{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random forest classifier results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import model_selection, preprocessing\n",
    "from sklearn.metrics import (\n",
    "    accuracy_score,\n",
    "    classification_report,\n",
    "    confusion_matrix,\n",
    ")\n",
    "from sklearn.tree import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data\n",
    "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of turbines to plot\n",
    "list1 = [1]\n",
    "# list1 = list(df['turbine_id'].unique())\n",
    "# sort turbines in ascending order\n",
    "# list1 = sorted(list1, key=int)\n",
    "# list of categories to plot\n",
    "list2 = [11]\n",
    "# list2 = list(df1['TurbineCategory_id'].unique())\n",
    "# sort categories in ascending order\n",
    "# list2 = sorted(list2, key=int)\n",
    "# categories to remove from plot\n",
    "# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]\n",
    "list3 = list(itertools.product(list1, list2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.899764844209\n",
      "[[  358     3     5     0     2     0     0     1     1    42]\n",
      " [    7    45    13     1     0     0     0     0     0   228]\n",
      " [    5     9    46     4     0     0     0     0     1   166]\n",
      " [    8     4     9    36     3     0     1     2     1   132]\n",
      " [    0     1     4     2    19     1     0     0     1   125]\n",
      " [    2     1     0     0     2    31     2     1     1   118]\n",
      " [    3     0     0     1     0     3    22     3     0    98]\n",
      " [    2     0     1     2     0     1     0    16     4   121]\n",
      " [    7     1     0     1     1     1     1     5    14   111]\n",
      " [    5     9   387     6     2     1     4     4    11 14718]]\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.90      0.87      0.89       412\n",
      "          6       0.62      0.15      0.25       294\n",
      "         12       0.10      0.20      0.13       231\n",
      "         18       0.68      0.18      0.29       196\n",
      "         24       0.66      0.12      0.21       153\n",
      "         30       0.82      0.20      0.32       158\n",
      "         36       0.73      0.17      0.28       130\n",
      "         42       0.50      0.11      0.18       147\n",
      "         48       0.41      0.10      0.16       142\n",
      "       9999       0.93      0.97      0.95     15147\n",
      "\n",
      "avg / total       0.89      0.90      0.89     17010\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for x, y in list3:\n",
    "    # filter only data for turbine x\n",
    "    dfx = df[(df[\"turbine_id\"] == x)].copy()\n",
    "\n",
    "    # sort values by timestamp in descending order\n",
    "    dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n",
    "\n",
    "    # copying fault to new column (mins) (fault when turbine category id is y)\n",
    "    def f(c):\n",
    "        if c[\"TurbineCategory_id\"] == y:\n",
    "            return 0\n",
    "        else:\n",
    "            return 1\n",
    "\n",
    "    dfx[\"mins\"] = dfx.apply(f, axis=1)\n",
    "\n",
    "    # reset index\n",
    "    dfx.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    # assigning value to first cell if it's not 0\n",
    "    if dfx.loc[0, \"mins\"] == 0:\n",
    "        dfx.set_value(0, \"mins\", 0)\n",
    "    else:\n",
    "        dfx.set_value(0, \"mins\", 999999999)\n",
    "\n",
    "    # using previous value's row to evaluate time\n",
    "    for i, e in enumerate(dfx[\"mins\"]):\n",
    "        if e == 1:\n",
    "            dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n",
    "\n",
    "    # sort in ascending order\n",
    "    dfx = dfx.sort_values(by=\"timestamp\")\n",
    "\n",
    "    # reset index\n",
    "    dfx.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    # convert to hours and round to nearest hour\n",
    "    dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n",
    "    dfx[\"hours\"] = dfx[\"hours\"] / 60\n",
    "    dfx[\"hours\"] = round(dfx[\"hours\"])\n",
    "    dfx[\"hours\"] = dfx[\"hours\"].astype(np.int64)\n",
    "\n",
    "    # > 48 hours - label as normal (9999)\n",
    "    def f1(c):\n",
    "        if c[\"hours\"] > 48:\n",
    "            return 9999\n",
    "        else:\n",
    "            return c[\"hours\"]\n",
    "\n",
    "    dfx[\"hours\"] = dfx.apply(f1, axis=1)\n",
    "\n",
    "    # filter out curtailment - curtailed when turbine is pitching outside\n",
    "    # 0deg <= normal <= 3.5deg\n",
    "    def f2(c):\n",
    "        if (\n",
    "            0 <= c[\"pitch\"] <= 3.5\n",
    "            or c[\"hours\"] != 9999\n",
    "            or (\n",
    "                (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n",
    "                and (\n",
    "                    c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n",
    "                    or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n",
    "                )\n",
    "            )\n",
    "        ):\n",
    "            return \"normal\"\n",
    "        else:\n",
    "            return \"curtailed\"\n",
    "\n",
    "    dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n",
    "\n",
    "    # filter unusual readings, i.e., for normal operation, power <= 0 in\n",
    "    # operating wind speeds, power > 100 before cut-in, runtime < 600\n",
    "    def f3(c):\n",
    "        if c[\"hours\"] == 9999 and (\n",
    "            (\n",
    "                3 < c[\"ws_av\"] < 25\n",
    "                and (\n",
    "                    c[\"ap_av\"] <= 0\n",
    "                    or c[\"runtime\"] < 600\n",
    "                    or c[\"EnvironmentalCategory_id\"] > 1\n",
    "                    or c[\"GridCategory_id\"] > 1\n",
    "                    or c[\"InfrastructureCategory_id\"] > 1\n",
    "                    or c[\"AvailabilityCategory_id\"] == 2\n",
    "                    or 12 <= c[\"TurbineCategory_id\"] <= 15\n",
    "                    or 21 <= c[\"TurbineCategory_id\"] <= 22\n",
    "                )\n",
    "            )\n",
    "            or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n",
    "        ):\n",
    "            # remove unusual readings, i.e., zero power at operating wind\n",
    "            # speeds, power > 0 before cut-in ...\n",
    "            return \"unusual\"\n",
    "        else:\n",
    "            return \"normal\"\n",
    "\n",
    "    dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n",
    "\n",
    "    def f4(c):\n",
    "        if 1 <= c[\"hours\"] <= 6:\n",
    "            return 6\n",
    "        elif 7 <= c[\"hours\"] <= 12:\n",
    "            return 12\n",
    "        elif 13 <= c[\"hours\"] <= 18:\n",
    "            return 18\n",
    "        elif 19 <= c[\"hours\"] <= 24:\n",
    "            return 24\n",
    "        elif 25 <= c[\"hours\"] <= 30:\n",
    "            return 30\n",
    "        elif 31 <= c[\"hours\"] <= 36:\n",
    "            return 36\n",
    "        elif 37 <= c[\"hours\"] <= 42:\n",
    "            return 42\n",
    "        elif 43 <= c[\"hours\"] <= 48:\n",
    "            return 48\n",
    "        else:\n",
    "            return c[\"hours\"]\n",
    "\n",
    "    dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n",
    "\n",
    "    # filter data\n",
    "    # normal w/o curtailment\n",
    "    df3 = dfx[dfx.curtailment == \"normal\"]\n",
    "    # normal w/o curtailment and unusual readings\n",
    "    df3 = df3[df3.unusual == \"normal\"]\n",
    "\n",
    "    df4 = df3[\n",
    "        [\n",
    "            \"ap_av\",\n",
    "            \"ws_av\",\n",
    "            \"wd_av\",\n",
    "            \"pitch\",\n",
    "            \"ap_max\",\n",
    "            \"ap_dev\",\n",
    "            \"reactive_power\",\n",
    "            \"rs_av\",\n",
    "            \"gen_sp\",\n",
    "            \"nac_pos\",\n",
    "            \"hours6\",\n",
    "        ]\n",
    "    ].copy()\n",
    "    df4 = df4.dropna()\n",
    "\n",
    "    # splitting data set\n",
    "    features = [\n",
    "        \"ap_av\",\n",
    "        \"ws_av\",\n",
    "        \"wd_av\",\n",
    "        \"pitch\",\n",
    "        \"ap_max\",\n",
    "        \"ap_dev\",\n",
    "        \"reactive_power\",\n",
    "        \"rs_av\",\n",
    "        \"gen_sp\",\n",
    "        \"nac_pos\",\n",
    "    ]\n",
    "    X = df4[features]\n",
    "    Y = df4[\"hours6\"]\n",
    "    Xn = preprocessing.normalize(X)\n",
    "    validation_size = 0.20\n",
    "    seed = 7\n",
    "    X_train, X_validation, Y_train, Y_validation = (\n",
    "        model_selection.train_test_split(\n",
    "            Xn, Y, test_size=validation_size, random_state=seed\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # fit using gini criterion\n",
    "    clf = RandomForestClassifier(class_weight=\"balanced\")\n",
    "    clf = clf.fit(X_train, Y_train)\n",
    "\n",
    "    predictions = clf.predict(X_validation)\n",
    "    print(accuracy_score(Y_validation, predictions))\n",
    "    print(confusion_matrix(Y_validation, predictions))\n",
    "    print(classification_report(Y_validation, predictions))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}