{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Decision tree classifier results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import subprocess\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import model_selection, preprocessing\n",
    "from sklearn.metrics import (\n",
    "    accuracy_score,\n",
    "    classification_report,\n",
    "    confusion_matrix,\n",
    ")\n",
    "from sklearn.tree import DecisionTreeClassifier, export_graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data\n",
    "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of turbines to plot\n",
    "list1 = [1]\n",
    "# list1 = list(df['turbine_id'].unique())\n",
    "# sort turbines in ascending order\n",
    "# list1 = sorted(list1, key=int)\n",
    "# list of categories to plot\n",
    "list2 = [11]\n",
    "# list2 = list(df1['TurbineCategory_id'].unique())\n",
    "# sort categories in ascending order\n",
    "# list2 = sorted(list2, key=int)\n",
    "# categories to remove from plot\n",
    "# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]\n",
    "list3 = list(itertools.product(list1, list2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.849559082892\n",
      "[[  341     7     7     2     2     1     2     2     4    44]\n",
      " [    9    75    19     5     3     2     3     1     0   177]\n",
      " [    3    17    50    12     5     7     1     3     2   131]\n",
      " [    3     7    11    49     6     2     4     3     3   108]\n",
      " [    0     7     6    13    25     2     0     1     2    97]\n",
      " [    2     2     6     3     3    42     5     0     3    92]\n",
      " [    2     2     0     3     1     6    25     3     1    87]\n",
      " [    1     1     1     5     0     8     1    28     7    95]\n",
      " [    6     1     0     4     4     3     2     5    22    95]\n",
      " [   51   169   515   140   115    83   114    76    90 13794]]\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.82      0.83      0.82       412\n",
      "          6       0.26      0.26      0.26       294\n",
      "         12       0.08      0.22      0.12       231\n",
      "         18       0.21      0.25      0.23       196\n",
      "         24       0.15      0.16      0.16       153\n",
      "         30       0.27      0.27      0.27       158\n",
      "         36       0.16      0.19      0.17       130\n",
      "         42       0.23      0.19      0.21       147\n",
      "         48       0.16      0.15      0.16       142\n",
      "       9999       0.94      0.91      0.92     15147\n",
      "\n",
      "avg / total       0.87      0.85      0.86     17010\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for x, y in list3:\n",
    "    # filter only data for turbine x\n",
    "    dfx = df[(df[\"turbine_id\"] == x)].copy()\n",
    "\n",
    "    # sort values by timestamp in descending order\n",
    "    dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n",
    "\n",
    "    # copying fault to new column (mins) (fault when turbine category id is y)\n",
    "    def f(c):\n",
    "        if c[\"TurbineCategory_id\"] == y:\n",
    "            return 0\n",
    "        else:\n",
    "            return 1\n",
    "\n",
    "    dfx[\"mins\"] = dfx.apply(f, axis=1)\n",
    "\n",
    "    # reset index\n",
    "    dfx.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    # assigning value to first cell if it's not 0\n",
    "    if dfx.loc[0, \"mins\"] == 0:\n",
    "        dfx.set_value(0, \"mins\", 0)\n",
    "    else:\n",
    "        dfx.set_value(0, \"mins\", 999999999)\n",
    "\n",
    "    # using previous value's row to evaluate time\n",
    "    for i, e in enumerate(dfx[\"mins\"]):\n",
    "        if e == 1:\n",
    "            dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n",
    "\n",
    "    # sort in ascending order\n",
    "    dfx = dfx.sort_values(by=\"timestamp\")\n",
    "\n",
    "    # reset index\n",
    "    dfx.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    # convert to hours and round to nearest hour\n",
    "    dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n",
    "    dfx[\"hours\"] = dfx[\"hours\"] / 60\n",
    "    dfx[\"hours\"] = round(dfx[\"hours\"])\n",
    "    dfx[\"hours\"] = dfx[\"hours\"].astype(np.int64)\n",
    "\n",
    "    # > 48 hours - label as normal (9999)\n",
    "    def f1(c):\n",
    "        if c[\"hours\"] > 48:\n",
    "            return 9999\n",
    "        else:\n",
    "            return c[\"hours\"]\n",
    "\n",
    "    dfx[\"hours\"] = dfx.apply(f1, axis=1)\n",
    "\n",
    "    # filter out curtailment - curtailed when turbine is pitching outside\n",
    "    # 0deg <= normal <= 3.5deg\n",
    "    def f2(c):\n",
    "        if (\n",
    "            0 <= c[\"pitch\"] <= 3.5\n",
    "            or c[\"hours\"] != 9999\n",
    "            or (\n",
    "                (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n",
    "                and (\n",
    "                    c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n",
    "                    or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n",
    "                )\n",
    "            )\n",
    "        ):\n",
    "            return \"normal\"\n",
    "        else:\n",
    "            return \"curtailed\"\n",
    "\n",
    "    dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n",
    "\n",
    "    # filter unusual readings, i.e., for normal operation, power <= 0 in\n",
    "    # operating wind speeds, power > 100 before cut-in, runtime < 600\n",
    "    def f3(c):\n",
    "        if c[\"hours\"] == 9999 and (\n",
    "            (\n",
    "                3 < c[\"ws_av\"] < 25\n",
    "                and (\n",
    "                    c[\"ap_av\"] <= 0\n",
    "                    or c[\"runtime\"] < 600\n",
    "                    or c[\"EnvironmentalCategory_id\"] > 1\n",
    "                    or c[\"GridCategory_id\"] > 1\n",
    "                    or c[\"InfrastructureCategory_id\"] > 1\n",
    "                    or c[\"AvailabilityCategory_id\"] == 2\n",
    "                    or 12 <= c[\"TurbineCategory_id\"] <= 15\n",
    "                    or 21 <= c[\"TurbineCategory_id\"] <= 22\n",
    "                )\n",
    "            )\n",
    "            or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n",
    "        ):\n",
    "            # remove unusual readings, i.e., zero power at operating wind\n",
    "            # speeds, power > 0 before cut-in ...\n",
    "            return \"unusual\"\n",
    "        else:\n",
    "            return \"normal\"\n",
    "\n",
    "    dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n",
    "\n",
    "    def f4(c):\n",
    "        if 1 <= c[\"hours\"] <= 6:\n",
    "            return 6\n",
    "        elif 7 <= c[\"hours\"] <= 12:\n",
    "            return 12\n",
    "        elif 13 <= c[\"hours\"] <= 18:\n",
    "            return 18\n",
    "        elif 19 <= c[\"hours\"] <= 24:\n",
    "            return 24\n",
    "        elif 25 <= c[\"hours\"] <= 30:\n",
    "            return 30\n",
    "        elif 31 <= c[\"hours\"] <= 36:\n",
    "            return 36\n",
    "        elif 37 <= c[\"hours\"] <= 42:\n",
    "            return 42\n",
    "        elif 43 <= c[\"hours\"] <= 48:\n",
    "            return 48\n",
    "        else:\n",
    "            return c[\"hours\"]\n",
    "\n",
    "    dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n",
    "\n",
    "    # filter data\n",
    "    # normal w/o curtailment\n",
    "    df3 = dfx[dfx.curtailment == \"normal\"]\n",
    "    # normal w/o curtailment and unusual readings\n",
    "    df3 = df3[df3.unusual == \"normal\"]\n",
    "\n",
    "    df4 = df3[\n",
    "        [\n",
    "            \"ap_av\",\n",
    "            \"ws_av\",\n",
    "            \"wd_av\",\n",
    "            \"pitch\",\n",
    "            \"ap_max\",\n",
    "            \"ap_dev\",\n",
    "            \"reactive_power\",\n",
    "            \"rs_av\",\n",
    "            \"gen_sp\",\n",
    "            \"nac_pos\",\n",
    "            \"hours6\",\n",
    "        ]\n",
    "    ].copy()\n",
    "    df4 = df4.dropna()\n",
    "\n",
    "    # splitting data set\n",
    "    features = [\n",
    "        \"ap_av\",\n",
    "        \"ws_av\",\n",
    "        \"wd_av\",\n",
    "        \"pitch\",\n",
    "        \"ap_max\",\n",
    "        \"ap_dev\",\n",
    "        \"reactive_power\",\n",
    "        \"rs_av\",\n",
    "        \"gen_sp\",\n",
    "        \"nac_pos\",\n",
    "    ]\n",
    "    X = df4[features]\n",
    "    Y = df4[\"hours6\"]\n",
    "    Xn = preprocessing.normalize(X)\n",
    "    validation_size = 0.20\n",
    "    seed = 7\n",
    "    X_train, X_validation, Y_train, Y_validation = (\n",
    "        model_selection.train_test_split(\n",
    "            Xn, Y, test_size=validation_size, random_state=seed\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # fit using gini criterion\n",
    "    clf = DecisionTreeClassifier(class_weight=\"balanced\")\n",
    "    clf = clf.fit(X_train, Y_train)\n",
    "\n",
    "    predictions = clf.predict(X_validation)\n",
    "    print(accuracy_score(Y_validation, predictions))\n",
    "    print(confusion_matrix(Y_validation, predictions))\n",
    "    print(classification_report(Y_validation, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([9999, 9999, 9999, ..., 9999,   12, 9999], dtype=int64)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# after being fitted, the model can be used to predict the class of samples\n",
    "clf.predict(X_validation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],\n",
       "       [ 0. ,  0. ,  0.1, ...,  0. ,  0. ,  0.9],\n",
       "       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],\n",
       "       ..., \n",
       "       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],\n",
       "       [ 0.3,  0.3,  0.4, ...,  0. ,  0. ,  0. ],\n",
       "       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternatively, the probability of each class can be predicted, which is the\n",
    "# fraction of training samples of the same class in a leaf\n",
    "clf.predict_proba(X_validation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# using entropy\n",
    "clf1 = DecisionTreeClassifier(criterion=\"entropy\", class_weight=\"balanced\")\n",
    "clf1 = clf1.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.858083480306\n",
      "[[  342     4     9     6     3     1     0     3     2    42]\n",
      " [    9    67    20     5     4     2     5     3     3   176]\n",
      " [    3     8    50     8     7     5     2     2     5   141]\n",
      " [    9     3    11    54     7     5     2     2     1   102]\n",
      " [    0     1     7     1    33     7     3     0     2    99]\n",
      " [    2     2     2     5     2    42     5     0     1    97]\n",
      " [    3     3     2     2     2     3    33     5     1    76]\n",
      " [    1     1     0     2     1     1     2    34    12    93]\n",
      " [    6     3     1     0     5     2     5     2    37    81]\n",
      " [   31   172   499   104    94    91    92    86    74 13904]]\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.84      0.83      0.84       412\n",
      "          6       0.25      0.23      0.24       294\n",
      "         12       0.08      0.22      0.12       231\n",
      "         18       0.29      0.28      0.28       196\n",
      "         24       0.21      0.22      0.21       153\n",
      "         30       0.26      0.27      0.26       158\n",
      "         36       0.22      0.25      0.24       130\n",
      "         42       0.25      0.23      0.24       147\n",
      "         48       0.27      0.26      0.26       142\n",
      "       9999       0.94      0.92      0.93     15147\n",
      "\n",
      "avg / total       0.88      0.86      0.87     17010\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions1 = clf1.predict(X_validation)\n",
    "print(accuracy_score(Y_validation, predictions1))\n",
    "print(confusion_matrix(Y_validation, predictions1))\n",
    "print(classification_report(Y_validation, predictions1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# fault\n",
    "df4[\"fault\"] = df4[\"hours6\"].astype(str)\n",
    "\n",
    "\n",
    "def fx(c):\n",
    "    if c[\"fault\"] == \"9999\":\n",
    "        return \"normal\"\n",
    "    elif c[\"fault\"] == \"0\":\n",
    "        return \"faulty\"\n",
    "    else:\n",
    "        return \"up to \" + c[\"fault\"] + \" hr(s) before fault\"\n",
    "\n",
    "\n",
    "df4[\"fault\"] = df4.apply(fx, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# visualising\n",
    "\n",
    "\n",
    "def visualise_tree(tree, feature_names):\n",
    "    \"\"\"Create tree svg using graphviz.\n",
    "\n",
    "    Args\n",
    "    ----\n",
    "    tree -- scikit-learn DecisionTree.\n",
    "    feature_names -- list of feature names.\n",
    "    \"\"\"\n",
    "    with open(\"df.dot\", \"w\") as f:\n",
    "        export_graphviz(\n",
    "            tree,\n",
    "            out_file=f,\n",
    "            filled=True,\n",
    "            rounded=True,\n",
    "            feature_names=feature_names,\n",
    "            special_characters=True,\n",
    "            class_names=list(df4[\"fault\"].unique()),\n",
    "        )\n",
    "\n",
    "    command = [\"dot\", \"-Tpng\", \"df.dot\", \"-o\", \"df.png\"]\n",
    "    try:\n",
    "        subprocess.check_call(command)\n",
    "    except:\n",
    "        exit(\"Could not run dot, i.e., graphviz, to produce visualisation\")\n",
    "\n",
    "\n",
    "visualise_tree(clf1, features)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}