{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Decision tree classifier results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "import subprocess\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn import model_selection, preprocessing\n", "from sklearn.metrics import (\n", " accuracy_score,\n", " classification_report,\n", " confusion_matrix,\n", ")\n", "from sklearn.tree import DecisionTreeClassifier, export_graphviz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import data\n", "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# list of turbines to plot\n", "list1 = [1]\n", "# list1 = list(df['turbine_id'].unique())\n", "# sort turbines in ascending order\n", "# list1 = sorted(list1, key=int)\n", "# list of categories to plot\n", "list2 = [11]\n", "# list2 = list(df1['TurbineCategory_id'].unique())\n", "# sort categories in ascending order\n", "# list2 = sorted(list2, key=int)\n", "# categories to remove from plot\n", "# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]\n", "list3 = list(itertools.product(list1, list2))" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.849559082892\n", "[[ 341 7 7 2 2 1 2 2 4 44]\n", " [ 9 75 19 5 3 2 3 1 0 177]\n", " [ 3 17 50 12 5 7 1 3 2 131]\n", " [ 3 7 11 49 6 2 4 3 3 108]\n", " [ 0 7 6 13 25 2 0 1 2 97]\n", " [ 2 2 6 3 3 42 5 0 3 92]\n", " [ 2 2 0 3 1 6 25 3 1 87]\n", " [ 1 1 1 5 0 8 1 28 7 95]\n", " [ 6 1 0 4 4 3 2 5 22 95]\n", " [ 51 169 515 140 115 83 114 76 90 13794]]\n", " precision recall f1-score support\n", "\n", " 0 0.82 0.83 0.82 412\n", " 6 0.26 0.26 0.26 294\n", " 12 0.08 0.22 0.12 231\n", " 18 0.21 0.25 0.23 196\n", " 24 0.15 0.16 0.16 153\n", " 30 0.27 0.27 0.27 158\n", " 36 0.16 0.19 0.17 130\n", " 42 0.23 0.19 0.21 147\n", " 48 0.16 0.15 0.16 142\n", " 9999 0.94 0.91 0.92 15147\n", "\n", "avg / total 0.87 0.85 0.86 17010\n", "\n" ] } ], "source": [ "for x, y in list3:\n", " # filter only data for turbine x\n", " dfx = df[(df[\"turbine_id\"] == x)].copy()\n", "\n", " # sort values by timestamp in descending order\n", " dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n", "\n", " # copying fault to new column (mins) (fault when turbine category id is y)\n", " def f(c):\n", " if c[\"TurbineCategory_id\"] == y:\n", " return 0\n", " else:\n", " return 1\n", "\n", " dfx[\"mins\"] = dfx.apply(f, axis=1)\n", "\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # assigning value to first cell if it's not 0\n", " if dfx.loc[0, \"mins\"] == 0:\n", " dfx.set_value(0, \"mins\", 0)\n", " else:\n", " dfx.set_value(0, \"mins\", 999999999)\n", "\n", " # using previous value's row to evaluate time\n", " for i, e in enumerate(dfx[\"mins\"]):\n", " if e == 1:\n", " dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n", "\n", " # sort in ascending order\n", " dfx = dfx.sort_values(by=\"timestamp\")\n", "\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # convert to hours and round to nearest hour\n", " dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n", " dfx[\"hours\"] = dfx[\"hours\"] / 60\n", " dfx[\"hours\"] = round(dfx[\"hours\"])\n", " dfx[\"hours\"] = dfx[\"hours\"].astype(np.int64)\n", "\n", " # > 48 hours - label as normal (9999)\n", " def f1(c):\n", " if c[\"hours\"] > 48:\n", " return 9999\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours\"] = dfx.apply(f1, axis=1)\n", "\n", " # filter out curtailment - curtailed when turbine is pitching outside\n", " # 0deg <= normal <= 3.5deg\n", " def f2(c):\n", " if (\n", " 0 <= c[\"pitch\"] <= 3.5\n", " or c[\"hours\"] != 9999\n", " or (\n", " (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n", " and (\n", " c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n", " or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n", " )\n", " )\n", " ):\n", " return \"normal\"\n", " else:\n", " return \"curtailed\"\n", "\n", " dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n", "\n", " # filter unusual readings, i.e., for normal operation, power <= 0 in\n", " # operating wind speeds, power > 100 before cut-in, runtime < 600\n", " def f3(c):\n", " if c[\"hours\"] == 9999 and (\n", " (\n", " 3 < c[\"ws_av\"] < 25\n", " and (\n", " c[\"ap_av\"] <= 0\n", " or c[\"runtime\"] < 600\n", " or c[\"EnvironmentalCategory_id\"] > 1\n", " or c[\"GridCategory_id\"] > 1\n", " or c[\"InfrastructureCategory_id\"] > 1\n", " or c[\"AvailabilityCategory_id\"] == 2\n", " or 12 <= c[\"TurbineCategory_id\"] <= 15\n", " or 21 <= c[\"TurbineCategory_id\"] <= 22\n", " )\n", " )\n", " or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n", " ):\n", " # remove unusual readings, i.e., zero power at operating wind\n", " # speeds, power > 0 before cut-in ...\n", " return \"unusual\"\n", " else:\n", " return \"normal\"\n", "\n", " dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n", "\n", " def f4(c):\n", " if 1 <= c[\"hours\"] <= 6:\n", " return 6\n", " elif 7 <= c[\"hours\"] <= 12:\n", " return 12\n", " elif 13 <= c[\"hours\"] <= 18:\n", " return 18\n", " elif 19 <= c[\"hours\"] <= 24:\n", " return 24\n", " elif 25 <= c[\"hours\"] <= 30:\n", " return 30\n", " elif 31 <= c[\"hours\"] <= 36:\n", " return 36\n", " elif 37 <= c[\"hours\"] <= 42:\n", " return 42\n", " elif 43 <= c[\"hours\"] <= 48:\n", " return 48\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n", "\n", " # filter data\n", " # normal w/o curtailment\n", " df3 = dfx[dfx.curtailment == \"normal\"]\n", " # normal w/o curtailment and unusual readings\n", " df3 = df3[df3.unusual == \"normal\"]\n", "\n", " df4 = df3[\n", " [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " \"hours6\",\n", " ]\n", " ].copy()\n", " df4 = df4.dropna()\n", "\n", " # splitting data set\n", " features = [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " ]\n", " X = df4[features]\n", " Y = df4[\"hours6\"]\n", " Xn = preprocessing.normalize(X)\n", " validation_size = 0.20\n", " seed = 7\n", " X_train, X_validation, Y_train, Y_validation = (\n", " model_selection.train_test_split(\n", " Xn, Y, test_size=validation_size, random_state=seed\n", " )\n", " )\n", "\n", " # fit using gini criterion\n", " clf = DecisionTreeClassifier(class_weight=\"balanced\")\n", " clf = clf.fit(X_train, Y_train)\n", "\n", " predictions = clf.predict(X_validation)\n", " print(accuracy_score(Y_validation, predictions))\n", " print(confusion_matrix(Y_validation, predictions))\n", " print(classification_report(Y_validation, predictions))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([9999, 9999, 9999, ..., 9999, 12, 9999], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# after being fitted, the model can be used to predict the class of samples\n", "clf.predict(X_validation)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0. , 0. , 0. , ..., 0. , 0. , 1. ],\n", " [ 0. , 0. , 0.1, ..., 0. , 0. , 0.9],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 1. ],\n", " ..., \n", " [ 0. , 0. , 0. , ..., 0. , 0. , 1. ],\n", " [ 0.3, 0.3, 0.4, ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 1. ]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternatively, the probability of each class can be predicted, which is the\n", "# fraction of training samples of the same class in a leaf\n", "clf.predict_proba(X_validation)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# using entropy\n", "clf1 = DecisionTreeClassifier(criterion=\"entropy\", class_weight=\"balanced\")\n", "clf1 = clf1.fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.858083480306\n", "[[ 342 4 9 6 3 1 0 3 2 42]\n", " [ 9 67 20 5 4 2 5 3 3 176]\n", " [ 3 8 50 8 7 5 2 2 5 141]\n", " [ 9 3 11 54 7 5 2 2 1 102]\n", " [ 0 1 7 1 33 7 3 0 2 99]\n", " [ 2 2 2 5 2 42 5 0 1 97]\n", " [ 3 3 2 2 2 3 33 5 1 76]\n", " [ 1 1 0 2 1 1 2 34 12 93]\n", " [ 6 3 1 0 5 2 5 2 37 81]\n", " [ 31 172 499 104 94 91 92 86 74 13904]]\n", " precision recall f1-score support\n", "\n", " 0 0.84 0.83 0.84 412\n", " 6 0.25 0.23 0.24 294\n", " 12 0.08 0.22 0.12 231\n", " 18 0.29 0.28 0.28 196\n", " 24 0.21 0.22 0.21 153\n", " 30 0.26 0.27 0.26 158\n", " 36 0.22 0.25 0.24 130\n", " 42 0.25 0.23 0.24 147\n", " 48 0.27 0.26 0.26 142\n", " 9999 0.94 0.92 0.93 15147\n", "\n", "avg / total 0.88 0.86 0.87 17010\n", "\n" ] } ], "source": [ "predictions1 = clf1.predict(X_validation)\n", "print(accuracy_score(Y_validation, predictions1))\n", "print(confusion_matrix(Y_validation, predictions1))\n", "print(classification_report(Y_validation, predictions1))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# fault\n", "df4[\"fault\"] = df4[\"hours6\"].astype(str)\n", "\n", "\n", "def fx(c):\n", " if c[\"fault\"] == \"9999\":\n", " return \"normal\"\n", " elif c[\"fault\"] == \"0\":\n", " return \"faulty\"\n", " else:\n", " return \"up to \" + c[\"fault\"] + \" hr(s) before fault\"\n", "\n", "\n", "df4[\"fault\"] = df4.apply(fx, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# visualising\n", "\n", "\n", "def visualise_tree(tree, feature_names):\n", " \"\"\"Create tree svg using graphviz.\n", "\n", " Args\n", " ----\n", " tree -- scikit-learn DecisionTree.\n", " feature_names -- list of feature names.\n", " \"\"\"\n", " with open(\"df.dot\", \"w\") as f:\n", " export_graphviz(\n", " tree,\n", " out_file=f,\n", " filled=True,\n", " rounded=True,\n", " feature_names=feature_names,\n", " special_characters=True,\n", " class_names=list(df4[\"fault\"].unique()),\n", " )\n", "\n", " command = [\"dot\", \"-Tpng\", \"df.dot\", \"-o\", \"df.png\"]\n", " try:\n", " subprocess.check_call(command)\n", " except:\n", " exit(\"Could not run dot, i.e., graphviz, to produce visualisation\")\n", "\n", "\n", "visualise_tree(clf1, features)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }