{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Decision trees: finding optimal criterion ('gini' or 'entropy')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.tree import DecisionTreeClassifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import data\n", "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# list of turbines to plot\n", "list1 = list(df[\"turbine_id\"].unique())\n", "# sort turbines in ascending order\n", "list1 = sorted(list1, key=int)\n", "# list of categories\n", "list2 = list(df[\"TurbineCategory_id\"].unique())\n", "# remove NaN from list\n", "list2 = [g for g in list2 if g >= 0]\n", "# sort categories in ascending order\n", "list2 = sorted(list2, key=int)\n", "# categories to remove\n", "list2 = [m for m in list2 if m not in (1, 12, 13, 14, 15, 17, 21, 22)]\n", "# empty list to hold optimal n values for all turbines\n", "num = []\n", "# empty list to hold minimum error readings for all turbines\n", "err = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# filter only data for turbine x\n", "for x in list1:\n", " dfx = df[(df[\"turbine_id\"] == x)].copy()\n", " # copying fault to new column (mins) (fault when turbine category id is y)\n", " for y in list2:\n", "\n", " def f(c):\n", " if c[\"TurbineCategory_id\"] == y:\n", " return 0\n", " else:\n", " return 1\n", "\n", " dfx[\"mins\"] = dfx.apply(f, axis=1)\n", "\n", " # sort values by timestamp in descending order\n", " dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # assigning value to first cell if it's not 0 with a large number\n", " if dfx.loc[0, \"mins\"] == 0:\n", " dfx.set_value(0, \"mins\", 0)\n", " else:\n", " # to allow the following loop to work\n", " dfx.set_value(0, \"mins\", 999999999)\n", "\n", " # using previous value's row to evaluate time\n", " for i, e in enumerate(dfx[\"mins\"]):\n", " if e == 1:\n", " dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n", "\n", " # sort in ascending order\n", " dfx = dfx.sort_values(by=\"timestamp\")\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", " # convert to hours, then round to nearest hour\n", " dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n", " dfx[\"hours\"] = dfx[\"hours\"] / 60\n", " # round to integer\n", " dfx[\"hours\"] = round(dfx[\"hours\"]).astype(np.int64)\n", "\n", " # > 48 hours - label as normal (999)\n", " def f1(c):\n", " if c[\"hours\"] > 48:\n", " return 999\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours\"] = dfx.apply(f1, axis=1)\n", "\n", " # filter out curtailment - curtailed when turbine is pitching outside\n", " # 0deg <= normal <= 3.5deg\n", " def f2(c):\n", " if (\n", " 0 <= c[\"pitch\"] <= 3.5\n", " or c[\"hours\"] != 999\n", " or (\n", " (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n", " and (\n", " c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n", " or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n", " )\n", " )\n", " ):\n", " return \"normal\"\n", " else:\n", " return \"curtailed\"\n", "\n", " dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n", "\n", " def f3(c):\n", " # filter unusual readings, i.e., for normal operation, power <= 0\n", " # in operating wind speeds, power > 100 before cut-in, runtime\n", " # < 600 and other downtime categories\n", " if c[\"hours\"] == 999 and (\n", " (\n", " 3 < c[\"ws_av\"] < 25\n", " and (\n", " c[\"ap_av\"] <= 0\n", " or c[\"runtime\"] < 600\n", " or c[\"EnvironmentalCategory_id\"] > 1\n", " or c[\"GridCategory_id\"] > 1\n", " or c[\"InfrastructureCategory_id\"] > 1\n", " or c[\"AvailabilityCategory_id\"] == 2\n", " or 12 <= c[\"TurbineCategory_id\"] <= 15\n", " or 21 <= c[\"TurbineCategory_id\"] <= 22\n", " )\n", " )\n", " or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n", " ):\n", " return \"unusual\"\n", " else:\n", " return \"normal\"\n", "\n", " dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n", "\n", " # round to 6 hour intervals to reduce number of classes\n", " def f4(c):\n", " if 1 <= c[\"hours\"] <= 6:\n", " return 6\n", " elif 7 <= c[\"hours\"] <= 12:\n", " return 12\n", " elif 13 <= c[\"hours\"] <= 18:\n", " return 18\n", " elif 19 <= c[\"hours\"] <= 24:\n", " return 24\n", " elif 25 <= c[\"hours\"] <= 30:\n", " return 30\n", " elif 31 <= c[\"hours\"] <= 36:\n", " return 36\n", " elif 37 <= c[\"hours\"] <= 42:\n", " return 42\n", " elif 43 <= c[\"hours\"] <= 48:\n", " return 48\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n", "\n", " # change label for unusual and curtailed data (9999), if originally\n", " # labelled as normal\n", " def f5(c):\n", " if c[\"unusual\"] == \"unusual\" or c[\"curtailment\"] == \"curtailed\":\n", " return 9999\n", " else:\n", " return c[\"hours6\"]\n", "\n", " # apply to new column specific to each fault\n", " dfx[\"hours_%s\" % y] = dfx.apply(f5, axis=1)\n", "\n", " # drop unnecessary columns\n", " dfx = dfx.drop(\"hours6\", axis=1)\n", " dfx = dfx.drop(\"hours\", axis=1)\n", " dfx = dfx.drop(\"mins\", axis=1)\n", " dfx = dfx.drop(\"curtailment\", axis=1)\n", " dfx = dfx.drop(\"unusual\", axis=1)\n", "\n", " features = [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " ]\n", " # separate features from classes for classification\n", " classes = [col for col in dfx.columns if \"hours\" in col]\n", " # list of columns to copy into new df\n", " list3 = features + classes + [\"timestamp\"]\n", " df2 = dfx[list3].copy()\n", " # drop NaNs\n", " df2 = df2.dropna()\n", " X = df2[features]\n", " # normalise features to values b/w 0 and 1\n", " X = preprocessing.normalize(X)\n", " Y = df2[classes]\n", " # convert from pd dataframe to np array\n", " Y = Y.as_matrix()\n", "\n", " criterion = [\"gini\", \"entropy\"]\n", " scores = []\n", " # cross validation using time series split\n", " tscv = TimeSeriesSplit(n_splits=5)\n", "\n", " # looping for each value of c and defining classifier\n", " for c in criterion:\n", " dt = DecisionTreeClassifier(criterion=c)\n", " # empty list to hold score for each cross validation fold\n", " p1 = []\n", " # looping for each cross validation fold\n", " for train_index, test_index in tscv.split(X):\n", " # split train and test sets\n", " X_train, X_test = X[train_index], X[test_index]\n", " Y_train, Y_test = Y[train_index], Y[test_index]\n", " # fit to classifier and predict\n", " dt1 = dt.fit(X_train, Y_train)\n", " pred = dt1.predict(X_test)\n", " # accuracy score\n", " p2 = np.sum(np.equal(Y_test, pred)) / Y_test.size\n", " # add to list\n", " p1.append(p2)\n", " # average score across all cross validation folds\n", " p = sum(p1) / len(p1)\n", " scores.append(p)\n", " # changing to misclassification error\n", " MSE = [1 - x for x in scores]\n", " # determining best n\n", " optimal = criterion[MSE.index(min(MSE))]\n", " num.append(optimal)\n", " err.append(min(MSE))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "d = pd.DataFrame(num, columns=[\"criterion\"])\n", "d[\"error\"] = err\n", "d[\"turbine\"] = list1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
criterionerrorturbine
0entropy0.1941391
1entropy0.1776592
2entropy0.1072073
3entropy0.1154624
4entropy0.1368905
5entropy0.3578536
6entropy0.1572897
7entropy0.3166108
8entropy0.1047689
9gini0.17135910
10entropy0.20380511
11entropy0.31759212
12entropy0.11759613
13entropy0.80117014
14entropy0.16035015
15entropy0.15496616
16entropy0.15035917
17entropy0.69844818
18entropy0.11436219
19entropy0.50667120
20entropy0.54886521
21entropy0.11971522
22entropy0.40199223
23entropy0.39735824
24entropy0.61345825
\n", "
" ], "text/plain": [ " criterion error turbine\n", "0 entropy 0.194139 1\n", "1 entropy 0.177659 2\n", "2 entropy 0.107207 3\n", "3 entropy 0.115462 4\n", "4 entropy 0.136890 5\n", "5 entropy 0.357853 6\n", "6 entropy 0.157289 7\n", "7 entropy 0.316610 8\n", "8 entropy 0.104768 9\n", "9 gini 0.171359 10\n", "10 entropy 0.203805 11\n", "11 entropy 0.317592 12\n", "12 entropy 0.117596 13\n", "13 entropy 0.801170 14\n", "14 entropy 0.160350 15\n", "15 entropy 0.154966 16\n", "16 entropy 0.150359 17\n", "17 entropy 0.698448 18\n", "18 entropy 0.114362 19\n", "19 entropy 0.506671 20\n", "20 entropy 0.548865 21\n", "21 entropy 0.119715 22\n", "22 entropy 0.401992 23\n", "23 entropy 0.397358 24\n", "24 entropy 0.613458 25" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }