{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Random forest classifier results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn import model_selection, preprocessing\n", "from sklearn.metrics import (\n", " accuracy_score,\n", " classification_report,\n", " confusion_matrix,\n", ")\n", "from sklearn.tree import RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import data\n", "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# list of turbines to plot\n", "list1 = [1]\n", "# list1 = list(df['turbine_id'].unique())\n", "# sort turbines in ascending order\n", "# list1 = sorted(list1, key=int)\n", "# list of categories to plot\n", "list2 = [11]\n", "# list2 = list(df1['TurbineCategory_id'].unique())\n", "# sort categories in ascending order\n", "# list2 = sorted(list2, key=int)\n", "# categories to remove from plot\n", "# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]\n", "list3 = list(itertools.product(list1, list2))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.899764844209\n", "[[ 358 3 5 0 2 0 0 1 1 42]\n", " [ 7 45 13 1 0 0 0 0 0 228]\n", " [ 5 9 46 4 0 0 0 0 1 166]\n", " [ 8 4 9 36 3 0 1 2 1 132]\n", " [ 0 1 4 2 19 1 0 0 1 125]\n", " [ 2 1 0 0 2 31 2 1 1 118]\n", " [ 3 0 0 1 0 3 22 3 0 98]\n", " [ 2 0 1 2 0 1 0 16 4 121]\n", " [ 7 1 0 1 1 1 1 5 14 111]\n", " [ 5 9 387 6 2 1 4 4 11 14718]]\n", " precision recall f1-score support\n", "\n", " 0 0.90 0.87 0.89 412\n", " 6 0.62 0.15 0.25 294\n", " 12 0.10 0.20 0.13 231\n", " 18 0.68 0.18 0.29 196\n", " 24 0.66 0.12 0.21 153\n", " 30 0.82 0.20 0.32 158\n", " 36 0.73 0.17 0.28 130\n", " 42 0.50 0.11 0.18 147\n", " 48 0.41 0.10 0.16 142\n", " 9999 0.93 0.97 0.95 15147\n", "\n", "avg / total 0.89 0.90 0.89 17010\n", "\n" ] } ], "source": [ "for x, y in list3:\n", " # filter only data for turbine x\n", " dfx = df[(df[\"turbine_id\"] == x)].copy()\n", "\n", " # sort values by timestamp in descending order\n", " dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n", "\n", " # copying fault to new column (mins) (fault when turbine category id is y)\n", " def f(c):\n", " if c[\"TurbineCategory_id\"] == y:\n", " return 0\n", " else:\n", " return 1\n", "\n", " dfx[\"mins\"] = dfx.apply(f, axis=1)\n", "\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # assigning value to first cell if it's not 0\n", " if dfx.loc[0, \"mins\"] == 0:\n", " dfx.set_value(0, \"mins\", 0)\n", " else:\n", " dfx.set_value(0, \"mins\", 999999999)\n", "\n", " # using previous value's row to evaluate time\n", " for i, e in enumerate(dfx[\"mins\"]):\n", " if e == 1:\n", " dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n", "\n", " # sort in ascending order\n", " dfx = dfx.sort_values(by=\"timestamp\")\n", "\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # convert to hours and round to nearest hour\n", " dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n", " dfx[\"hours\"] = dfx[\"hours\"] / 60\n", " dfx[\"hours\"] = round(dfx[\"hours\"])\n", " dfx[\"hours\"] = dfx[\"hours\"].astype(np.int64)\n", "\n", " # > 48 hours - label as normal (9999)\n", " def f1(c):\n", " if c[\"hours\"] > 48:\n", " return 9999\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours\"] = dfx.apply(f1, axis=1)\n", "\n", " # filter out curtailment - curtailed when turbine is pitching outside\n", " # 0deg <= normal <= 3.5deg\n", " def f2(c):\n", " if (\n", " 0 <= c[\"pitch\"] <= 3.5\n", " or c[\"hours\"] != 9999\n", " or (\n", " (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n", " and (\n", " c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n", " or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n", " )\n", " )\n", " ):\n", " return \"normal\"\n", " else:\n", " return \"curtailed\"\n", "\n", " dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n", "\n", " # filter unusual readings, i.e., for normal operation, power <= 0 in\n", " # operating wind speeds, power > 100 before cut-in, runtime < 600\n", " def f3(c):\n", " if c[\"hours\"] == 9999 and (\n", " (\n", " 3 < c[\"ws_av\"] < 25\n", " and (\n", " c[\"ap_av\"] <= 0\n", " or c[\"runtime\"] < 600\n", " or c[\"EnvironmentalCategory_id\"] > 1\n", " or c[\"GridCategory_id\"] > 1\n", " or c[\"InfrastructureCategory_id\"] > 1\n", " or c[\"AvailabilityCategory_id\"] == 2\n", " or 12 <= c[\"TurbineCategory_id\"] <= 15\n", " or 21 <= c[\"TurbineCategory_id\"] <= 22\n", " )\n", " )\n", " or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n", " ):\n", " # remove unusual readings, i.e., zero power at operating wind\n", " # speeds, power > 0 before cut-in ...\n", " return \"unusual\"\n", " else:\n", " return \"normal\"\n", "\n", " dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n", "\n", " def f4(c):\n", " if 1 <= c[\"hours\"] <= 6:\n", " return 6\n", " elif 7 <= c[\"hours\"] <= 12:\n", " return 12\n", " elif 13 <= c[\"hours\"] <= 18:\n", " return 18\n", " elif 19 <= c[\"hours\"] <= 24:\n", " return 24\n", " elif 25 <= c[\"hours\"] <= 30:\n", " return 30\n", " elif 31 <= c[\"hours\"] <= 36:\n", " return 36\n", " elif 37 <= c[\"hours\"] <= 42:\n", " return 42\n", " elif 43 <= c[\"hours\"] <= 48:\n", " return 48\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n", "\n", " # filter data\n", " # normal w/o curtailment\n", " df3 = dfx[dfx.curtailment == \"normal\"]\n", " # normal w/o curtailment and unusual readings\n", " df3 = df3[df3.unusual == \"normal\"]\n", "\n", " df4 = df3[\n", " [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " \"hours6\",\n", " ]\n", " ].copy()\n", " df4 = df4.dropna()\n", "\n", " # splitting data set\n", " features = [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " ]\n", " X = df4[features]\n", " Y = df4[\"hours6\"]\n", " Xn = preprocessing.normalize(X)\n", " validation_size = 0.20\n", " seed = 7\n", " X_train, X_validation, Y_train, Y_validation = (\n", " model_selection.train_test_split(\n", " Xn, Y, test_size=validation_size, random_state=seed\n", " )\n", " )\n", "\n", " # fit using gini criterion\n", " clf = RandomForestClassifier(class_weight=\"balanced\")\n", " clf = clf.fit(X_train, Y_train)\n", "\n", " predictions = clf.predict(X_validation)\n", " print(accuracy_score(Y_validation, predictions))\n", " print(confusion_matrix(Y_validation, predictions))\n", " print(classification_report(Y_validation, predictions))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }