{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# k-NN: finding optimal weight function ('distance' or 'uniform')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.neighbors import KNeighborsClassifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import data\n", "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# list of turbines to plot\n", "list1 = list(df[\"turbine_id\"].unique())\n", "# sort turbines in ascending order\n", "list1 = sorted(list1, key=int)\n", "# list of categories\n", "list2 = list(df[\"TurbineCategory_id\"].unique())\n", "# remove NaN from list\n", "list2 = [g for g in list2 if g >= 0]\n", "# sort categories in ascending order\n", "list2 = sorted(list2, key=int)\n", "# categories to remove\n", "list2 = [m for m in list2 if m not in (1, 12, 13, 14, 15, 17, 21, 22)]\n", "# empty list to hold optimal n values for all turbines\n", "num = []\n", "# empty list to hold minimum error readings for all turbines\n", "err = []" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# filter only data for turbine x\n", "for x in list1:\n", " dfx = df[(df[\"turbine_id\"] == x)].copy()\n", " # copying fault to new column (mins) (fault when turbine category id is y)\n", " for y in list2:\n", "\n", " def f(c):\n", " if c[\"TurbineCategory_id\"] == y:\n", " return 0\n", " else:\n", " return 1\n", "\n", " dfx[\"mins\"] = dfx.apply(f, axis=1)\n", "\n", " # sort values by timestamp in descending order\n", " dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", "\n", " # assigning value to first cell if it's not 0\n", " if dfx.loc[0, \"mins\"] == 0:\n", " dfx.set_value(0, \"mins\", 0)\n", " else:\n", " dfx.set_value(0, \"mins\", 999999999)\n", "\n", " # using previous value's row to evaluate time\n", " for i, e in enumerate(dfx[\"mins\"]):\n", " if e == 1:\n", " dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n", "\n", " # sort in ascending order\n", " dfx = dfx.sort_values(by=\"timestamp\")\n", " # reset index\n", " dfx.reset_index(drop=True, inplace=True)\n", " # convert to hours, then round to nearest hour\n", " dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n", " dfx[\"hours\"] = dfx[\"hours\"] / 60\n", " dfx[\"hours\"] = round(dfx[\"hours\"]).astype(np.int64)\n", "\n", " # > 48 hours - label as normal (999)\n", " def f1(c):\n", " if c[\"hours\"] > 48:\n", " return 999\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours\"] = dfx.apply(f1, axis=1)\n", "\n", " # filter out curtailment - curtailed when turbine is pitching outside\n", " # 0deg <= normal <= 3.5deg\n", " def f2(c):\n", " if (\n", " 0 <= c[\"pitch\"] <= 3.5\n", " or c[\"hours\"] != 999\n", " or (\n", " (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n", " and (\n", " c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n", " or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n", " )\n", " )\n", " ):\n", " return \"normal\"\n", " else:\n", " return \"curtailed\"\n", "\n", " dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n", "\n", " # filter unusual readings, i.e., for normal operation, power <= 0 in\n", " # operating wind speeds, power > 100 before cut-in, runtime < 600 and\n", " # other downtime categories\n", " def f3(c):\n", " if c[\"hours\"] == 999 and (\n", " (\n", " 3 < c[\"ws_av\"] < 25\n", " and (\n", " c[\"ap_av\"] <= 0\n", " or c[\"runtime\"] < 600\n", " or c[\"EnvironmentalCategory_id\"] > 1\n", " or c[\"GridCategory_id\"] > 1\n", " or c[\"InfrastructureCategory_id\"] > 1\n", " or c[\"AvailabilityCategory_id\"] == 2\n", " or 12 <= c[\"TurbineCategory_id\"] <= 15\n", " or 21 <= c[\"TurbineCategory_id\"] <= 22\n", " )\n", " )\n", " or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n", " ):\n", " return \"unusual\"\n", " else:\n", " return \"normal\"\n", "\n", " dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n", "\n", " # round to 6 hour intervals\n", " def f4(c):\n", " if 1 <= c[\"hours\"] <= 6:\n", " return 6\n", " elif 7 <= c[\"hours\"] <= 12:\n", " return 12\n", " elif 13 <= c[\"hours\"] <= 18:\n", " return 18\n", " elif 19 <= c[\"hours\"] <= 24:\n", " return 24\n", " elif 25 <= c[\"hours\"] <= 30:\n", " return 30\n", " elif 31 <= c[\"hours\"] <= 36:\n", " return 36\n", " elif 37 <= c[\"hours\"] <= 42:\n", " return 42\n", " elif 43 <= c[\"hours\"] <= 48:\n", " return 48\n", " else:\n", " return c[\"hours\"]\n", "\n", " dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n", "\n", " # change label for unusual and curtailed data (9999)\n", " def f5(c):\n", " if c[\"unusual\"] == \"unusual\" or c[\"curtailment\"] == \"curtailed\":\n", " return 9999\n", " else:\n", " return c[\"hours6\"]\n", "\n", " dfx[\"hours_%s\" % y] = dfx.apply(f5, axis=1)\n", "\n", " # drop unnecessary columns\n", " dfx = dfx.drop(\"hours6\", axis=1)\n", " dfx = dfx.drop(\"hours\", axis=1)\n", " dfx = dfx.drop(\"mins\", axis=1)\n", " dfx = dfx.drop(\"curtailment\", axis=1)\n", " dfx = dfx.drop(\"unusual\", axis=1)\n", "\n", " # separate features from classes for classification\n", " features = [\n", " \"ap_av\",\n", " \"ws_av\",\n", " \"wd_av\",\n", " \"pitch\",\n", " \"ap_max\",\n", " \"ap_dev\",\n", " \"reactive_power\",\n", " \"rs_av\",\n", " \"gen_sp\",\n", " \"nac_pos\",\n", " ]\n", " classes = [col for col in dfx.columns if \"hours\" in col]\n", " # list of columns to copy into new df\n", " list3 = features + classes + [\"timestamp\"]\n", " df2 = dfx[list3].copy()\n", " # drop NaNs\n", " df2 = df2.dropna()\n", " X = df2[features]\n", " # normalise features to values b/w 0 and 1\n", " X = preprocessing.normalize(X)\n", " Y = df2[classes]\n", " # convert from pd dataframe to np array\n", " Y = Y.as_matrix()\n", "\n", " # subsetting just the odd ones\n", " weights = [\"uniform\", \"distance\"]\n", " # empty list that will hold average cross validation scores for each n\n", " scores = []\n", " # cross validation using time series split\n", " tscv = TimeSeriesSplit(n_splits=5)\n", "\n", " # looping for each value of w and defining classifier\n", " for w in weights:\n", " knn = KNeighborsClassifier(weights=w, n_jobs=-1)\n", " # empty list to hold score for each cross validation fold\n", " p1 = []\n", " # looping for each cross validation fold\n", " for train_index, test_index in tscv.split(X):\n", " # split train and test sets\n", " X_train, X_test = X[train_index], X[test_index]\n", " Y_train, Y_test = Y[train_index], Y[test_index]\n", " # fit to classifier and predict\n", " knn1 = knn.fit(X_train, Y_train)\n", " pred = knn1.predict(X_test)\n", " # accuracy score\n", " p2 = np.sum(np.equal(Y_test, pred)) / Y_test.size\n", " # add to list\n", " p1.append(p2)\n", " # average score across all cross validation folds\n", " p = sum(p1) / len(p1)\n", " scores.append(p)\n", " # changing to misclassification error\n", " MSE = [1 - x for x in scores]\n", " # determining best n\n", " optimal = weights[MSE.index(min(MSE))]\n", " num.append(optimal)\n", " err.append(min(MSE))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "d = pd.DataFrame(num, columns=[\"weights\"])\n", "d[\"error\"] = err\n", "d[\"turbine\"] = list1" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | weights | \n", "error | \n", "turbine | \n", "
---|---|---|---|
0 | \n", "distance | \n", "0.208227 | \n", "1 | \n", "
1 | \n", "distance | \n", "0.134103 | \n", "2 | \n", "
2 | \n", "distance | \n", "0.108901 | \n", "3 | \n", "
3 | \n", "distance | \n", "0.125060 | \n", "4 | \n", "
4 | \n", "distance | \n", "0.093205 | \n", "5 | \n", "
5 | \n", "distance | \n", "0.116663 | \n", "6 | \n", "
6 | \n", "distance | \n", "0.215100 | \n", "7 | \n", "
7 | \n", "distance | \n", "0.145820 | \n", "8 | \n", "
8 | \n", "distance | \n", "0.120158 | \n", "9 | \n", "
9 | \n", "distance | \n", "0.117416 | \n", "10 | \n", "
10 | \n", "distance | \n", "0.132436 | \n", "11 | \n", "
11 | \n", "distance | \n", "0.138280 | \n", "12 | \n", "
12 | \n", "distance | \n", "0.142595 | \n", "13 | \n", "
13 | \n", "distance | \n", "0.074375 | \n", "14 | \n", "
14 | \n", "uniform | \n", "0.181361 | \n", "15 | \n", "
15 | \n", "distance | \n", "0.158894 | \n", "16 | \n", "
16 | \n", "distance | \n", "0.149808 | \n", "17 | \n", "
17 | \n", "distance | \n", "0.113547 | \n", "18 | \n", "
18 | \n", "distance | \n", "0.086192 | \n", "19 | \n", "
19 | \n", "distance | \n", "0.148732 | \n", "20 | \n", "
20 | \n", "distance | \n", "0.076828 | \n", "21 | \n", "
21 | \n", "distance | \n", "0.075347 | \n", "22 | \n", "
22 | \n", "distance | \n", "0.090167 | \n", "23 | \n", "
23 | \n", "distance | \n", "0.160592 | \n", "24 | \n", "
24 | \n", "distance | \n", "0.077421 | \n", "25 | \n", "