{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# k-NN: finding optimal weight function ('distance' or 'uniform')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import preprocessing\n",
    "from sklearn.model_selection import TimeSeriesSplit\n",
    "from sklearn.neighbors import KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data\n",
    "df = pd.read_csv(\"data/SCADA_downtime_merged.csv\", skip_blank_lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of turbines to plot\n",
    "list1 = list(df[\"turbine_id\"].unique())\n",
    "# sort turbines in ascending order\n",
    "list1 = sorted(list1, key=int)\n",
    "# list of categories\n",
    "list2 = list(df[\"TurbineCategory_id\"].unique())\n",
    "# remove NaN from list\n",
    "list2 = [g for g in list2 if g >= 0]\n",
    "# sort categories in ascending order\n",
    "list2 = sorted(list2, key=int)\n",
    "# categories to remove\n",
    "list2 = [m for m in list2 if m not in (1, 12, 13, 14, 15, 17, 21, 22)]\n",
    "# empty list to hold optimal n values for all turbines\n",
    "num = []\n",
    "# empty list to hold minimum error readings for all turbines\n",
    "err = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# filter only data for turbine x\n",
    "for x in list1:\n",
    "    dfx = df[(df[\"turbine_id\"] == x)].copy()\n",
    "    # copying fault to new column (mins) (fault when turbine category id is y)\n",
    "    for y in list2:\n",
    "\n",
    "        def f(c):\n",
    "            if c[\"TurbineCategory_id\"] == y:\n",
    "                return 0\n",
    "            else:\n",
    "                return 1\n",
    "\n",
    "        dfx[\"mins\"] = dfx.apply(f, axis=1)\n",
    "\n",
    "        # sort values by timestamp in descending order\n",
    "        dfx = dfx.sort_values(by=\"timestamp\", ascending=False)\n",
    "        # reset index\n",
    "        dfx.reset_index(drop=True, inplace=True)\n",
    "\n",
    "        # assigning value to first cell if it's not 0\n",
    "        if dfx.loc[0, \"mins\"] == 0:\n",
    "            dfx.set_value(0, \"mins\", 0)\n",
    "        else:\n",
    "            dfx.set_value(0, \"mins\", 999999999)\n",
    "\n",
    "        # using previous value's row to evaluate time\n",
    "        for i, e in enumerate(dfx[\"mins\"]):\n",
    "            if e == 1:\n",
    "                dfx.at[i, \"mins\"] = dfx.at[i - 1, \"mins\"] + 10\n",
    "\n",
    "        # sort in ascending order\n",
    "        dfx = dfx.sort_values(by=\"timestamp\")\n",
    "        # reset index\n",
    "        dfx.reset_index(drop=True, inplace=True)\n",
    "        # convert to hours, then round to nearest hour\n",
    "        dfx[\"hours\"] = dfx[\"mins\"].astype(np.int64)\n",
    "        dfx[\"hours\"] = dfx[\"hours\"] / 60\n",
    "        dfx[\"hours\"] = round(dfx[\"hours\"]).astype(np.int64)\n",
    "\n",
    "        # > 48 hours - label as normal (999)\n",
    "        def f1(c):\n",
    "            if c[\"hours\"] > 48:\n",
    "                return 999\n",
    "            else:\n",
    "                return c[\"hours\"]\n",
    "\n",
    "        dfx[\"hours\"] = dfx.apply(f1, axis=1)\n",
    "\n",
    "        # filter out curtailment - curtailed when turbine is pitching outside\n",
    "        # 0deg <= normal <= 3.5deg\n",
    "        def f2(c):\n",
    "            if (\n",
    "                0 <= c[\"pitch\"] <= 3.5\n",
    "                or c[\"hours\"] != 999\n",
    "                or (\n",
    "                    (c[\"pitch\"] > 3.5 or c[\"pitch\"] < 0)\n",
    "                    and (\n",
    "                        c[\"ap_av\"] <= (0.1 * dfx[\"ap_av\"].max())\n",
    "                        or c[\"ap_av\"] >= (0.9 * dfx[\"ap_av\"].max())\n",
    "                    )\n",
    "                )\n",
    "            ):\n",
    "                return \"normal\"\n",
    "            else:\n",
    "                return \"curtailed\"\n",
    "\n",
    "        dfx[\"curtailment\"] = dfx.apply(f2, axis=1)\n",
    "\n",
    "        # filter unusual readings, i.e., for normal operation, power <= 0 in\n",
    "        # operating wind speeds, power > 100 before cut-in, runtime < 600 and\n",
    "        # other downtime categories\n",
    "        def f3(c):\n",
    "            if c[\"hours\"] == 999 and (\n",
    "                (\n",
    "                    3 < c[\"ws_av\"] < 25\n",
    "                    and (\n",
    "                        c[\"ap_av\"] <= 0\n",
    "                        or c[\"runtime\"] < 600\n",
    "                        or c[\"EnvironmentalCategory_id\"] > 1\n",
    "                        or c[\"GridCategory_id\"] > 1\n",
    "                        or c[\"InfrastructureCategory_id\"] > 1\n",
    "                        or c[\"AvailabilityCategory_id\"] == 2\n",
    "                        or 12 <= c[\"TurbineCategory_id\"] <= 15\n",
    "                        or 21 <= c[\"TurbineCategory_id\"] <= 22\n",
    "                    )\n",
    "                )\n",
    "                or (c[\"ws_av\"] < 3 and c[\"ap_av\"] > 100)\n",
    "            ):\n",
    "                return \"unusual\"\n",
    "            else:\n",
    "                return \"normal\"\n",
    "\n",
    "        dfx[\"unusual\"] = dfx.apply(f3, axis=1)\n",
    "\n",
    "        # round to 6 hour intervals\n",
    "        def f4(c):\n",
    "            if 1 <= c[\"hours\"] <= 6:\n",
    "                return 6\n",
    "            elif 7 <= c[\"hours\"] <= 12:\n",
    "                return 12\n",
    "            elif 13 <= c[\"hours\"] <= 18:\n",
    "                return 18\n",
    "            elif 19 <= c[\"hours\"] <= 24:\n",
    "                return 24\n",
    "            elif 25 <= c[\"hours\"] <= 30:\n",
    "                return 30\n",
    "            elif 31 <= c[\"hours\"] <= 36:\n",
    "                return 36\n",
    "            elif 37 <= c[\"hours\"] <= 42:\n",
    "                return 42\n",
    "            elif 43 <= c[\"hours\"] <= 48:\n",
    "                return 48\n",
    "            else:\n",
    "                return c[\"hours\"]\n",
    "\n",
    "        dfx[\"hours6\"] = dfx.apply(f4, axis=1)\n",
    "\n",
    "        # change label for unusual and curtailed data (9999)\n",
    "        def f5(c):\n",
    "            if c[\"unusual\"] == \"unusual\" or c[\"curtailment\"] == \"curtailed\":\n",
    "                return 9999\n",
    "            else:\n",
    "                return c[\"hours6\"]\n",
    "\n",
    "        dfx[\"hours_%s\" % y] = dfx.apply(f5, axis=1)\n",
    "\n",
    "        # drop unnecessary columns\n",
    "        dfx = dfx.drop(\"hours6\", axis=1)\n",
    "        dfx = dfx.drop(\"hours\", axis=1)\n",
    "        dfx = dfx.drop(\"mins\", axis=1)\n",
    "        dfx = dfx.drop(\"curtailment\", axis=1)\n",
    "        dfx = dfx.drop(\"unusual\", axis=1)\n",
    "\n",
    "    # separate features from classes for classification\n",
    "    features = [\n",
    "        \"ap_av\",\n",
    "        \"ws_av\",\n",
    "        \"wd_av\",\n",
    "        \"pitch\",\n",
    "        \"ap_max\",\n",
    "        \"ap_dev\",\n",
    "        \"reactive_power\",\n",
    "        \"rs_av\",\n",
    "        \"gen_sp\",\n",
    "        \"nac_pos\",\n",
    "    ]\n",
    "    classes = [col for col in dfx.columns if \"hours\" in col]\n",
    "    # list of columns to copy into new df\n",
    "    list3 = features + classes + [\"timestamp\"]\n",
    "    df2 = dfx[list3].copy()\n",
    "    # drop NaNs\n",
    "    df2 = df2.dropna()\n",
    "    X = df2[features]\n",
    "    # normalise features to values b/w 0 and 1\n",
    "    X = preprocessing.normalize(X)\n",
    "    Y = df2[classes]\n",
    "    # convert from pd dataframe to np array\n",
    "    Y = Y.as_matrix()\n",
    "\n",
    "    # subsetting just the odd ones\n",
    "    weights = [\"uniform\", \"distance\"]\n",
    "    # empty list that will hold average cross validation scores for each n\n",
    "    scores = []\n",
    "    # cross validation using time series split\n",
    "    tscv = TimeSeriesSplit(n_splits=5)\n",
    "\n",
    "    # looping for each value of w and defining classifier\n",
    "    for w in weights:\n",
    "        knn = KNeighborsClassifier(weights=w, n_jobs=-1)\n",
    "        # empty list to hold score for each cross validation fold\n",
    "        p1 = []\n",
    "        # looping for each cross validation fold\n",
    "        for train_index, test_index in tscv.split(X):\n",
    "            # split train and test sets\n",
    "            X_train, X_test = X[train_index], X[test_index]\n",
    "            Y_train, Y_test = Y[train_index], Y[test_index]\n",
    "            # fit to classifier and predict\n",
    "            knn1 = knn.fit(X_train, Y_train)\n",
    "            pred = knn1.predict(X_test)\n",
    "            # accuracy score\n",
    "            p2 = np.sum(np.equal(Y_test, pred)) / Y_test.size\n",
    "            # add to list\n",
    "            p1.append(p2)\n",
    "        # average score across all cross validation folds\n",
    "        p = sum(p1) / len(p1)\n",
    "        scores.append(p)\n",
    "    # changing to misclassification error\n",
    "    MSE = [1 - x for x in scores]\n",
    "    # determining best n\n",
    "    optimal = weights[MSE.index(min(MSE))]\n",
    "    num.append(optimal)\n",
    "    err.append(min(MSE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = pd.DataFrame(num, columns=[\"weights\"])\n",
    "d[\"error\"] = err\n",
    "d[\"turbine\"] = list1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weights</th>\n",
       "      <th>error</th>\n",
       "      <th>turbine</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.208227</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.134103</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.108901</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.125060</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.093205</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.116663</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.215100</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.145820</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.120158</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.117416</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.132436</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.138280</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.142595</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.074375</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>uniform</td>\n",
       "      <td>0.181361</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.158894</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.149808</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.113547</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.086192</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.148732</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.076828</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.075347</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.090167</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.160592</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>distance</td>\n",
       "      <td>0.077421</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     weights     error  turbine\n",
       "0   distance  0.208227        1\n",
       "1   distance  0.134103        2\n",
       "2   distance  0.108901        3\n",
       "3   distance  0.125060        4\n",
       "4   distance  0.093205        5\n",
       "5   distance  0.116663        6\n",
       "6   distance  0.215100        7\n",
       "7   distance  0.145820        8\n",
       "8   distance  0.120158        9\n",
       "9   distance  0.117416       10\n",
       "10  distance  0.132436       11\n",
       "11  distance  0.138280       12\n",
       "12  distance  0.142595       13\n",
       "13  distance  0.074375       14\n",
       "14   uniform  0.181361       15\n",
       "15  distance  0.158894       16\n",
       "16  distance  0.149808       17\n",
       "17  distance  0.113547       18\n",
       "18  distance  0.086192       19\n",
       "19  distance  0.148732       20\n",
       "20  distance  0.076828       21\n",
       "21  distance  0.075347       22\n",
       "22  distance  0.090167       23\n",
       "23  distance  0.160592       24\n",
       "24  distance  0.077421       25"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}