Labelling data and comparing classification algorithms#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
# import data
df = pd.read_csv("data/SCADA_downtime_merged.csv", skip_blank_lines=True)
# list of turbines to plot
list1 = [1]
# list1 = list(df['turbine_id'].unique())
# sort turbines in ascending order
# list1 = sorted(list1, key=int)
# list of categories to plot
list2 = [10, 11]
# list2 = list(df['TurbineCategory_id'].unique())
# list2 = [g for g in list2 if g >= 0]
# sort categories in ascending order
# list2 = sorted(list2, key=int)
# categories to remove from plot
# list2 = [m for m in list2 if m not in (1, 12, 13, 14, 15, 17, 21, 22)]
for x in list1:
    # filter only data for turbine x
    dfx = df[(df["turbine_id"] == x)].copy()

    for y in list2:
        # copying fault to new column (mins) (fault when turbine category id
        # is y)
        def f(c):
            if c["TurbineCategory_id"] == y:
                return 0
            else:
                return 1

        dfx["mins"] = dfx.apply(f, axis=1)

        # sort values by timestamp in descending order
        dfx = dfx.sort_values(by="timestamp", ascending=False)

        # reset index
        dfx.reset_index(drop=True, inplace=True)

        # assigning value to first cell if it's not 0
        if dfx.loc[0, "mins"] == 0:
            dfx.set_value(0, "mins", 0)
        else:
            dfx.set_value(0, "mins", 999999999)

        # using previous value's row to evaluate time
        for i, e in enumerate(dfx["mins"]):
            if e == 1:
                dfx.at[i, "mins"] = dfx.at[i - 1, "mins"] + 10

        # sort in ascending order
        dfx = dfx.sort_values(by="timestamp")

        # reset index
        dfx.reset_index(drop=True, inplace=True)

        # convert to hours and round to nearest hour
        dfx["hours"] = dfx["mins"].astype(np.int64)
        dfx["hours"] = dfx["hours"] / 60
        dfx["hours"] = round(dfx["hours"]).astype(np.int64)

        # > 48 hours - label as normal (9999)
        def f1(c):
            if c["hours"] > 48:
                return 9999
            else:
                return c["hours"]

        dfx["hours"] = dfx.apply(f1, axis=1)

        # filter out curtailment - curtailed when turbine is pitching outside
        # 0deg <= normal <= 3.5deg
        def f2(c):
            if (
                0 <= c["pitch"] <= 3.5
                or c["hours"] != 9999
                or (
                    (c["pitch"] > 3.5 or c["pitch"] < 0)
                    and (
                        c["ap_av"] <= (0.1 * dfx["ap_av"].max())
                        or c["ap_av"] >= (0.9 * dfx["ap_av"].max())
                    )
                )
            ):
                return "normal"
            else:
                return "curtailed"

        dfx["curtailment"] = dfx.apply(f2, axis=1)

        # filter unusual readings, i.e. for normal operation, power <= 0 in
        # operating wind speeds, power > 100 before cut-in, runtime < 600 and
        # other downtime categories
        def f3(c):
            if c["hours"] == 9999 and (
                (
                    3 < c["ws_av"] < 25
                    and (
                        c["ap_av"] <= 0
                        or c["runtime"] < 600
                        or c["EnvironmentalCategory_id"] > 1
                        or c["GridCategory_id"] > 1
                        or c["InfrastructureCategory_id"] > 1
                        or c["AvailabilityCategory_id"] == 2
                        or 12 <= c["TurbineCategory_id"] <= 15
                        or 21 <= c["TurbineCategory_id"] <= 22
                    )
                )
                or (c["ws_av"] < 3 and c["ap_av"] > 100)
            ):
                # remove unusual readings, i.e., zero power at operating wind
                # speeds, power > 0 before cut-in ...
                return "unusual"
            else:
                return "normal"

        dfx["unusual"] = dfx.apply(f3, axis=1)

        # round to 6 hour intervals
        def f4(c):
            if 1 <= c["hours"] <= 6:
                return 6
            elif 7 <= c["hours"] <= 12:
                return 12
            elif 13 <= c["hours"] <= 18:
                return 18
            elif 19 <= c["hours"] <= 24:
                return 24
            elif 25 <= c["hours"] <= 30:
                return 30
            elif 31 <= c["hours"] <= 36:
                return 36
            elif 37 <= c["hours"] <= 42:
                return 42
            elif 43 <= c["hours"] <= 48:
                return 48
            else:
                return c["hours"]

        dfx["hours6"] = dfx.apply(f4, axis=1)

        # change label for unusual and curtailed data
        def f5(c):
            if c["unusual"] == "unusual" or c["curtailment"] == "curtailed":
                return -9999
            else:
                return c["hours6"]

        dfx["hours_%s" % y] = dfx.apply(f5, axis=1)

        # drop unnecessary columns
        dfx = dfx.drop("hours6", axis=1)
        dfx = dfx.drop("hours", axis=1)
        dfx = dfx.drop("mins", axis=1)
        dfx = dfx.drop("curtailment", axis=1)
        dfx = dfx.drop("unusual", axis=1)

    df2 = dfx[
        [
            "ap_av",
            "ws_av",
            "wd_av",
            "pitch",
            "ap_max",
            "ap_dev",
            "reactive_power",
            "rs_av",
            "gen_sp",
            "nac_pos",
            "hours_11",
        ]
    ].copy()
    df2 = df2.dropna()

    # splitting data set
    features = [
        "ap_av",
        "ws_av",
        "wd_av",
        "pitch",
        "ap_max",
        "ap_dev",
        "reactive_power",
        "rs_av",
        "gen_sp",
        "nac_pos",
    ]
    X = df2[features]
    Y = df2["hours_11"]
    Xn = preprocessing.normalize(X)
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = (
        model_selection.train_test_split(
            Xn, Y, test_size=validation_size, random_state=seed
        )
    )

    models = []
    models.append(("LR", LogisticRegression(class_weight="balanced")))
    models.append(("LDA", LinearDiscriminantAnalysis()))
    models.append(("KNN5", KNeighborsClassifier()))
    models.append(("KNN15", KNeighborsClassifier(n_neighbors=15)))
    models.append(("DT", DecisionTreeClassifier(class_weight="balanced")))
    models.append(("RF", RandomForestClassifier(class_weight="balanced")))
    models.append(("NB", GaussianNB()))
    models.append(("MLP", MLPClassifier()))

    # evaluate each model in turn
    results = []
    names = []
    msg1 = "Results for Turbine %s" % x + " with Turbine Category %s" % y
    print(msg1)
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(
            model, X_train, Y_train, cv=kfold, scoring="accuracy"
        )
        results.append(cv_results)
        names.append(name)
        msg = "{}: {:f} ({:f})".format(
            name, cv_results.mean(), cv_results.std()
        )
        print(msg)

    # compare algorithms
    fig = plt.figure()
    fig.suptitle("Algorithm Comparison")
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
Results for Turbine 1 with Turbine Category 11
LR: 0.682084 (0.006069)
LDA: 0.765452 (0.005270)
KNN5: 0.889947 (0.002742)
KNN15: 0.888914 (0.003590)
DT: 0.851390 (0.003076)
RF: 0.899010 (0.002075)
NB: 0.683200 (0.006298)
MLP: 0.890631 (0.004285)
../../_images/254c709ad63fc96e377e2006935c4f4a288545893b39c1637887fe799ec0dbe4.png
clf2 = RandomForestClassifier(class_weight="balanced")
clf2.fit(X_train, Y_train)
predictions2 = clf2.predict(X_validation)
print(accuracy_score(Y_validation, predictions2))
print(confusion_matrix(Y_validation, predictions2))
print(classification_report(Y_validation, predictions2))
0.903127204326
[[  369     5     4     1     1     0     0     0     3    37]
 [    3    57    13     1     1     0     2     0     0   202]
 [    5     9    60     2     0     0     0     0     1   160]
 [    1     7     6    50     3     0     0     4     0   140]
 [    0     1     4     3    22     0     1     1     2   124]
 [    0     0     3     0     5    33     2     2     2    98]
 [    0     1     0     0     2     3    31     1     0   124]
 [    4     1     0     3     0     1     1    19     3   114]
 [    1     0     3     1     0     0     1     3    22   107]
 [    4    22   356     9     6     6     7     3     2 14701]]
             precision    recall  f1-score   support

          0       0.95      0.88      0.91       420
          6       0.55      0.20      0.30       279
         12       0.13      0.25      0.17       237
         18       0.71      0.24      0.36       211
         24       0.55      0.14      0.22       158
         30       0.77      0.23      0.35       145
         36       0.69      0.19      0.30       162
         42       0.58      0.13      0.21       146
         48       0.63      0.16      0.25       138
       9999       0.93      0.97      0.95     15116

avg / total       0.90      0.90      0.89     17012
clf = KNeighborsClassifier()
clf.fit(X_train, Y_train)
predictions = clf.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
0.913825534917
[[  361     6     2     3     0     0     1     0     0    47]
 [    6    28     7     3     5     1     2     1     0   226]
 [    7     9    40     4     3     0     0     0     0   174]
 [    3     9    11    40     3     1     0     2     1   141]
 [    1     0     0     7    15     3     0     2     0   130]
 [    3     3     2     1     3    28     3     2     1    99]
 [    2     2     0     2     1     3    30     1     1   120]
 [    4     0     2     6     0     1     4    23     3   103]
 [    2     0     2     1     1     0     1     1    19   111]
 [   15    36    19    21    13    15    16    11     8 14962]]
             precision    recall  f1-score   support

          0       0.89      0.86      0.88       420
          6       0.30      0.10      0.15       279
         12       0.47      0.17      0.25       237
         18       0.45      0.19      0.27       211
         24       0.34      0.09      0.15       158
         30       0.54      0.19      0.28       145
         36       0.53      0.19      0.27       162
         42       0.53      0.16      0.24       146
         48       0.58      0.14      0.22       138
       9999       0.93      0.99      0.96     15116

avg / total       0.89      0.91      0.89     17012
print(dfx.groupby("hours6").size())
hours6
0         2204
6         1719
12        1289
18        1070
24         913
30         787
36         756
42         727
48         684
9999    121179
dtype: int64