Decision tree classifier results#

import itertools
import subprocess

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# import data
df = pd.read_csv("data/SCADA_downtime_merged.csv", skip_blank_lines=True)
# list of turbines to plot
list1 = [1]
# list1 = list(df['turbine_id'].unique())
# sort turbines in ascending order
# list1 = sorted(list1, key=int)
# list of categories to plot
list2 = [11]
# list2 = list(df1['TurbineCategory_id'].unique())
# sort categories in ascending order
# list2 = sorted(list2, key=int)
# categories to remove from plot
# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]
list3 = list(itertools.product(list1, list2))
for x, y in list3:
    # filter only data for turbine x
    dfx = df[(df["turbine_id"] == x)].copy()

    # sort values by timestamp in descending order
    dfx = dfx.sort_values(by="timestamp", ascending=False)

    # copying fault to new column (mins) (fault when turbine category id is y)
    def f(c):
        if c["TurbineCategory_id"] == y:
            return 0
        else:
            return 1

    dfx["mins"] = dfx.apply(f, axis=1)

    # reset index
    dfx.reset_index(drop=True, inplace=True)

    # assigning value to first cell if it's not 0
    if dfx.loc[0, "mins"] == 0:
        dfx.set_value(0, "mins", 0)
    else:
        dfx.set_value(0, "mins", 999999999)

    # using previous value's row to evaluate time
    for i, e in enumerate(dfx["mins"]):
        if e == 1:
            dfx.at[i, "mins"] = dfx.at[i - 1, "mins"] + 10

    # sort in ascending order
    dfx = dfx.sort_values(by="timestamp")

    # reset index
    dfx.reset_index(drop=True, inplace=True)

    # convert to hours and round to nearest hour
    dfx["hours"] = dfx["mins"].astype(np.int64)
    dfx["hours"] = dfx["hours"] / 60
    dfx["hours"] = round(dfx["hours"])
    dfx["hours"] = dfx["hours"].astype(np.int64)

    # > 48 hours - label as normal (9999)
    def f1(c):
        if c["hours"] > 48:
            return 9999
        else:
            return c["hours"]

    dfx["hours"] = dfx.apply(f1, axis=1)

    # filter out curtailment - curtailed when turbine is pitching outside
    # 0deg <= normal <= 3.5deg
    def f2(c):
        if (
            0 <= c["pitch"] <= 3.5
            or c["hours"] != 9999
            or (
                (c["pitch"] > 3.5 or c["pitch"] < 0)
                and (
                    c["ap_av"] <= (0.1 * dfx["ap_av"].max())
                    or c["ap_av"] >= (0.9 * dfx["ap_av"].max())
                )
            )
        ):
            return "normal"
        else:
            return "curtailed"

    dfx["curtailment"] = dfx.apply(f2, axis=1)

    # filter unusual readings, i.e., for normal operation, power <= 0 in
    # operating wind speeds, power > 100 before cut-in, runtime < 600
    def f3(c):
        if c["hours"] == 9999 and (
            (
                3 < c["ws_av"] < 25
                and (
                    c["ap_av"] <= 0
                    or c["runtime"] < 600
                    or c["EnvironmentalCategory_id"] > 1
                    or c["GridCategory_id"] > 1
                    or c["InfrastructureCategory_id"] > 1
                    or c["AvailabilityCategory_id"] == 2
                    or 12 <= c["TurbineCategory_id"] <= 15
                    or 21 <= c["TurbineCategory_id"] <= 22
                )
            )
            or (c["ws_av"] < 3 and c["ap_av"] > 100)
        ):
            # remove unusual readings, i.e., zero power at operating wind
            # speeds, power > 0 before cut-in ...
            return "unusual"
        else:
            return "normal"

    dfx["unusual"] = dfx.apply(f3, axis=1)

    def f4(c):
        if 1 <= c["hours"] <= 6:
            return 6
        elif 7 <= c["hours"] <= 12:
            return 12
        elif 13 <= c["hours"] <= 18:
            return 18
        elif 19 <= c["hours"] <= 24:
            return 24
        elif 25 <= c["hours"] <= 30:
            return 30
        elif 31 <= c["hours"] <= 36:
            return 36
        elif 37 <= c["hours"] <= 42:
            return 42
        elif 43 <= c["hours"] <= 48:
            return 48
        else:
            return c["hours"]

    dfx["hours6"] = dfx.apply(f4, axis=1)

    # filter data
    # normal w/o curtailment
    df3 = dfx[dfx.curtailment == "normal"]
    # normal w/o curtailment and unusual readings
    df3 = df3[df3.unusual == "normal"]

    df4 = df3[
        [
            "ap_av",
            "ws_av",
            "wd_av",
            "pitch",
            "ap_max",
            "ap_dev",
            "reactive_power",
            "rs_av",
            "gen_sp",
            "nac_pos",
            "hours6",
        ]
    ].copy()
    df4 = df4.dropna()

    # splitting data set
    features = [
        "ap_av",
        "ws_av",
        "wd_av",
        "pitch",
        "ap_max",
        "ap_dev",
        "reactive_power",
        "rs_av",
        "gen_sp",
        "nac_pos",
    ]
    X = df4[features]
    Y = df4["hours6"]
    Xn = preprocessing.normalize(X)
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = (
        model_selection.train_test_split(
            Xn, Y, test_size=validation_size, random_state=seed
        )
    )

    # fit using gini criterion
    clf = DecisionTreeClassifier(class_weight="balanced")
    clf = clf.fit(X_train, Y_train)

    predictions = clf.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
0.849559082892
[[  341     7     7     2     2     1     2     2     4    44]
 [    9    75    19     5     3     2     3     1     0   177]
 [    3    17    50    12     5     7     1     3     2   131]
 [    3     7    11    49     6     2     4     3     3   108]
 [    0     7     6    13    25     2     0     1     2    97]
 [    2     2     6     3     3    42     5     0     3    92]
 [    2     2     0     3     1     6    25     3     1    87]
 [    1     1     1     5     0     8     1    28     7    95]
 [    6     1     0     4     4     3     2     5    22    95]
 [   51   169   515   140   115    83   114    76    90 13794]]
             precision    recall  f1-score   support

          0       0.82      0.83      0.82       412
          6       0.26      0.26      0.26       294
         12       0.08      0.22      0.12       231
         18       0.21      0.25      0.23       196
         24       0.15      0.16      0.16       153
         30       0.27      0.27      0.27       158
         36       0.16      0.19      0.17       130
         42       0.23      0.19      0.21       147
         48       0.16      0.15      0.16       142
       9999       0.94      0.91      0.92     15147

avg / total       0.87      0.85      0.86     17010
# after being fitted, the model can be used to predict the class of samples
clf.predict(X_validation)
array([9999, 9999, 9999, ..., 9999,   12, 9999], dtype=int64)
# alternatively, the probability of each class can be predicted, which is the
# fraction of training samples of the same class in a leaf
clf.predict_proba(X_validation)
array([[ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],
       [ 0. ,  0. ,  0.1, ...,  0. ,  0. ,  0.9],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],
       [ 0.3,  0.3,  0.4, ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  1. ]])
# using entropy
clf1 = DecisionTreeClassifier(criterion="entropy", class_weight="balanced")
clf1 = clf1.fit(X_train, Y_train)
predictions1 = clf1.predict(X_validation)
print(accuracy_score(Y_validation, predictions1))
print(confusion_matrix(Y_validation, predictions1))
print(classification_report(Y_validation, predictions1))
0.858083480306
[[  342     4     9     6     3     1     0     3     2    42]
 [    9    67    20     5     4     2     5     3     3   176]
 [    3     8    50     8     7     5     2     2     5   141]
 [    9     3    11    54     7     5     2     2     1   102]
 [    0     1     7     1    33     7     3     0     2    99]
 [    2     2     2     5     2    42     5     0     1    97]
 [    3     3     2     2     2     3    33     5     1    76]
 [    1     1     0     2     1     1     2    34    12    93]
 [    6     3     1     0     5     2     5     2    37    81]
 [   31   172   499   104    94    91    92    86    74 13904]]
             precision    recall  f1-score   support

          0       0.84      0.83      0.84       412
          6       0.25      0.23      0.24       294
         12       0.08      0.22      0.12       231
         18       0.29      0.28      0.28       196
         24       0.21      0.22      0.21       153
         30       0.26      0.27      0.26       158
         36       0.22      0.25      0.24       130
         42       0.25      0.23      0.24       147
         48       0.27      0.26      0.26       142
       9999       0.94      0.92      0.93     15147

avg / total       0.88      0.86      0.87     17010
# fault
df4["fault"] = df4["hours6"].astype(str)


def fx(c):
    if c["fault"] == "9999":
        return "normal"
    elif c["fault"] == "0":
        return "faulty"
    else:
        return "up to " + c["fault"] + " hr(s) before fault"


df4["fault"] = df4.apply(fx, axis=1)
# visualising


def visualise_tree(tree, feature_names):
    """Create tree svg using graphviz.

    Args
    ----
    tree -- scikit-learn DecisionTree.
    feature_names -- list of feature names.
    """
    with open("df.dot", "w") as f:
        export_graphviz(
            tree,
            out_file=f,
            filled=True,
            rounded=True,
            feature_names=feature_names,
            special_characters=True,
            class_names=list(df4["fault"].unique()),
        )

    command = ["dot", "-Tpng", "df.dot", "-o", "df.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, i.e., graphviz, to produce visualisation")


visualise_tree(clf1, features)