Random forest classifier results#

import itertools

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.tree import RandomForestClassifier

# import data
df = pd.read_csv("data/SCADA_downtime_merged.csv", skip_blank_lines=True)

# list of turbines to plot
list1 = [1]
# list1 = list(df['turbine_id'].unique())
# sort turbines in ascending order
# list1 = sorted(list1, key=int)
# list of categories to plot
list2 = [11]
# list2 = list(df1['TurbineCategory_id'].unique())
# sort categories in ascending order
# list2 = sorted(list2, key=int)
# categories to remove from plot
# list2 = [e for e in list2 if e not in (1, 12, 13, 15, 21, 22)]
list3 = list(itertools.product(list1, list2))

for x, y in list3:
    # filter only data for turbine x
    dfx = df[(df["turbine_id"] == x)].copy()

    # sort values by timestamp in descending order
    dfx = dfx.sort_values(by="timestamp", ascending=False)

    # copying fault to new column (mins) (fault when turbine category id is y)
    def f(c):
        if c["TurbineCategory_id"] == y:
            return 0
        else:
            return 1

    dfx["mins"] = dfx.apply(f, axis=1)

    # reset index
    dfx.reset_index(drop=True, inplace=True)

    # assigning value to first cell if it's not 0
    if dfx.loc[0, "mins"] == 0:
        dfx.set_value(0, "mins", 0)
    else:
        dfx.set_value(0, "mins", 999999999)

    # using previous value's row to evaluate time
    for i, e in enumerate(dfx["mins"]):
        if e == 1:
            dfx.at[i, "mins"] = dfx.at[i - 1, "mins"] + 10

    # sort in ascending order
    dfx = dfx.sort_values(by="timestamp")

    # reset index
    dfx.reset_index(drop=True, inplace=True)

    # convert to hours and round to nearest hour
    dfx["hours"] = dfx["mins"].astype(np.int64)
    dfx["hours"] = dfx["hours"] / 60
    dfx["hours"] = round(dfx["hours"])
    dfx["hours"] = dfx["hours"].astype(np.int64)

    # > 48 hours - label as normal (9999)
    def f1(c):
        if c["hours"] > 48:
            return 9999
        else:
            return c["hours"]

    dfx["hours"] = dfx.apply(f1, axis=1)

    # filter out curtailment - curtailed when turbine is pitching outside
    # 0deg <= normal <= 3.5deg
    def f2(c):
        if (
            0 <= c["pitch"] <= 3.5
            or c["hours"] != 9999
            or (
                (c["pitch"] > 3.5 or c["pitch"] < 0)
                and (
                    c["ap_av"] <= (0.1 * dfx["ap_av"].max())
                    or c["ap_av"] >= (0.9 * dfx["ap_av"].max())
                )
            )
        ):
            return "normal"
        else:
            return "curtailed"

    dfx["curtailment"] = dfx.apply(f2, axis=1)

    # filter unusual readings, i.e., for normal operation, power <= 0 in
    # operating wind speeds, power > 100 before cut-in, runtime < 600
    def f3(c):
        if c["hours"] == 9999 and (
            (
                3 < c["ws_av"] < 25
                and (
                    c["ap_av"] <= 0
                    or c["runtime"] < 600
                    or c["EnvironmentalCategory_id"] > 1
                    or c["GridCategory_id"] > 1
                    or c["InfrastructureCategory_id"] > 1
                    or c["AvailabilityCategory_id"] == 2
                    or 12 <= c["TurbineCategory_id"] <= 15
                    or 21 <= c["TurbineCategory_id"] <= 22
                )
            )
            or (c["ws_av"] < 3 and c["ap_av"] > 100)
        ):
            # remove unusual readings, i.e., zero power at operating wind
            # speeds, power > 0 before cut-in ...
            return "unusual"
        else:
            return "normal"

    dfx["unusual"] = dfx.apply(f3, axis=1)

    def f4(c):
        if 1 <= c["hours"] <= 6:
            return 6
        elif 7 <= c["hours"] <= 12:
            return 12
        elif 13 <= c["hours"] <= 18:
            return 18
        elif 19 <= c["hours"] <= 24:
            return 24
        elif 25 <= c["hours"] <= 30:
            return 30
        elif 31 <= c["hours"] <= 36:
            return 36
        elif 37 <= c["hours"] <= 42:
            return 42
        elif 43 <= c["hours"] <= 48:
            return 48
        else:
            return c["hours"]

    dfx["hours6"] = dfx.apply(f4, axis=1)

    # filter data
    # normal w/o curtailment
    df3 = dfx[dfx.curtailment == "normal"]
    # normal w/o curtailment and unusual readings
    df3 = df3[df3.unusual == "normal"]

    df4 = df3[
        [
            "ap_av",
            "ws_av",
            "wd_av",
            "pitch",
            "ap_max",
            "ap_dev",
            "reactive_power",
            "rs_av",
            "gen_sp",
            "nac_pos",
            "hours6",
        ]
    ].copy()
    df4 = df4.dropna()

    # splitting data set
    features = [
        "ap_av",
        "ws_av",
        "wd_av",
        "pitch",
        "ap_max",
        "ap_dev",
        "reactive_power",
        "rs_av",
        "gen_sp",
        "nac_pos",
    ]
    X = df4[features]
    Y = df4["hours6"]
    Xn = preprocessing.normalize(X)
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = (
        model_selection.train_test_split(
            Xn, Y, test_size=validation_size, random_state=seed
        )
    )

    # fit using gini criterion
    clf = RandomForestClassifier(class_weight="balanced")
    clf = clf.fit(X_train, Y_train)

    predictions = clf.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

0.899764844209
[[  358     3     5     0     2     0     0     1     1    42]
 [    7    45    13     1     0     0     0     0     0   228]
 [    5     9    46     4     0     0     0     0     1   166]
 [    8     4     9    36     3     0     1     2     1   132]
 [    0     1     4     2    19     1     0     0     1   125]
 [    2     1     0     0     2    31     2     1     1   118]
 [    3     0     0     1     0     3    22     3     0    98]
 [    2     0     1     2     0     1     0    16     4   121]
 [    7     1     0     1     1     1     1     5    14   111]
 [    5     9   387     6     2     1     4     4    11 14718]]
             precision    recall  f1-score   support

          0       0.90      0.87      0.89       412
          6       0.62      0.15      0.25       294
         12       0.10      0.20      0.13       231
         18       0.68      0.18      0.29       196
         24       0.66      0.12      0.21       153
         30       0.82      0.20      0.32       158
         36       0.73      0.17      0.28       130
         42       0.50      0.11      0.18       147
         48       0.41      0.10      0.16       142
       9999       0.93      0.97      0.95     15147

avg / total       0.89      0.90      0.89     17010

Random forest classifier results#

This Page