Plots of results#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# decision trees
d = pd.read_csv("data/DT.csv")
# per turbine
d0 = d.groupby("turbine", as_index=False)["f1"].mean()
d1 = d.groupby("turbine", as_index=False)["f1"].max()
d2 = d.groupby("turbine", as_index=False)["f1"].min()
d3 = d.groupby("turbine", as_index=False)["f"].mean()
d4 = d.groupby("turbine", as_index=False)["f"].max()
d5 = d.groupby("turbine", as_index=False)["f"].min()
x = np.array(d0["turbine"])
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(list(range(1, 26)))
plt.xlabel("Turbine")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/1c504d5c31dd39df0cc2ffbb882368eb4fbfe5a53b2a51a51593a05d9732e1f7.png
# per turbine category
d0 = d.groupby("fault", as_index=False)["f1"].mean()
d1 = d.groupby("fault", as_index=False)["f1"].max()
d2 = d.groupby("fault", as_index=False)["f1"].min()
d3 = d.groupby("fault", as_index=False)["f"].mean()
d4 = d.groupby("fault", as_index=False)["f"].max()
d5 = d.groupby("fault", as_index=False)["f"].min()
x = np.array(list(range(1, 15)))
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(range(1, 15), sorted(d0["fault"].tolist(), key=int))
plt.xlabel("Turbine category")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/386b7f643f683a2119c4925b3882f428df5b58217a504e792c5227c30d3d57b7.png
# random forests
d = pd.read_csv("data/RF.csv")
# per turbine
d0 = d.groupby("turbine", as_index=False)["f1"].mean()
d1 = d.groupby("turbine", as_index=False)["f1"].max()
d2 = d.groupby("turbine", as_index=False)["f1"].min()
d3 = d.groupby("turbine", as_index=False)["f"].mean()
d4 = d.groupby("turbine", as_index=False)["f"].max()
d5 = d.groupby("turbine", as_index=False)["f"].min()
x = np.array(d0["turbine"])
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(list(range(1, 26)))
plt.xlabel("Turbine")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/c032fe07fc5841a21dba5ddbc42af89e4242caaa8e7f91a4c893e5f56c696ba7.png
# per turbine category
d0 = d.groupby("fault", as_index=False)["f1"].mean()
d1 = d.groupby("fault", as_index=False)["f1"].max()
d2 = d.groupby("fault", as_index=False)["f1"].min()
d3 = d.groupby("fault", as_index=False)["f"].mean()
d4 = d.groupby("fault", as_index=False)["f"].max()
d5 = d.groupby("fault", as_index=False)["f"].min()
x = np.array(list(range(1, 15)))
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(range(1, 15), sorted(d0["fault"].tolist(), key=int))
plt.xlabel("Turbine category")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/2eb493840c59bb5340c85c87b7b2e73aa8a18f179fd23d24704da0ebbc1cdbdc.png
# k nearest neighbours
d = pd.read_csv("data/knn.csv")
# per turbine
d0 = d.groupby("turbine", as_index=False)["f1"].mean()
d1 = d.groupby("turbine", as_index=False)["f1"].max()
d2 = d.groupby("turbine", as_index=False)["f1"].min()
d3 = d.groupby("turbine", as_index=False)["f"].mean()
d4 = d.groupby("turbine", as_index=False)["f"].max()
d5 = d.groupby("turbine", as_index=False)["f"].min()
x = np.array(d0["turbine"])
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(list(range(1, 26)))
plt.xlabel("Turbine")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/a8fd35f29255c8c4ef9be5725cba62320ede18828b3af02f636ee4534e9f1df4.png
# per turbine category
d0 = d.groupby("fault", as_index=False)["f1"].mean()
d1 = d.groupby("fault", as_index=False)["f1"].max()
d2 = d.groupby("fault", as_index=False)["f1"].min()
d3 = d.groupby("fault", as_index=False)["f"].mean()
d4 = d.groupby("fault", as_index=False)["f"].max()
d5 = d.groupby("fault", as_index=False)["f"].min()
x = np.array(list(range(1, 15)))
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x, y1, linestyle="None", color="#3F2B78", marker="o", label="imbalanced"
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.xticks(range(1, 15), sorted(d0["fault"].tolist(), key=int))
plt.xlabel("Turbine category")
plt.ylabel("F1 score")
plt.legend(loc=4)
plt.show()
../../_images/66be7f77f5a1d000d52e91707099f0618df00aca245ca30050c18eeccfeff497.png
# kNN optimised k
d_k = pd.read_csv("data/knn-k.csv")
# per turbine
d0 = d_k.groupby("turbine", as_index=False)["f1"].mean()
d1 = d_k.groupby("turbine", as_index=False)["f1"].max()
d2 = d_k.groupby("turbine", as_index=False)["f1"].min()
d3 = d_k.groupby("turbine", as_index=False)["f"].mean()
d4 = d_k.groupby("turbine", as_index=False)["f"].max()
d5 = d_k.groupby("turbine", as_index=False)["f"].min()
x = np.array(d0["turbine"])
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
d6 = d.groupby("turbine", as_index=False)["f1"].mean()
d7 = d.groupby("turbine", as_index=False)["f1"].max()
d8 = d.groupby("turbine", as_index=False)["f1"].min()
y2 = np.array(d6["f1"])
eh2 = np.array(d7["f1"])
el2 = np.array(d8["f1"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y2, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y2, [y2 - el2, eh2 - y2], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x,
    y,
    linestyle="None",
    color="#3F2B78",
    marker="o",
    label="imbalanced, without k optimisation",
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.errorbar(
    x,
    y1,
    linestyle="None",
    color="C0",
    marker="o",
    label="imbalanced, with k optimisation",
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="C0", capsize=3
)
plt.xticks(list(range(1, 26)))
plt.xlabel("Turbine")
plt.ylabel("F1 score")
plt.legend()
plt.show()
../../_images/86da80d2d4e86db0ebdb09d01476b9afe24be237adf424ede0cc32e5bcfda6b8.png
# per turbine category
d0 = d_k.groupby("fault", as_index=False)["f1"].mean()
d1 = d_k.groupby("fault", as_index=False)["f1"].max()
d2 = d_k.groupby("fault", as_index=False)["f1"].min()
d3 = d_k.groupby("fault", as_index=False)["f"].mean()
d4 = d_k.groupby("fault", as_index=False)["f"].max()
d5 = d_k.groupby("fault", as_index=False)["f"].min()
d6 = d.groupby("fault", as_index=False)["f1"].mean()
d7 = d.groupby("fault", as_index=False)["f1"].max()
d8 = d.groupby("fault", as_index=False)["f1"].min()
y2 = np.array(d6["f1"])
eh2 = np.array(d7["f1"])
el2 = np.array(d8["f1"])
x = np.array(list(range(1, 15)))
y = np.array(d0["f1"])
eh = np.array(d1["f1"])
el = np.array(d2["f1"])
y1 = np.array(d3["f"])
eh1 = np.array(d4["f"])
el1 = np.array(d5["f"])
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.errorbar(
    x, y2, linestyle="None", color="#098A63", marker="o", label="balanced"
)
plt.errorbar(
    x, y2, [y2 - el2, eh2 - y2], linestyle="None", ecolor="#098A63", capsize=3
)
plt.errorbar(
    x,
    y,
    linestyle="None",
    color="#3F2B78",
    marker="o",
    label="imbalanced, without k optimisation",
)
plt.errorbar(
    x, y, [y - el, eh - y], linestyle="None", ecolor="#3F2B78", capsize=3
)
plt.errorbar(
    x,
    y1,
    linestyle="None",
    color="C0",
    marker="o",
    label="imbalanced, with k optimisation",
)
plt.errorbar(
    x, y1, [y1 - el1, eh1 - y1], linestyle="None", ecolor="C0", capsize=3
)
plt.xticks(range(1, 15), sorted(d0["fault"].tolist(), key=int))
plt.xlabel("Turbine category")
plt.ylabel("F1 score")
plt.legend()
plt.show()
../../_images/489211d96ad0bbbb0e5575dfc9c73dfe46c181c80b048e7401a9ebd6fd01dc52.png
# optimal k
d = pd.read_csv("data/knn-optimal.csv")

x = d["turbine"]
y = d["k_p"]
y1 = d["k_r"]
y2 = d["k_f"]

fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.plot(x, y, color="c", label="k_precision", marker="o")
plt.plot(x, y1, color="#098A63", label="k_recall", marker="o")
plt.plot(x, y2, color="#3F2B78", label="k_F1-score", marker="o")
plt.xlabel("Turbine")
plt.ylabel("Optimal k value")
plt.xticks(range(1, 26))
plt.legend()
plt.show()
../../_images/bfa1872fe6d5d4d423ecf98b88781460cf0c0b55ca46aeab6782e13d3b3bbd64.png
# optimal k - F1 score
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
plt.plot(x, y2, color="#3F2B78", label="k_F1-score", marker="o")
plt.xlabel("Turbine")
plt.ylabel("Optimal k value")
plt.xticks(range(1, 26))
plt.show()
../../_images/b9c7fcbb80e8335a99ed8229f6bb98cbdd4cac8e68fb0b0c79a943d5216108aa.png