Process SCADA and downtime data#

This notebook demonstrates the merging of all four CSV files containing SCADA and downtime data into single dataframes. Two files are older datasets, and the other two are newer datasets. Both old and new datasets have most of their timestamps in common.

The older SCADA datasets were found to have some errors in the rotor speed readings. This merging replaces the old errorred data points with the new ones, and removes incomplete rows.

This merging ensures the downtime data has the same range as the SCADA data, and removes incomplete rows.

import glob
import os

import pandas as pd

# create directory to store processed data
os.makedirs("data/processed/", exist_ok=True)

Downtime categories#

# read and view data
data = pd.read_excel("data/Melogale Downtime Categories.xlsx")

data

	Turbine Categories	Unnamed: 1	Environmental Categories	Unnamed: 3	Grid Categories	Unnamed: 5	Infrastructure Categories	Unnamed: 7	Availability Categories	Unnamed: 9
0	id	OLD name	id	name	id	name	id	name	id	name
1	0	Unknown	0	Unknown	0	Unknown	0	Unknown	0	Unknown
2	1	OK	1	OK	1	OK	1	OK	1	Available/Non-penalising
3	2	Anemometry	2	High Wind Shutdown	2	Planned Outage	2	Planned Outage	2	NOT available/penalising
4	3	Rotor Brake	3	Icing	3	Unplanned Outage	3	Unplanned Outage	NaN	NaN
5	4	Main Shaft	4	Turbulence	NaN	NaN	NaN	NaN	NaN	NaN
6	5	Gearbox	5	Work Halted	NaN	NaN	NaN	NaN	NaN	NaN
7	6	Generator	6	Lightning	NaN	NaN	NaN	NaN	NaN	NaN
8	7	Yaw System	7	Sector Management	NaN	NaN	NaN	NaN	NaN	NaN
9	8	Electrical Controls	8	Low Temperature	NaN	NaN	NaN	NaN	NaN	NaN
10	9	Hydraulics	11	Low Wind Shutdown	NaN	NaN	NaN	NaN	NaN	NaN
11	10	Electrical System	12	Shadow casting stop	NaN	NaN	NaN	NaN	NaN	NaN
12	11	Pitch Control	13	Ambient temperature high	NaN	NaN	NaN	NaN	NaN	NaN
13	12	Unlogged manual stop	14	Cable unwind	NaN	NaN	NaN	NaN	NaN	NaN
14	13	Customer Stop	15	Wildlife	NaN	NaN	NaN	NaN	NaN	NaN
15	14	Noise Constraints	16	Other	NaN	NaN	NaN	NaN	NaN	NaN
16	15	Scheduled Maintenance	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
17	16	Tower	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
18	17	Retrofit	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19	18	Cable Unwind	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20	19	Hub	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
21	20	Rotor Blades	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
22	21	Delayed Startup	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
23	22	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

# drop first row
data = data.drop([0])

# function to filter data for each category type
catData = {}


def categorise_data(cat, number):
    catData[cat] = data.filter(
        items=[cat + " Categories", "Unnamed: " + str(number)]
    )
    catData[cat].rename(
        columns={
            cat + " Categories": "Category",
            "Unnamed: " + str(number): "Name",
        },
        inplace=True,
    )
    catData[cat]["Type"] = cat
    catData[cat].dropna(inplace=True)

# filtering
categorise_data("Turbine", 1)
categorise_data("Environmental", 3)
categorise_data("Grid", 5)
categorise_data("Infrastructure", 7)
categorise_data("Availability", 9)

# concatenate data
data = pd.concat(catData.values(), ignore_index=True)

data

	Category	Name	Type
0	0	Unknown	Turbine
1	1	OK	Turbine
2	2	Anemometry	Turbine
3	3	Rotor Brake	Turbine
4	4	Main Shaft	Turbine
5	5	Gearbox	Turbine
6	6	Generator	Turbine
7	7	Yaw System	Turbine
8	8	Electrical Controls	Turbine
9	9	Hydraulics	Turbine
10	10	Electrical System	Turbine
11	11	Pitch Control	Turbine
12	12	Unlogged manual stop	Turbine
13	13	Customer Stop	Turbine
14	14	Noise Constraints	Turbine
15	15	Scheduled Maintenance	Turbine
16	16	Tower	Turbine
17	17	Retrofit	Turbine
18	18	Cable Unwind	Turbine
19	19	Hub	Turbine
20	20	Rotor Blades	Turbine
21	21	Delayed Startup	Turbine
22	22	Other	Turbine
23	0	Unknown	Environmental
24	1	OK	Environmental
25	2	High Wind Shutdown	Environmental
26	3	Icing	Environmental
27	4	Turbulence	Environmental
28	5	Work Halted	Environmental
29	6	Lightning	Environmental
30	7	Sector Management	Environmental
31	8	Low Temperature	Environmental
32	11	Low Wind Shutdown	Environmental
33	12	Shadow casting stop	Environmental
34	13	Ambient temperature high	Environmental
35	14	Cable unwind	Environmental
36	15	Wildlife	Environmental
37	16	Other	Environmental
38	0	Unknown	Grid
39	1	OK	Grid
40	2	Planned Outage	Grid
41	3	Unplanned Outage	Grid
42	0	Unknown	Infrastructure
43	1	OK	Infrastructure
44	2	Planned Outage	Infrastructure
45	3	Unplanned Outage	Infrastructure
46	0	Unknown	Availability
47	1	Available/Non-penalising	Availability
48	2	NOT available/penalising	Availability

Downtime time series#

dt = {}
dtList = glob.glob("data/*downtime*.csv")
for num, df in enumerate(dtList):
    dt[num] = pd.read_csv(df)
    print(num, df, dt[num].shape)

0 data/Last_six_months_downtime.csv (5367, 12)
1 data/Prior_two_years_downtime.csv (16958, 24)

dt[0].head(5)

	id	timestamp_start	timestamp_end	turbine_id	alarm_id	GridCategory_id	InfrastructureCategory_id	EnvironmentalCategory_id	TurbineCategory_id	AvailabilityCategory_id	comment	workorder_id
0	715463	2016-10-05 11:27:58.000	2016-10-05 11:28:37.000	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
1	715464	2016-10-05 11:28:37.000	2016-10-05 11:58:15.000	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
2	715465	2016-10-05 11:58:15.000	2016-10-05 12:00:04.000	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
3	715466	2016-10-05 12:00:50.000	2016-10-05 12:48:00.000	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
4	715467	2016-10-26 04:56:44.000	2016-10-26 05:20:00.000	18	391	1	1	1	18	1	FM1003Yaw CableRewind	NaN

dt[1].head(5)

	id	timestamp_start	timestamp_end	turbine_id	alarm_id	GridCategory_id	InfrastructureCategory_id	EnvironmentalCategory_id	TurbineCategory_id	AvailabilityCategory_id	...	timestamp_end.1	turbine_id.1	alarm_id.1	GridCategory_id.1	InfrastructureCategory_id.1	EnvironmentalCategory_id.1	TurbineCategory_id.1	AvailabilityCategory_id.1	comment.1	workorder_id.1
0	85851	2015-01-09 01:31:23.000	2015-01-09 01:42:07.000	6	389	1	1	2	1	1	...	2015-01-09 01:42:07.000	6	389	1	1	2	1	1	NaN	NaN
1	85852	2015-01-09 02:01:28.000	2015-01-09 02:06:43.000	6	389	1	1	2	1	1	...	2015-01-09 02:06:43.000	6	389	1	1	2	1	1	NaN	NaN
2	85853	2015-01-09 03:08:50.000	2015-01-09 03:20:00.000	6	389	1	1	2	1	1	...	2015-01-09 03:20:00.000	6	389	1	1	2	1	1	NaN	NaN
3	85854	2015-01-09 04:21:58.000	2015-01-09 05:30:00.000	6	389	1	1	2	1	1	...	2015-01-09 05:30:00.000	6	389	1	1	2	1	1	NaN	NaN
4	85855	2015-01-09 05:43:41.000	2015-01-09 05:54:57.000	6	389	1	1	2	1	1	...	2015-01-09 05:54:57.000	6	389	1	1	2	1	1	NaN	NaN

5 rows × 24 columns

dt[1].columns

Index(['id', 'timestamp_start', 'timestamp_end', 'turbine_id', 'alarm_id',
       'GridCategory_id', 'InfrastructureCategory_id',
       'EnvironmentalCategory_id', 'TurbineCategory_id',
       'AvailabilityCategory_id', 'comment', 'workorder_id', 'id.1',
       'timestamp_start.1', 'timestamp_end.1', 'turbine_id.1', 'alarm_id.1',
       'GridCategory_id.1', 'InfrastructureCategory_id.1',
       'EnvironmentalCategory_id.1', 'TurbineCategory_id.1',
       'AvailabilityCategory_id.1', 'comment.1', 'workorder_id.1'],
      dtype='object')

# drop duplicate columns
dt[1].drop(columns=list(dt[1].filter(regex=".1")), inplace=True)

# convert timestamps to datetime data type
for key in dt.keys():
    for col in list(dt[key].filter(regex="timestamp")):
        dt[key][col] = pd.to_datetime(dt[key][col])

# concatenate data
data = pd.concat(dt.values(), join="outer")

data.head(5)

	id	timestamp_start	timestamp_end	turbine_id	alarm_id	GridCategory_id	InfrastructureCategory_id	EnvironmentalCategory_id	TurbineCategory_id	AvailabilityCategory_id	comment	workorder_id
0	715463	2016-10-05 11:27:58	2016-10-05 11:28:37	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
1	715464	2016-10-05 11:28:37	2016-10-05 11:58:15	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
2	715465	2016-10-05 11:58:15	2016-10-05 12:00:04	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
3	715466	2016-10-05 12:00:50	2016-10-05 12:48:00	18	5	1	1	1	13	1	Advanced rescue procedures	108128.0
4	715467	2016-10-26 04:56:44	2016-10-26 05:20:00	18	391	1	1	1	18	1	FM1003Yaw CableRewind	NaN

data.shape

(22325, 12)

SCADA time series#

# old SCADA data
scada = {}
scadaList = glob.glob("data/*SCADA.csv")
for num, df in enumerate(scadaList):
    scada[num] = pd.read_csv(df)
    print(num, df, scada[num].shape)

0 data/Last_six_months_SCADA.csv (651600, 17)
1 data/Prior_two_years_SCADA.csv (2550346, 17)

scada[0].head(5)

	timestamp	turbine	ap_av	ap_dev	ap_max	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	rs_av	nac_pos	pitch	runtime
0	01/11/2016 15:40	3	80.37572	29.46033	178.0	-9.292799	4.088211	4.126762	4.088211	290.4030	-17.849820	-4.077392	797.0712	0.0	294.4802	0.0	600.0
1	01/11/2016 15:40	4	141.68580	53.87635	242.0	1.339321	4.745911	5.071102	4.745911	316.6896	-15.205220	-6.955624	814.9814	0.0	323.6452	0.0	600.0
2	01/11/2016 15:40	5	14.85285	17.79933	91.0	-2.037041	2.886049	0.000000	2.886049	259.8634	0.000000	-2.367190	780.9207	0.0	262.2312	0.0	600.0
3	01/11/2016 15:40	6	139.71000	63.40798	273.0	-55.802610	4.517820	4.584790	4.517820	282.0574	-180.000000	-6.154136	816.2123	0.0	288.2119	0.0	600.0
4	01/11/2016 15:40	7	37.40505	40.17854	161.0	-145.587500	3.540881	3.540881	3.369349	228.9863	-3.396181	-4.141267	787.7513	0.0	232.3828	0.0	600.0

scada[1].head(5)

	timestamp	turbine	ap_av	ap_dev	ap_min	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	rs_av	nac_pos	pitch	runtime
0	2014-11-01 02:20:00	8	204.6824	74.51731	419.0	-38.03038	5.029291	5.029291	5.152810	204.4785	2.978069	-5.033260	836.9521	12.28072	201.5000	0.0	600.0
1	2014-11-01 02:20:00	9	228.1198	82.34712	400.0	91.85419	5.506918	5.506918	5.225697	215.6716	0.671585	-0.585945	845.9201	12.40917	215.0000	0.0	600.0
2	2014-11-01 02:20:00	10	159.3024	64.77182	302.0	-932.91080	5.442306	5.442306	5.653325	223.2563	2.356911	0.643579	822.5547	0.00000	220.9000	0.0	600.0
3	2014-11-01 02:20:00	11	119.3055	31.69469	199.0	-146.52910	4.281990	4.281990	4.095518	224.0360	0.852958	1.169835	808.5560	11.84853	223.1832	0.0	600.0
4	2014-11-01 02:20:00	12	168.3689	85.72588	387.0	77.63621	4.962106	4.962106	4.764067	240.3239	-3.776284	-5.501996	942.0538	13.78513	244.1000	0.0	600.0

# rename ap_min in scada[1] to ap_max
scada[1].rename(columns={"ap_min": "ap_max"}, inplace=True)

# fixing rotor speed readings due to errors in data


def fix_rs(c):
    if c["turbine"] <= 20:
        return c["rs_av"]

for df in scada.keys():
    scada[df]["rs_av_old"] = scada[df].apply(fix_rs, axis=1)
    scada[df] = scada[df].drop("rs_av", axis=1)

# concatenate old datasets
scadaOld = pd.concat(scada.values())

scadaOld.head(5)

	timestamp	turbine	ap_av	ap_dev	ap_max	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	nac_pos	pitch	runtime	rs_av_old
0	01/11/2016 15:40	3	80.37572	29.46033	178.0	-9.292799	4.088211	4.126762	4.088211	290.4030	-17.849820	-4.077392	797.0712	294.4802	0.0	600.0	0.0
1	01/11/2016 15:40	4	141.68580	53.87635	242.0	1.339321	4.745911	5.071102	4.745911	316.6896	-15.205220	-6.955624	814.9814	323.6452	0.0	600.0	0.0
2	01/11/2016 15:40	5	14.85285	17.79933	91.0	-2.037041	2.886049	0.000000	2.886049	259.8634	0.000000	-2.367190	780.9207	262.2312	0.0	600.0	0.0
3	01/11/2016 15:40	6	139.71000	63.40798	273.0	-55.802610	4.517820	4.584790	4.517820	282.0574	-180.000000	-6.154136	816.2123	288.2119	0.0	600.0	0.0
4	01/11/2016 15:40	7	37.40505	40.17854	161.0	-145.587500	3.540881	3.540881	3.369349	228.9863	-3.396181	-4.141267	787.7513	232.3828	0.0	600.0	0.0

# new SCADA data
scada = {}
scadaList = glob.glob("data/NS_SCADA*.csv")
for num, df in enumerate(scadaList):
    scada[num] = pd.read_csv(df)
    print(num, df, scada[num].shape)

0 data/NS_SCADA_2017_v2.csv (543425, 16)
1 data/NS_SCADA_v2.csv (3064604, 16)

scada[0].head(5)

	timestamp	turbine_id	ap_av	ap_dev	ap_max	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	rs_av	nac_pos	pitch
0	2017-01-01 00:00:00.000	1	275.5676	79.71668	455	-128.126100	5.950170	6.065521	5.950170	347.89090	3.216685	3.490916	868.2094	12.78084	344.40000	0.0
1	2017-01-01 00:00:00.000	2	280.9523	58.14750	433	-141.462800	5.920139	5.920139	0.011563	125.38090	1.381044	4.931978	871.9911	12.73835	124.00000	0.0
2	2017-01-01 00:00:00.000	3	307.0966	54.00394	449	-72.826970	6.408066	5.701910	6.408066	12.70238	-9.518435	2.202345	878.9877	12.92154	10.50000	0.0
3	2017-01-01 00:00:00.000	4	348.8079	112.20640	633	1.584503	6.269817	6.569055	6.269817	18.58538	-6.821929	2.269212	909.9926	13.39747	16.31618	0.0
4	2017-01-01 00:00:00.000	5	315.8827	60.93330	538	-95.838270	5.848261	0.000000	5.848261	319.47270	0.000000	5.373057	882.6358	12.97570	314.10000	0.0

scada[1].head(5)

	timestamp	turbine_id	ap_av	ap_dev	ap_max	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	rs_av	nac_pos	pitch
0	2015-01-01 00:00:00.000	1	1478.246	269.6151	1780	0.181361	10.90739	10.90739	11.467810	246.6747	2.730545	4.884112	1142.79500	16.754220	243.9438	3.475755
1	2015-01-01 00:00:00.000	2	2237.340	146.2053	2501	-8.899933	14.16283	14.16283	0.017031	211.1832	3.617529	9.506144	1155.91100	16.886540	207.5653	5.934744
2	2015-01-01 00:00:00.000	3	2076.400	465.8087	2769	334.474500	12.83834	12.83834	13.927340	242.8819	1.866124	9.465878	1141.49200	16.771280	241.0154	2.050259
3	2015-01-01 00:00:00.000	4	0.000	0.0000	0	0.000000	13.18276	13.18276	13.569020	164.8922	28.992340	35.905610	26.76113	0.481272	135.9000	87.000000
4	2015-01-01 00:00:00.000	5	1648.039	223.3687	1906	251.515400	11.84929	11.21665	11.849290	223.8225	-68.455830	2.734230	1146.72100	16.831610	221.0888	3.055791

# concatenate new datasets
scadaNew = pd.concat(scada.values())

scadaNew.head(5)

	timestamp	turbine_id	ap_av	ap_dev	ap_max	reactive_power	ws_av	ws_1	ws_2	wd_av	wd_1	wd_2	gen_sp	rs_av	nac_pos	pitch
0	2017-01-01 00:00:00.000	1	275.5676	79.71668	455	-128.126100	5.950170	6.065521	5.950170	347.89090	3.216685	3.490916	868.2094	12.78084	344.40000	0.0
1	2017-01-01 00:00:00.000	2	280.9523	58.14750	433	-141.462800	5.920139	5.920139	0.011563	125.38090	1.381044	4.931978	871.9911	12.73835	124.00000	0.0
2	2017-01-01 00:00:00.000	3	307.0966	54.00394	449	-72.826970	6.408066	5.701910	6.408066	12.70238	-9.518435	2.202345	878.9877	12.92154	10.50000	0.0
3	2017-01-01 00:00:00.000	4	348.8079	112.20640	633	1.584503	6.269817	6.569055	6.269817	18.58538	-6.821929	2.269212	909.9926	13.39747	16.31618	0.0
4	2017-01-01 00:00:00.000	5	315.8827	60.93330	538	-95.838270	5.848261	0.000000	5.848261	319.47270	0.000000	5.373057	882.6358	12.97570	314.10000	0.0