Python_Code: Added python code after running pylint
[thoth.git] / models / failure_prediction / python / lstm_correlation.py
1 # pylint: disable=C0103, C0116, W0621, E0401, W0104, W0105, R0913, E1136, W0612, E0102, C0301, W0611, C0411, W0311, W0404, E0602, C0326, C0330, W0106, C0412
2 # -*- coding: utf-8 -*-
3 """LSTM_correlation.ipynb
4
5 Automatically generated by Colaboratory.
6
7 Original file is located at
8     https://colab.research.google.com/drive/1pDIYGV2-FR7QJEhCt9HxlJfeIeqw8xBj
9
10 Contributors: Rohit Singh Rathaur, Girish L.
11
12 Copyright 2021 [Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka]
13
14 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
15
16 http://www.apache.org/licenses/LICENSE-2.0
17 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
18 """
19
20 import os
21 from keras.layers import Activation, Dense, Dropout
22 import seaborn as sns
23 import numpy as np
24 import pandas as pd
25 import matplotlib as mpl
26 import matplotlib.pyplot as plt
27 import tensorflow as tf
28 from google.colab import drive
29 drive.mount('/gdrive')
30
31 """We are importing the libraries:
32
33 - TensorFlow: to process and train the model
34 - Matplotlib: to plot the training anf loss curves
35 - Pandas: used for data analysis and it allows us to import data from various formats
36 - Numpy: For array computing
37 """
38
39 # Importing libraries
40
41 """We are reading the CSV file using `read_csv` function and storing it in a DataFrame named `df_Ellis`"""
42
43 df_Ellis = pd.read_csv(
44     "/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/Ellis_FinalTwoConditionwithOR.csv")
45 df_Ellis
46
47 df_Ellis.plot()
48
49 # we show here the hist
50 df_Ellis.hist(bins=100, figsize=(20, 15))
51 # save_fig("attribute_histogram_plots")
52 plt.show()
53
54 cpu_system_perc = df_Ellis[['ellis-cpu.system_perc']]
55 cpu_system_perc.rolling(12).mean().plot(
56     figsize=(20, 10), linewidth=5, fontsize=20)
57 plt.xlabel('Timestamp', fontsize=30)
58
59 load_avg_1_min = df_Ellis[['ellis-load.avg_1_min']]
60 load_avg_1_min.rolling(12).mean().plot(
61     figsize=(20, 10), linewidth=5, fontsize=20)
62 plt.xlabel('Timestamp', fontsize=30)
63
64 cpu_wait_perc = df_Ellis[['ellis-cpu.wait_perc']]
65 cpu_wait_perc.rolling(12).mean().plot(
66     figsize=(20, 10), linewidth=5, fontsize=20)
67 plt.xlabel('Year', fontsize=30)
68
69 df_dg = pd.concat([cpu_system_perc.rolling(12).mean(), load_avg_1_min.rolling(
70     12).mean(), cpu_wait_perc.rolling(12).mean()], axis=1)
71 df_dg.plot(figsize=(20, 10), linewidth=5, fontsize=20)
72 plt.xlabel('Year', fontsize=20)
73
74 # we establish the corrmartrice
75 color = sns.color_palette()
76 sns.set_style('darkgrid')
77
78 correaltionMatrice = df_Ellis.corr()
79 f, ax = plt.subplots(figsize=(20, 10))
80 sns.heatmap(
81     correaltionMatrice,
82     cbar=True,
83     vmin=0,
84     vmax=1,
85     square=True,
86     annot=True)
87 plt.show()
88
89 df_Ellis.corrwith(df_Ellis['ellis-load.avg_1_min'])
90
91 # using multivariate feature
92
93 features_3 = [
94     'ellis-cpu.wait_perc',
95     'ellis-load.avg_1_min',
96     'ellis-net.in_bytes_sec',
97     'Label']
98
99 features = df_Ellis[features_3]
100 features.index = df_Ellis['Timestamp']
101 features.head()
102
103 features.plot(subplots=True)
104
105 features = features.values
106
107 # standardize data
108 train_split = 141600
109 tf.random.set_seed(13)
110
111 # standardize data
112 features_mean = features[:train_split].mean()
113 features_std = features[:train_split].std()
114 features = (features - features_mean) / features_std
115
116 print(type(features))
117 print(features.shape)
118
119 # create mutlivariate data
120
121
122 def mutlivariate_data(
123         features,
124         target,
125         start_idx,
126         end_idx,
127         history_size,
128         target_size,
129         step,
130         single_step=False):
131     data = []
132     labels = []
133     start_idx = start_idx + history_size
134     if end_idx is None:
135         end_idx = len(features) - target_size
136     for i in range(start_idx, end_idx):
137         idxs = range(i - history_size, i, step)  # using step
138         data.append(features[idxs])
139         if single_step:
140             labels.append(target[i + target_size])
141         else:
142             labels.append(target[i:i + target_size])
143
144     return np.array(data), np.array(labels)
145
146 # generate multivariate data
147
148
149 history = 720
150 future_target = 72
151 STEP = 6
152
153 x_train_ss, y_train_ss = mutlivariate_data(
154     features, features[:, 1], 0, train_split, history, future_target, STEP, single_step=True)
155
156 x_val_ss, y_val_ss = mutlivariate_data(features, features[:, 1], train_split, None, history,
157                                        future_target, STEP, single_step=True)
158
159 print(x_train_ss.shape, y_train_ss.shape)
160 print(x_val_ss.shape, y_val_ss.shape)
161
162 # tensorflow dataset
163 batch_size = 256
164 buffer_size = 10000
165
166 train_ss = tf.data.Dataset.from_tensor_slices((x_train_ss, y_train_ss))
167 train_ss = train_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()
168
169 val_ss = tf.data.Dataset.from_tensor_slices((x_val_ss, y_val_ss))
170 val_ss = val_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()
171
172 print(train_ss)
173 print(val_ss)
174
175
176 def root_mean_squared_error(y_true, y_pred):
177     return K.sqrt(K.mean(K.square(y_pred - y_true)))
178
179
180 # Modelling using LSTM
181 steps = 50
182
183 EPOCHS = 20
184
185 single_step_model = tf.keras.models.Sequential()
186
187 single_step_model.add(tf.keras.layers.LSTM(
188     32, return_sequences=False, input_shape=x_train_ss.shape[-2:]))
189 single_step_model.add(tf.keras.layers.Dropout(0.3))
190 single_step_model.add(tf.keras.layers.Dense(1))
191 single_step_model.compile(
192     optimizer=tf.keras.optimizers.Adam(),
193     loss='mae',
194     metrics=[
195         tf.keras.metrics.RootMeanSquaredError(
196             name='rmse')])
197 #single_step_model.compile(loss='mse', optimizer='rmsprop')
198 single_step_model_history = single_step_model.fit(
199     train_ss,
200     epochs=EPOCHS,
201     steps_per_epoch=steps,
202     validation_data=val_ss,
203     validation_steps=50)
204 single_step_model.summary()
205
206 # plot train test loss
207
208
209 def plot_loss(history, title):
210     loss = history.history['loss']
211     val_loss = history.history['val_loss']
212
213     epochs = range(len(loss))
214     plt.figure()
215     plt.plot(epochs, loss, 'b', label='Train Loss')
216     plt.plot(epochs, val_loss, 'r', label='Validation Loss')
217     plt.title(title)
218     plt.legend()
219     plt.grid()
220     plt.show()
221
222
223 plot_loss(single_step_model_history,
224           'Single Step Training and validation loss')
225
226 # plot train test loss
227
228
229 def plot_loss(history, title):
230     loss = history.history['rmse']
231     val_loss = history.history['val_rmse']
232
233     epochs = range(len(loss))
234     plt.figure()
235     plt.plot(epochs, loss, 'b', label='Train RMSE')
236     plt.plot(epochs, val_loss, 'r', label='Validation RMSE')
237     plt.title(title)
238     plt.legend()
239     plt.grid()
240     plt.show()
241
242
243 plot_loss(single_step_model_history,
244           'Single Step Training and validation loss')
245
246 # fucntion to create time steps
247
248
249 def create_time_steps(length):
250     return list(range(-length, 0))
251
252 # function to plot time series data
253
254
255 def plot_time_series(plot_data, delta, title):
256     labels = ["History", 'True Future', 'Model Predcited']
257     marker = ['.-', 'rx', 'go']
258     time_steps = create_time_steps(plot_data[0].shape[0])
259
260     if delta:
261         future = delta
262     else:
263         future = 0
264     plt.title(title)
265     for i, x in enumerate(plot_data):
266         if i:
267             plt.plot(
268                 future,
269                 plot_data[i],
270                 marker[i],
271                 markersize=10,
272                 label=labels[i])
273         else:
274             plt.plot(
275                 time_steps,
276                 plot_data[i].flatten(),
277                 marker[i],
278                 label=labels[i])
279     plt.legend()
280     plt.xlim([time_steps[0], (future + 5) * 2])
281
282     plt.xlabel('Time_Step')
283     return plt
284
285 # Moving window average
286
287
288 def MWA(history):
289     return np.mean(history)
290
291 # plot time series and predicted values
292
293
294 for x, y in val_ss.take(5):
295     plot = plot_time_series([x[0][:, 1].numpy(), y[0].numpy(),
296                              single_step_model.predict(x)[0]], 12,
297                             'Single Step Prediction')
298     plot.show()
299
300 """# **MultiStep Forcasting**"""
301
302 future_target = 72  # 72 future values
303 x_train_multi, y_train_multi = mutlivariate_data(features, features[:, 1], 0,
304                                                  train_split, history,
305                                                  future_target, STEP)
306 x_val_multi, y_val_multi = mutlivariate_data(features, features[:, 1],
307                                              train_split, None, history,
308                                              future_target, STEP)
309
310 print(x_train_multi.shape)
311 print(y_train_multi.shape)
312
313 #  TF DATASET
314
315 train_data_multi = tf.data.Dataset.from_tensor_slices(
316     (x_train_multi, y_train_multi))
317 train_data_multi = train_data_multi.cache().shuffle(
318     buffer_size).batch(batch_size).repeat()
319
320 val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
321 val_data_multi = val_data_multi.batch(batch_size).repeat()
322
323 print(train_data_multi)
324 print(val_data_multi)
325
326 # plotting function
327
328
329 def multi_step_plot(history, true_future, prediction):
330     plt.figure(figsize=(12, 6))
331     num_in = create_time_steps(len(history))
332     num_out = len(true_future)
333     plt.grid()
334     plt.plot(num_in, np.array(history[:, 1]), label='History')
335     plt.plot(np.arange(num_out) / STEP, np.array(true_future), 'bo',
336              label='True Future')
337     if prediction.any():
338         plt.plot(np.arange(num_out) / STEP, np.array(prediction), 'ro',
339                  label='Predicted Future')
340     plt.legend(loc='upper left')
341     plt.show()
342
343
344 for x, y in train_data_multi.take(1):
345     multi_step_plot(x[0], y[0], np.array([0]))
346
347 multi_step_model = tf.keras.models.Sequential()
348 multi_step_model.add(tf.keras.layers.LSTM(
349     32, return_sequences=True, input_shape=x_train_multi.shape[-2:]))
350 multi_step_model.add(tf.keras.layers.LSTM(16, activation='relu'))
351 # aDD dropout layer (0.3)
352 multi_step_model.add(tf.keras.layers.Dense(72))  # for 72 outputs
353
354 multi_step_model.compile(
355     optimizer=tf.keras.optimizers.RMSprop(
356         clipvalue=1.0), loss='mae', metrics=[
357             tf.keras.metrics.RootMeanSquaredError(
358                 name='rmse')])
359
360 multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS,
361                                           steps_per_epoch=steps,
362                                           validation_data=val_data_multi,
363                                           validation_steps=50)
364
365 plot_loss(multi_step_history, 'Multi-Step Training and validation loss')
366
367 for x, y in val_data_multi.take(5):
368     multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])
369
370 scores = multi_step_model.evaluate(
371     x_train_multi,
372     y_train_multi,
373     verbose=1,
374     batch_size=200)
375 print('MAE: {}'.format(scores[1]))
376
377 scores_test = multi_step_model.evaluate(
378     x_val_multi, y_val_multi, verbose=1, batch_size=200)
379 print('MAE: {}'.format(scores[1]))