models/failure_prediction/python/bi_lstmstacked_lstm_correlation.py

   1 # pylint: disable=C0103, C0116, W0621, E0401, W0104, W0105, R0913, E1136, W0612, E0102, C0301, W0611, C0411
   2 # -*- coding: utf-8 -*-
   3 """Bi_LSTMstacked_LSTM_Correlation.ipynb
   4
   5 Automatically generated by Colaboratory.
   6
   7 Original file is located at
   8     https://colab.research.google.com/drive/1lwBt4E8mHUhRTWK94Y0KsUZ1jHgU1ePq
   9
  10 Contributors: **Rohit Singh Rathaur, Girish L.**
  11
  12 Copyright [2021](2021) [*Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka*]
  13
  14 Licensed under the Apache License, Version 2.0 (the "License");
  15 you may not use this file except in compliance with the License.
  16 You may obtain a copy of the License at
  17
  18     http://www.apache.org/licenses/LICENSE-2.0
  19
  20 Unless required by applicable law or agreed to in writing, software
  21 distributed under the License is distributed on an "AS IS" BASIS,
  22 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  23 See the License for the specific language governing permissions and
  24 limitations under the License.
  25 """
  26
  27 from keras import backend as K
  28 from keras.utils.vis_utils import plot_model
  29 import seaborn as sns
  30 import os
  31 import numpy as np
  32 import pandas as pd
  33 import matplotlib as mpl
  34 import matplotlib.pyplot as plt
  35 import tensorflow as tf
  36 from google.colab import drive
  37 drive.mount('/content/drive')
  38
  39 # Importing libraries
  40
  41 df_Ellis = pd.read_csv(
  42     "/content/drive/MyDrive/LFN Anuket/Analysis/data/Final/Ellis_FinalTwoConditionwithOR.csv")
  43 df_Ellis
  44
  45 df_Ellis.plot()
  46
  47 """We showed here the histograms of Ellis data"""
  48
  49 # we show here the hist
  50 df_Ellis.hist(bins=100, figsize=(20, 15))
  51 # save_fig("attribute_histogram_plots")
  52 plt.show()
  53
  54 cpu_system_perc = df_Ellis[['ellis-cpu.system_perc']]
  55 cpu_system_perc.rolling(12).mean().plot(
  56     figsize=(20, 10), linewidth=5, fontsize=20)
  57 plt.xlabel('Timestamp', fontsize=30)
  58
  59 load_avg_1_min = df_Ellis[['ellis-load.avg_1_min']]
  60 load_avg_1_min.rolling(12).mean().plot(
  61     figsize=(20, 10), linewidth=5, fontsize=20)
  62 plt.xlabel('Timestamp', fontsize=30)
  63
  64 """## Identifying trends in Time Series data
  65 There are several ways to think about identifying trends in time series. One popular way is by taking a rolling average, which means that, for each time point, we take the average of the points on either side of it. Note that the number of points is specified by a window size, which we need to choose.
  66
  67 What happens then because we take the average is it tends to smooth out noise and seasonality. We will see that below right now. Check out this rolling average of `'ellis-cpu.wait_perc'` using the built-in `pandas` methods.
  68
  69 When it comes to determining the window size, here, it makes sense to first try out one of twelve months, as we're talking about yearly seasonality.
  70
  71 Note that in the code chunk above we used two sets of squared brackets to extract the `'ellis-cpu.wait_perc'` column as a DataFrame; If we would have used one set, like `df_Ellis['ellis-cpu.wait_perc']`, we would have created a pandas Series.
  72
  73 In the code chunk above, you also chained methods: you called methods on an object one after another. Method chaining is pretty popular and pandas is one of the packages that really allows you to use that style of programming to the max!
  74 """
  75
  76 cpu_wait_perc = df_Ellis[['ellis-cpu.wait_perc']]
  77 cpu_wait_perc.rolling(12).mean().plot(
  78     figsize=(20, 10), linewidth=5, fontsize=20)
  79 plt.xlabel('Year', fontsize=30)
  80
  81 """We have successfully removed the seasonality and we saw an upward trend for `ellis-cpu.wait_perc`! But how do these two search terms compare?
  82
  83 We can figure this out by plotting the trends of `'ellis-cpu.wait_perc'`, `cpu_system_perc` and `'load_avg_1_min'` on a single figure:
  84 """
  85
  86 df_dg = pd.concat([cpu_system_perc.rolling(12).mean(), load_avg_1_min.rolling(
  87     12).mean(), cpu_wait_perc.rolling(12).mean()], axis=1)
  88 df_dg.plot(figsize=(20, 10), linewidth=5, fontsize=20)
  89 plt.xlabel('Year', fontsize=20)
  90
  91 """We established the correlation matrix for Ellis data.
  92 Seaborn has five built-in themes to style its plots: `darkgrid`, `whitegrid`, `dark`, `white`, and `ticks`. Seaborn defaults to using the darkgrid theme for its plots, but we can change this styling to better suit our presentation needs.
  93
  94 To use any of the preset themes pass the name of it to `sns.set_style()`.
  95 """
  96
  97 # we establish the corrmartrice
  98 color = sns.color_palette()
  99 sns.set_style('darkgrid')
 100
 101 correaltionMatrice = df_Ellis.corr()
 102 f, ax = plt.subplots(figsize=(20, 10))
 103 sns.heatmap(
 104     correaltionMatrice,
 105     cbar=True,
 106     vmin=0,
 107     vmax=1,
 108     square=True,
 109     annot=True)
 110 plt.show()
 111
 112 """Correlation between rows or columns of two DataFrame objectsCompute pairwise"""
 113
 114 df_Ellis.corrwith(df_Ellis['ellis-load.avg_1_min'])
 115
 116 # using multivariate feature
 117
 118 features_3 = [
 119     'ellis-cpu.wait_perc',
 120     'ellis-load.avg_1_min',
 121     'ellis-net.in_bytes_sec',
 122     'Label']
 123
 124 features = df_Ellis[features_3]
 125 features.index = df_Ellis['Timestamp']
 126 features.head()
 127
 128 features.plot(subplots=True)
 129
 130 features = features.values
 131
 132 """train test split for simple time series moving window average"""
 133
 134 # standardize data
 135 train_split = 141600
 136 tf.random.set_seed(13)
 137
 138 # standardize data
 139 features_mean = features[:train_split].mean()
 140 features_std = features[:train_split].std()
 141 features = (features - features_mean) / features_std
 142
 143 print(type(features))
 144 print(features.shape)
 145
 146 """Created multivariate data"""
 147
 148 # create mutlivariate data
 149
 150
 151 def mutlivariate_data(
 152         features,
 153         target,
 154         start_idx,
 155         end_idx,
 156         history_size,
 157         target_size,
 158         step,
 159         single_step=False):
 160     data = []
 161     labels = []
 162     start_idx = start_idx + history_size
 163     if end_idx is None:
 164         end_idx = len(features) - target_size
 165     for i in range(start_idx, end_idx):
 166         idxs = range(i - history_size, i, step)  # using step
 167         data.append(features[idxs])
 168         if single_step:
 169             labels.append(target[i + target_size])
 170         else:
 171             labels.append(target[i:i + target_size])
 172
 173     return np.array(data), np.array(labels)
 174
 175 # generate multivariate data
 176
 177
 178 history = 720
 179 future_target = 72
 180 STEP = 6
 181
 182 x_train_ss, y_train_ss = mutlivariate_data(
 183     features, features[:, 1], 0, train_split, history, future_target, STEP, single_step=True)
 184
 185 x_val_ss, y_val_ss = mutlivariate_data(features, features[:, 1], train_split, None, history,
 186                                        future_target, STEP, single_step=True)
 187
 188 print(x_train_ss.shape, y_train_ss.shape)
 189 print(x_val_ss.shape, y_val_ss.shape)
 190
 191 """The `tf.data.Dataset` API supports writing descriptive and efficient input pipelines. Dataset usage following a common pattern:
 192 - Creating a source dataset from our input data.
 193 - Applied dataset transformations to preprocess the data.
 194 - Iterate over the dataset and process the elements.
 195 Note: Iteration happens in a streaming fashion, so the full dataset does not need to fit into memory.
 196 Once we have a dataset, we can apply transformations to prepare the data for our model:
 197 """
 198
 199 # tensorflow dataset
 200 batch_size = 256
 201 buffer_size = 10000
 202
 203 train_ss = tf.data.Dataset.from_tensor_slices((x_train_ss, y_train_ss))
 204 train_ss = train_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()
 205
 206 val_ss = tf.data.Dataset.from_tensor_slices((x_val_ss, y_val_ss))
 207 val_ss = val_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()
 208
 209 print(train_ss)
 210 print(val_ss)
 211
 212 x_train_ss.shape[-2:]
 213
 214
 215 def root_mean_squared_error(y_true, y_pred):
 216     return K.sqrt(K.mean(K.square(y_pred - y_true)))
 217
 218
 219 # Modelling using LSTM
 220 steps = 50
 221
 222 EPOCHS = 20
 223
 224 single_step_model = tf.keras.models.Sequential()
 225
 226 single_step_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
 227     32, return_sequences=True, input_shape=x_train_ss.shape[-2:])))
 228 # single_step_model.add(tf.keras.layers.Dropout(0.3))
 229 single_step_model.add(tf.keras.layers.LSTM(units=100, return_sequences=False))
 230 # single_step_model.add(tf.keras.layers.Dropout(0.2))
 231 #model.add(Dense(units=1, activation='relu'))
 232 single_step_model.add(tf.keras.layers.Activation("relu"))
 233 single_step_model.add(tf.keras.layers.Dense(1))
 234 single_step_model.compile(
 235     optimizer=tf.keras.optimizers.Adam(),
 236     loss='mae',
 237     metrics=[
 238         tf.keras.metrics.RootMeanSquaredError(
 239             name='rmse')])
 240 #single_step_model.compile(loss='mse', optimizer='rmsprop')
 241
 242 single_step_model_history = single_step_model.fit(
 243     train_ss,
 244     epochs=EPOCHS,
 245     steps_per_epoch=steps,
 246     validation_data=val_ss,
 247     validation_steps=50)
 248
 249 plot_model(
 250     single_step_model,
 251     to_file='/content/drive/MyDrive/LFN Anuket/Analysis/data/Final/Bi-LSTM.png',
 252     show_shapes=True,
 253     show_layer_names=True)
 254 single_step_model.summary()
 255
 256
 257 # plot train test loss
 258
 259 def plot_loss(history, title):
 260     loss = history.history['loss']
 261     val_loss = history.history['val_loss']
 262
 263     epochs = range(len(loss))
 264     plt.figure()
 265     plt.plot(epochs, loss, 'b', label='Train Loss')
 266     plt.plot(epochs, val_loss, 'r', label='Validation Loss')
 267     plt.title(title)
 268     plt.legend()
 269     plt.grid()
 270     plt.show()
 271
 272
 273 plot_loss(single_step_model_history,
 274           'Single Step Training and validation loss')
 275
 276 # plot train test loss
 277
 278
 279 def plot_loss(history, title):
 280     loss = history.history['rmse']
 281     val_loss = history.history['val_rmse']
 282
 283     epochs = range(len(loss))
 284     plt.figure()
 285     plt.plot(epochs, loss, 'b', label='Train RMSE')
 286     plt.plot(epochs, val_loss, 'r', label='Validation RMSE')
 287     plt.title(title)
 288     plt.legend()
 289     plt.grid()
 290     plt.show()
 291
 292
 293 plot_loss(single_step_model_history,
 294           'Single Step Training and validation loss')
 295
 296 # fucntion to create time steps
 297
 298
 299 def create_time_steps(length):
 300     return list(range(-length, 0))
 301
 302 # function to plot time series data
 303
 304
 305 def plot_time_series(plot_data, delta, title):
 306     labels = ["History", 'True Future', 'Model Predcited']
 307     marker = ['.-', 'rx', 'go']
 308     time_steps = create_time_steps(plot_data[0].shape[0])
 309
 310     if delta:
 311         future = delta
 312     else:
 313         future = 0
 314     plt.title(title)
 315     for i, x in enumerate(plot_data):
 316         if i:
 317             plt.plot(
 318                 future,
 319                 plot_data[i],
 320                 marker[i],
 321                 markersize=10,
 322                 label=labels[i])
 323         else:
 324             plt.plot(
 325                 time_steps,
 326                 plot_data[i].flatten(),
 327                 marker[i],
 328                 label=labels[i])
 329     plt.legend()
 330     plt.xlim([time_steps[0], (future + 5) * 2])
 331
 332     plt.xlabel('Time_Step')
 333     return plt
 334
 335 # Moving window average
 336
 337
 338 def MWA(history):
 339     return np.mean(history)
 340
 341 # plot time series and predicted values
 342
 343
 344 for x, y in val_ss.take(5):
 345     plot = plot_time_series([x[0][:, 1].numpy(), y[0].numpy(),
 346                              single_step_model.predict(x)[0]], 12,
 347                             'Single Step Prediction')
 348     plot.show()
 349
 350 """# **MultiStep Forcasting**"""
 351
 352 future_target = 72  # 72 future values
 353 x_train_multi, y_train_multi = mutlivariate_data(features, features[:, 1], 0,
 354                                                  train_split, history,
 355                                                  future_target, STEP)
 356 x_val_multi, y_val_multi = mutlivariate_data(features, features[:, 1],
 357                                              train_split, None, history,
 358                                              future_target, STEP)
 359
 360 print(x_train_multi.shape)
 361 print(y_train_multi.shape)
 362
 363 #  TF DATASET
 364
 365 train_data_multi = tf.data.Dataset.from_tensor_slices(
 366     (x_train_multi, y_train_multi))
 367 train_data_multi = train_data_multi.cache().shuffle(
 368     buffer_size).batch(batch_size).repeat()
 369
 370 val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
 371 val_data_multi = val_data_multi.batch(batch_size).repeat()
 372
 373 print(train_data_multi)
 374 print(val_data_multi)
 375
 376 # plotting function
 377
 378
 379 def multi_step_plot(history, true_future, prediction):
 380     plt.figure(figsize=(12, 6))
 381     num_in = create_time_steps(len(history))
 382     num_out = len(true_future)
 383     plt.grid()
 384     plt.plot(num_in, np.array(history[:, 1]), label='History')
 385     plt.plot(np.arange(num_out) / STEP, np.array(true_future), 'bo',
 386              label='True Future')
 387     if prediction.any():
 388         plt.plot(np.arange(num_out) / STEP, np.array(prediction), 'ro',
 389                  label='Predicted Future')
 390     plt.legend(loc='upper left')
 391     plt.show()
 392
 393
 394 for x, y in train_data_multi.take(1):
 395     multi_step_plot(x[0], y[0], np.array([0]))
 396
 397 """Bi-directional LSTM:
 398 On some sequence prediction problems, it can be beneficial to allow the LSTM model to learn the input sequence both forward and backwards and concatenate both interpretations. This is known as bidirectional.
 399
 400 Here, `tf.keras.layers.Bidirectional` is a bidirectional wrapper for RNNs which inherits from `Wrapper`, `Layer`, and `module`
 401 """
 402
 403 multi_step_model = tf.keras.models.Sequential()
 404 multi_step_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
 405     32, return_sequences=True, input_shape=x_train_multi.shape[-2:])))
 406 multi_step_model.add(tf.keras.layers.Dropout(0.2))
 407 multi_step_model.add(tf.keras.layers.LSTM(units=100, return_sequences=False))
 408 multi_step_model.add(tf.keras.layers.Dropout(0.2))
 409 #model.add(Dense(units=1, activation='relu'))
 410 multi_step_model.add(tf.keras.layers.Activation("relu"))
 411 # aDD dropout layer (0.3)
 412 multi_step_model.add(tf.keras.layers.Dense(72))  # for 72 outputs
 413
 414 multi_step_model.compile(
 415     optimizer=tf.keras.optimizers.RMSprop(
 416         clipvalue=1.0), loss='mae', metrics=[
 417             tf.keras.metrics.RootMeanSquaredError(
 418                 name='rmse')])
 419
 420 multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS,
 421                                           steps_per_epoch=steps,
 422                                           validation_data=val_data_multi,
 423                                           validation_steps=50)
 424
 425 plot_loss(multi_step_history, 'Multi-Step Training and validation loss')
 426
 427 for x, y in val_data_multi.take(5):
 428     multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])
 429
 430 scores = multi_step_model.evaluate(
 431     x_train_multi,
 432     y_train_multi,
 433     verbose=1,
 434     batch_size=200)
 435 print('MAE: {}'.format(scores[1]))
 436
 437 scores_test = multi_step_model.evaluate(
 438     x_val_multi, y_val_multi, verbose=1, batch_size=200)
 439 print('MAE: {}'.format(scores[1]))
 440
 441 y_pred_test = multi_step_model.predict(x_val_multi, verbose=0)
 442
 443 plt.figure(figsize=(10, 5))
 444 plt.plot(y_pred_test)
 445 plt.plot(y_val_multi)
 446 plt.ylabel("Value")
 447 plt.xlabel("Timestap")
 448 plt.legend(loc='upper left')
 449 plt.show()