models/failure_prediction/python/decision_tree.py

   1 # pylint: disable=C0103, C0116, W0621, E0401, W0104, W0105, R0913, E1136, W0612, E0102, C0301, W0611, C0411, W0311, C0326, C0330, W0106, C0412
   2 # -*- coding: utf-8 -*-
   3 """Decision_Tree.ipynb
   4
   5 Automatically generated by Colaboratory.
   6
   7 Original file is located at
   8     https://colab.research.google.com/drive/1TdQCHMWu8lPA53-jFhxXDUPQdjqufrL1
   9
  10 Contributors: **Rohit Singh Rathaur, Girish L.**
  11
  12 Copyright [2021](2021) [*Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka*]
  13
  14 Licensed under the Apache License, Version 2.0 (the "License");
  15 you may not use this file except in compliance with the License.
  16 You may obtain a copy of the License at
  17
  18     http://www.apache.org/licenses/LICENSE-2.0
  19
  20 Unless required by applicable law or agreed to in writing, software
  21 distributed under the License is distributed on an "AS IS" BASIS,
  22 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  23 See the License for the specific language governing permissions and
  24 limitations under the License.
  25
  26 We mounted the drive to access the data
  27 """
  28
  29 import sklearn.metrics as metrics
  30 from sklearn.metrics import classification_report
  31 import seaborn as sns
  32 from sklearn import tree
  33 from sklearn.linear_model import LogisticRegression
  34 from sklearn.model_selection import train_test_split
  35 import os
  36 import numpy as np
  37 import pandas as pd
  38 import matplotlib as mpl
  39 import matplotlib.pyplot as plt
  40 import tensorflow as tf
  41 from google.colab import drive
  42 drive.mount('/content/drive')
  43
  44 """We are importing libraries to read the CSV and to train the models"""
  45
  46 # Importing libraries
  47
  48 """We are reading CSV file using `read_csv` function and dropping the `Timestamp` column and storing it in a DataFrame called `df_Ellis`."""
  49
  50 df_Ellis = pd.read_csv(
  51     "/content/drive/MyDrive/Failure/lstm/Ellis_FinalTwoConditionwithOR.csv")
  52 df_Ellis = df_Ellis.drop(columns='Timestamp')
  53 df_Ellis
  54
  55 """First we stored the `feature_cols` and defined the `X` matrix and `y` vector where `X` is a matrix and containing all the feature matrix and `y` is a vector which is having target value."""
  56
  57 # define X and y
  58 feature_cols = [
  59     'ellis-cpu.wait_perc',
  60     'ellis-load.avg_1_min',
  61     'ellis-net.in_bytes_sec',
  62     'ellis-cpu.system_perc',
  63     'ellis-mem.free_mb']
  64
  65 # X is a matrix, hence we use [] to access the features we want in feature_cols
  66 X = df_Ellis[feature_cols]
  67
  68 # y is a vector, hence we use dot to access 'label'
  69 y = df_Ellis.Label
  70
  71 """We splitted `X` and `y` into `X_train`, `X_test`, `y_train`, and `y_test` using `train_test_split` function."""
  72
  73 # split X and y into training and testing sets
  74 X_train, X_test, y_train, y_test = train_test_split(
  75     X, y, test_size=0.30, random_state=5)
  76
  77 """We are training the model with Decision Tree."""
  78
  79 # train a logistic regression model on the training set
  80
  81 # instantiate model
  82 logreg = tree.DecisionTreeClassifier()
  83
  84 # fit model
  85 logreg.fit(X_train, y_train)
  86
  87 """We are making predictions for test set"""
  88
  89 # make class predictions for the testing set
  90 y_pred_class = logreg.predict(X_test)
  91
  92 """Here, we are calculating the accuracy using `sklearn` library"""
  93
  94 # calculate accuracy
  95 print(metrics.accuracy_score(y_test, y_pred_class))
  96
  97 """We are examining the class distribution of the testing set using a `pandas` series method"""
  98
  99 # examine the class distribution of the testing set (using a Pandas Series
 100 # method)
 101 y_test.value_counts()
 102
 103 """We counted the value for each lables"""
 104
 105 y_train.value_counts()
 106
 107 """We are calculating the percentage of ones because `y_test` only contains ones and zeroes, we can simply calculate the mean = percentage of ones"""
 108
 109 # calculate the percentage of ones
 110 # because y_test only contains ones and zeros, we can simply calculate the
 111 # mean = percentage of ones
 112 y_test.mean()
 113
 114 """We are calculating the percentage of zeros"""
 115
 116 # calculate the percentage of zeros
 117 1 - y_test.mean()
 118
 119 # calculate null accuracy in a single line of code
 120 # only for binary classification problems coded as 0/1
 121 max(y_test.mean(), 1 - y_test.mean())
 122
 123 # calculate null accuracy (for multi-class classification problems)
 124 y_test.value_counts().head(1) / len(y_test)
 125
 126 # print the first 25 true and predicted responses
 127 print('True:', y_test.values[0:50])
 128 print('False:', y_pred_class[0:50])
 129
 130 # IMPORTANT: first argument is true values, second argument is predicted values
 131 # this produces a 2x2 numpy array (matrix)
 132 print(metrics.confusion_matrix(y_test, y_pred_class))
 133
 134 # save confusion matrix and slice into four pieces
 135 confusion = metrics.confusion_matrix(y_test, y_pred_class)
 136 print(confusion)
 137 #[row, column]
 138 TP = confusion[1, 1]
 139 TN = confusion[0, 0]
 140 FP = confusion[0, 1]
 141 FN = confusion[1, 0]
 142
 143 # use float to perform true division, not integer division
 144 print((TP + TN) / float(TP + TN + FP + FN))
 145 print(metrics.accuracy_score(y_test, y_pred_class))
 146
 147 """We are defining a function `print_results` to print the result of `y_test` and `y_pred`."""
 148
 149
 150 def print_results(y_test, y_pred):
 151
 152     # f1-score
 153     f1 = metrics.f1_score(y_test, y_pred)
 154     print("F1 Score: ", f1)
 155     print(classification_report(y_test, y_pred))
 156
 157     conf_matrix = metrics.confusion_matrix(y_test, y_pred)
 158     plt.figure(figsize=(12, 12))
 159     plt.subplot(221)
 160     sns.heatmap(conf_matrix, fmt="d", annot=True, cmap='Blues')
 161     b, t = plt.ylim()
 162     plt.ylim(b + 0.5, t - 0.5)
 163     plt.title('Confuion Matrix')
 164     plt.ylabel('True Values')
 165     plt.xlabel('Predicted Values')
 166
 167     # roc_auc_score
 168     model_roc_auc = metrics.roc_auc_score(y_test, y_pred)
 169     print("Area under curve : ", model_roc_auc, "\n")
 170     fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
 171     gmeans = np.sqrt(tpr * (1 - fpr))
 172     ix = np.argmax(gmeans)
 173     threshold = np.round(thresholds[ix], 3)
 174
 175     plt.subplot(222)
 176     plt.plot(
 177         fpr,
 178         tpr,
 179         color='darkorange',
 180         lw=1,
 181         label="Auc : %.3f" %
 182         model_roc_auc)
 183     plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
 184     plt.scatter(
 185         fpr[ix],
 186         tpr[ix],
 187         marker='o',
 188         color='black',
 189         label='Best Threshold:' +
 190         str(threshold))
 191     plt.xlim([0.0, 1.0])
 192     plt.ylim([0.0, 1.05])
 193     plt.xlabel('False Positive Rate')
 194     plt.ylabel('True Positive Rate')
 195     plt.title('Receiver operating characteristic')
 196     plt.legend(loc="lower right")
 197
 198
 199 print_results(y_test, y_pred_class)