1 # pylint: disable=C0103, C0116, W0621, E0401, W0104, W0105, R0913, E1136, W0612, E0102, C0301, W0611, C0411, W0311, C0326, C0330, W0106, C0412
2 # -*- coding: utf-8 -*-
5 Automatically generated by Colaboratory.
7 Original file is located at
8 https://colab.research.google.com/drive/1TdQCHMWu8lPA53-jFhxXDUPQdjqufrL1
10 Contributors: **Rohit Singh Rathaur, Girish L.**
12 Copyright [2021](2021) [*Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka*]
14 Licensed under the Apache License, Version 2.0 (the "License");
15 you may not use this file except in compliance with the License.
16 You may obtain a copy of the License at
18 http://www.apache.org/licenses/LICENSE-2.0
20 Unless required by applicable law or agreed to in writing, software
21 distributed under the License is distributed on an "AS IS" BASIS,
22 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 See the License for the specific language governing permissions and
24 limitations under the License.
26 We mounted the drive to access the data
29 import sklearn.metrics as metrics
30 from sklearn.metrics import classification_report
32 from sklearn import tree
33 from sklearn.linear_model import LogisticRegression
34 from sklearn.model_selection import train_test_split
38 import matplotlib as mpl
39 import matplotlib.pyplot as plt
40 import tensorflow as tf
41 from google.colab import drive
42 drive.mount('/content/drive')
44 """We are importing libraries to read the CSV and to train the models"""
48 """We are reading CSV file using `read_csv` function and dropping the `Timestamp` column and storing it in a DataFrame called `df_Ellis`."""
50 df_Ellis = pd.read_csv(
51 "/content/drive/MyDrive/Failure/lstm/Ellis_FinalTwoConditionwithOR.csv")
52 df_Ellis = df_Ellis.drop(columns='Timestamp')
55 """First we stored the `feature_cols` and defined the `X` matrix and `y` vector where `X` is a matrix and containing all the feature matrix and `y` is a vector which is having target value."""
59 'ellis-cpu.wait_perc',
60 'ellis-load.avg_1_min',
61 'ellis-net.in_bytes_sec',
62 'ellis-cpu.system_perc',
65 # X is a matrix, hence we use [] to access the features we want in feature_cols
66 X = df_Ellis[feature_cols]
68 # y is a vector, hence we use dot to access 'label'
71 """We splitted `X` and `y` into `X_train`, `X_test`, `y_train`, and `y_test` using `train_test_split` function."""
73 # split X and y into training and testing sets
74 X_train, X_test, y_train, y_test = train_test_split(
75 X, y, test_size=0.30, random_state=5)
77 """We are training the model with Decision Tree."""
79 # train a logistic regression model on the training set
82 logreg = tree.DecisionTreeClassifier()
85 logreg.fit(X_train, y_train)
87 """We are making predictions for test set"""
89 # make class predictions for the testing set
90 y_pred_class = logreg.predict(X_test)
92 """Here, we are calculating the accuracy using `sklearn` library"""
95 print(metrics.accuracy_score(y_test, y_pred_class))
97 """We are examining the class distribution of the testing set using a `pandas` series method"""
99 # examine the class distribution of the testing set (using a Pandas Series
101 y_test.value_counts()
103 """We counted the value for each lables"""
105 y_train.value_counts()
107 """We are calculating the percentage of ones because `y_test` only contains ones and zeroes, we can simply calculate the mean = percentage of ones"""
109 # calculate the percentage of ones
110 # because y_test only contains ones and zeros, we can simply calculate the
111 # mean = percentage of ones
114 """We are calculating the percentage of zeros"""
116 # calculate the percentage of zeros
119 # calculate null accuracy in a single line of code
120 # only for binary classification problems coded as 0/1
121 max(y_test.mean(), 1 - y_test.mean())
123 # calculate null accuracy (for multi-class classification problems)
124 y_test.value_counts().head(1) / len(y_test)
126 # print the first 25 true and predicted responses
127 print('True:', y_test.values[0:50])
128 print('False:', y_pred_class[0:50])
130 # IMPORTANT: first argument is true values, second argument is predicted values
131 # this produces a 2x2 numpy array (matrix)
132 print(metrics.confusion_matrix(y_test, y_pred_class))
134 # save confusion matrix and slice into four pieces
135 confusion = metrics.confusion_matrix(y_test, y_pred_class)
143 # use float to perform true division, not integer division
144 print((TP + TN) / float(TP + TN + FP + FN))
145 print(metrics.accuracy_score(y_test, y_pred_class))
147 """We are defining a function `print_results` to print the result of `y_test` and `y_pred`."""
150 def print_results(y_test, y_pred):
153 f1 = metrics.f1_score(y_test, y_pred)
154 print("F1 Score: ", f1)
155 print(classification_report(y_test, y_pred))
157 conf_matrix = metrics.confusion_matrix(y_test, y_pred)
158 plt.figure(figsize=(12, 12))
160 sns.heatmap(conf_matrix, fmt="d", annot=True, cmap='Blues')
162 plt.ylim(b + 0.5, t - 0.5)
163 plt.title('Confuion Matrix')
164 plt.ylabel('True Values')
165 plt.xlabel('Predicted Values')
168 model_roc_auc = metrics.roc_auc_score(y_test, y_pred)
169 print("Area under curve : ", model_roc_auc, "\n")
170 fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
171 gmeans = np.sqrt(tpr * (1 - fpr))
172 ix = np.argmax(gmeans)
173 threshold = np.round(thresholds[ix], 3)
183 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
189 label='Best Threshold:' +
192 plt.ylim([0.0, 1.05])
193 plt.xlabel('False Positive Rate')
194 plt.ylabel('True Positive Rate')
195 plt.title('Receiver operating characteristic')
196 plt.legend(loc="lower right")
199 print_results(y_test, y_pred_class)