6 "name": "FeatureCreation.ipynb",
11 "display_name": "Python 3"
19 "cell_type": "markdown",
24 "Contributors: **Rohit Singh Rathaur, Girish L.** \n",
26 "Copyright [2021](2021) [*Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka*]\n",
28 "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
29 "you may not use this file except in compliance with the License.\n",
30 "You may obtain a copy of the License at\n",
32 " http://www.apache.org/licenses/LICENSE-2.0\n",
34 "Unless required by applicable law or agreed to in writing, software\n",
35 "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
36 "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
37 "See the License for the specific language governing permissions and\n",
38 "limitations under the License."
47 "# Import libraries use for visualization and analysis\n",
48 "import pandas as pd\n",
49 "import numpy as np\n",
51 "%matplotlib inline\n",
52 "import matplotlib\n",
53 "import matplotlib.pyplot as plt\n",
55 "from pandas import Series,DataFrame\n",
56 "import matplotlib.pyplot as plt\n",
57 "import seaborn as sns\n",
58 "from sklearn.preprocessing import scale\n",
59 "from sklearn.decomposition import PCA\n",
60 "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
61 "from scipy import stats\n",
62 "from IPython.display import display, HTML"
64 "execution_count": null,
71 "base_uri": "https://localhost:8080/"
74 "outputId": "2b3ef633-a851-4c53-80eb-6b1bf4ffcc1c"
77 "from google.colab import drive\n",
78 "drive.mount('/gdrive')"
80 "execution_count": null,
83 "output_type": "stream",
85 "Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount(\"/gdrive\", force_remount=True).\n"
92 "cell_type": "markdown",
97 "# **Loading the Data**"
106 "df_Ellis = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/Final.csv\")\n",
107 "#df_Bono = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/matrices/df_Bono.csv\", error_bad_lines=False)\n",
108 "#df_Sprout = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/matrices/df_Sprout.csv\", error_bad_lines=False)\n",
109 "#df_Homer = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/matrices/df_Homer.csv\", error_bad_lines=False)\n",
110 "#df_Homestead = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/matrices/df_Homestead.csv\", error_bad_lines=False)\n",
111 "#df_Ralf = pd.read_csv(\"/gdrive/MyDrive/LFN Anuket/Analysis/data/matrices/df_Ralf.csv\", error_bad_lines=False)"
113 "execution_count": null,
120 "base_uri": "https://localhost:8080/",
123 "id": "dpy8jAm-TsCs",
124 "outputId": "d8ad2072-1fa3-4b3c-fb55-b5128767b349"
129 "execution_count": null,
132 "output_type": "execute_result",
137 " .dataframe tbody tr th:only-of-type {\n",
138 " vertical-align: middle;\n",
141 " .dataframe tbody tr th {\n",
142 " vertical-align: top;\n",
145 " .dataframe thead th {\n",
146 " text-align: right;\n",
149 "<table border=\"1\" class=\"dataframe\">\n",
151 " <tr style=\"text-align: right;\">\n",
153 " <th>Timestamp</th>\n",
154 " <th>ellis-cpu.system_perc</th>\n",
155 " <th>ellis-cpu.wait_perc</th>\n",
156 " <th>ellis-load.avg_1_min</th>\n",
157 " <th>ellis-mem.free_mb</th>\n",
158 " <th>ellis-net.in_bytes_sec</th>\n",
159 " <th>ellis-net.out_packets_sec</th>\n",
165 " <td>14/09/2016 0:00</td>\n",
170 " <td>5413.200</td>\n",
171 " <td>62.067</td>\n",
175 " <td>14/09/2016 0:00</td>\n",
180 " <td>5201.667</td>\n",
181 " <td>59.567</td>\n",
185 " <td>14/09/2016 0:01</td>\n",
190 " <td>5370.733</td>\n",
191 " <td>61.200</td>\n",
195 " <td>14/09/2016 0:01</td>\n",
200 " <td>5292.467</td>\n",
201 " <td>60.400</td>\n",
205 " <td>14/09/2016 0:02</td>\n",
210 " <td>5318.167</td>\n",
211 " <td>61.700</td>\n",
218 " Timestamp ... ellis-net.out_packets_sec\n",
219 "0 14/09/2016 0:00 ... 62.067\n",
220 "1 14/09/2016 0:00 ... 59.567\n",
221 "2 14/09/2016 0:01 ... 61.200\n",
222 "3 14/09/2016 0:01 ... 60.400\n",
223 "4 14/09/2016 0:02 ... 61.700\n",
225 "[5 rows x 7 columns]"
231 "execution_count": 264
239 "base_uri": "https://localhost:8080/",
242 "id": "dJa9FgJNgqpI",
243 "outputId": "54d6c43d-489f-4347-93e5-12e4a4da2066"
246 "df_Ellis.describe()"
248 "execution_count": null,
251 "output_type": "execute_result",
256 " .dataframe tbody tr th:only-of-type {\n",
257 " vertical-align: middle;\n",
260 " .dataframe tbody tr th {\n",
261 " vertical-align: top;\n",
264 " .dataframe thead th {\n",
265 " text-align: right;\n",
268 "<table border=\"1\" class=\"dataframe\">\n",
270 " <tr style=\"text-align: right;\">\n",
272 " <th>ellis-cpu.system_perc</th>\n",
273 " <th>ellis-cpu.wait_perc</th>\n",
274 " <th>ellis-load.avg_1_min</th>\n",
275 " <th>ellis-mem.free_mb</th>\n",
276 " <th>ellis-net.in_bytes_sec</th>\n",
277 " <th>ellis-net.out_packets_sec</th>\n",
283 " <td>177000.000000</td>\n",
284 " <td>177000.000000</td>\n",
285 " <td>177000.000000</td>\n",
286 " <td>177000.000000</td>\n",
287 " <td>1.770000e+05</td>\n",
288 " <td>177000.000000</td>\n",
292 " <td>2.315540</td>\n",
293 " <td>1.024163</td>\n",
294 " <td>0.198842</td>\n",
295 " <td>4206.847232</td>\n",
296 " <td>1.855987e+07</td>\n",
297 " <td>1336.694851</td>\n",
301 " <td>1.170977</td>\n",
302 " <td>3.127178</td>\n",
303 " <td>0.262227</td>\n",
304 " <td>173.364297</td>\n",
305 " <td>5.612164e+06</td>\n",
306 " <td>2220.146124</td>\n",
310 " <td>0.100000</td>\n",
311 " <td>0.000000</td>\n",
312 " <td>0.000000</td>\n",
313 " <td>2320.000000</td>\n",
314 " <td>0.000000e+00</td>\n",
315 " <td>0.000000</td>\n",
319 " <td>1.500000</td>\n",
320 " <td>0.200000</td>\n",
321 " <td>0.095000</td>\n",
322 " <td>4095.000000</td>\n",
323 " <td>1.797602e+07</td>\n",
324 " <td>182.033000</td>\n",
328 " <td>1.700000</td>\n",
329 " <td>0.200000</td>\n",
330 " <td>0.140000</td>\n",
331 " <td>4214.000000</td>\n",
332 " <td>2.087674e+07</td>\n",
333 " <td>200.067000</td>\n",
337 " <td>3.500000</td>\n",
338 " <td>0.400000</td>\n",
339 " <td>0.198000</td>\n",
340 " <td>4331.000000</td>\n",
341 " <td>2.160859e+07</td>\n",
342 " <td>1069.667000</td>\n",
346 " <td>16.700000</td>\n",
347 " <td>22.400000</td>\n",
348 " <td>2.580000</td>\n",
349 " <td>4633.000000</td>\n",
350 " <td>2.339041e+07</td>\n",
351 " <td>7887.552000</td>\n",
358 " ellis-cpu.system_perc ... ellis-net.out_packets_sec\n",
359 "count 177000.000000 ... 177000.000000\n",
360 "mean 2.315540 ... 1336.694851\n",
361 "std 1.170977 ... 2220.146124\n",
362 "min 0.100000 ... 0.000000\n",
363 "25% 1.500000 ... 182.033000\n",
364 "50% 1.700000 ... 200.067000\n",
365 "75% 3.500000 ... 1069.667000\n",
366 "max 16.700000 ... 7887.552000\n",
368 "[8 rows x 6 columns]"
374 "execution_count": 265
384 "#df_Ellis['SLO1'] = 0\n",
385 "#print('Column names are: ',list(df_Ellis.columns))"
387 "execution_count": null,
394 "base_uri": "https://localhost:8080/"
396 "id": "b-F_gA61xowR",
397 "outputId": "f9bd6232-2603-40ad-ccff-18887839e2da"
400 "df4 = df_Ellis[\"ellis-load.avg_1_min\"] > 2.45\n",
402 "df4.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/EllisLoadAvgLabel_lessthan0198.csv')\n",
405 "execution_count": null,
408 "output_type": "execute_result",
461 "Name: ellis-load.avg_1_min, dtype: bool"
467 "execution_count": 267
475 "base_uri": "https://localhost:8080/"
477 "id": "8xcPRerCz8nA",
478 "outputId": "fb66f20e-7365-40ec-857a-9dd9a8072401"
481 "df3 = df_Ellis[\"ellis-cpu.wait_perc\"] > 5\n",
483 "df3.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/ellis-cpu>5.csv')\n",
486 "execution_count": null,
489 "output_type": "execute_result",
542 "Name: ellis-cpu.wait_perc, dtype: bool"
548 "execution_count": 268
555 "id": "EED56Wiq_NjM",
557 "base_uri": "https://localhost:8080/"
559 "outputId": "20b06258-c5ba-457b-a022-cf5823217cbf"
562 "df5 = df_Ellis[\"ellis-net.out_packets_sec\"] > 1000\n",
564 "df5.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/ellis-net.in_bytes_sec21139.csv')\n",
567 "execution_count": null,
570 "output_type": "execute_result",
623 "Name: ellis-net.out_packets_sec, dtype: bool"
629 "execution_count": 269
637 "base_uri": "https://localhost:8080/"
639 "id": "phlI40_y0mug",
640 "outputId": "7fa177b9-bf9a-4b96-db65-7402f7f6cf32"
643 "# We are applying Logical OR Operator between df4 and df3\n",
644 "df6 = (df4[0:176999]) | (df3[0:176999])\n",
647 "execution_count": null,
650 "output_type": "execute_result",
709 "execution_count": 270
717 "base_uri": "https://localhost:8080/"
719 "id": "9xKYzZcLAZGy",
720 "outputId": "bc15e547-c791-4104-8bb2-8ed4d3288ac1"
723 "df6.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/OR_TwoCondition(2).csv')\n",
726 "execution_count": null,
729 "output_type": "execute_result",
788 "execution_count": 271
795 "id": "wRADpDibBZo5",
797 "base_uri": "https://localhost:8080/"
799 "outputId": "dfc6dc79-3d9f-4979-8210-e62e77b1aa6e"
802 "df7 = (df6[0:176999]) | (df5[0:176999])\n",
805 "execution_count": null,
808 "output_type": "execute_result",
867 "execution_count": 272
874 "id": "w6BrDjX4CODn",
876 "base_uri": "https://localhost:8080/"
878 "outputId": "a6c956e7-6aed-4bdd-f37f-505a994de51a"
881 "df7.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/FinalORLabel8.5.csv')\n",
884 "execution_count": null,
887 "output_type": "execute_result",
946 "execution_count": 273
956 "df_Ellis.insert (7, \"Label\", df7)"
958 "execution_count": null,
967 "#df_Ellis.insert (8, \"Label\", df7)"
969 "execution_count": null,
978 "# We applied Logical OR operator in two features only known as and df3 and df4 and stored result in df6 which is known as Final Label after applying OR condition\n",
980 "df_Ellis.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/Final/Ellis_FinalTwoConditionwithOR.csv')"
982 "execution_count": null,
988 "id": "3rEy1vtp67M9",
990 "base_uri": "https://localhost:8080/",
993 "outputId": "4e2175cc-dccb-4aaf-a152-e2452de241b0"
998 "execution_count": null,
1001 "output_type": "execute_result",
1006 " .dataframe tbody tr th:only-of-type {\n",
1007 " vertical-align: middle;\n",
1010 " .dataframe tbody tr th {\n",
1011 " vertical-align: top;\n",
1014 " .dataframe thead th {\n",
1015 " text-align: right;\n",
1018 "<table border=\"1\" class=\"dataframe\">\n",
1020 " <tr style=\"text-align: right;\">\n",
1022 " <th>Timestamp</th>\n",
1023 " <th>ellis-cpu.system_perc</th>\n",
1024 " <th>ellis-cpu.wait_perc</th>\n",
1025 " <th>ellis-load.avg_1_min</th>\n",
1026 " <th>ellis-mem.free_mb</th>\n",
1027 " <th>ellis-net.in_bytes_sec</th>\n",
1028 " <th>ellis-net.out_packets_sec</th>\n",
1029 " <th>Label</th>\n",
1035 " <td>14/09/2016 0:00</td>\n",
1040 " <td>5413.200</td>\n",
1041 " <td>62.067</td>\n",
1046 " <td>14/09/2016 0:00</td>\n",
1051 " <td>5201.667</td>\n",
1052 " <td>59.567</td>\n",
1057 " <td>14/09/2016 0:01</td>\n",
1062 " <td>5370.733</td>\n",
1063 " <td>61.200</td>\n",
1068 " <td>14/09/2016 0:01</td>\n",
1073 " <td>5292.467</td>\n",
1074 " <td>60.400</td>\n",
1079 " <td>14/09/2016 0:02</td>\n",
1084 " <td>5318.167</td>\n",
1085 " <td>61.700</td>\n",
1101 " <td>14/09/2016 0:47</td>\n",
1106 " <td>5187.133</td>\n",
1107 " <td>60.100</td>\n",
1112 " <td>14/09/2016 0:48</td>\n",
1117 " <td>5223.100</td>\n",
1118 " <td>60.233</td>\n",
1123 " <td>14/09/2016 0:48</td>\n",
1128 " <td>5335.200</td>\n",
1129 " <td>60.667</td>\n",
1134 " <td>14/09/2016 0:49</td>\n",
1139 " <td>5185.733</td>\n",
1140 " <td>60.367</td>\n",
1145 " <td>14/09/2016 0:49</td>\n",
1150 " <td>5204.233</td>\n",
1151 " <td>59.600</td>\n",
1156 "<p>100 rows × 8 columns</p>\n",
1160 " Timestamp ellis-cpu.system_perc ... ellis-net.out_packets_sec Label\n",
1161 "0 14/09/2016 0:00 0.5 ... 62.067 True\n",
1162 "1 14/09/2016 0:00 0.4 ... 59.567 True\n",
1163 "2 14/09/2016 0:01 0.4 ... 61.200 True\n",
1164 "3 14/09/2016 0:01 0.4 ... 60.400 True\n",
1165 "4 14/09/2016 0:02 0.5 ... 61.700 True\n",
1166 ".. ... ... ... ... ...\n",
1167 "95 14/09/2016 0:47 0.5 ... 60.100 True\n",
1168 "96 14/09/2016 0:48 0.5 ... 60.233 True\n",
1169 "97 14/09/2016 0:48 0.6 ... 60.667 True\n",
1170 "98 14/09/2016 0:49 0.6 ... 60.367 True\n",
1171 "99 14/09/2016 0:49 0.6 ... 59.600 True\n",
1173 "[100 rows x 8 columns]"
1179 "execution_count": 277
1184 "cell_type": "code",
1187 "base_uri": "https://localhost:8080/"
1189 "id": "11Qu45RY0HNG",
1190 "outputId": "305c5dd5-ec61-48a8-abb6-e29bbc4b9e42"
1193 "# pandas count distinct values in column\n",
1194 "df_Ellis['Label'].value_counts()"
1196 "execution_count": null,
1199 "output_type": "execute_result",
1204 "Name: Label, dtype: int64"
1210 "execution_count": 278
1215 "cell_type": "code",
1217 "id": "0sB-W_Ny4eHk"
1220 "#final.to_csv('/gdrive/MyDrive/LFN Anuket/Analysis/data/New/FinalLabel.csv')"
1222 "execution_count": null,
1226 "cell_type": "code",
1228 "id": "ERsufys7wcSg"
1231 "#df_Ellis.loc[(df_Ellis[\"ellis-cpu.wait_perc\"] > 5) & (df_Ellis[\"ellis-load.avg_1_min\"] > 2)]"
1233 "execution_count": null,
1237 "cell_type": "markdown",
1239 "id": "9le7MwnDhlnH"
1242 "# **Creating New Features**"
1246 "cell_type": "code",
1248 "id": "090QXGpPlEF6"
1253 "execution_count": null,