4 "cell_type": "markdown",
7 "# Logs and Metrics Analysis Notebook\n",
9 "#### Used to capture anomalies in the logs and analyse / visualize the metrics in the vicinity of that time\n",
11 "##### Contributors:\n",
13 "- Adarsh Yadav <adiyadav0509@gmail.com> \n",
15 " Log Analysis and Anomaly Finding\n",
20 "- Aditya Srivastava <adityasrivastava301199@gmail.com>\n",
22 " Metrics Analysis and Visualization"
26 "cell_type": "markdown",
29 "### Metrics Analysis and Visualization"
34 "execution_count": null,
38 "import pandas as pd\n",
39 "import matplotlib.pyplot as plt\n",
40 "import matplotlib.dates as mdates\n",
41 "import numpy as np\n",
47 "from pprint import pprint\n",
49 "from datetime import datetime, timedelta\n",
51 "from elasticsearch import Elasticsearch\n",
52 "from elasticsearch_dsl import Search\n",
53 "from elasticsearch.connection import create_ssl_context\n",
60 "execution_count": null,
64 "PROMETHEUS = 'http://10.10.120.211:30902/' #do not change, unless sure"
68 "cell_type": "markdown",
76 "execution_count": null,
80 "#function to make DF out of query json\n",
82 "def convert_to_df(res_json):\n",
84 " data_list = res_json['data']['result']\n",
85 " res_df = pd.DataFrame()\n",
86 " if not data_list:\n",
90 " headers = data_list[0]\n",
91 " for data in data_list:\n",
92 " metrics = data['metric']\n",
93 " for metric in metrics.keys():\n",
94 " res_df[metric] = np.nan\n",
95 " res_df['value'] = 0\n",
97 " # filling the df\n",
98 " for data in data_list:\n",
99 " metrics = data['metric']\n",
100 " metrics['value'] = data['value'][-1]\n",
101 " res_df = res_df.append(metrics, ignore_index=True) \n",
105 "def convert_to_df_range(res_json):\n",
107 " data_list = res_json['data']['result']\n",
108 " res_df = pd.DataFrame()\n",
109 " if not data_list:\n",
112 " # filling the df\n",
113 " for data in data_list:\n",
114 " metrics = data['metric']\n",
115 " values = np.array(data['values'])\n",
116 " for time, value in values:\n",
117 " metrics['timestamp'] = time\n",
118 " metrics['value'] = value\n",
119 " res_df = res_df.append(metrics, ignore_index=True) \n",
126 "execution_count": null,
130 "# functions to query\n",
132 "def convert_to_timestamp(s):\n",
133 " return time.mktime(datetime.strptime(s, \"%Y-%m-%d %H:%M:%S\").timetuple())\n",
135 "def query_current(params={}):\n",
136 " # input: params\n",
138 " # Example: {'query': 'container_cpu_user_seconds_total'}\n",
140 " # Output: dict, loaded json response of the query\n",
142 " res = requests.get(PROMETHEUS + '/api/v1/query', \n",
144 " return json.loads(res.text)\n",
147 "def query_range(start, end, params={}, steps = '30s'):\n",
148 " # input: params\n",
150 " # Example: {'query': 'container_cpu_user_seconds_total'}\n",
152 " # Output: dict, loaded json response of the query\n",
153 " params[\"start\"] = convert_to_timestamp(start)\n",
154 " params[\"end\"] = convert_to_timestamp(end)\n",
155 " params[\"step\"] = steps\n",
157 " # print(params)\n",
159 " res = requests.get(PROMETHEUS + '/api/v1/query_range', \n",
163 " return json.loads(res.text)\n"
167 "cell_type": "markdown",
174 "cell_type": "markdown",
177 "## Analysis Function"
181 "cell_type": "markdown",
189 "execution_count": null,
193 "# CPU Unused Cores\n",
194 "def unused_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
196 " if csv is not None:\n",
197 " df = pd.read_csv(csv)\n",
200 " if start is None or end is None or node is None:\n",
201 " return \"Start, end and Node name required when fetching from prometheus\"\n",
203 " params = {'query' : \"collectd_cpu_percent{exported_instance='\" + node + \"'}\"}\n",
205 " target_cpu_usage_range = query_range(start, end, params, steps)\n",
206 " df = convert_to_df_range(target_cpu_usage_range)\n",
208 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
209 " groups = df.groupby(['cpu'])\n",
210 " if verbose: print(\"Unused Cores :\")\n",
211 " unused_cores = []\n",
212 " for key, item in groups:\n",
214 " idle_row = curr_df.loc[curr_df['type'] == 'idle']\n",
215 " if idle_row['value'].iloc[0] == '100':\n",
216 " if verbose: print(\"Core: \",key)\n",
217 " unused_cores.append(int(key))\n",
219 " print(\"Number of unused cores: \", len(unused_cores))\n",
220 " return unused_cores\n",
223 "#CPU fully used cores\n",
224 "def fully_used_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
226 " if csv is not None:\n",
227 " df = pd.read_csv(csv)\n",
230 " if start is None or end is None or node is None:\n",
231 " return \"Start, end and Node name required when fetching from prometheus\"\n",
233 " params = {'query' : \"collectd_cpu_percent{exported_instance='\" + node + \"'}\"}\n",
235 " target_cpu_usage_range = query_range(start, end, params, steps)\n",
236 " df = convert_to_df_range(target_cpu_usage_range)\n",
238 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
239 " groups = df.groupby(['cpu'])\n",
240 " if verbose: print(\"Fully Used Cores :\")\n",
241 " fully_used_cores = []\n",
242 " for key, item in groups:\n",
244 " idle_row = curr_df.loc[curr_df['type'] == 'idle']\n",
245 " if idle_row['value'].iloc[0] == '0':\n",
246 " if verbose: print(\"Core: \",key)\n",
247 " fully_used_cores.append(int(key))\n",
248 " print(\"Number of fully used cores: \", len(fully_used_cores))\n",
249 " return fully_used_cores\n",
252 "# CPU used cores plots\n",
253 "def plot_used_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
255 " if csv is not None:\n",
256 " df = pd.read_csv(csv)\n",
259 " if start is None or end is None or node is None:\n",
260 " return \"Start, end and Node name required when fetching from prometheus\"\n",
262 " params = {'query' : \"collectd_cpu_percent{exported_instance='\" + node + \"'}\"}\n",
264 " target_cpu_usage_range = query_range(start, end, params, steps)\n",
265 " df = convert_to_df_range(target_cpu_usage_range)\n",
267 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
268 " groups = df.groupby(['cpu'])\n",
269 " used_cores = []\n",
271 " for key, item in groups:\n",
273 " user_row = curr_df.loc[curr_df['type'] == 'user']\n",
274 " sys_row = curr_df.loc[curr_df['type'] == 'system']\n",
277 " if np.any(sys_row != '0') or np.any(user_row != '0'):\n",
278 " used_cores.append(key)\n",
279 " type_grps = curr_df.groupby('type')\n",
280 " fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')\n",
282 " for type_key, new_item in type_grps:\n",
284 " if type_key == 'system':\n",
285 " ax1 = fig.add_subplot(131)\n",
286 " ax1.title.set_text(type_key)\n",
287 " ax1.plot(new_item['timestamp'], new_item['value'])\n",
288 " elif type_key == 'user':\n",
289 " ax2 = fig.add_subplot(132)\n",
290 " ax2.title.set_text(type_key)\n",
291 " ax2.plot(new_item['timestamp'], new_item['value'])\n",
292 " elif type_key == 'wait':\n",
293 " ax3 = fig.add_subplot(133)\n",
294 " ax3.title.set_text(type_key)\n",
295 " ax3.plot(new_item['timestamp'], new_item['value'])\n",
297 " plt.suptitle('Used CPU Core {}'.format(key), fontsize=14)\n",
299 " print(\"Number of used cores: \", len(used_cores))\n",
304 "cell_type": "markdown",
312 "execution_count": null,
316 "# Interface Dropped (both type 1 and 2, i.e rx and tx)\n",
317 "#TODO: Change this to separate functions later\n",
318 "def interface_dropped(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
320 " if csv is not None:\n",
321 " df = pd.read_csv(csv)\n",
322 " df_0 = df #TODO: Change this\n",
323 " df_1 = df #TODO: Change this\n",
325 " if start is None or end is None or node is None:\n",
326 " return \"Start, end and Node name required when fetching from prometheus\"\n",
328 " params = {'query' : \"collectd_interface_if_dropped_0_total{exported_instance='\" + node + \"'}\"}\n",
330 " interface_dropped_0 = query_range(start, end, params, steps)\n",
331 " df_0 = convert_to_df_range(interface_dropped_0)\n",
333 " params = {'query' : \"collectd_interface_if_dropped_1_total{exported_instance='\" + node + \"'}\"}\n",
334 " interface_dropped_1 = query_range(start, end, params, steps)\n",
335 " df_1 = convert_to_df_range(interface_dropped_1)\n",
338 " #df_0 : interfaces_dropped_0_df\n",
339 " df_0 = df_0.drop(['__name__', 'instance', 'job'], axis = 1)\n",
341 " #df_1 : interfaces_dropped_1_df\n",
342 " df_1 = df_1.drop(['__name__', 'instance', 'job'], axis = 1)\n",
344 " groups_0 = df_0.groupby(['interface'])\n",
345 " groups_1 = df_1.groupby(['interface'])\n",
347 " groups = [groups_0, groups_1]\n",
348 " dropped_interfaces= []\n",
350 " color = ['oldlace', 'mistyrose']\n",
351 " plot_iter = 111\n",
352 " for group in groups:\n",
355 " for key, item in group:\n",
357 " if np.any(curr_df['value'] == '1'):\n",
358 " dropped_row = curr_df.loc[curr_df['value'] == '1']\n",
359 " dropped.append([key, dropped_row['timestamp'].iloc[0]])\n",
360 " fig = plt.figure(figsize=(24,6), facecolor=color[drop_type], edgecolor='red')\n",
361 " ax = fig.add_subplot(plot_iter)\n",
362 " ax.title.set_text(\"Interface: {}\".format(key))\n",
363 " ax.plot(item['timestamp'], item['value'])\n",
364 " dropped_interfaces.append(dropped)\n",
365 " plt.suptitle('Interfaces Drop type {}'.format(drop_type), fontsize=14)\n",
368 " return dropped_interfaces\n",
371 "# Interface Errors (both type 1 and 2, i.e rx and tx)\n",
372 "#TODO: Change this to separate functions later\n",
373 "def interface_errors(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
375 " if csv is not None:\n",
376 " df = pd.read_csv(csv)\n",
377 " df_0 = df #TODO: Change this\n",
378 " df_1 = df #TODO: Change this\n",
380 " if start is None or end is None or node is None:\n",
381 " return \"Start, end and Node name required when fetching from prometheus\"\n",
383 " params = {'query' : \"collectd_interface_if_errors_0_total{exported_instance='\" + node + \"'}\"}\n",
384 " interfaces_errors_0 = query_range(start, end, params, steps)\n",
385 " df_0 = convert_to_df_range(interfaces_errors_0)\n",
387 " params = {'query' : \"collectd_interface_if_errors_1_total{exported_instance='\" + node + \"'}\"}\n",
388 " interface_errors_1 = query_range(start, end, params, steps)\n",
389 " df_1 = convert_to_df_range(interface_errors_1)\n",
392 " #df_0 : interfaces_errors_0_df\n",
393 " df_0 = df_0.drop(['__name__', 'instance', 'job'], axis = 1)\n",
395 " #df_1 : interfaces_dropped_1_df\n",
396 " df_1 = df_1.drop(['__name__', 'instance', 'job'], axis = 1)\n",
398 " groups_0 = df_0.groupby(['interface'])\n",
399 " groups_1 = df_1.groupby(['interface'])\n",
401 " groups = [groups_0, groups_1]\n",
402 " err_interfaces= []\n",
404 " color = ['oldlace', 'mistyrose']\n",
405 " for group in groups:\n",
408 " for key, item in group:\n",
411 " if np.any(curr_df['value'] == '1'):\n",
412 " err_row = curr_df.loc[curr_df['value'] == '1']\n",
413 " erros.append([key, err_row['timestamp'].iloc[0]])\n",
415 " fig = plt.figure(figsize=(24,6), facecolor=color[err_type], edgecolor='red')\n",
416 " ax = fig.add_subplot(111)\n",
417 " ax.title.set_text(\"Interface: {}\".format(key))\n",
418 " ax.plot(item['timestamp'], item['value'])\n",
420 " err_interfaces.append(errors)\n",
421 " plt.suptitle('Interfaces Error type {}'.format(err_type), fontsize=14)\n",
425 " return err_interfaces"
429 "cell_type": "markdown",
437 "execution_count": null,
441 "# L3 cache bytes\n",
442 "def plot_rdt_bytes(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
444 " if csv is not None:\n",
445 " df = pd.read_csv(csv)\n",
447 " if start is None or end is None or node is None:\n",
448 " return \"Start, end and Node name required when fetching from prometheus\"\n",
450 " params = {'query' : \"collectd_intel_rdt_bytes{exported_instance='\" + node + \"'}\"}\n",
451 " intel_rdt_bytes = query_range(start, end, params, steps)\n",
452 " df = convert_to_df_range(intel_rdt_bytes)\n",
454 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
455 " groups = df.groupby(['intel_rdt'])\n",
456 " for key, item in groups:\n",
458 " fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')\n",
459 " ax1 = fig.add_subplot(111)\n",
460 " ax1.title.set_text(\"Intel RDT Number: {}\".format(key))\n",
461 " ax1.plot(item['timestamp'], item['value'])\n",
467 "def plot_rdt_ipc(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
469 " if csv is not None:\n",
470 " df = pd.read_csv(csv)\n",
472 " if start is None or end is None or node is None:\n",
473 " return \"Start, end and Node name required when fetching from prometheus\"\n",
475 " params = {'query' : \"collectd_intel_rdt_ipc{exported_instance='\" + node + \"'}\"}\n",
476 " intel_rdt_ipc = query_range(start, end, params, steps)\n",
477 " df = convert_to_df_range(intel_rdt_ipc)\n",
479 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
480 " groups = df.groupby(['intel_rdt'])\n",
481 " for key, item in groups:\n",
483 " fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')\n",
484 " ax1 = fig.add_subplot(111)\n",
485 " ax1.title.set_text(\"Intel RDT Number: {}, IPC value\".format(key))\n",
486 " ax1.plot(item['timestamp'], item['value'])\n",
491 "# memeory bandwidtdh\n",
492 "def get_rdt_memory_bandwidth(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
494 " if csv is not None:\n",
495 " df = pd.read_csv(csv)\n",
498 " if start is None or end is None or node is None:\n",
499 " return \"Start, end and Node name required when fetching from prometheus\"\n",
501 " params = {'query' : \"collectd_intel_rdt_memory_bandwidth_total{exported_instance='\" + node + \"'}\"}\n",
502 " intel_rdt_mem_bw = query_range(start, end, params, steps)\n",
503 " df = convert_to_df_range(intel_rdt_mem_bw)\n",
505 " df = df.drop(['__name__', 'instance', 'job'], axis = 1)\n",
511 "cell_type": "markdown",
519 "execution_count": null,
525 "def get_memory_usage(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):\n",
527 " if csv is not None:\n",
528 " df = pd.read_csv(csv)\n",
530 " if start is None or end is None or node is None:\n",
531 " return \"Start, end and Node name required when fetching from prometheus\"\n",
533 " params = {'query' : \"collectd_memory{exported_instance='\" + node + \"'} / (1024*1024*1024) \"} \n",
534 " target_memory_usage_range = query_range(start, end, params, steps)\n",
535 " df = convert_to_df_range(target_memory_usage_range)\n",
537 " df = df.drop(['instance', 'job'], axis = 1)\n",
538 " groups = df.groupby(['memory'])\n",
539 " for key, item in groups:\n",
541 " fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')\n",
542 " ax1 = fig.add_subplot(111)\n",
543 " ax1.title.set_text(\"Memory Type: {}\".format(key))\n",
544 " ax1.plot(item['timestamp'], item['value'])\n",
550 "cell_type": "markdown",
558 "execution_count": null,
562 "get_memory_usage('2020-08-03 08:00:12', '2020-08-03 08:01:12', 'pod12-node4')"
567 "execution_count": null,
571 "def analyse(timestamp, node):\n",
572 " ts = datetime.strptime(timestamp.split(',')[0], \"%Y-%m-%d %H:%M:%S\")\n",
573 " start = ts - timedelta(seconds=10)\n",
574 " end = ts + timedelta(seconds=10)\n",
576 " start = str(start)\n",
580 " print(\"Starting Analysis from\",start,\"to\",end,'\\n\\n')\n",
582 " if \"node4\" in node:\n",
583 " node = 'pod12-node4'\n",
586 " print(\"=====CPU ANALYSIS=====\\n\")\n",
587 " unused = unused_cores(start, end, node, steps)\n",
588 " print(\"Unused Cores:\", unused)\n",
589 " fully_used = fully_used_cores(start, end, node, steps)\n",
590 " print(\"Fully Used Cores:\", fully_used)\n",
591 " print(\"Plotting used cores:\")\n",
592 " used_cores = plot_used_cores(start, end, node, steps)\n",
594 " #interface analysis\n",
595 " print(\"=====Interfaces Dropped / Errors=====\\n\")\n",
596 " dropped_interfaces = interface_dropped(start, end, node, steps)\n",
597 " err_interfaces = interface_errors(start, end, node, steps)\n",
600 " print(\"=====RDT Analysis=====\\n\")\n",
601 " plot_rdt_bytes(start, end, node, steps)\n",
602 " plot_rdt_ipc(start, end, node, steps)\n",
603 " mem_bandwidht = get_rdt_memory_bandwidth(start, end, node, steps)\n",
605 " #Memory Analysis:\n",
606 " print(\"=====Memory Analysis=====\\n\")\n",
607 " mem = get_memory_usage(start, end, node, steps)"
611 "cell_type": "markdown",
614 "## Usage / Examples\n",
619 "- For calling cpu unsued cores\n",
622 "# Fetching from prometheus\n",
623 "cores = unused_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
627 "- For finding fully used cores\n",
630 "# Fetching from prometheus\n",
631 "fully_used = fully_used_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
635 "- Similarly for plotting used cores\n",
639 "plot_used_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
642 "# use Analysis-Monitoring-Local Notebook for correct analysis \n",
643 "plot_used_cores(csv='metrics_data/cpu-0/cpu-user-2020-06-02')\n",
650 "- Interface Dropped \n",
653 "# Fetching from prom\n",
654 "dropped_interfaces = interface_dropped('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
658 "- Interface Errors\n",
661 "# Fetching from prom\n",
662 "interface_errors('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
671 "plot_rdt_bytes('2020-07-31 08:00:12', '2020-07-31 08:01:12','pod12-node4')\n",
674 "- Plot ipc values\n",
678 "plot_rdt_ipc('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
681 "- Memory bandwidth\n",
685 "get_rdt_memory_bandwidth('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')\n",
693 "get_memory_usage('2020-08-03 08:00:12', '2020-08-03 08:01:12', 'pod12-node4')\n",
696 "##### Analyse everything\n",
699 "# example alert_time: 2020-08-03 08:00:12\n",
700 "# example index: 'pod12-node4'\n",
701 "analyse(alert_time,index)\n",
706 "cell_type": "markdown",
709 "#### Checking Anomaly in logs"
714 "execution_count": null,
719 "foldername = \"results_2020-08-07_03-39-57\"\n",
720 "#Give index name - \"node1*\" or \"node4*\"\n",
726 "execution_count": null,
732 "ssl_context = create_ssl_context()\n",
733 "ssl_context.check_hostname = False\n",
734 "ssl_context.verify_mode = ssl.CERT_NONE\n",
735 "urllib3.disable_warnings()\n",
736 "client = Elasticsearch(['https://elasticsearch:password123@10.10.120.211:31111'],verify_certs=False,ssl_context=ssl_context)"
741 "execution_count": null,
745 "vsperf = \"vsperf-overall_\"+ foldername[8:] +\".log\"\n",
746 "s = Search(index=index).using(client).query(\"exists\", field=\"alert\").query(\"match_phrase\", log_path=vsperf)\n",
747 "for hits in s.scan():\n",
748 " alert_time = hits.alert_time\n",
755 "execution_count": null,
759 "analyse(alert_time,index)"
765 "display_name": "Python 3",
766 "language": "python",
774 "file_extension": ".py",
775 "mimetype": "text/x-python",
777 "nbconvert_exporter": "python",
778 "pygments_lexer": "ipython3",