Tools: Add K8s monitoring cluster 74/70874/3
authorAditya Srivastava <adityasrivastava301199@gmail.com>
Sun, 23 Aug 2020 20:23:02 +0000 (01:53 +0530)
committerAditya Srivastava <adityasrivastava301199@gmail.com>
Thu, 17 Sep 2020 11:27:18 +0000 (16:57 +0530)
This patch adds k8s monitoring cluster deployment using ansible for
both client and server side. Also adds scripts (ansible roles) to clean
(remove) the K8S cluster completely.

Signed-off-by: Aditya Srivastava <adityasrivastava301199@gmail.com>
Change-Id: I1115869c0a3e72a20047b31994f3d27e5fdae6c6

47 files changed:
tools/lma/ansible-client/ansible.cfg [new file with mode: 0644]
tools/lma/ansible-client/hosts [new file with mode: 0644]
tools/lma/ansible-client/playbooks/clean.yaml [new file with mode: 0644]
tools/lma/ansible-client/roles/clean-collectd/main.yml [new file with mode: 0644]
tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 [new file with mode: 0644]
tools/lma/ansible-client/roles/collectd/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/ansible.cfg [new file with mode: 0644]
tools/lma/ansible-server/group_vars/all.yml [new file with mode: 0644]
tools/lma/ansible-server/hosts [new file with mode: 0644]
tools/lma/ansible-server/playbooks/clean.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml [new file with mode: 0644]
tools/lma/ansible-server/roles/monitoring/tasks/main.yml [new file with mode: 0644]
tools/lma/metrics/dashboard/cpu_usage_using.json [new file with mode: 0644]
tools/lma/metrics/dashboard/memory_using.json [new file with mode: 0644]
tools/lma/metrics/dashboard/ovs_stats_using.json [new file with mode: 0644]
tools/lma/metrics/dashboard/rdt_using.json [new file with mode: 0644]

diff --git a/tools/lma/ansible-client/ansible.cfg b/tools/lma/ansible-client/ansible.cfg
new file mode 100644 (file)
index 0000000..307ef45
--- /dev/null
@@ -0,0 +1,17 @@
+[defaults]
+inventory = ./hosts
+host_key_checking = false
+
+# additional path to search for roles in
+roles_path = roles
+
+# enable logging
+log_path = ./ansible.log
+
+[privilege_escalation]
+become=True
+become_method=sudo
+become_user=root
+
+[ssh_connection]
+pipelining = True
diff --git a/tools/lma/ansible-client/hosts b/tools/lma/ansible-client/hosts
new file mode 100644 (file)
index 0000000..eba586c
--- /dev/null
@@ -0,0 +1,2 @@
+[all]
+127.0.0.1 ansible_connection=local
diff --git a/tools/lma/ansible-client/playbooks/clean.yaml b/tools/lma/ansible-client/playbooks/clean.yaml
new file mode 100644 (file)
index 0000000..4f77b06
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#clean td-agent
+- name: clean td-agent
+  hosts: all
+  roles:
+    - clean-td-agent
+
+#clean collectd
+- name: clean collectd
+  hosts: all
+  roles:
+    - clean-collectd
diff --git a/tools/lma/ansible-client/roles/clean-collectd/main.yml b/tools/lma/ansible-client/roles/clean-collectd/main.yml
new file mode 100644 (file)
index 0000000..97100ca
--- /dev/null
@@ -0,0 +1,44 @@
+# Copyright 2020 Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+- hosts: localhost
+
+  tasks:
+  - name: Check and install dependencies
+    yum:
+      name: docker
+      state: present
+
+  - name: Install python sdk
+    yum:
+      name: python-docker-py
+      state: present
+
+  - name: Stopping collectd container
+    docker_container:
+      name: collectd
+      state: stopped
+
+  - name: Removing collectd container
+    docker_container:
+      name: collectd
+      state: absent
+
+  # Removes the image (not recommended)
+  # - name: Remove image
+  #   docker_image:
+  #     state: absent
+  #     name: opnfv/barometer-collectd
+  #     tag: latest
diff --git a/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 b/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2
new file mode 100644 (file)
index 0000000..ba953e3
--- /dev/null
@@ -0,0 +1,44 @@
+Hostname "{{ host_name }}"
+Interval     10
+LoadPlugin intel_rdt
+LoadPlugin processes
+LoadPlugin interface
+LoadPlugin network
+LoadPlugin ovs_stats
+LoadPlugin cpu
+LoadPlugin memory
+#LoadPlugin csv
+#LoadPlugin write_http
+#LoadPlugin dpdkstat
+##############################################################################
+# Plugin configuration                                                       #
+##############################################################################
+<Plugin processes>
+        ProcessMatch "ovs-vswitchd" "ovs-vswitchd"
+        ProcessMatch "ovsdb-server" "ovsdb-server"
+        ProcessMatch "collectd" "collectd"
+</Plugin>
+
+<Plugin cpu>
+  ReportByCpu true
+  ReportByState true
+  ValuesPercentage true
+  ReportNumCpu true
+  ReportGuestState false
+  SubtractGuestState false
+</Plugin>
+
+<Plugin network>
+  Server "10.10.120.211" "30826"
+</Plugin>
+
+<Plugin ovs_stats>
+  Port "6640"
+  Address "127.0.0.1"
+  Socket "/usr/local/var/run/openvswitch/db.sock"
+  Bridges "vsperf-br0"
+</Plugin>
+
+<Plugin "intel_rdt">
+  Cores "2" "4-5" "6-7" "8" "9" "22" "23" "24" "25" "26" "27"
+</Plugin>
diff --git a/tools/lma/ansible-client/roles/collectd/tasks/main.yml b/tools/lma/ansible-client/roles/collectd/tasks/main.yml
new file mode 100644 (file)
index 0000000..0befb22
--- /dev/null
@@ -0,0 +1,60 @@
+# Copyright 2020 Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# Dependency check
+- name: Check and install dependencies
+  yum:
+    name: ['docker', 'python-docker-py']
+    state: present
+
+- name: Install pip
+  yum:
+    name: python-pip
+    state: present
+
+- name: install docker-py
+  pip: name=docker-py
+
+- name: Cloning barometer
+  git:
+    repo: https://gerrit.opnfv.org/gerrit/barometer
+    dest: /tmp/barometer
+
+- name: Create Folder
+  file:
+    path: /tmp/barometer/docker/src/collectd_sample_configs
+    state: directory
+
+# Build collectd
+- name: Downlaod and Build Image
+  command: chdir=/tmp/ {{ item }}
+  become: true
+  with_items:
+  - docker build -t opnfv/barometer-collectd -f barometer/docker/barometer-collectd/Dockerfile barometer/docker/barometer-collectd
+
+# Configuring collectd0
+- name: Ensure collectd is configured
+  template:
+    src: ../files/collectd.conf.j2
+    dest: /tmp/barometer/docker/src/collectd_sample_configs/collectd.conf
+
+# Running Collectd container #####################
+- name: Running collectd
+  command : chdir=/tmp/ {{ item }}
+  become: true
+  with_items:
+  - docker run -tid --name collectd --net=host -v /tmp/barometer/docker/src/collectd_sample_configs:/opt/collectd/etc/collectd.conf.d -v /var/run:/var/run -v /tmp:/tmp --privileged opnfv/barometer-collectd /run_collectd.sh
+  - docker ps
diff --git a/tools/lma/ansible-server/ansible.cfg b/tools/lma/ansible-server/ansible.cfg
new file mode 100644 (file)
index 0000000..307ef45
--- /dev/null
@@ -0,0 +1,17 @@
+[defaults]
+inventory = ./hosts
+host_key_checking = false
+
+# additional path to search for roles in
+roles_path = roles
+
+# enable logging
+log_path = ./ansible.log
+
+[privilege_escalation]
+become=True
+become_method=sudo
+become_user=root
+
+[ssh_connection]
+pipelining = True
diff --git a/tools/lma/ansible-server/group_vars/all.yml b/tools/lma/ansible-server/group_vars/all.yml
new file mode 100644 (file)
index 0000000..b0725ff
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#apiserver advertise address
+ad_addr: 10.10.120.211
+
+#pod network cidr
+pod_cidr: 192.168.0.0/16
+
+#token generated by master
+token_file: join_token
+
+#give hostname
+vm3: 'vm3'
+vm2: 'vm2'
+vm1: 'vm1'
diff --git a/tools/lma/ansible-server/hosts b/tools/lma/ansible-server/hosts
new file mode 100644 (file)
index 0000000..0a13d75
--- /dev/null
@@ -0,0 +1,12 @@
+[all]
+10.10.120.211 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
+10.10.120.203 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
+10.10.120.204 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
+
+
+[master]
+10.10.120.211 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
+
+[worker-nodes]
+10.10.120.203 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
+10.10.120.204 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd
\ No newline at end of file
diff --git a/tools/lma/ansible-server/playbooks/clean.yaml b/tools/lma/ansible-server/playbooks/clean.yaml
new file mode 100644 (file)
index 0000000..b4da66d
--- /dev/null
@@ -0,0 +1,52 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# clean monitoring
+- name: Clean PAG setup
+  hosts: master
+  roles:
+    - clean-monitoring
+
+#clean logging
+- name: Clean EFK setup
+  hosts: master
+  roles:
+    - clean-logging
+
+#IF KUBELET IS RUNNING THEN RUN THIS
+#clean k8s cluster
+- name: Clean k8s cluster
+  hosts: master
+  roles:
+    - clean-k8s-cluster
+
+#reset worker-nodes
+- name: Reset worker-nodes
+  hosts: worker-nodes
+  roles:
+    - clean-k8s-worker-reset
+
+#unistall pre-requisites for k8s
+- name: unistall pre-requisites for k8s
+  hosts: all
+  roles:
+    - clean-k8s-pre
+
+#*************************************************************************************************************
+#THIS WILL DELETE DATA OF ELASTICSEARCH
+#*************************************************************************************************************
+# - name: Clean nfs server
+#   hosts: all
+#   roles:
+#     - clean-nfs
diff --git a/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml
new file mode 100644 (file)
index 0000000..83ac086
--- /dev/null
@@ -0,0 +1,34 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+  shell: "systemctl status kubelet"
+  register: _svc_kubelet
+  failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+  shell: |
+    kubectl drain {{vm3}} --delete-local-data --force --ignore-daemonsets
+    kubectl drain {{vm2}} --delete-local-data --force --ignore-daemonsets
+    kubectl drain {{vm1}} --delete-local-data --force --ignore-daemonsets
+    kubectl delete node {{vm3}}
+    kubectl delete node {{vm2}}
+    kubectl delete node {{vm1}}
+    sudo kubeadm reset -f
+    sudo rm $HOME/.kube/config
+  when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml
new file mode 100644 (file)
index 0000000..6d12bd5
--- /dev/null
@@ -0,0 +1,65 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+#Uninstalling K8s
+- name: Uninstalling K8s
+  yum:
+   name: ['kubeadm', 'kubectl', 'kubelet', 'docker-ce']
+   state: absent
+
+#Enabling Swap
+- name: Enabling Swap on all nodes
+  shell: swapon -a
+  ignore_errors: yes
+
+#Uncommenting Swap entries
+- name: Uncommenting Swap entries in /etc/fstab
+  replace:
+   path: /etc/fstab
+   regexp: '^# /(.*swap.*)'
+   replace: '\1'
+
+
+#Starting firewalld
+- name: 'Starting firewall'
+  service:
+   name: firewalld
+   state: started
+   enabled: yes
+
+# Enabling SELinux
+- name: Enabling SELinux on all nodes
+  shell: |
+    setenforce 1
+    sudo sed -i 's/^SELINUX=permissive$/SELINUX=enforcing/' /etc/selinux/config
+
+#removing Docker repo
+- name: removing Docker repo
+  command: yum-config-manager --disable docker-ce-stable
+
+#removing K8s repo
+- name: removing repository details in Kubernetes repo file.
+  blockinfile:
+   path: /etc/yum.repos.d/kubernetes.repo
+   state: absent
+   block: |
+    [kubernetes]
+    name=Kubernetes
+    baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+    enabled=1
+    gpgcheck=1
+    repo_gpgcheck=1
+    gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
+      https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
diff --git a/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml
new file mode 100644 (file)
index 0000000..3ba9c9e
--- /dev/null
@@ -0,0 +1,26 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+  shell: "systemctl status kubelet"
+  register: _svc_kubelet
+  failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+  command: kubeadm reset -f
+  when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml
new file mode 100644 (file)
index 0000000..49943ec
--- /dev/null
@@ -0,0 +1,48 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#Deleting PAG setup from k8s cluster
+
+#check kubelet is running or not
+- name: check for kubelet
+  shell: "systemctl status kubelet"
+  register: _svc_kubelet
+  failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#***********************************************************************************************************
+#copy  yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy namespace yaml to /tmp/files/
+  copy:
+    src: ../../monitoring/files/monitoring-namespace.yaml
+    dest: /tmp/monitoring-namespace.yaml
+
+#***********************************************************************************************************
+#Deleting Namespace
+#***********************************************************************************************************
+- name: Deleting Namespace
+  k8s:
+    state: absent
+    src: /tmp/monitoring-namespace.yaml
+    namespace: monitoring
+  when: "_svc_kubelet.rc == 0"
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/monitoring-namespace.yaml
+  file:
+    path: "/tmp/monitoring-namespace.yaml"
+    state: absent
diff --git a/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml
new file mode 100644 (file)
index 0000000..157db84
--- /dev/null
@@ -0,0 +1,44 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#Edit /etc/export
+- name: Edit /etc/export file for NFS
+  lineinfile:
+    path: /etc/exports
+    line: "{{item.line}}"
+    state: absent
+  with_items:
+    - {line: "/srv/nfs/master   *(rw,sync,no_root_squash,no_subtree_check)"}
+    - {line: "/srv/nfs/data     *(rw,sync,no_root_squash,no_subtree_check)"}
+    - {line: "/usr/share/monitoring_data/grafana     *(rw,sync,no_root_squash,no_subtree_check)"}
+
+#uninstall NFS server
+- name: Uninstalling NFS server utils
+  yum:
+    name: nfs-utils
+    state: absent
+
+#remove Elasticsearch data
+- name: Removing Directory for elasticsearch
+  file:
+    path: "/srv/nfs/{{item}}"
+    state: absent
+  with_items:
+    - ['data', 'master']
+
+#remove Grafana data
+- name: Removing Directory for grafana
+  file:
+    path: "/usr/share/monitoring_data/grafana"
+    state: absent
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml
new file mode 100644 (file)
index 0000000..7b9abc4
--- /dev/null
@@ -0,0 +1,37 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: alertmanager-config
+  namespace: monitoring
+data:
+  config.yml: |-
+    global:
+    route:
+      receiver: "webhook"
+      group_by: ['alertname', 'priority']
+      group_wait: 1s
+      group_interval: 5s
+      repeat_interval: 5s
+      routes:
+      - match:
+          severity: critical
+
+    receivers:
+    - name: "webhook"
+      webhook_configs:
+      - url: 'http://10.10.120.20/alertmanager'
+        send_resolved: true
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml
new file mode 100644 (file)
index 0000000..f1c3d78
--- /dev/null
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: alertmanager
+    adi10hero.monitoring: alertmanager
+  name: alertmanager
+  namespace: monitoring
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: alertmanager
+      adi10hero.monitoring: alertmanager
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      name: alertmanager
+      labels:
+        app: alertmanager
+        adi10hero.monitoring: alertmanager
+    spec:
+      containers:
+      - name: alertmanager
+        image: prom/alertmanager
+        args:
+        - --config.file=/etc/alertmanager/config.yml
+        - --storage.path=/alertmanager
+        - --cluster.peer=alertmanager1:6783
+        - --cluster.listen-address=0.0.0.0:6783
+        ports:
+        - containerPort: 9093
+        - containerPort: 6783
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - name: config-volume
+          mountPath: /etc/alertmanager
+        - name: alertmanager
+          mountPath: /alertmanager
+      restartPolicy: Always
+      volumes:
+      - name: config-volume
+        configMap:
+          name: alertmanager-config
+      - name: alertmanager
+        emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml
new file mode 100644 (file)
index 0000000..c67517d
--- /dev/null
@@ -0,0 +1,41 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: alertmanager
+    app: alertmanager
+  name: alertmanager
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/path: /
+    prometheus.io/port: '8080'
+
+spec:
+  selector:
+    app: alertmanager
+    adi10hero.monitoring: alertmanager
+  type: NodePort
+  ports:
+  - name: "9093"
+    port: 9093
+    targetPort: 9093
+    nodePort: 30930
+  - name: "6783"
+    port: 6783
+    targetPort: 6783
+    nodePort: 30679
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml
new file mode 100644 (file)
index 0000000..18b7645
--- /dev/null
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: alertmanager1
+    adi10hero.monitoring: alertmanager1
+  name: alertmanager1
+  namespace: monitoring
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: alertmanager1
+      adi10hero.monitoring: alertmanager1
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      name: alertmanager1
+      labels:
+        app: alertmanager1
+        adi10hero.monitoring: alertmanager1
+    spec:
+      containers:
+      - name: alertmanager1
+        image: prom/alertmanager
+        args:
+        - --config.file=/etc/alertmanager/config.yml
+        - --storage.path=/alertmanager
+        - --cluster.peer=alertmanager:6783
+        - --cluster.listen-address=0.0.0.0:6783
+        ports:
+        - containerPort: 9093
+        - containerPort: 6783
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - name: config-volume
+          mountPath: /etc/alertmanager
+        - name: alertmanager
+          mountPath: /alertmanager
+      restartPolicy: Always
+      volumes:
+      - name: config-volume
+        configMap:
+          name: alertmanager-config
+      - name: alertmanager
+        emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml
new file mode 100644 (file)
index 0000000..66d0d2b
--- /dev/null
@@ -0,0 +1,42 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: alertmanager1
+    app: alertmanager1
+  name: alertmanager1
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/path: /
+    prometheus.io/port: '8080'
+
+spec:
+  selector:
+    app: alertmanager1
+    adi10hero.monitoring: alertmanager1
+  type: NodePort
+  ports:
+  - name: "9093"
+    port: 9093
+    targetPort: 9093
+    nodePort: 30931
+  - name: "6783"
+    port: 6783
+    targetPort: 6783
+    nodePort: 30678
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml
new file mode 100644 (file)
index 0000000..6a62985
--- /dev/null
@@ -0,0 +1,79 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: cadvisor
+  namespace: monitoring
+  labels:
+    adi10hero.monitoring: cadvisor
+    app: cadvisor
+spec:
+  selector:
+    matchLabels:
+      app: cadvisor
+      adi10hero.monitoring: cadvisor
+  template:
+    metadata:
+      name: cadvisor
+      labels:
+        adi10hero.monitoring: cadvisor
+        app: cadvisor
+    spec:
+      containers:
+      - image: gcr.io/google-containers/cadvisor
+        name: cadvisor
+        ports:
+        - containerPort: 8080
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - mountPath: /rootfs
+          name: cadvisor-hostpath0
+          readOnly: true
+        - mountPath: /var/run
+          name: cadvisor-hostpath1
+        - mountPath: /sys
+          name: cadvisor-hostpath2
+          readOnly: true
+        - mountPath: /sys/fs/cgroup
+          name: cadvisor-hostpath3
+          readOnly: true
+        - mountPath: /dev/disk
+          name: cadvisor-hostpath4
+          readOnly: true
+        - mountPath: /var/lib/docker
+          name: cadvisor-hostpath5
+          readOnly: true
+      restartPolicy: Always
+      volumes:
+      - hostPath:
+          path: /
+        name: cadvisor-hostpath0
+      - hostPath:
+          path: /var/run
+        name: cadvisor-hostpath1
+      - hostPath:
+          path: /sys
+        name: cadvisor-hostpath2
+      - hostPath:
+          path: /cgroup
+        name: cadvisor-hostpath3
+      - hostPath:
+          path: /dev/disk/
+        name: cadvisor-hostpath4
+      - hostPath:
+          path: /var/lib/docker/
+        name: cadvisor-hostpath5
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml
new file mode 100644 (file)
index 0000000..734240b
--- /dev/null
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: cadvisor
+    adi10hero.monitoring: cadvisor
+  name: cadvisor
+  namespace: monitoring
+spec:
+  ports:
+  - name: "8080"
+    port: 8080
+    targetPort: 8080
+  selector:
+    app: cadvisor
+    adi10hero.monitoring: cadvisor
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
new file mode 100644 (file)
index 0000000..b6bfe0b
--- /dev/null
@@ -0,0 +1,51 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: collectd-exporter
+  namespace: monitoring
+  labels:
+    app: collectd-exporter
+    adi10hero.monitoring: collectd-exporter
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: collectd-exporter
+      adi10hero.monitoring: collectd-exporter
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      name: collectd-exporter
+      labels:
+        app: collectd-exporter
+        adi10hero.monitoring: collectd-exporter
+    spec:
+      containers:
+      - args:
+        - --collectd.listen-address=0.0.0.0:25826
+        image: prom/collectd-exporter
+        name: collectd-exporter
+        ports:
+        - containerPort: 9103
+        - containerPort: 25826
+          protocol: UDP
+        securityContext:
+          runAsUser: 0
+      restartPolicy: Always
+      volumes: null
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml
new file mode 100644 (file)
index 0000000..5609d04
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: collectd-exporter
+  namespace: monitoring
+  labels:
+    app: collectd-exporter
+    adi10hero.monitoring: collectd-exporter
+spec:
+  ports:
+  - name: "9103"
+    port: 9103
+    nodePort: 30103
+  - name: "25826"
+    port: 25826
+    protocol: UDP
+    nodePort: 30826
+  selector:
+    app: collectd-exporter
+    adi10hero.monitoring: collectd-exporter
+  type: NodePort
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml
new file mode 100644 (file)
index 0000000..e2b8c9f
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: monitoring
+data:
+  prometheus.yaml: |-
+    {
+        "apiVersion": 1,
+        "datasources": [
+            {
+               "access":"proxy",
+                "editable": true,
+                "name": "prometheus",
+                "orgId": 1,
+                "type": "prometheus",
+                "url": "http://prometheus-main:9090",
+                "version": 1
+            }
+        ]
+    }
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml
new file mode 100644 (file)
index 0000000..afb0094
--- /dev/null
@@ -0,0 +1,68 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    adi10hero.monitoring: grafana
+    app: grafana
+  name: grafana
+  namespace: monitoring
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adi10hero.monitoring: grafana
+      app: grafana
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      name: grafana
+      labels:
+        adi10hero.monitoring: grafana
+        app: grafana
+    spec:
+      containers:
+      - name: grafana
+        image: grafana/grafana
+        ports:
+        - containerPort: 3000
+        env:
+        - name: GF_SECURITY_ADMIN_PASSWORD
+          value: admin
+        - name: GF_SECURITY_ADMIN_USER
+          value: admin
+        - name: GF_SERVER_DOMAIN
+          value: 10.10.120.20
+        - name: GF_SERVER_ROOT_URL
+          value: "%(protocol)s://%(domain)s:/metrics"
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - mountPath: /var/lib/grafana
+          name: grafana-storage
+        - mountPath: /etc/grafana/provisioning/datasources
+          name: grafana-datasources
+          readOnly: false
+      restartPolicy: Always
+      volumes:
+      - name: grafana-storage
+        persistentVolumeClaim:
+          claimName: grafana-pvc
+      - name: grafana-datasources
+        configMap:
+          defaultMode: 420
+          name: grafana-datasources
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml
new file mode 100644 (file)
index 0000000..06bcc31
--- /dev/null
@@ -0,0 +1,31 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: grafana-pv
+  namespace: monitoring
+  labels:
+    app: grafana-pv
+    adi10hero.monitoring: grafana-pv
+spec:
+  storageClassName: monitoring
+  capacity:
+    storage: 5Gi
+  accessModes:
+  - ReadWriteMany
+  nfs:
+    server: 10.10.120.211
+    path: "/usr/share/monitoring_data/grafana"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml
new file mode 100644 (file)
index 0000000..2c2955c
--- /dev/null
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-pvc
+  namespace: monitoring
+  labels:
+    app: grafana-pvc
+    adi10hero.monitoring: grafana-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: monitoring
+  resources:
+    requests:
+      storage: 4Gi
+  selector:
+    matchLabels:
+      app: grafana-pv
+      adi10hero.monitoring: grafana-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml
new file mode 100644 (file)
index 0000000..d1c9c9c
--- /dev/null
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: monitoring
+  labels:
+    app: grafana
+    adi10hero.monitoring: grafana
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '3000'
+spec:
+  selector:
+    app: grafana
+    adi10hero.monitoring: grafana
+  type: NodePort
+  ports:
+  - name: "3000"
+    port: 3000
+    targetPort: 3000
+    nodePort: 30000
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
new file mode 100644 (file)
index 0000000..af3c546
--- /dev/null
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: kube-state-metrics
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: kube-state-metrics
+    spec:
+      #serviceAccountName: prometheus
+      containers:
+      - name: kube-state-metrics
+        image: quay.io/coreos/kube-state-metrics:v1.2.0
+        ports:
+        - containerPort: 8080
+          name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
new file mode 100644 (file)
index 0000000..8d29439
--- /dev/null
@@ -0,0 +1,26 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: Service
+apiVersion: v1
+metadata:
+  name: kube-state-metrics
+  namespace: kube-system
+spec:
+  selector:
+    app: kube-state-metrics
+  ports:
+  - protocol: TCP
+    port: 8080
+    targetPort: 8080
diff --git a/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml
new file mode 100644 (file)
index 0000000..f1c9b88
--- /dev/null
@@ -0,0 +1,18 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml
new file mode 100644 (file)
index 0000000..9334b2f
--- /dev/null
@@ -0,0 +1,80 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-exporter-daemonset
+  namespace: monitoring
+  labels:
+    app: node-exporter
+    adi10hero.monitoring: node-exporter
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+      adi10hero.monitoring: node-exporter
+  template:
+    metadata:
+      labels:
+        app: node-exporter
+        adi10hero.monitoring: node-exporter
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9100"
+    spec:
+      hostPID: true
+      hostIPC: true
+      hostNetwork: true
+      containers:
+        - ports:
+            - containerPort: 9100
+              protocol: TCP
+          resources:
+            requests:
+              cpu: 0.15
+          securityContext:
+            runAsUser: 0
+            privileged: true
+          image: prom/node-exporter:v0.15.2
+          args:
+            - --path.procfs
+            - /host/proc
+            - --path.sysfs
+            - /host/sys
+            - --collector.filesystem.ignored-mount-points
+            - '"^/(sys|proc|dev|host|etc)($|/)"'
+          name: node-exporter
+          volumeMounts:
+            - name: dev
+              mountPath: /host/dev
+            - name: proc
+              mountPath: /host/proc
+            - name: sys
+              mountPath: /host/sys
+            - name: rootfs
+              mountPath: /rootfs
+      volumes:
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: dev
+          hostPath:
+            path: /dev
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: rootfs
+          hostPath:
+            path: /
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml
new file mode 100644 (file)
index 0000000..dd0aea4
--- /dev/null
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: node-exporter
+    app: node-exporter
+  name: node-exporter
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "9100"
+spec:
+  ports:
+  - name: "node-exporter"
+    port: 9100
+    targetPort: 9100
+  selector:
+    adi10hero.monitoring: node-exporter
+    app: node-exporter
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml
new file mode 100644 (file)
index 0000000..58b220a
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: prometheus-main
+    app: prometheus-main
+  name: prometheus-main
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '9090'
+spec:
+  type: NodePort
+  ports:
+  - name: prometheus-main
+    protocol: TCP
+    port: 9090
+    nodePort: 30902
+  selector:
+    adi10hero.monitoring: prometheus1
+    app: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml
new file mode 100644 (file)
index 0000000..917f978
--- /dev/null
@@ -0,0 +1,609 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: monitoring
+data:
+  alert.rules: |-
+    groups:
+    - name: targets
+      rules:
+      - alert: MonitorServiceDown
+        expr: up == 0
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Monitor service non-operational"
+          description: "Service {{ $labels.instance }} is down."
+      - alert: HighCpuLoad
+        expr: node_load1 > 1.9
+        for: 15s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service under high load"
+          description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+    - name: host and hardware
+      rules:
+      - alert: HostHighCpuLoad
+        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host high CPU load (instance {{ $labels.instance }})"
+          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostSwapIsFillingUp
+        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host swap is filling up (instance {{ $labels.instance }})"
+          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HighMemoryLoad
+        expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
+        for: 30s
+        labels:
+          severity: warning
+        annotations:
+          summary: "Server memory is almost full"
+          description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+      - alert: HighStorageLoad
+        expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
+        for: 30s
+        labels:
+          severity: warning
+        annotations:
+          summary: "Server storage is almost full"
+          description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+      - alert: HostNetworkTransmitErrors
+        expr: increase(node_network_transmit_errs_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+          description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostOutOfMemory
+        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host out of memory (instance {{ $labels.instance }})"
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostMemoryUnderMemoryPressure
+        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
+          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputIn
+        expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
+          description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputOut
+        expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
+          description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostUnusualDiskRateRead
+        expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
+          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostUnusualDiskRateWrite
+        expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
+          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostOutOfDiskSpace
+        expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host out of disk space (instance {{ $labels.instance }})"
+          description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostDiskWillFillIn4Hours
+        expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
+          description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostPhysicalComponentTooHot
+        expr: node_hwmon_temp_celsius > 75
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host physical component too hot (instance {{ $labels.instance }})"
+          description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostNodeOvertemperatureAlarm
+        expr: node_hwmon_temp_alarm == 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host node overtemperature alarm (instance {{ $labels.instance }})"
+          description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostKernelVersionDeviations
+        expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host kernel version deviations (instance {{ $labels.instance }})"
+          description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostOomKillDetected
+        expr: increase(node_vmstat_oom_kill[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host OOM kill detected (instance {{ $labels.instance }})"
+          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostEdacCorrectableErrorsDetected
+        expr: increase(node_edac_correctable_errors_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: info
+        annotations:
+          summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})"
+          description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostEdacUncorrectableErrorsDetected
+        expr: node_edac_uncorrectable_errors_total > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})"
+          description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostNetworkReceiveErrors
+        expr: increase(node_network_receive_errs_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host Network Receive Errors (instance {{ $labels.instance }})"
+          description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: HostNetworkTransmitErrors
+        expr: increase(node_network_transmit_errs_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+          description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+    - name: container
+      rules:
+      - alert: ContainerKilled
+        expr: time() - container_last_seen > 60
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container killed (instance {{ $labels.instance }})"
+          description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: ContainerCpuUsage
+        expr: sum by(instance, name)  (rate(container_cpu_usage_seconds_total[3m]) * 100 > 80)
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container CPU usage (instance {{ $labels.instance }})"
+          description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: ContainerMemoryUsage
+        expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 125
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container Memory usage (instance {{ $labels.instance }})"
+          description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: ContainerVolumeUsage
+        expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container Volume usage (instance {{ $labels.instance }})"
+          description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: ContainerVolumeIoUsage
+        expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container Volume IO usage (instance {{ $labels.instance }})"
+          description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: ContainerHighThrottleRate
+        expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container high throttle rate (instance {{ $labels.instance }})"
+          description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+    - name: kubernetes
+      rules:
+      - alert: KubernetesNodeReady
+        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes Node ready (instance {{ $labels.instance }})"
+          description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesMemoryPressure
+        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes memory pressure (instance {{ $labels.instance }})"
+          description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesDiskPressure
+        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes disk pressure (instance {{ $labels.instance }})"
+          description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesOutOfDisk
+        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes out of disk (instance {{ $labels.instance }})"
+          description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesJobFailed
+        expr: kube_job_status_failed > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes Job failed (instance {{ $labels.instance }})"
+          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesCronjobSuspended
+        expr: kube_cronjob_spec_suspend != 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes CronJob suspended (instance {{ $labels.instance }})"
+          description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesPersistentvolumeclaimPending
+        expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})"
+          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesVolumeOutOfDiskSpace
+        expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes Volume out of disk space (instance {{ $labels.instance }})"
+          description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesVolumeFullInFourDays
+        expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes Volume full in four days (instance {{ $labels.instance }})"
+          description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesPersistentvolumeError
+        expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes PersistentVolume error (instance {{ $labels.instance }})"
+          description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesStatefulsetDown
+        expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})"
+          description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesHpaScalingAbility
+        expr: kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes HPA scaling ability (instance {{ $labels.instance }})"
+          description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesHpaMetricAvailability
+        expr: kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes HPA metric availability (instance {{ $labels.instance }})"
+          description: "HPA is not able to colelct metrics\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesHpaScaleCapability
+        expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes HPA scale capability (instance {{ $labels.instance }})"
+          description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesPodNotHealthy
+        expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})"
+          description: "Pod has been in a non-ready state for longer than an hour.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesPodCrashLooping
+        expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes pod crash looping (instance {{ $labels.instance }})"
+          description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesReplicassetMismatch
+        expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})"
+          description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesDeploymentReplicasMismatch
+        expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})"
+          description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesStatefulsetReplicasMismatch
+        expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})"
+          description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesDeploymentGenerationMismatch
+        expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})"
+          description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesStatefulsetGenerationMismatch
+        expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})"
+          description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesStatefulsetUpdateNotRolledOut
+        expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})"
+          description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesDaemonsetRolloutStuck
+        expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})"
+          description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesDaemonsetMisscheduled
+        expr: kube_daemonset_status_number_misscheduled > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})"
+          description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesCronjobTooLong
+        expr: time() - kube_cronjob_next_schedule_time > 3600
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})"
+          description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesJobCompletion
+        expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes job completion (instance {{ $labels.instance }})"
+          description: "Kubernetes Job failed to complete\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesApiServerErrors
+        expr: sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[2m])) / sum(rate(apiserver_request_count{job="apiserver"}[2m])) * 100 > 3
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes API server errors (instance {{ $labels.instance }})"
+          description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesApiClientErrors
+        expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[2m])) by (instance, job) / sum(rate(rest_client_requests_total[2m])) by (instance, job)) * 100 > 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes API client errors (instance {{ $labels.instance }})"
+          description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesClientCertificateExpiresNextWeek
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes client certificate expires next week (instance {{ $labels.instance }})"
+          description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesClientCertificateExpiresSoon
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kubernetes client certificate expires soon (instance {{ $labels.instance }})"
+          description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+      - alert: KubernetesApiServerLatency
+        expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Kubernetes API server latency (instance {{ $labels.instance }})"
+          description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+
+  prometheus.yml: |-
+    global:
+      scrape_interval:     15s
+      evaluation_interval: 15s
+
+    rule_files:
+      - "/etc/prometheus/alert.rules"
+
+    scrape_configs:
+      - job_name: 'collectd-exporter'
+        scrape_interval: 5s
+        static_configs:
+          - targets: ['collectd-exporter:9103']
+
+      - job_name: 'cadvisor'
+        scrape_interval: 5s
+        static_configs:
+          - targets: ['cadvisor:8080']
+
+      - job_name: 'node-exporter'
+        scrape_interval: 5s
+        static_configs:
+          - targets: ['node-exporter:9100']
+
+      - job_name: 'prometheus'
+        scrape_interval: 10s
+        static_configs:
+          - targets: ['localhost:9090']
+
+      - job_name: 'kube-state-metrics'
+        scrape_interval: 10s
+        static_configs:
+          - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
+
+    alerting:
+      alertmanagers:
+      - scheme: http
+        static_configs:
+        - targets: ['alertmanager:9093', 'alertmanager1:9093']
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml
new file mode 100644 (file)
index 0000000..5b98b15
--- /dev/null
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-deployment
+  namespace: monitoring
+  labels:
+    app: prometheus
+    adi10hero.monitoring: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adi10hero.monitoring: prometheus
+      app: prometheus
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        adi10hero.monitoring: prometheus
+        app: prometheus
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - vm2
+      containers:
+      - name: prometheus
+        image: prom/prometheus
+        args:
+        - --config.file=/etc/prometheus/prometheus.yml
+        - --storage.tsdb.path=/prometheus
+        - --storage.tsdb.retention.size=3GB
+        - --storage.tsdb.retention.time=30d
+        - --web.console.libraries=/etc/prometheus/console_libraries
+        - --web.console.templates=/etc/prometheus/consoles
+        ports:
+        - containerPort: 9090
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - name: prometheus-config-volume
+          mountPath: /etc/prometheus/
+        - name: prometheus-storage-volume
+          mountPath: /prometheus/
+      restartPolicy: Always
+      volumes:
+      - name: prometheus-config-volume
+        configMap:
+          defaultMode: 420
+          name: prometheus-config
+      - name: prometheus-storage-volume
+        persistentVolumeClaim:
+          claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml
new file mode 100644 (file)
index 0000000..f10cd07
--- /dev/null
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus-pv
+  namespace: monitoring
+  labels:
+    app: prometheus-pv
+    adi10hero.monitoring: prometheus-pv
+spec:
+  storageClassName: monitoring
+  capacity:
+    storage: 6Gi
+  accessModes:
+  - ReadWriteMany
+  hostPath:
+    path: "/usr/share/monitoring_data/prometheus"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml
new file mode 100644 (file)
index 0000000..812fcc7
--- /dev/null
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-pvc
+  namespace: monitoring
+  labels:
+    app: prometheus-pvc
+    adi10hero.monitoring: prometheus-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: monitoring
+  resources:
+    requests:
+      storage: 3Gi
+  selector:
+    matchLabels:
+      app: prometheus-pv
+      adi10hero.monitoring: prometheus-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml
new file mode 100644 (file)
index 0000000..5be76d3
--- /dev/null
@@ -0,0 +1,34 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: prometheus
+    app: prometheus
+  name: prometheus
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '9090'
+spec:
+  type: NodePort
+  ports:
+  - name: prometheus
+    protocol: TCP
+    port: 9090
+    nodePort: 30900
+  selector:
+    adi10hero.monitoring: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml
new file mode 100644 (file)
index 0000000..149bea8
--- /dev/null
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus1-deployment
+  namespace: monitoring
+  labels:
+    app: prometheus1
+    adi10hero.monitoring: prometheus1
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adi10hero.monitoring: prometheus1
+      app: prometheus1
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        adi10hero.monitoring: prometheus1
+        app: prometheus1
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - vm3
+      containers:
+      - name: prometheus
+        image: prom/prometheus
+        args:
+        - --config.file=/etc/prometheus/prometheus.yml
+        - --storage.tsdb.path=/prometheus
+        - --storage.tsdb.retention.size=3GB
+        - --storage.tsdb.retention.time=30d
+        - --web.console.libraries=/etc/prometheus/console_libraries
+        - --web.console.templates=/etc/prometheus/consoles
+        ports:
+        - containerPort: 9090
+        securityContext:
+          runAsUser: 0
+        volumeMounts:
+        - name: prometheus-config-volume
+          mountPath: /etc/prometheus/
+        - name: prometheus-storage-volume
+          mountPath: /prometheus/
+      restartPolicy: Always
+      volumes:
+      - name: prometheus-config-volume
+        configMap:
+          defaultMode: 420
+          name: prometheus-config
+      - name: prometheus-storage-volume
+        persistentVolumeClaim:
+          claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml
new file mode 100644 (file)
index 0000000..439deec
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    adi10hero.monitoring: prometheus1
+    app: prometheus1
+  name: prometheus1
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '9090'
+spec:
+  type: NodePort
+  ports:
+  - name: prometheus1
+    protocol: TCP
+    port: 9090
+    nodePort: 30901
+  selector:
+    adi10hero.monitoring: prometheus1
+    app: prometheus1
diff --git a/tools/lma/ansible-server/roles/monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml
new file mode 100644 (file)
index 0000000..cd4e6ac
--- /dev/null
@@ -0,0 +1,273 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#PAG setup in k8s cluster
+
+#***********************************************************************************************************
+#copy all yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy all yaml to /tmp/files/
+  copy:
+    src: ../files/
+    dest: /tmp/files/
+
+#***********************************************************************************************************
+#Creating Namespace
+#***********************************************************************************************************
+- name: Creating Monitoring Namespace
+  k8s:
+    state: present
+    src: /tmp/files/monitoring-namespace.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Prometheus
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus-pv.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Grafana
+  k8s:
+    state: present
+    src: /tmp/files/grafana/grafana-pv.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Prometheus
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus-pvc.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Grafana
+  k8s:
+    state: present
+    src: /tmp/files/grafana/grafana-pvc.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the CAdvisor deamonset
+#***********************************************************************************************************
+- name: Creating cAdvisor deamonset
+  k8s:
+    state: present
+    src: /tmp/files/cadvisor/cadvisor-deamonset.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the CAdvisor service
+#***********************************************************************************************************
+- name: Starting cAdvisor service
+  k8s:
+    state: present
+    src: /tmp/files/cadvisor/cadvisor-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Deploying and Starting the kube-system-metrics service
+#***********************************************************************************************************
+- name: Deploying kube-system-metrics
+  k8s:
+    state: present
+    src: /tmp/files/kube-state-metrics/kube-state-metrics-deployment.yaml
+    namespace: kube-system
+
+- name: Starting kube-system-metrics service
+  k8s:
+    state: present
+    src: /tmp/files/kube-state-metrics/kube-state-metrics-service.yaml
+    namespace: kube-system
+
+#***********************************************************************************************************
+#Making the NodeExporter deamonset
+#***********************************************************************************************************
+- name: Creating NodeExporter deamonset
+  k8s:
+    state: present
+    src: /tmp/files/node-exporter/nodeexporter-daemonset.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the NodeExporter service
+#***********************************************************************************************************
+- name: Starting NodeExporter service
+  k8s:
+    state: present
+    src: /tmp/files/node-exporter/nodeexporter-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter deployment
+#***********************************************************************************************************
+- name: Creating collectd-exporter deamonset
+  k8s:
+    state: present
+    src: /tmp/files/collectd-exporter/collectd-exporter-deployment.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter service
+#***********************************************************************************************************
+- name: Creating collectd-exporter service
+  k8s:
+    state: present
+    src: /tmp/files/collectd-exporter/collectd-exporter-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Webhook goes here
+#***********************************************************************************************************
+
+#***********************************************************************************************************
+#Making the config file for Alertmanagers
+#***********************************************************************************************************
+- name: Creating config map for Alertmanagers
+  k8s:
+    state: present
+    src: /tmp/files/alertmanager/alertmanager-config.yaml
+    namespace: monitoring
+
+# - name: Creating config map for Alertmanagers
+#   k8s:
+#     state: present
+#     src: /tmp/files/alertmanager1-config.yaml
+#     namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 1st alertmanager deployment
+  k8s:
+    state: present
+    src: /tmp/files/alertmanager/alertmanager-deployment.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager service
+#***********************************************************************************************************
+- name: Creating 1st alertmanager service
+  k8s:
+    state: present
+    src: /tmp/files/alertmanager/alertmanager-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager deployment
+  k8s:
+    state: present
+    src: /tmp/files/alertmanager/alertmanager1-deployment.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager service
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager service
+  k8s:
+    state: present
+    src: /tmp/files/alertmanager/alertmanager1-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Making the config file for Prometheus
+#***********************************************************************************************************
+- name: Creating 1st Prometheus Config
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus-config.yaml
+    namespace: monitoring
+
+# - name: Creating 2nd Prometheus Config
+#   k8s:
+#     state: present
+#     src: /tmp/files/prometheus1-config.yaml
+#     namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus
+#***********************************************************************************************************
+- name: Starting Prometheus 1
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus-deployment.yaml
+    namespace: monitoring
+
+- name: Starting Prometheus 2
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus1-deployment.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus Service
+#***********************************************************************************************************
+- name: Starting Prometheus 1 Service
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus-service.yaml
+    namespace: monitoring
+
+- name: Starting Prometheus 2 Service
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/prometheus1-service.yaml
+    namespace: monitoring
+
+- name: Starting Main Prometheus Service
+  k8s:
+    state: present
+    src: /tmp/files/prometheus/main-prometheus-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Grafana
+#***********************************************************************************************************
+- name: Creating Grafana Datasource Config
+  k8s:
+    state: present
+    src: /tmp/files/grafana/grafana-datasource-config.yaml
+    namespace: monitoring
+
+- name: Starting Grafana
+  k8s:
+    state: present
+    src: /tmp/files/grafana/grafana-deployment.yaml
+    namespace: monitoring
+
+- name: Starting Grafana Service
+  k8s:
+    state: present
+    src: /tmp/files/grafana/grafana-service.yaml
+    namespace: monitoring
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/files
+  file:
+    path: "/tmp/files"
+    state: absent
diff --git a/tools/lma/metrics/dashboard/cpu_usage_using.json b/tools/lma/metrics/dashboard/cpu_usage_using.json
new file mode 100644 (file)
index 0000000..85f7f12
--- /dev/null
@@ -0,0 +1,750 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "prometheus",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "limit": 100,
+        "name": "Monitoring",
+        "showIn": 0,
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": 4,
+  "iteration": 1596637894836,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "editable": true,
+      "error": false,
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 7,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "hiddenSeries": false,
+      "id": 3,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "hideZero": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "connected",
+      "percentage": false,
+      "pluginVersion": "7.1.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "collectd_cpu_percent{exported_instance='$host'}",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "CPU Usage",
+      "tooltip": {
+        "msResolution": true,
+        "shared": true,
+        "sort": 0,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 7,
+        "w": 24,
+        "x": 0,
+        "y": 7
+      },
+      "hiddenSeries": false,
+      "id": 4,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "collectd_cpu_percent{cpu='$core', exported_instance='$host'}",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "CPU utilization per core",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 7,
+        "w": 24,
+        "x": 0,
+        "y": 14
+      },
+      "hiddenSeries": false,
+      "id": 5,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "collectd_cpu_percent{cpu='$core',exported_instance='$host'}",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "CPU Usage per core",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 26,
+  "style": "dark",
+  "tags": [
+    "monitoring"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": true,
+          "text": "prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": false,
+          "text": "pod12-node4",
+          "value": "pod12-node4"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "host",
+        "options": [
+          {
+            "selected": true,
+            "text": "pod12-node4",
+            "value": "pod12-node4"
+          }
+        ],
+        "query": "pod12-node4,",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "text": "0",
+          "value": "0"
+        },
+        "hide": 0,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "core",
+        "options": [
+          {
+            "selected": false,
+            "text": "All",
+            "value": "$__all"
+          },
+          {
+            "selected": true,
+            "text": "0",
+            "value": "0"
+          },
+          {
+            "selected": false,
+            "text": "1",
+            "value": "1"
+          },
+          {
+            "selected": false,
+            "text": "2",
+            "value": "2"
+          },
+          {
+            "selected": false,
+            "text": "3",
+            "value": "3"
+          },
+          {
+            "selected": false,
+            "text": "4",
+            "value": "4"
+          },
+          {
+            "selected": false,
+            "text": "5",
+            "value": "5"
+          },
+          {
+            "selected": false,
+            "text": "6",
+            "value": "6"
+          },
+          {
+            "selected": false,
+            "text": "7",
+            "value": "7"
+          },
+          {
+            "selected": false,
+            "text": "8",
+            "value": "8"
+          },
+          {
+            "selected": false,
+            "text": "9",
+            "value": "9"
+          },
+          {
+            "selected": false,
+            "text": "10",
+            "value": "10"
+          },
+          {
+            "selected": false,
+            "text": "11",
+            "value": "11"
+          },
+          {
+            "selected": false,
+            "text": "12",
+            "value": "12"
+          },
+          {
+            "selected": false,
+            "text": "13",
+            "value": "13"
+          },
+          {
+            "selected": false,
+            "text": "14",
+            "value": "14"
+          },
+          {
+            "selected": false,
+            "text": "15",
+            "value": "15"
+          },
+          {
+            "selected": false,
+            "text": "16",
+            "value": "16"
+          },
+          {
+            "selected": false,
+            "text": "17",
+            "value": "17"
+          },
+          {
+            "selected": false,
+            "text": "18",
+            "value": "18"
+          },
+          {
+            "selected": false,
+            "text": "19",
+            "value": "19"
+          },
+          {
+            "selected": false,
+            "text": "20",
+            "value": "20"
+          },
+          {
+            "selected": false,
+            "text": "21",
+            "value": "21"
+          },
+          {
+            "selected": false,
+            "text": "22",
+            "value": "22"
+          },
+          {
+            "selected": false,
+            "text": "23",
+            "value": "23"
+          },
+          {
+            "selected": false,
+            "text": "24",
+            "value": "24"
+          },
+          {
+            "selected": false,
+            "text": "25",
+            "value": "25"
+          },
+          {
+            "selected": false,
+            "text": "26",
+            "value": "26"
+          },
+          {
+            "selected": false,
+            "text": "27",
+            "value": "27"
+          },
+          {
+            "selected": false,
+            "text": "28",
+            "value": "28"
+          },
+          {
+            "selected": false,
+            "text": "29",
+            "value": "29"
+          },
+          {
+            "selected": false,
+            "text": "30",
+            "value": "30"
+          },
+          {
+            "selected": false,
+            "text": "31",
+            "value": "31"
+          },
+          {
+            "selected": false,
+            "text": "32",
+            "value": "32"
+          },
+          {
+            "selected": false,
+            "text": "33",
+            "value": "33"
+          },
+          {
+            "selected": false,
+            "text": "34",
+            "value": "34"
+          },
+          {
+            "selected": false,
+            "text": "35",
+            "value": "35"
+          },
+          {
+            "selected": false,
+            "text": "36",
+            "value": "36"
+          },
+          {
+            "selected": false,
+            "text": "37",
+            "value": "37"
+          },
+          {
+            "selected": false,
+            "text": "38",
+            "value": "38"
+          },
+          {
+            "selected": false,
+            "text": "39",
+            "value": "39"
+          },
+          {
+            "selected": false,
+            "text": "40",
+            "value": "40"
+          },
+          {
+            "selected": false,
+            "text": "41",
+            "value": "41"
+          },
+          {
+            "selected": false,
+            "text": "42",
+            "value": "42"
+          },
+          {
+            "selected": false,
+            "text": "43",
+            "value": "43"
+          },
+          {
+            "selected": false,
+            "text": "44",
+            "value": "44"
+          },
+          {
+            "selected": false,
+            "text": "45",
+            "value": "45"
+          },
+          {
+            "selected": false,
+            "text": "46",
+            "value": "46"
+          },
+          {
+            "selected": false,
+            "text": "47",
+            "value": "47"
+          },
+          {
+            "selected": false,
+            "text": "48",
+            "value": "48"
+          },
+          {
+            "selected": false,
+            "text": "49",
+            "value": "49"
+          },
+          {
+            "selected": false,
+            "text": "50",
+            "value": "50"
+          },
+          {
+            "selected": false,
+            "text": "51",
+            "value": "51"
+          },
+          {
+            "selected": false,
+            "text": "52",
+            "value": "52"
+          },
+          {
+            "selected": false,
+            "text": "53",
+            "value": "53"
+          },
+          {
+            "selected": false,
+            "text": "54",
+            "value": "54"
+          },
+          {
+            "selected": false,
+            "text": "55",
+            "value": "55"
+          },
+          {
+            "selected": false,
+            "text": "56",
+            "value": "56"
+          },
+          {
+            "selected": false,
+            "text": "57",
+            "value": "57"
+          },
+          {
+            "selected": false,
+            "text": "58",
+            "value": "58"
+          },
+          {
+            "selected": false,
+            "text": "59",
+            "value": "59"
+          },
+          {
+            "selected": false,
+            "text": "60",
+            "value": "60"
+          },
+          {
+            "selected": false,
+            "text": "61",
+            "value": "61"
+          },
+          {
+            "selected": false,
+            "text": "62",
+            "value": "62"
+          },
+          {
+            "selected": false,
+            "text": "63",
+            "value": "63"
+          },
+          {
+            "selected": false,
+            "text": "64",
+            "value": "64"
+          }
+        ],
+        "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "CPU Usage",
+  "uid": "XeDwSiSGk",
+  "version": 13
+}
\ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/memory_using.json b/tools/lma/metrics/dashboard/memory_using.json
new file mode 100644 (file)
index 0000000..3b92d8f
--- /dev/null
@@ -0,0 +1,337 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "prometheus",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "limit": 100,
+        "name": "Monitoring",
+        "showIn": 0,
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": 6,
+  "iteration": 1597616052316,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 15,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "hiddenSeries": false,
+      "id": 1,
+      "interval": "1s",
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": false,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.3",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "span": 12,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(collectd_memory{exported_instance='$host', memory='$type'}[$range])",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Bytes",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 26,
+  "style": "dark",
+  "tags": [
+    "monitoring"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": false,
+          "text": "pod12-node4",
+          "value": "pod12-node4"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "host",
+        "options": [
+          {
+            "selected": true,
+            "text": "pod12-node4",
+            "value": "pod12-node4"
+          }
+        ],
+        "query": "pod12-node4,",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      },
+      {
+        "auto": false,
+        "auto_count": 30,
+        "auto_min": "10s",
+        "current": {
+          "selected": false,
+          "text": "30s",
+          "value": "30s"
+        },
+        "hide": 0,
+        "label": null,
+        "name": "range",
+        "options": [
+          {
+            "selected": true,
+            "text": "30s",
+            "value": "30s"
+          },
+          {
+            "selected": false,
+            "text": "1m",
+            "value": "1m"
+          },
+          {
+            "selected": false,
+            "text": "5m",
+            "value": "5m"
+          },
+          {
+            "selected": false,
+            "text": "10m",
+            "value": "10m"
+          },
+          {
+            "selected": false,
+            "text": "30m",
+            "value": "30m"
+          },
+          {
+            "selected": false,
+            "text": "1h",
+            "value": "1h"
+          },
+          {
+            "selected": false,
+            "text": "6h",
+            "value": "6h"
+          },
+          {
+            "selected": false,
+            "text": "12h",
+            "value": "12h"
+          },
+          {
+            "selected": false,
+            "text": "1d",
+            "value": "1d"
+          },
+          {
+            "selected": false,
+            "text": "7d",
+            "value": "7d"
+          },
+          {
+            "selected": false,
+            "text": "14d",
+            "value": "14d"
+          },
+          {
+            "selected": false,
+            "text": "30d",
+            "value": "30d"
+          }
+        ],
+        "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+        "queryValue": "",
+        "refresh": 2,
+        "skipUrlSync": false,
+        "type": "interval"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "text": "used",
+          "value": "used"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "type",
+        "options": [
+          {
+            "selected": false,
+            "text": "buffered",
+            "value": "buffered"
+          },
+          {
+            "selected": false,
+            "text": "cached",
+            "value": "cached"
+          },
+          {
+            "selected": false,
+            "text": "free",
+            "value": "free"
+          },
+          {
+            "selected": false,
+            "text": "slab_recl",
+            "value": "slab_recl"
+          },
+          {
+            "selected": false,
+            "text": "slab_unrecl",
+            "value": "slab_unrecl"
+          },
+          {
+            "selected": true,
+            "text": "used",
+            "value": "used"
+          }
+        ],
+        "query": "buffered,cached,free,slab_recl,slab_unrecl,used",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Memory",
+  "uid": "kuro-mem",
+  "version": 4
+}
\ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/ovs_stats_using.json b/tools/lma/metrics/dashboard/ovs_stats_using.json
new file mode 100644 (file)
index 0000000..1e679fb
--- /dev/null
@@ -0,0 +1,854 @@
+{
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": "prometheus",
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "limit": 100,
+          "name": "Monitoring",
+          "showIn": 0,
+          "type": "dashboard"
+        }
+      ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "id": 6,
+    "iteration": 1596643135141,
+    "links": [],
+    "panels": [
+      {
+        "aliasColors": {},
+        "bars": false,
+        "dashLength": 10,
+        "dashes": false,
+        "datasource": "$datasource",
+        "fieldConfig": {
+          "defaults": {
+            "custom": {}
+          },
+          "overrides": []
+        },
+        "fill": 1,
+        "fillGradient": 0,
+        "gridPos": {
+          "h": 6,
+          "w": 24,
+          "x": 0,
+          "y": 0
+        },
+        "hiddenSeries": false,
+        "id": 1,
+        "interval": "1s",
+        "legend": {
+          "alignAsTable": true,
+          "avg": true,
+          "current": true,
+          "max": true,
+          "min": true,
+          "rightSide": true,
+          "show": true,
+          "total": false,
+          "values": true
+        },
+        "lines": true,
+        "linewidth": 1,
+        "links": [],
+        "nullPointMode": "null",
+        "percentage": false,
+        "pluginVersion": "7.1.1",
+        "pointradius": 5,
+        "points": false,
+        "renderer": "flot",
+        "seriesOverrides": [],
+        "spaceLength": 10,
+        "span": 12,
+        "stack": false,
+        "steppedLine": false,
+        "targets": [
+          {
+            "expr": "rate(collectd_ovs_stats_if_rx_octets_total{exported_instance='$host'}[$__interval])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "A"
+          }
+        ],
+        "thresholds": [],
+        "timeFrom": null,
+        "timeRegions": [],
+        "timeShift": null,
+        "title": "Average RX values",
+        "tooltip": {
+          "shared": true,
+          "sort": 0,
+          "value_type": "individual"
+        },
+        "type": "graph",
+        "xaxis": {
+          "buckets": null,
+          "mode": "time",
+          "name": null,
+          "show": true,
+          "values": []
+        },
+        "yaxes": [
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          },
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          }
+        ],
+        "yaxis": {
+          "align": false,
+          "alignLevel": null
+        }
+      },
+      {
+        "aliasColors": {},
+        "bars": false,
+        "dashLength": 10,
+        "dashes": false,
+        "datasource": "$datasource",
+        "fieldConfig": {
+          "defaults": {
+            "custom": {}
+          },
+          "overrides": []
+        },
+        "fill": 1,
+        "fillGradient": 0,
+        "gridPos": {
+          "h": 6,
+          "w": 24,
+          "x": 0,
+          "y": 6
+        },
+        "hiddenSeries": false,
+        "id": 2,
+        "interval": "1s",
+        "legend": {
+          "alignAsTable": true,
+          "avg": true,
+          "current": true,
+          "max": true,
+          "min": true,
+          "rightSide": true,
+          "show": true,
+          "total": false,
+          "values": true
+        },
+        "lines": true,
+        "linewidth": 1,
+        "links": [],
+        "nullPointMode": "null",
+        "percentage": false,
+        "pluginVersion": "7.1.1",
+        "pointradius": 5,
+        "points": false,
+        "renderer": "flot",
+        "seriesOverrides": [],
+        "spaceLength": 10,
+        "span": 12,
+        "stack": false,
+        "steppedLine": false,
+        "targets": [
+          {
+            "expr": "rate(collectd_ovs_stats_if_tx_octets_total{exported_instance='$host'}[$__interval])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "A"
+          }
+        ],
+        "thresholds": [],
+        "timeFrom": null,
+        "timeRegions": [],
+        "timeShift": null,
+        "title": "Average TX values",
+        "tooltip": {
+          "shared": true,
+          "sort": 0,
+          "value_type": "individual"
+        },
+        "type": "graph",
+        "xaxis": {
+          "buckets": null,
+          "mode": "time",
+          "name": null,
+          "show": true,
+          "values": []
+        },
+        "yaxes": [
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          },
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          }
+        ],
+        "yaxis": {
+          "align": false,
+          "alignLevel": null
+        }
+      },
+      {
+        "aliasColors": {},
+        "bars": false,
+        "dashLength": 10,
+        "dashes": false,
+        "datasource": "$datasource",
+        "fieldConfig": {
+          "defaults": {
+            "custom": {}
+          },
+          "overrides": []
+        },
+        "fill": 1,
+        "fillGradient": 0,
+        "gridPos": {
+          "h": 5,
+          "w": 24,
+          "x": 0,
+          "y": 12
+        },
+        "hiddenSeries": false,
+        "id": 3,
+        "interval": "1s",
+        "legend": {
+          "alignAsTable": true,
+          "avg": true,
+          "current": true,
+          "max": true,
+          "min": true,
+          "rightSide": true,
+          "show": true,
+          "total": false,
+          "values": true
+        },
+        "lines": true,
+        "linewidth": 1,
+        "links": [],
+        "nullPointMode": "null",
+        "percentage": false,
+        "pluginVersion": "7.1.1",
+        "pointradius": 5,
+        "points": false,
+        "renderer": "flot",
+        "seriesOverrides": [],
+        "spaceLength": 10,
+        "span": 12,
+        "stack": false,
+        "steppedLine": false,
+        "targets": [
+          {
+            "expr": "rate(collectd_ovs_stats_if_collisions_total{exported_instance='$host'}[$range])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "A"
+          },
+          {
+            "expr": "rate(collectd_ovs_stats_if_dropped_0_total{exported_instance='$host'}[$range])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "B"
+          },
+          {
+            "expr": "rate(collectd_ovs_stats_if_dropped_1_total{exported_instance='$host'}[$range])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "C"
+          },
+          {
+            "expr": "rate(collectd_ovs_stats_if_errors_0_total{exported_instance='$host'}[$range])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "D"
+          },
+          {
+            "expr": "rate(collectd_ovs_stats_if_errors_1_total{exported_instance='$host'}[$range])",
+            "interval": "",
+            "legendFormat": "",
+            "refId": "E"
+          }
+        ],
+        "thresholds": [],
+        "timeFrom": null,
+        "timeRegions": [],
+        "timeShift": null,
+        "title": "Average Collisions, Drops and Error values",
+        "tooltip": {
+          "shared": true,
+          "sort": 0,
+          "value_type": "individual"
+        },
+        "type": "graph",
+        "xaxis": {
+          "buckets": null,
+          "mode": "time",
+          "name": null,
+          "show": true,
+          "values": []
+        },
+        "yaxes": [
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          },
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          }
+        ],
+        "yaxis": {
+          "align": false,
+          "alignLevel": null
+        }
+      }
+    ],
+    "refresh": "30s",
+    "schemaVersion": 26,
+    "style": "dark",
+    "tags": [
+      "monitoring"
+    ],
+    "templating": {
+      "list": [
+        {
+          "current": {
+            "selected": false,
+            "text": "prometheus",
+            "value": "prometheus"
+          },
+          "hide": 0,
+          "includeAll": false,
+          "label": null,
+          "multi": false,
+          "name": "datasource",
+          "options": [],
+          "query": "prometheus",
+          "queryValue": "",
+          "refresh": 1,
+          "regex": "",
+          "skipUrlSync": false,
+          "type": "datasource"
+        },
+        {
+          "allValue": null,
+          "current": {
+            "selected": false,
+            "text": "pod12-node4",
+            "value": "pod12-node4"
+          },
+          "hide": 0,
+          "includeAll": false,
+          "label": null,
+          "multi": false,
+          "name": "host",
+          "options": [
+            {
+              "selected": true,
+              "text": "pod12-node4",
+              "value": "pod12-node4"
+            }
+          ],
+          "query": "pod12-node4,",
+          "queryValue": "",
+          "skipUrlSync": false,
+          "type": "custom"
+        },
+        {
+          "allValue": null,
+          "current": {
+            "selected": true,
+            "text": "0",
+            "value": "0"
+          },
+          "hide": 0,
+          "includeAll": true,
+          "label": null,
+          "multi": false,
+          "name": "core",
+          "options": [
+            {
+              "selected": false,
+              "text": "All",
+              "value": "$__all"
+            },
+            {
+              "selected": true,
+              "text": "0",
+              "value": "0"
+            },
+            {
+              "selected": false,
+              "text": "1",
+              "value": "1"
+            },
+            {
+              "selected": false,
+              "text": "2",
+              "value": "2"
+            },
+            {
+              "selected": false,
+              "text": "3",
+              "value": "3"
+            },
+            {
+              "selected": false,
+              "text": "4",
+              "value": "4"
+            },
+            {
+              "selected": false,
+              "text": "5",
+              "value": "5"
+            },
+            {
+              "selected": false,
+              "text": "6",
+              "value": "6"
+            },
+            {
+              "selected": false,
+              "text": "7",
+              "value": "7"
+            },
+            {
+              "selected": false,
+              "text": "8",
+              "value": "8"
+            },
+            {
+              "selected": false,
+              "text": "9",
+              "value": "9"
+            },
+            {
+              "selected": false,
+              "text": "10",
+              "value": "10"
+            },
+            {
+              "selected": false,
+              "text": "11",
+              "value": "11"
+            },
+            {
+              "selected": false,
+              "text": "12",
+              "value": "12"
+            },
+            {
+              "selected": false,
+              "text": "13",
+              "value": "13"
+            },
+            {
+              "selected": false,
+              "text": "14",
+              "value": "14"
+            },
+            {
+              "selected": false,
+              "text": "15",
+              "value": "15"
+            },
+            {
+              "selected": false,
+              "text": "16",
+              "value": "16"
+            },
+            {
+              "selected": false,
+              "text": "17",
+              "value": "17"
+            },
+            {
+              "selected": false,
+              "text": "18",
+              "value": "18"
+            },
+            {
+              "selected": false,
+              "text": "19",
+              "value": "19"
+            },
+            {
+              "selected": false,
+              "text": "20",
+              "value": "20"
+            },
+            {
+              "selected": false,
+              "text": "21",
+              "value": "21"
+            },
+            {
+              "selected": false,
+              "text": "22",
+              "value": "22"
+            },
+            {
+              "selected": false,
+              "text": "23",
+              "value": "23"
+            },
+            {
+              "selected": false,
+              "text": "24",
+              "value": "24"
+            },
+            {
+              "selected": false,
+              "text": "25",
+              "value": "25"
+            },
+            {
+              "selected": false,
+              "text": "26",
+              "value": "26"
+            },
+            {
+              "selected": false,
+              "text": "27",
+              "value": "27"
+            },
+            {
+              "selected": false,
+              "text": "28",
+              "value": "28"
+            },
+            {
+              "selected": false,
+              "text": "29",
+              "value": "29"
+            },
+            {
+              "selected": false,
+              "text": "30",
+              "value": "30"
+            },
+            {
+              "selected": false,
+              "text": "31",
+              "value": "31"
+            },
+            {
+              "selected": false,
+              "text": "32",
+              "value": "32"
+            },
+            {
+              "selected": false,
+              "text": "33",
+              "value": "33"
+            },
+            {
+              "selected": false,
+              "text": "34",
+              "value": "34"
+            },
+            {
+              "selected": false,
+              "text": "35",
+              "value": "35"
+            },
+            {
+              "selected": false,
+              "text": "36",
+              "value": "36"
+            },
+            {
+              "selected": false,
+              "text": "37",
+              "value": "37"
+            },
+            {
+              "selected": false,
+              "text": "38",
+              "value": "38"
+            },
+            {
+              "selected": false,
+              "text": "39",
+              "value": "39"
+            },
+            {
+              "selected": false,
+              "text": "40",
+              "value": "40"
+            },
+            {
+              "selected": false,
+              "text": "41",
+              "value": "41"
+            },
+            {
+              "selected": false,
+              "text": "42",
+              "value": "42"
+            },
+            {
+              "selected": false,
+              "text": "43",
+              "value": "43"
+            },
+            {
+              "selected": false,
+              "text": "44",
+              "value": "44"
+            },
+            {
+              "selected": false,
+              "text": "45",
+              "value": "45"
+            },
+            {
+              "selected": false,
+              "text": "46",
+              "value": "46"
+            },
+            {
+              "selected": false,
+              "text": "47",
+              "value": "47"
+            },
+            {
+              "selected": false,
+              "text": "48",
+              "value": "48"
+            },
+            {
+              "selected": false,
+              "text": "49",
+              "value": "49"
+            },
+            {
+              "selected": false,
+              "text": "50",
+              "value": "50"
+            },
+            {
+              "selected": false,
+              "text": "51",
+              "value": "51"
+            },
+            {
+              "selected": false,
+              "text": "52",
+              "value": "52"
+            },
+            {
+              "selected": false,
+              "text": "53",
+              "value": "53"
+            },
+            {
+              "selected": false,
+              "text": "54",
+              "value": "54"
+            },
+            {
+              "selected": false,
+              "text": "55",
+              "value": "55"
+            },
+            {
+              "selected": false,
+              "text": "56",
+              "value": "56"
+            },
+            {
+              "selected": false,
+              "text": "57",
+              "value": "57"
+            },
+            {
+              "selected": false,
+              "text": "58",
+              "value": "58"
+            },
+            {
+              "selected": false,
+              "text": "59",
+              "value": "59"
+            },
+            {
+              "selected": false,
+              "text": "60",
+              "value": "60"
+            },
+            {
+              "selected": false,
+              "text": "61",
+              "value": "61"
+            },
+            {
+              "selected": false,
+              "text": "62",
+              "value": "62"
+            },
+            {
+              "selected": false,
+              "text": "63",
+              "value": "63"
+            },
+            {
+              "selected": false,
+              "text": "64",
+              "value": "64"
+            }
+          ],
+          "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+          "queryValue": "",
+          "skipUrlSync": false,
+          "type": "custom"
+        },
+        {
+          "auto": false,
+          "auto_count": 30,
+          "auto_min": "10s",
+          "current": {
+            "selected": false,
+            "text": "30s",
+            "value": "30s"
+          },
+          "hide": 0,
+          "label": null,
+          "name": "range",
+          "options": [
+            {
+              "selected": true,
+              "text": "30s",
+              "value": "30s"
+            },
+            {
+              "selected": false,
+              "text": "1m",
+              "value": "1m"
+            },
+            {
+              "selected": false,
+              "text": "5m",
+              "value": "5m"
+            },
+            {
+              "selected": false,
+              "text": "10m",
+              "value": "10m"
+            },
+            {
+              "selected": false,
+              "text": "30m",
+              "value": "30m"
+            },
+            {
+              "selected": false,
+              "text": "1h",
+              "value": "1h"
+            },
+            {
+              "selected": false,
+              "text": "6h",
+              "value": "6h"
+            },
+            {
+              "selected": false,
+              "text": "12h",
+              "value": "12h"
+            },
+            {
+              "selected": false,
+              "text": "1d",
+              "value": "1d"
+            },
+            {
+              "selected": false,
+              "text": "7d",
+              "value": "7d"
+            },
+            {
+              "selected": false,
+              "text": "14d",
+              "value": "14d"
+            },
+            {
+              "selected": false,
+              "text": "30d",
+              "value": "30d"
+            }
+          ],
+          "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+          "queryValue": "",
+          "refresh": 2,
+          "skipUrlSync": false,
+          "type": "interval"
+        }
+      ]
+    },
+    "time": {
+      "from": "now-5m",
+      "to": "now"
+    },
+    "timepicker": {
+      "refresh_intervals": [
+        "10s",
+        "30s",
+        "1m",
+        "5m",
+        "15m",
+        "30m",
+        "1h",
+        "2h",
+        "1d"
+      ],
+      "time_options": [
+        "5m",
+        "15m",
+        "1h",
+        "6h",
+        "12h",
+        "24h",
+        "2d",
+        "7d",
+        "30d"
+      ]
+    },
+    "timezone": "browser",
+    "title": "OVS Stats",
+    "uid": "K1N5ciIGz",
+    "version": 7
+  }
\ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/rdt_using.json b/tools/lma/metrics/dashboard/rdt_using.json
new file mode 100644 (file)
index 0000000..a0ce798
--- /dev/null
@@ -0,0 +1,833 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "prometheus",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "limit": 100,
+        "name": "Monitoring",
+        "showIn": 0,
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": 7,
+  "iteration": 1597615840124,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 6,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "hiddenSeries": false,
+      "id": 1,
+      "interval": "1s",
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": false,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.3",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "span": 12,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(collectd_intel_rdt_bytes{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "RDT Bytes",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 6,
+        "w": 24,
+        "x": 0,
+        "y": 6
+      },
+      "hiddenSeries": false,
+      "id": 2,
+      "interval": "1s",
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": false,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.3",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "span": 12,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(collectd_intel_rdt_ipc{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])",
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "IPC values",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 5,
+        "w": 24,
+        "x": 0,
+        "y": 12
+      },
+      "hiddenSeries": false,
+      "id": 3,
+      "interval": "1s",
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": true,
+        "show": false,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pluginVersion": "7.1.3",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "span": 12,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='local'}[$range])",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='remote'}[$range])",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Memory Bandwidth Total",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 26,
+  "style": "dark",
+  "tags": [
+    "monitoring"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": false,
+          "text": "pod12-node4",
+          "value": "pod12-node4"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "host",
+        "options": [
+          {
+            "selected": true,
+            "text": "pod12-node4",
+            "value": "pod12-node4"
+          }
+        ],
+        "query": "pod12-node4,",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      },
+      {
+        "auto": false,
+        "auto_count": 30,
+        "auto_min": "10s",
+        "current": {
+          "selected": false,
+          "text": "30s",
+          "value": "30s"
+        },
+        "hide": 0,
+        "label": null,
+        "name": "range",
+        "options": [
+          {
+            "selected": true,
+            "text": "30s",
+            "value": "30s"
+          },
+          {
+            "selected": false,
+            "text": "1m",
+            "value": "1m"
+          },
+          {
+            "selected": false,
+            "text": "5m",
+            "value": "5m"
+          },
+          {
+            "selected": false,
+            "text": "10m",
+            "value": "10m"
+          },
+          {
+            "selected": false,
+            "text": "30m",
+            "value": "30m"
+          },
+          {
+            "selected": false,
+            "text": "1h",
+            "value": "1h"
+          },
+          {
+            "selected": false,
+            "text": "6h",
+            "value": "6h"
+          },
+          {
+            "selected": false,
+            "text": "12h",
+            "value": "12h"
+          },
+          {
+            "selected": false,
+            "text": "1d",
+            "value": "1d"
+          },
+          {
+            "selected": false,
+            "text": "7d",
+            "value": "7d"
+          },
+          {
+            "selected": false,
+            "text": "14d",
+            "value": "14d"
+          },
+          {
+            "selected": false,
+            "text": "30d",
+            "value": "30d"
+          }
+        ],
+        "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+        "queryValue": "",
+        "refresh": 2,
+        "skipUrlSync": false,
+        "type": "interval"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "text": "2",
+          "value": "2"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": null,
+        "multi": false,
+        "name": "intel_rdt",
+        "options": [
+          {
+            "selected": false,
+            "text": "0",
+            "value": "0"
+          },
+          {
+            "selected": false,
+            "text": "1",
+            "value": "1"
+          },
+          {
+            "selected": true,
+            "text": "2",
+            "value": "2"
+          },
+          {
+            "selected": false,
+            "text": "3",
+            "value": "3"
+          },
+          {
+            "selected": false,
+            "text": "4",
+            "value": "4"
+          },
+          {
+            "selected": false,
+            "text": "5",
+            "value": "5"
+          },
+          {
+            "selected": false,
+            "text": "6",
+            "value": "6"
+          },
+          {
+            "selected": false,
+            "text": "7",
+            "value": "7"
+          },
+          {
+            "selected": false,
+            "text": "8",
+            "value": "8"
+          },
+          {
+            "selected": false,
+            "text": "9",
+            "value": "9"
+          },
+          {
+            "selected": false,
+            "text": "10",
+            "value": "10"
+          },
+          {
+            "selected": false,
+            "text": "11",
+            "value": "11"
+          },
+          {
+            "selected": false,
+            "text": "12",
+            "value": "12"
+          },
+          {
+            "selected": false,
+            "text": "13",
+            "value": "13"
+          },
+          {
+            "selected": false,
+            "text": "14",
+            "value": "14"
+          },
+          {
+            "selected": false,
+            "text": "15",
+            "value": "15"
+          },
+          {
+            "selected": false,
+            "text": "16",
+            "value": "16"
+          },
+          {
+            "selected": false,
+            "text": "17",
+            "value": "17"
+          },
+          {
+            "selected": false,
+            "text": "18",
+            "value": "18"
+          },
+          {
+            "selected": false,
+            "text": "19",
+            "value": "19"
+          },
+          {
+            "selected": false,
+            "text": "20",
+            "value": "20"
+          },
+          {
+            "selected": false,
+            "text": "21",
+            "value": "21"
+          },
+          {
+            "selected": false,
+            "text": "22",
+            "value": "22"
+          },
+          {
+            "selected": false,
+            "text": "23",
+            "value": "23"
+          },
+          {
+            "selected": false,
+            "text": "24",
+            "value": "24"
+          },
+          {
+            "selected": false,
+            "text": "25",
+            "value": "25"
+          },
+          {
+            "selected": false,
+            "text": "26",
+            "value": "26"
+          },
+          {
+            "selected": false,
+            "text": "27",
+            "value": "27"
+          },
+          {
+            "selected": false,
+            "text": "28",
+            "value": "28"
+          },
+          {
+            "selected": false,
+            "text": "29",
+            "value": "29"
+          },
+          {
+            "selected": false,
+            "text": "30",
+            "value": "30"
+          },
+          {
+            "selected": false,
+            "text": "31",
+            "value": "31"
+          },
+          {
+            "selected": false,
+            "text": "32",
+            "value": "32"
+          },
+          {
+            "selected": false,
+            "text": "33",
+            "value": "33"
+          },
+          {
+            "selected": false,
+            "text": "34",
+            "value": "34"
+          },
+          {
+            "selected": false,
+            "text": "35",
+            "value": "35"
+          },
+          {
+            "selected": false,
+            "text": "36",
+            "value": "36"
+          },
+          {
+            "selected": false,
+            "text": "37",
+            "value": "37"
+          },
+          {
+            "selected": false,
+            "text": "38",
+            "value": "38"
+          },
+          {
+            "selected": false,
+            "text": "39",
+            "value": "39"
+          },
+          {
+            "selected": false,
+            "text": "40",
+            "value": "40"
+          },
+          {
+            "selected": false,
+            "text": "41",
+            "value": "41"
+          },
+          {
+            "selected": false,
+            "text": "42",
+            "value": "42"
+          },
+          {
+            "selected": false,
+            "text": "43",
+            "value": "43"
+          },
+          {
+            "selected": false,
+            "text": "44",
+            "value": "44"
+          },
+          {
+            "selected": false,
+            "text": "45",
+            "value": "45"
+          },
+          {
+            "selected": false,
+            "text": "46",
+            "value": "46"
+          },
+          {
+            "selected": false,
+            "text": "47",
+            "value": "47"
+          },
+          {
+            "selected": false,
+            "text": "48",
+            "value": "48"
+          },
+          {
+            "selected": false,
+            "text": "49",
+            "value": "49"
+          },
+          {
+            "selected": false,
+            "text": "50",
+            "value": "50"
+          },
+          {
+            "selected": false,
+            "text": "51",
+            "value": "51"
+          },
+          {
+            "selected": false,
+            "text": "52",
+            "value": "52"
+          },
+          {
+            "selected": false,
+            "text": "53",
+            "value": "53"
+          },
+          {
+            "selected": false,
+            "text": "54",
+            "value": "54"
+          },
+          {
+            "selected": false,
+            "text": "55",
+            "value": "55"
+          },
+          {
+            "selected": false,
+            "text": "56",
+            "value": "56"
+          },
+          {
+            "selected": false,
+            "text": "57",
+            "value": "57"
+          },
+          {
+            "selected": false,
+            "text": "58",
+            "value": "58"
+          },
+          {
+            "selected": false,
+            "text": "59",
+            "value": "59"
+          },
+          {
+            "selected": false,
+            "text": "60",
+            "value": "60"
+          },
+          {
+            "selected": false,
+            "text": "61",
+            "value": "61"
+          },
+          {
+            "selected": false,
+            "text": "62",
+            "value": "62"
+          },
+          {
+            "selected": false,
+            "text": "63",
+            "value": "63"
+          },
+          {
+            "selected": false,
+            "text": "64",
+            "value": "64"
+          }
+        ],
+        "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+        "queryValue": "",
+        "skipUrlSync": false,
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "RDT (L3 Cache)",
+  "uid": "kuro-rdt",
+  "version": 9
+}
\ No newline at end of file