From 570c42174e3503ece1443ff6f44b5c7fda1798e4 Mon Sep 17 00:00:00 2001 From: SAUCEZ Damien <damien.saucez@inria.fr> Date: Fri, 23 Aug 2024 17:27:18 +0200 Subject: [PATCH] First version with new structure --- 5g.yaml | 13 + Dockerfile | 10 + README.md | 115 +- bp-monitoring.yml | 18 + collections/requirements.yml | 3 +- flexric.yaml | 6 + inventories/production/group_vars/all | 3 + .../production/group_vars/lowlatency_kernel | 48 + inventories/production/hosts | 40 + inventories/staging/group_vars/all | 3 + .../staging/group_vars/lowlatency_kernel | 48 + inventories/staging/hosts | 36 + k8s-cluster.yaml | 73 + k8s-ready.yaml | 8 + lowlatency_kernel.yaml | 6 + lpg.yaml | 10 + params.5g.yaml | 67 + params.production.yaml | 42 + params.yaml | 43 + requirements.txt | 6 + roles/5g/base/files/gen_oai.py | 160 ++ roles/5g/base/tasks/main.yaml | 42 + roles/5g/base/templates/manifest.yaml | 25 + .../oai-5g-core/oai-5g-basic/config.yaml.j2 | 178 ++ .../oai-5g-core/oai-5g-basic/values.yaml.j2 | 196 ++ .../oai-nrf/templates/service.yaml | 21 + .../oai-5g-ran/oai-cu-cp/values.yaml.j2 | 131 + .../oai-5g-ran/oai-cu-up/values.yaml.j2 | 129 + .../charts/oai-5g-ran/oai-cu/values.yaml.j2 | 135 + .../charts/oai-5g-ran/oai-du/values.yaml.j2 | 128 + .../oai-gnb/templates/configmap.yaml | 294 ++ .../oai-gnb/templates/deployment.yaml | 377 +++ .../charts/oai-5g-ran/oai-gnb/values.yaml.j2 | 137 + .../oai-5g-ran/oai-nr-ue/values.yaml.j2 | 89 + roles/5g/core/tasks/main.yaml | 28 + roles/5g/flexric/tasks/build_flexric.yaml | 108 + roles/5g/flexric/tasks/main.yaml | 18 + roles/5g/flexric/templates/values.yaml.j2 | 37 + roles/5g/ran/tasks/main.yaml | 65 + roles/5g/ue/tasks/main.yaml | 28 + roles/cluster_monitoring/tasks/main.yaml | 9 + roles/common/README.md | 26 + roles/common/defaults/main.yaml | 24 + roles/common/tasks/main.yaml | 26 + roles/common/tasks/ubuntu.yaml | 21 + roles/docker-cri/README.md | 25 + roles/docker-cri/defaults/main.yaml | 1 + roles/docker-cri/tasks/main.yaml | 60 + roles/docker/README.md | 51 + roles/docker/defaults/main.yaml | 18 + roles/docker/meta/main.yaml | 2 + roles/docker/tasks/main.yaml | 116 + roles/docker/templates/daemon.json.j2 | 18 + roles/docker/templates/docker.service.d.j2 | 2 + roles/grafana-loki-prometheus/README.md | 428 +++ .../grafana/defaults/main.yaml | 1 + .../files/dashboards/cluster-monitoring.json | 1443 ++++++++++ .../cluster_monitoring_per_node.json | 2384 +++++++++++++++++ .../files/dashboards/k8s_monitoring.json | 2199 +++++++++++++++ .../files/dashboards/loki_general.json | 284 ++ .../files/dashboards/node_exporter.json | 1222 +++++++++ .../files/dashboards/oai_monitoring.json | 552 ++++ .../grafana/tasks/main.yml | 107 + .../grafana/templates/datasources.json | 22 + .../grafana/templates/datasources2.json | 1 + .../grafana/templates/grafana.ini.j2 | 1641 ++++++++++++ .../loki/defaults/main.yaml | 1 + .../loki/tasks/main.yml | 41 + .../loki/templates/loki-local-config.yaml.j2 | 48 + .../prometheus/defaults/main.yaml | 1 + .../prometheus/files/genpass.py | 10 + .../prometheus/tasks/main.yml | 43 + .../prometheus/templates/prometheus.yml.j2 | 20 + .../prometheus/templates/web.yml.j2 | 2 + roles/ha/README.md | 29 + roles/ha/tasks/main.yaml | 16 + roles/ha/templates/haproxy.cfg.j2 | 53 + roles/k8s/auth/README.md | 72 + roles/k8s/auth/tasks/main.yaml | 40 + roles/k8s/base/README.md | 84 + roles/k8s/base/defaults/main.yaml | 16 + roles/k8s/base/meta/main.yaml | 4 + roles/k8s/base/tasks/install.yaml | 38 + roles/k8s/base/tasks/main.yaml | 46 + .../base/templates/startup.swapoff.service.j2 | 8 + roles/k8s/base/vars/main.yaml | 2 + roles/k8s/create/README.md | 55 + roles/k8s/create/defaults/main.yaml | 7 + roles/k8s/create/meta/main.yaml | 2 + roles/k8s/create/tasks/cni_calico.yaml | 23 + roles/k8s/create/tasks/cni_flannel.yaml | 19 + roles/k8s/create/tasks/main.yaml | 58 + .../create/templates/kubeadm_config.yaml.j2 | 31 + .../create/templates/tigera_operator.yaml.j2 | 12 + roles/k8s/join/README.md | 67 + roles/k8s/join/meta/main.yaml | 2 + roles/k8s/join/tasks/main.yaml | 33 + .../k8s/join/templates/kubeadm_config.yaml.j2 | 22 + roles/k8s/metallb/README.md | 31 + roles/k8s/metallb/tasks/main.yaml | 19 + roles/k8s/metallb/templates/metallb.yaml.j2 | 16 + roles/kernel/README.md | 48 + roles/kernel/defaults/main.yaml | 7 + roles/kernel/tasks/custom_kernel.yaml | 33 + roles/kernel/tasks/main.yaml | 61 + roles/packages/README.md | 37 + roles/packages/tasks/main.yaml | 52 + roles/post-5g-bp-cluster-monitoring/README.md | 73 + .../cadvisor/tasks/main.yml | 13 + .../cadvisor/templates/cadvisor.yaml.j2 | 97 + .../kube-state-metrics/tasks/main.yml | 13 + .../templates/kube-state-metrics.yaml.j2 | 128 + .../node-exporter/tasks/main.yml | 13 + .../templates/node-exporter.yaml.j2 | 103 + .../prometheus/tasks/main.yml | 33 + .../prometheus/templates/prometheus.yaml.j2 | 107 + .../promtail/tasks/main.yml | 13 + .../promtail/templates/promtail.yaml.j2 | 138 + 118 files changed, 15467 insertions(+), 80 deletions(-) create mode 100644 5g.yaml create mode 100644 Dockerfile create mode 100644 bp-monitoring.yml create mode 100644 flexric.yaml create mode 100644 inventories/production/group_vars/all create mode 100644 inventories/production/group_vars/lowlatency_kernel create mode 100644 inventories/production/hosts create mode 100644 inventories/staging/group_vars/all create mode 100644 inventories/staging/group_vars/lowlatency_kernel create mode 100644 inventories/staging/hosts create mode 100644 k8s-cluster.yaml create mode 100644 k8s-ready.yaml create mode 100644 lowlatency_kernel.yaml create mode 100644 lpg.yaml create mode 100644 params.5g.yaml create mode 100644 params.production.yaml create mode 100644 params.yaml create mode 100644 requirements.txt create mode 100644 roles/5g/base/files/gen_oai.py create mode 100644 roles/5g/base/tasks/main.yaml create mode 100644 roles/5g/base/templates/manifest.yaml create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/config.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-nrf/templates/service.yaml create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-du/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/configmap.yaml create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/deployment.yaml create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/values.yaml.j2 create mode 100644 roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue/values.yaml.j2 create mode 100644 roles/5g/core/tasks/main.yaml create mode 100644 roles/5g/flexric/tasks/build_flexric.yaml create mode 100644 roles/5g/flexric/tasks/main.yaml create mode 100644 roles/5g/flexric/templates/values.yaml.j2 create mode 100644 roles/5g/ran/tasks/main.yaml create mode 100644 roles/5g/ue/tasks/main.yaml create mode 100644 roles/cluster_monitoring/tasks/main.yaml create mode 100644 roles/common/README.md create mode 100644 roles/common/defaults/main.yaml create mode 100644 roles/common/tasks/main.yaml create mode 100644 roles/common/tasks/ubuntu.yaml create mode 100644 roles/docker-cri/README.md create mode 100644 roles/docker-cri/defaults/main.yaml create mode 100644 roles/docker-cri/tasks/main.yaml create mode 100644 roles/docker/README.md create mode 100644 roles/docker/defaults/main.yaml create mode 100644 roles/docker/meta/main.yaml create mode 100644 roles/docker/tasks/main.yaml create mode 100644 roles/docker/templates/daemon.json.j2 create mode 100644 roles/docker/templates/docker.service.d.j2 create mode 100644 roles/grafana-loki-prometheus/README.md create mode 100644 roles/grafana-loki-prometheus/grafana/defaults/main.yaml create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/cluster-monitoring.json create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/cluster_monitoring_per_node.json create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/k8s_monitoring.json create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/loki_general.json create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/node_exporter.json create mode 100755 roles/grafana-loki-prometheus/grafana/files/dashboards/oai_monitoring.json create mode 100644 roles/grafana-loki-prometheus/grafana/tasks/main.yml create mode 100644 roles/grafana-loki-prometheus/grafana/templates/datasources.json create mode 100644 roles/grafana-loki-prometheus/grafana/templates/datasources2.json create mode 100644 roles/grafana-loki-prometheus/grafana/templates/grafana.ini.j2 create mode 100644 roles/grafana-loki-prometheus/loki/defaults/main.yaml create mode 100644 roles/grafana-loki-prometheus/loki/tasks/main.yml create mode 100644 roles/grafana-loki-prometheus/loki/templates/loki-local-config.yaml.j2 create mode 100644 roles/grafana-loki-prometheus/prometheus/defaults/main.yaml create mode 100644 roles/grafana-loki-prometheus/prometheus/files/genpass.py create mode 100644 roles/grafana-loki-prometheus/prometheus/tasks/main.yml create mode 100644 roles/grafana-loki-prometheus/prometheus/templates/prometheus.yml.j2 create mode 100644 roles/grafana-loki-prometheus/prometheus/templates/web.yml.j2 create mode 100644 roles/ha/README.md create mode 100644 roles/ha/tasks/main.yaml create mode 100644 roles/ha/templates/haproxy.cfg.j2 create mode 100644 roles/k8s/auth/README.md create mode 100644 roles/k8s/auth/tasks/main.yaml create mode 100644 roles/k8s/base/README.md create mode 100644 roles/k8s/base/defaults/main.yaml create mode 100644 roles/k8s/base/meta/main.yaml create mode 100644 roles/k8s/base/tasks/install.yaml create mode 100644 roles/k8s/base/tasks/main.yaml create mode 100644 roles/k8s/base/templates/startup.swapoff.service.j2 create mode 100644 roles/k8s/base/vars/main.yaml create mode 100644 roles/k8s/create/README.md create mode 100644 roles/k8s/create/defaults/main.yaml create mode 100644 roles/k8s/create/meta/main.yaml create mode 100644 roles/k8s/create/tasks/cni_calico.yaml create mode 100644 roles/k8s/create/tasks/cni_flannel.yaml create mode 100644 roles/k8s/create/tasks/main.yaml create mode 100644 roles/k8s/create/templates/kubeadm_config.yaml.j2 create mode 100644 roles/k8s/create/templates/tigera_operator.yaml.j2 create mode 100644 roles/k8s/join/README.md create mode 100644 roles/k8s/join/meta/main.yaml create mode 100644 roles/k8s/join/tasks/main.yaml create mode 100644 roles/k8s/join/templates/kubeadm_config.yaml.j2 create mode 100644 roles/k8s/metallb/README.md create mode 100644 roles/k8s/metallb/tasks/main.yaml create mode 100644 roles/k8s/metallb/templates/metallb.yaml.j2 create mode 100644 roles/kernel/README.md create mode 100644 roles/kernel/defaults/main.yaml create mode 100644 roles/kernel/tasks/custom_kernel.yaml create mode 100644 roles/kernel/tasks/main.yaml create mode 100644 roles/packages/README.md create mode 100644 roles/packages/tasks/main.yaml create mode 100644 roles/post-5g-bp-cluster-monitoring/README.md create mode 100644 roles/post-5g-bp-cluster-monitoring/cadvisor/tasks/main.yml create mode 100644 roles/post-5g-bp-cluster-monitoring/cadvisor/templates/cadvisor.yaml.j2 create mode 100644 roles/post-5g-bp-cluster-monitoring/kube-state-metrics/tasks/main.yml create mode 100644 roles/post-5g-bp-cluster-monitoring/kube-state-metrics/templates/kube-state-metrics.yaml.j2 create mode 100644 roles/post-5g-bp-cluster-monitoring/node-exporter/tasks/main.yml create mode 100644 roles/post-5g-bp-cluster-monitoring/node-exporter/templates/node-exporter.yaml.j2 create mode 100644 roles/post-5g-bp-cluster-monitoring/prometheus/tasks/main.yml create mode 100644 roles/post-5g-bp-cluster-monitoring/prometheus/templates/prometheus.yaml.j2 create mode 100644 roles/post-5g-bp-cluster-monitoring/promtail/tasks/main.yml create mode 100644 roles/post-5g-bp-cluster-monitoring/promtail/templates/promtail.yaml.j2 diff --git a/5g.yaml b/5g.yaml new file mode 100644 index 0000000..e9a86ad --- /dev/null +++ b/5g.yaml @@ -0,0 +1,13 @@ +- name: Deploy Core, RAN, and UE + hosts: masters[0] + + roles: + - role: 5g/base + - role: 5g/core + when: GCN.core.present | default(false) + - role: 5g/flexric + when: GCN.flexric.present | default(false) + - role: 5g/ran + when: GCN.RAN.present | default(false) + - role: 5g/ue + when: GCN.UE.present | default(false) \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f251765 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM ubuntu:24.04 +RUN apt update && apt install -y python3-pip git rsync vim nano emacs +RUN python3 -m pip install --break-system-packages --user ansible-core==2.17.2 +RUN echo "export PATH=$PATH:~/.local/bin" >> ~/.bashrc +RUN mkdir /reqs/ +COPY collections/requirements.yml /reqs/ansible-galaxy.yml +COPY requirements.txt /reqs/python3.txt +RUN ~/.local/bin/ansible-galaxy install -r /reqs/ansible-galaxy.yml +RUN pip3 install --break-system-packages -r /reqs/python3.txt +WORKDIR /blueprint \ No newline at end of file diff --git a/README.md b/README.md index f6d9753..052b118 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,50 @@ -# reference_implementation +# Deploy the environment +## Deployment node - -## Getting started - -To make it easy for you to get started with GitLab, here's a list of recommended next steps. - -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! - -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: - -``` -cd existing_repo -git remote add origin https://gitlab.inria.fr/slices-ri/blueprints/post-5g/reference_implementation.git -git branch -M main -git push -uf origin main +```console +docker build -t deployment_node -f Dockerfile . ``` -## Integrate with your tools - -- [ ] [Set up project integrations](https://gitlab.inria.fr/slices-ri/blueprints/post-5g/reference_implementation/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README - -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. +```console +docker run --rm -it -v "$(pwd)":/blueprint -v ${HOME}/.ssh/id_rsa:/private_key deployment_node +``` -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. +## Create secrets file -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. +We recommand to encryp the secrets file, for that, we use Ansible vault. To +create the file and vault it, use the following command: -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. +```console +EDITOR=nano ansible-vault edit secrets.yaml +``` -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. +In this secret file you have to define -## Contributing -State if you are open to contributions and what your requirements are for accepting them. +```yaml +secrets: + prometheus_basic_auth_password: REDACTED + grafana_password: REDACTED +``` -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. +This will create the encrypted file `secrets.yaml` that we can use later to +access to critical data. -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. +## deploy the monitoring collector +```console +ansible-playbook -i inventories/staging --extra-vars "@params.yaml" --extra-vars "@secrets.yaml" --ask-vault-pass lpg.yaml +``` -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. +> If you don't want to type the vault password, you can write you vault +> password in a file (e.g., `a_password_file`) and get it from there by +> replacing `--ask-vault-pass` by `--vault-id dev@a_password_file`. -## License -For open source projects, say how it is licensed. +## deploy the cluster +```console +ansible-playbook -i inventories/staging --extra-vars "@params.yaml" --extra-vars "@secrets.yaml" --ask-vault-pass k8s-cluster.yaml +``` -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +## Deploy 5G network +```console +ansible-playbook -i inventories/staging --extra-vars "@params.yaml" --extra-vars "@params.5g.yaml" --extra-vars "@secrets.yaml" --ask-vault-pass 5g.yaml +``` \ No newline at end of file diff --git a/bp-monitoring.yml b/bp-monitoring.yml new file mode 100644 index 0000000..64d5171 --- /dev/null +++ b/bp-monitoring.yml @@ -0,0 +1,18 @@ +- name: Install cluster monitoring + hosts: masters[0] + vars: + remote_write_address: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_address'] }}" + remote_write_port: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_port'] }}" + remote_write_user: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_user'] }}" + remote_write_pass: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_pass'] }}" + remote_data_label: "{{ k8s['cluster_monitoring']['prometheus']['remote_data_label'] }}" + loki_address: "{{ k8s['cluster_monitoring']['promtail']['loki_address'] }}" + loki_port: "{{ k8s['cluster_monitoring']['promtail']['loki_port'] }}" + loki_label: "{{ k8s['cluster_monitoring']['promtail']['loki_label'] }}" + + roles: + - post-5g-bp-cluster-monitoring/prometheus + - post-5g-bp-cluster-monitoring/promtail + - post-5g-bp-cluster-monitoring/cadvisor + - post-5g-bp-cluster-monitoring/node-exporter + - post-5g-bp-cluster-monitoring/kube-state-metrics \ No newline at end of file diff --git a/collections/requirements.yml b/collections/requirements.yml index bc0a48c..e172f66 100644 --- a/collections/requirements.yml +++ b/collections/requirements.yml @@ -5,4 +5,5 @@ collections: - name: kubernetes.core - name: community.crypto - name: community.general - - name: community.docker \ No newline at end of file + - name: community.docker + - name: community.grafana \ No newline at end of file diff --git a/flexric.yaml b/flexric.yaml new file mode 100644 index 0000000..5bb6b04 --- /dev/null +++ b/flexric.yaml @@ -0,0 +1,6 @@ +--- +- name: Deploy FlexRIC + hosts: masters[0] + + roles: + - role: 5g/flexric \ No newline at end of file diff --git a/inventories/production/group_vars/all b/inventories/production/group_vars/all new file mode 100644 index 0000000..fb7b6f0 --- /dev/null +++ b/inventories/production/group_vars/all @@ -0,0 +1,3 @@ +ansible_ssh_common_args: '-o StrictHostKeyChecking=no -oKexAlgorithms=+diffie-hellman-group1-sha1' +ansible_ssh_private_key_file: /private_key +ansible_user: dsaucezi diff --git a/inventories/production/group_vars/lowlatency_kernel b/inventories/production/group_vars/lowlatency_kernel new file mode 100644 index 0000000..d55dcad --- /dev/null +++ b/inventories/production/group_vars/lowlatency_kernel @@ -0,0 +1,48 @@ +install_custom_kernel: true + +tcp_rmem: "4096 87380 {{ 67108864 if ansible_memtotal_mb >= 64000 else 33554432 if ansible_memtotal_mb >= 32000 else 16777216 }}" +tcp_wmem: "4096 87380 {{ 67108864 if ansible_memtotal_mb >= 64000 else 33554432 if ansible_memtotal_mb >= 32000 else 16777216 }}" + +kernel: + image: lowlatency + GRUB_CMDLINE_LINUX_DEFAULT: quiet splash elevator=bfq + attributes: + - option: vm.swappiness + value: 1 + - option: net.core.somaxconn + value: 65535 + - option: net.ipv4.tcp_tw_reuse + value: 1 + - option: net.ipv4.tcp_fin_timeout + value: 30 + - option: net.ipv4.tcp_keepalive_time + value: 1200 + - option: net.ipv4.ip_local_port_range + value: '1024 65000' + - option: net.ipv4.tcp_syncookies + value: 1 + - option: net.ipv4.tcp_synack_retries + value: 2 + - option: net.ipv4.tcp_timestamps + value: 1 + - option: net.ipv4.tcp_max_syn_backlog + value: 65535 + - option: net.core.netdev_max_backlog + value: 65535 + - option: net.ipv4.tcp_rmem + value: '{{ tcp_rmem }}' + - option: net.ipv4.tcp_wmem + value: '{{ tcp_wmem }}' + - option: net.core.rmem_max + value: '{{ tcp_rmem.split()[-1] }}' + - option: net.core.wmem_max + value: '{{ tcp_wmem.split()[-1] }}' + limits: + - domain: '*' + type: 'hard' + item: 'nofile' + value: '4194304' + - domain: '*' + type: 'soft' + item: 'nofile' + value: '4194304' \ No newline at end of file diff --git a/inventories/production/hosts b/inventories/production/hosts new file mode 100644 index 0000000..24af363 --- /dev/null +++ b/inventories/production/hosts @@ -0,0 +1,40 @@ +all: + hosts: + 192.168.0.1: + name: node0 + apiserver_advertise_address: 192.168.0.1 + 192.168.0.3: + name: node1 + apiserver_advertise_address: 192.168.0.3 + 192.168.0.2: + name: node2 + apiserver_advertise_address: 192.168.0.2 + 192.168.0.4: + name: node3 + apiserver_advertise_address: 192.168.0.4 + 192.168.0.5: + name: io + apiserver_advertise_address: 192.168.0.5 + KUBELET_EXTRA_ARGS: + cpu-manager-policy: "static" + kube-reserved=cpu: "2" + children: + workers: + hosts: + 192.168.0.4: + masters: + hosts: + 192.168.0.3: + 192.168.0.2: + lowlatency_kernel: + hosts: + # 192.168.0.: + flexric: + hosts: + # 192.168.0.: + ha: + hosts: + 192.168.0.1: + central_monitor: + hosts: + 192.168.0.1: diff --git a/inventories/staging/group_vars/all b/inventories/staging/group_vars/all new file mode 100644 index 0000000..b3fdb07 --- /dev/null +++ b/inventories/staging/group_vars/all @@ -0,0 +1,3 @@ +ansible_ssh_common_args: '-o StrictHostKeyChecking=no -oKexAlgorithms=+diffie-hellman-group1-sha1' +ansible_ssh_private_key_file: /private_key +ansible_user: ubuntu diff --git a/inventories/staging/group_vars/lowlatency_kernel b/inventories/staging/group_vars/lowlatency_kernel new file mode 100644 index 0000000..d55dcad --- /dev/null +++ b/inventories/staging/group_vars/lowlatency_kernel @@ -0,0 +1,48 @@ +install_custom_kernel: true + +tcp_rmem: "4096 87380 {{ 67108864 if ansible_memtotal_mb >= 64000 else 33554432 if ansible_memtotal_mb >= 32000 else 16777216 }}" +tcp_wmem: "4096 87380 {{ 67108864 if ansible_memtotal_mb >= 64000 else 33554432 if ansible_memtotal_mb >= 32000 else 16777216 }}" + +kernel: + image: lowlatency + GRUB_CMDLINE_LINUX_DEFAULT: quiet splash elevator=bfq + attributes: + - option: vm.swappiness + value: 1 + - option: net.core.somaxconn + value: 65535 + - option: net.ipv4.tcp_tw_reuse + value: 1 + - option: net.ipv4.tcp_fin_timeout + value: 30 + - option: net.ipv4.tcp_keepalive_time + value: 1200 + - option: net.ipv4.ip_local_port_range + value: '1024 65000' + - option: net.ipv4.tcp_syncookies + value: 1 + - option: net.ipv4.tcp_synack_retries + value: 2 + - option: net.ipv4.tcp_timestamps + value: 1 + - option: net.ipv4.tcp_max_syn_backlog + value: 65535 + - option: net.core.netdev_max_backlog + value: 65535 + - option: net.ipv4.tcp_rmem + value: '{{ tcp_rmem }}' + - option: net.ipv4.tcp_wmem + value: '{{ tcp_wmem }}' + - option: net.core.rmem_max + value: '{{ tcp_rmem.split()[-1] }}' + - option: net.core.wmem_max + value: '{{ tcp_wmem.split()[-1] }}' + limits: + - domain: '*' + type: 'hard' + item: 'nofile' + value: '4194304' + - domain: '*' + type: 'soft' + item: 'nofile' + value: '4194304' \ No newline at end of file diff --git a/inventories/staging/hosts b/inventories/staging/hosts new file mode 100644 index 0000000..6d4cc2f --- /dev/null +++ b/inventories/staging/hosts @@ -0,0 +1,36 @@ +all: + hosts: + 192.168.64.15: + name: node1 + # bindPort: 6443 + apiserver_advertise_address: 192.168.64.15 + 192.168.64.16: + name: node2 + # bindPort: 6443 + apiserver_advertise_address: 192.168.64.16 + 192.168.64.41: + name: monitor + # bindPort: 6443 + apiserver_advertise_address: 192.168.64.41 + # KUBELET_EXTRA_ARGS: + # cpu-manager-policy: "static" + # kube-reserved=cpu: "2" + children: + ha: + hosts: + # 192.168.64.92: + masters: + hosts: + 192.168.64.15: + workers: + hosts: + 192.168.64.16: + lowlatency_kernel: + hosts: + 192.168.64.16: + kernel: + image: lowlatency + GRUB_CMDLINE_LINUX_DEFAULT: quiet splash elevator=bfq + central_monitor: + hosts: + 192.168.64.41: diff --git a/k8s-cluster.yaml b/k8s-cluster.yaml new file mode 100644 index 0000000..a8c15b9 --- /dev/null +++ b/k8s-cluster.yaml @@ -0,0 +1,73 @@ +--- +- name: HA Preparation + hosts: ha + + roles: + - ha + +- name: Deploy first master + hosts: masters[0] + + roles: + - k8s/create + - k8s/auth + + post_tasks: + - name: Wait for the node to be ready + ansible.builtin.include_tasks: k8s-ready.yaml + +- name: Setup cluster monitoring + hosts: masters[0] + vars: + remote_write_address: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_address'] }}" + remote_write_port: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_port'] }}" + remote_write_user: "{{ k8s['cluster_monitoring']['prometheus']['remote_write_user'] }}" + remote_write_pass: "{{ secrets['prometheus_basic_auth_password'] }}" + remote_data_label: "{{ k8s['cluster_monitoring']['prometheus']['remote_data_label'] }}" + loki_address: "{{ k8s['cluster_monitoring']['promtail']['loki_address'] }}" + loki_port: "{{ k8s['cluster_monitoring']['promtail']['loki_port'] }}" + loki_label: "{{ k8s['cluster_monitoring']['promtail']['loki_label'] }}" + + roles: + - cluster_monitoring + +- name: Attach masters + hosts: masters[1:] + vars: + token: "{{ hostvars['ansible_dummy_host']['_token'] }}" + ca_cert_hash: "{{ hostvars['ansible_dummy_host']['_ca_cert_hash'] }}" + certificate_key: "{{ hostvars['ansible_dummy_host']['_certificate_key'] }}" + kube_config_local_path: "{{ hostvars['ansible_dummy_host']['_kube_config'] }}" + master: "{{ k8s['apiserver_advertise_address'] }}" + is_control_plane_node: true + environment: + PATH: '{{ansible_env.PATH }}:/usr/local/bin/' + + roles: + - role: k8s/join + post_tasks: + - name: Wait for the node to be ready + ansible.builtin.include_tasks: k8s-ready.yaml + +- name: Install k8s nodes + hosts: workers:!masters + vars: + token: "{{ hostvars['ansible_dummy_host']['_token'] }}" + ca_cert_hash: "{{ hostvars['ansible_dummy_host']['_ca_cert_hash'] }}" + kube_config_local_path: "{{ hostvars['ansible_dummy_host']['_kube_config'] }}" + master: "{{ k8s['apiserver_advertise_address']" + environment: + PATH: '{{ansible_env.PATH }}:/usr/local/bin/' + + roles: + - role: k8s/join + post_tasks: + - name: Wait for the node to be ready + ansible.builtin.include_tasks: k8s-ready.yaml + +- name: Install metallb + hosts: masters[0] + + roles: + - role: k8s/metallb + when: k8s.metallb is defined \ No newline at end of file diff --git a/k8s-ready.yaml b/k8s-ready.yaml new file mode 100644 index 0000000..8a2d219 --- /dev/null +++ b/k8s-ready.yaml @@ -0,0 +1,8 @@ +- name: Wait for the node to be ready + ansible.builtin.shell: 'kubectl wait --timeout=300s --all-namespaces --for=condition=Ready nodes {{ hostvars[inventory_hostname]["name"] }}' + register: node_wait + retries: 10 + until: node_wait is succeeded + +- name: Wait for the pods to be ready + ansible.builtin.shell: 'kubectl wait --timeout=300s --all-namespaces --for=condition=Ready pods --field-selector spec.nodeName={{ hostvars[inventory_hostname]["name"] }}' \ No newline at end of file diff --git a/lowlatency_kernel.yaml b/lowlatency_kernel.yaml new file mode 100644 index 0000000..37d4c99 --- /dev/null +++ b/lowlatency_kernel.yaml @@ -0,0 +1,6 @@ +--- +- name: Low Latency kernel + hosts: lowlatency_kernel + + roles: + - role: kernel \ No newline at end of file diff --git a/lpg.yaml b/lpg.yaml new file mode 100644 index 0000000..4a840cf --- /dev/null +++ b/lpg.yaml @@ -0,0 +1,10 @@ +- hosts: central_monitor + vars: + prometheus_basic_auth_user: "{{ central_monitor['prometheus']['prometheus_basic_auth_user'] }}" + prometheus_basic_auth_password: "{{ secrets['prometheus_basic_auth_password'] }}" + grafana_password: "{{ secrets['grafana_password'] }}" + roles: + - grafana-loki-prometheus/prometheus + - grafana-loki-prometheus/loki + - grafana-loki-prometheus/grafana + \ No newline at end of file diff --git a/params.5g.yaml b/params.5g.yaml new file mode 100644 index 0000000..727fffd --- /dev/null +++ b/params.5g.yaml @@ -0,0 +1,67 @@ +# 5G config +GCN: + config_files: oai-cn5g-fed/ # make sur the path finished by a `/` + core: + present: true + namespace: core + nrfLoadBalancerIP: "192.0.2.250" + cleanup: true + RAN: + present: true + namespace: ran + cleanup: true + split: + f1: true + e1: true + UE: + present: true + cleanup: false + namespace: ran + + flexric: + present: true + + multus: + network: "192.0.2.0/24" + hostInterface: 'ran0' + # routes: + # - dst: 69.0.1.0/24 + # gw: 192.0.2.1 + + mcc: '001' + mnc: '01' + tac: '0x0001' + sst: 1 + + dnns: + - dnn: oai + pdu_session_type: IPV4 + ipv4_subnet: 12.1.1.0/24 + - dnn: ims + pdu_session_type: IPV4V6 + ipv4_subnet: 14.1.1.0/24 + + slices: + - snssai: + sst: 1 + plnms: + - mcc: '001' + mnc: '01' + dnns: + - oai + qos_profile: + 5qi: 5 + session_ambr_ul: 200Mbps + session_ambr_dl: 400Mbps + - snssai: + sst: 1 + sd: "FFFFFF" + plnms: + - mcc: '001' + mnc: '01' + dnns: + - ims + qos_profile: + 5qi: 2 + session_ambr_ul: 100Mbps + session_ambr_dl: 200Mbps diff --git a/params.production.yaml b/params.production.yaml new file mode 100644 index 0000000..68819b6 --- /dev/null +++ b/params.production.yaml @@ -0,0 +1,42 @@ +--- +# docker configuration +docker: + # insecure_registries: + # - 192.0.2.1:5000 + registry_mirrors: + - http://192.168.0.1:5000 + # data_root: "/docker_data" + +# k8s configuration +k8s: + clusterName: vwall-production + # runtime: docker + # cni_plugin: calico + podSubnet: 10.244.0.0/16 + serviceSubnet: 10.96.0.0/16 + dnsDomain: cluster.local + bindPort: 6443 + controlPlaneEndpoint: 192.168.0.69:6443 + calico: + nodeAddressAutodetectionV4: + cidrs: + - 192.168.0.0/24 + encapsulation: VXLAN + metallb: + addresses: + - 172.29.0.250/32 + + cluster_monitoring: + prometheus: + remote_write_address: "192.168.0.1" + remote_write_port: "9090" + remote_write_user: "admin" + remote_data_label: "centralhub" + promtail: + loki_address: 192.168.0.1 + loki_port: 3100 + loki_label: "centralhub" + +central_monitor: + prometheus: + prometheus_basic_auth_user: 'admin' diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..90a6a01 --- /dev/null +++ b/params.yaml @@ -0,0 +1,43 @@ +--- +# docker configuration +docker: + # insecure_registries: + # - 192.0.2.1:5000 + #registry_mirrors: + # - http://192.168.0.1:5000 + # data_root: "/docker_data" + +# k8s configuration +k8s: + clusterName: sopode-stagging + # runtime: docker + # cni_plugin: calico + podSubnet: 10.244.0.0/16 + serviceSubnet: 10.96.0.0/16 + dnsDomain: cluster.local + bindPort: 6443 + # controlPlaneEndpoint: 192.168.64.92:6443 + calico: + nodeAddressAutodetectionV4: + cidrs: + - 192.168.64.0/24 + encapsulation: VXLAN + metallb: + addresses: + - 192.0.2.250/32 + + + cluster_monitoring: + prometheus: + remote_write_address: "192.168.64.41" + remote_write_port: "9090" + remote_write_user: "admin" + remote_data_label: "sopnode-stagging" + promtail: + loki_address: 192.168.64.41 + loki_port: 3100 + loki_label: "sopnode-stagging" + +central_monitor: + prometheus: + prometheus_basic_auth_user: 'admin' diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5a75a41 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +netaddr==0.8.0 +PyYAML==6.0.1 +requests==2.22.0 +bcrypt==3.2.2 +Jinja2==3.0.3 +ipaddr==2.2.0 \ No newline at end of file diff --git a/roles/5g/base/files/gen_oai.py b/roles/5g/base/files/gen_oai.py new file mode 100644 index 0000000..de2b458 --- /dev/null +++ b/roles/5g/base/files/gen_oai.py @@ -0,0 +1,160 @@ +import os +import yaml +from jinja2 import Environment, FileSystemLoader, BaseLoader +import ipaddr + +def to_yaml(data): + return yaml.safe_dump(data).strip() + +def createFile(content, filename): + with open(filename, mode="w", encoding="utf-8") as file: + file.write(content) +########### + +def genIPs(gcn): + net = ipaddr.IPNetwork(gcn['multus']['network']) + base_ip = net.ip + 2 + hostInterface = gcn['multus']['hostInterface'] + + ips = { + "amf": { + "n2": {'ip': str(base_ip), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "smf": { + "n4": {'ip': str(base_ip + 1), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "upf": { + "n3": {'ip': str(base_ip + 2), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n4": {'ip': str(base_ip + 3), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n6": {'ip': str(base_ip + 4), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "gnb": { + "n2": {'ip': str(base_ip + 5), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n3": {'ip': str(base_ip + 6), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + }, + "du": { + "f1": {'ip': str(base_ip + 7), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "cu": { + "f1": {'ip': str(base_ip + 8), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n2": {'ip': str(base_ip + 9), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n3": {'ip': str(base_ip + 10), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "cucp": { + "e1": {'ip': str(base_ip + 11), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n2": {'ip': str(base_ip + 12), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "f1": {'ip': str(base_ip + 13), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "cuup": { + "e1": {'ip': str(base_ip + 14), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "n3": {'ip': str(base_ip + 15), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface}, + "f1": {'ip': str(base_ip + 16), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface} + }, + "trafficserver": { + 'ip': str(base_ip + 17), 'prefixlen': net.prefixlen, 'hostInterface': hostInterface + }, + "nrf": { + "loadBalancerIP": gcn['core']['nrfLoadBalancerIP'] + }, + } + return ips + +def plmnSupportList(gcn): + slices = gcn['slices'] + plnms: [] + + nssais = [] + for _slice in slices: + nssais.append(_slice['snssai']) + + return [ + { + 'mcc': gcn['mcc'], + 'mnc': gcn['mnc'], + 'tac': gcn['tac'], + 'nssai': nssais + } + ] + +def smfInfo(gcn): + slices = gcn['slices'] + + items = list() + for _slice in slices: + item = { + 'sNssai': _slice['snssai'], + 'dnnSmfInfoList': [] + } + for dnn in _slice['dnns']: + item['dnnSmfInfoList'].append({'dnn': dnn}) + items.append(item) + return {'sNssaiSmfInfoList': items} + +def servedGuamiList(gcn): + return [ + { + 'mcc': gcn['mcc'], + 'mnc': gcn['mnc'], + 'amf_region_id': '01', + 'amf_set_id': '001', + 'amf_pointer': '01' + } + ] + + +def readFile(path): + with open(path, "r") as file: + content = file.read() + return content + +def render(templatepath, gcn): + template = environment.get_template(templatepath) + + slices = gcn['slices'] + dnns = gcn['dnns'] + + smf_info = smfInfo(gcn) + plmn_support_list = plmnSupportList(gcn) + served_guami_list = servedGuamiList(gcn) + + content = template.render( + mcc = gcn['mcc'], + mnc = gcn['mnc'], + tac = gcn['tac'], + slices = slices, + dnns = dnns, + smf_info = smf_info, + plmn_support_list = plmn_support_list, + served_guami_list = served_guami_list, + network = gcn['multus'], + core = gcn + ) + + return content + +# ============================================================================== +# load configurations +with open('roles/5g/templates/manifest.yaml', "r") as file: + tpls = yaml.safe_load(file) + +core = yaml.safe_load(os.environ['params_5g']) +gcn = core['GCN'] + +# Prepare rendering environment +environment = Environment(loader=FileSystemLoader(tpls["output_dir"])) +environment.filters.update({'to_yaml': to_yaml}) + +if 'multus' in gcn: + if "ips" not in gcn['multus']: + gcn['multus']['ips'] = genIPs(gcn) + +# Render files +for item in tpls["templates"]: + path = item['template'] + torender = True if 'render' not in item.keys() else item['render'] + + if torender: + content = render(path, gcn) + else: + content = readFile("/".join([tpls["output_dir"], path ])) + createFile(content, item['output']) \ No newline at end of file diff --git a/roles/5g/base/tasks/main.yaml b/roles/5g/base/tasks/main.yaml new file mode 100644 index 0000000..ca6a7dc --- /dev/null +++ b/roles/5g/base/tasks/main.yaml @@ -0,0 +1,42 @@ +- block: + - name: Retrieve OAI + ansible.builtin.git: + repo: https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed.git + dest: oai-cn5g-fed + version: v2.0.1 + force: yes + - name: Synchronize all git submodules + shell: "./scripts/syncComponents.sh" + args: + chdir: oai-cn5g-fed + + - name: generate OAI files + ansible.builtin.shell: python3 roles/5g/files/gen_oai.py + environment: + params_5g: + GCN: '{{ GCN }}' + delegate_to: localhost + +- name: Custom chart values + ansible.posix.synchronize: + src: '{{ GCN.config_files }}' + dest: oai-cn5g-fed/ + recursive: true + checksum: true + when: GCN.config_files is defined + + +# # Deploy the core +# - name: Deploy the core +# ansible.builtin.include_tasks: 'core.yaml' +# when: GCN.core.present | default(false) + +# # Deploy the RAN +# - name: Deploy the RAN +# ansible.builtin.include_tasks: 'ran.yaml' +# when: GCN.RAN.present | default(false) + +# # Deploy the UE +# - name: Deploy the UE +# ansible.builtin.include_tasks: 'ue.yaml' +# when: GCN.UE is defined and GCN.UE.present \ No newline at end of file diff --git a/roles/5g/base/templates/manifest.yaml b/roles/5g/base/templates/manifest.yaml new file mode 100644 index 0000000..1907319 --- /dev/null +++ b/roles/5g/base/templates/manifest.yaml @@ -0,0 +1,25 @@ +output_dir: roles/5g/templates/oai-cn5g-fed/charts +templates: + - template: oai-5g-core/oai-5g-basic/config.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/config.yaml + - template: oai-5g-core/oai-5g-basic/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/values.yaml + - template: oai-5g-core/oai-nrf/templates/service.yaml + output: ./oai-cn5g-fed/charts/oai-5g-core/oai-nrf/templates/service.yaml + render: false + - template: oai-5g-ran/oai-gnb/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/values.yaml + - template: oai-5g-ran/oai-du/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-du/values.yaml + - template: oai-5g-ran/oai-cu/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu/values.yaml + - template: oai-5g-ran/oai-cu-cp/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp/values.yaml + - template: oai-5g-ran/oai-cu-up/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up/values.yaml + - template: oai-5g-ran/oai-nr-ue/values.yaml.j2 + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue/values.yaml + - template: oai-5g-ran/oai-gnb/templates/configmap.yaml + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/configmap.yaml + - template: oai-5g-ran/oai-gnb/templates/deployment.yaml + output: ./oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/deployment.yaml \ No newline at end of file diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/config.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/config.yaml.j2 new file mode 100644 index 0000000..6346e57 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/config.yaml.j2 @@ -0,0 +1,178 @@ +############# Common configuration + +# Log level for all the NFs +log_level: + general: info + +# If you enable registration, the other NFs will use the NRF discovery mechanism +register_nf: + general: yes + +http_version: 2 + +############## SBI Interfaces +### Each NF takes its local SBI interfaces and remote interfaces from here, unless it gets them using NRF discovery mechanisms +nfs: + amf: + host: {{ network.ips.amf.n2.ip }} #oai-amf + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + n2: + interface_name: n2 + port: 38412 + smf: + host: {{ network.ips.smf.n4.ip }} #oai-smf + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + n4: + interface_name: n4 + port: 8805 + upf: + host: {{ network.ips.upf.n3.ip }} #oai-upf + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + n3: + interface_name: n3 + port: 2152 + n4: + interface_name: n4 + port: 8805 + n6: + interface_name: n6 + n9: + interface_name: eth0 + port: 2152 + udm: + host: oai-udm + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + udr: + host: oai-udr + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + ausf: + host: oai-ausf + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + nrf: + host: {{ network.ips.nrf.loadBalancerIP }} #oai-nrf + sbi: + port: 80 + api_version: v1 + interface_name: eth0 + +#### Common for UDR and AMF +database: + host: mysql + user: test + type: mysql + password: test + database_name: oai_db + generate_random: true + connection_timeout: 300 # seconds + +############## NF-specific configuration +amf: + amf_name: "OAI-AMF" + # This really depends on if we want to keep the "mini" version or not + support_features_options: + enable_simple_scenario: no # "no" by default with the normal deployment scenarios with AMF/SMF/UPF/AUSF/UDM/UDR/NRF. + # set it to "yes" to use with the minimalist deployment scenario (including only AMF/SMF/UPF) by using the internal AUSF/UDM implemented inside AMF. + # There's no NRF in this scenario, SMF info is taken from "nfs" section. + enable_nssf: no + enable_smf_selection: yes + relative_capacity: 30 + statistics_timer_interval: 20 # in seconds + emergency_support: false + served_guami_list: +{%- filter indent (width=2*2) %} +{{ served_guami_list | to_yaml }} +{%- endfilter %} + plmn_support_list: +{%- filter indent (width=2*2) %} +{{ plmn_support_list | to_yaml }} +{%- endfilter %} + supported_integrity_algorithms: + - "NIA1" + - "NIA2" + supported_encryption_algorithms: + - "NEA0" + - "NEA1" + - "NEA2" + +smf: + ue_mtu: 1500 + support_features: + use_local_subscription_info: no # Use infos from local_subscription_info or from UDM + use_local_pcc_rules: yes # Use infos from local_pcc_rules or from PCF + # we resolve from NRF, this is just to configure usage_reporting + upfs: + - host: oai-upf + config: + enable_usage_reporting: no + ue_dns: + primary_ipv4: "10.3.2.200" + primary_ipv6: "2001:4860:4860::8888" + secondary_ipv4: "8.8.8.8" + secondary_ipv6: "2001:4860:4860::8888" + ims: + pcscf_ipv4: "192.168.70.139" + pcscf_ipv6: "fe80::7915:f408:1787:db8b" + # the DNN you configure here should be configured in "dnns" + # follows the SmfInfo datatype from 3GPP TS 29.510 + smf_info: +{%- filter indent(width=2*2) %} +{{ smf_info | to_yaml}} +{%- endfilter %} + + local_subscription_infos: +{%- for slice in slices %} +{%- for dnn in slice.dnns %} + - single_nssai: +{%- filter indent(width=4*2) %} +{{ slice.snssai | to_yaml }} +{%- endfilter %} + dnn: "{{ dnn }}" + qos_profile: +{%- filter indent(width=4*2) %} +{{ slice.qos_profile | to_yaml }} +{%- endfilter %} +{%- endfor %} +{%- endfor %} + +upf: + support_features: + enable_bpf_datapath: no # If "on": BPF is used as datapath else simpleswitch is used, DEFAULT= off + enable_snat: yes # If "on": Source natting is done for UE, DEFAULT= off + remote_n6_gw: 127.0.0.1 # Dummy host since simple-switch does not use N6 GW + upf_info: + sNssaiUpfInfoList: +{%- for slice in slices %} + - sNssai: +{%- filter indent(width=4*2) %} +{{ slice.snssai | to_yaml }} +{%- endfilter %} + dnnUpfInfoList: +{%- for dnn in slice.dnns %} + - dnn: {{ dnn}} +{%- endfor %} +{%- endfor %} + +## DNN configuration +dnns: +{%- filter indent(width=1*2) %} +{{ dnns | to_yaml }} +{%- endfilter %} + diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/values.yaml.j2 new file mode 100644 index 0000000..17f2116 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic/values.yaml.j2 @@ -0,0 +1,196 @@ +global: + nfConfigurationConfigMap: oai-5g-basic + clusterIpServiceIpAllocation: true #this will allocate ip-address to cluster Ip service + waitForNRF: true + loadBalancerIP: {{ network.ips.nrf.loadBalancerIP }} + http2Param: "--http2-prior-knowledge" # if waitForNRF is true and http2 is used for all NF then this param is needed + timeout: 1 +mysql: + enabled: true + imagePullPolicy: IfNotPresent + oai5gdatabase: basic + imagePullSecrets: + - name: "regcred" + persistence: + enabled: false +oai-nrf: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-nrf ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + config: + logLevel: "debug" #allowed info/error/debug + nodeSelector: {} +oai-udr: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-udr ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + config: + logLevel: "debug" #allowed info/error/debug + nodeSelector: {} +oai-udm: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-udm ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + config: + logLevel: "debug" #allowed info/error/debug + nodeSelector: {} +oai-ausf: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-ausf ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + config: + logLevel: "debug" #allowed info/error/debug + nodeSelector: {} +oai-amf: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-amf ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + multus: + ## If you don't want to add a default route in your pod then replace this field with "" + defaultGateway: "" + n2Interface: + create: true + Ipadd: "{{ network.ips.amf.n2.ip }}" + Netmask: "{{ network.ips.amf.n2.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n2" + ## If you do not have a gateway leave the field empty + Gateway: + ## If you do not want to add any routes in your pod then leave this field empty + routes: {{ network.routes | default("")}} + hostInterface: {{ network.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + nodeSelector: {} +oai-upf: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-upf ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + imagePullSecrets: + - name: "regcred" + # create an extra interface for N3 incase the gNB is outside the cluster network or there is a need to have dedicated interface for N3 + ## Change these ip-addresses according to your environment + ## N4, N6 are optional only if you want that UPF uses different subnets for different 3gpp interfaces. + multus: + ## If you don't want to add a default route in your pod then replace this field with "" + defaultGateway: "" + n3Interface: + create: true + Ipadd: "{{ network.ips.upf.n3.ip }}" + Netmask: "{{ network.ips.upf.n3.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n3" + ## If you do not have a gateway leave the field empty + Gateway: "" + ## If you don't want to add a default route in your pod then replace this field with "" + routes: {{ network.routes | default("") }} + hostInterface: {{ network.hostInterface | default("bond0") }} + ## For n4 it is better to re-use eth0 interface inside the pod (primary CNI of Kubernetes) + n4Interface: + create: true + Ipadd: "{{ network.ips.upf.n4.ip }}" + Netmask: "{{ network.ips.upf.n4.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n4" + ## If you do not have a gateway leave the field empty + Gateway: "" + ## If you do not want to add any routes in your pod then leave this field empty + routes: "" + hostInterface: {{ network.hostInterface | default("bond0") }} + n6Interface: + create: true + Ipadd: "{{ network.ips.upf.n6.ip }}" + Netmask: "{{ network.ips.upf.n6.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n6" + ## If you do not have a gateway leave the field empty + Gateway: "" + ## If you do not want to add any routes in your pod then leave this field empty + routes: "" + hostInterface: {{ network.hostInterface | default("bond0") }} + #NOTE: If the interface you selected for n6If is wrong then traffic will not be routed towards internet + nodeSelector: {} +# oai-traffic-server: +# enabled: false +# weight: 6 +# kubernetesType: Vanilla +# trafficServer: +# repository: docker.io/oaisoftwarealliance/trf-gen-cn5g +# version: latest +# #pullPolicy: IfNotPresent or Never or Always +# pullPolicy: IfNotPresent +# #Without multus traffic server won't be able to forward packets to spgwu if can't use server then add an extra container in spgwu with iperf3 or install iperf3 package in tcpdump container +# multus: +# create: true +# IPadd: "172.21.6.99" +# Netmask: "22" +# defaultGateway: "172.21.7.254" +# hostInterface: {{ network.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled +# config: +# ueroute: 12.1.1.0/24 +# upfIpadd: 172.21.6.95 +# noOfIperf3Server: 10 +oai-smf: + enabled: true + kubernetesType: Vanilla #Openshift/Vanilla Vanilla for Upstream Kubernetes + nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-smf ## The image will be pulled from dockerhub + version: v2.0.1 ## The branch to be used to pull from dockerhub + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + includeTcpDumpContainer: false #only for taking pcaps inside network function pod + multus: + ## If you don't want to add a default route in your pod then replace this field with "" + defaultGateway: "" + n4Interface: + create: true + Ipadd: "{{ network.ips.smf.n4.ip }}" + Netmask: "{{ network.ips.smf.n4.prefixlen }}" + ## If you do not have a gateway leave the field empty + Gateway: "" + ## If you do not want to add any routes in your pod then leave this field empty + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + routes: {{ network.routes | default("") }} + hostInterface: {{ network.hostInterface | default("bond0") }} + nodeSelector: {} + imagePullSecrets: + - name: "regcred" + # nodeSelector: {} diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-nrf/templates/service.yaml b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-nrf/templates/service.yaml new file mode 100644 index 0000000..97dcd62 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-core/oai-nrf/templates/service.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Chart.Name }} + labels: + {{- include "oai-nrf.labels" . | nindent 4 }} +spec: + type: LoadBalancer + {{- if .Values.global }}{{ if .Values.global.loadBalancerIP }} + loadBalancerIP: {{ .Values.global.loadBalancerIP }} + {{- end }}{{ end }} + ports: + - name: http + # Port accessible outside cluster + port: {{ .Values.exposedPorts.sbi }} + # Port to forward config inside the pod + targetPort: {{ .Values.exposedPorts.sbi }} + protocol: TCP + selector: + {{- include "oai-nrf.selectorLabels" . | nindent 4 }} + diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp/values.yaml.j2 new file mode 100644 index 0000000..3461381 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp/values.yaml.j2 @@ -0,0 +1,131 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-gnb ## dockerhub oaisoftwarealliance/oai-gnb + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-cu-cp-sa" + +## CU-CP can work with 1 virtual ip-address here we are using three different virtual interfaces. +## In case you are using 1 ip-address for all 3 logical interfaces then please change it template/configmap.yaml +## Change these ip-addresses according to your environment + +multus: + # to remove the default gateway change it with "" + defaultGateway: "" + e1Interface: + create: true + IPadd: "{{ network.ips.cucp.e1.ip }}" + Netmask: "{{ network.ips.cucp.e1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "e1" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.cucp.e1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + n2Interface: + create: true + IPadd: "{{ network.ips.cucp.n2.ip }}" + Netmask: "{{ network.ips.cucp.n2.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n2" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: + hostInterface: {{ network.ips.cucp.n2.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + f1cInterface: + create: true + IPadd: "{{ network.ips.cucp.f1.ip }}" + Netmask: "{{ network.ips.cucp.f1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "f1c" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: + hostInterface: {{ network.ips.cucp.f1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + +## configuration file is in template/config.yaml +## It is taken from https://gitlab.eurecom.fr/oai/openairinterface5g/-/blob/develop/ci-scripts/conf_files/gnb-cucp.sa.f1.conf +config: + timeZone: "Europe/Paris" + useAdditionalOptions: "--sa --log_config.global_log_options level,nocolor,time" + cucpName: "oai-cu-cp" + mcc: "{{ core.mcc }}" # check the information with AMF, SMF, UPF + mnc: "{{ core.mnc }}" # check the information with AMF, SMF, UPF + tac: "{{ core.tac}}" # check the information with AMF + sst: "{{ core.sst }}" #currently only 4 standard values are allowed 1,2,3,4 + amfhost: "{{ network.ips.amf.n2.ip }}" # amf ip-address or service-name oai-amf-svc or 172.21.6.94 + n2IfName: "n2" # if multus.n2Interface.create is true then use n2 + n3IfName: "n3" #if multus.n3Interface.create is true then use n3 or you can only use 1 interface n2 or eth0 + f1IfName: "f1" #if multus.f1Interface.create is true then use f1 + e1IfName: "e1" #if multus.f1Interface.create is true then use e1 + f1cuPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + f1duPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + +# Debugging section +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +start: + gnbcucp: true + tcpdump: false + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + requests: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + + +tolerations: [] + +affinity: {} + +terminationGracePeriodSeconds: 5 + +nodeSelector: {} + +nodeName: diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up/values.yaml.j2 new file mode 100644 index 0000000..295809f --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up/values.yaml.j2 @@ -0,0 +1,129 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-nr-cuup ## dockerhub oaisoftwarealliance/oai-gnb + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-cu-up-sa" + +## CU-UP can work with 1 virtual ip-address here we are using three different virtual interfaces. +## In case you are using 1 ip-address for all 3 logical interfaces then please change it template/configmap.yaml +## Change these ip-addresses according to your environment +multus: + #if defaultGateway is empty then it will be removed + defaultGateway: "" + e1Interface: + create: true + IPadd: "{{ network.ips.cuup.e1.ip }}" + Netmask: "{{ network.ips.cuup.e1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "e1" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.cuup.e1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + n3Interface: + create: true + IPadd: "{{ network.ips.cuup.n3.ip }}" + Netmask: "{{ network.ips.cuup.n3.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n3" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: [] + hostInterface: {{ network.ips.cuup.n3.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + f1uInterface: + create: true + IPadd: "{{ network.ips.cuup.f1.ip }}" + Netmask: "{{ network.ips.cuup.f1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "f1u" + # if gatway is empty then it will be removed + #Gateway: "" + #routes: [] + hostInterface: {{ network.ips.cuup.f1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + +## configuration file is in template/config.yaml +## It is taken from https://gitlab.eurecom.fr/oai/openairinterface5g/-/blob/develop/ci-scripts/conf_files/gnb-cuup.sa.f1.conf +config: + timeZone: "Europe/Paris" + useAdditionalOptions: "--sa" + cuupName: "oai-cuup" + mcc: "{{ core.mcc }}" # check the information with AMF, SMF, UPF + mnc: "{{ core.mnc }}" # check the information with AMF, SMF, UPF + tac: "{{ core.tac}}" # check the information with AMF + sst: "{{ core.sst }}" #currently only 4 standard values are allowed 1,2,3,4 + cuCpHost: "{{ network.ips.cucp.e1.ip }}" # + n2IfName: "n2" # if multus.n2Interface.create is true then use n2 + n3IfName: "n3" #if multus.n3Interface.create is true then use n3 or you can only use 1 interface n2 or eth0 + f1IfName: "f1" #if multus.f1uInterface.create is true then use f1 + e1IfName: "e1" #if multus.e1Interface.create is true then use e1 + f1cuPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + f1duPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + +## Debugging section +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +start: + gnbcuup: true #If false the network function container will run in sleep mode for manually testing + tcpdump: false + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + requests: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + +tolerations: [] + +affinity: {} + +terminationGracePeriodSeconds: 5 + +nodeSelector: {} + +nodeName: diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu/values.yaml.j2 new file mode 100644 index 0000000..31b8aad --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-cu/values.yaml.j2 @@ -0,0 +1,135 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-gnb ## dockerhub oaisoftwarealliance/oai-gnb + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-cu-sa" + +# oai-cu can be configured with multiple interface which may correspond to 3GPP logical interfaces. There can be one to one or one to many. +# In case of one to one mapping f1, n2, n3 all can have seperate virtual interfaces. +# In one to many mapping f1,n2 and n3 all can be mapped to one virtual interface. +# Interface mapping is strictly based on your networking environment +## Change these ip-addresses according to your environment +multus: + # if default gatway is empty then it will be removed + defaultGateway: "" + f1Interface: + create: true + IPadd: "{{ network.ips.cu.f1.ip }}" + Netmask: "{{ network.ips.cu.f1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "f1" + # if gatway is commented then it will be removed + #Gateway: "172.21.19.254" + #routes: + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.cu.f1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + n2Interface: + create: true + IPadd: "{{ network.ips.cu.n2.ip }}" + #name inside the pod is hardcoded right now + # name: "n2" + Netmask: "{{ network.ips.cu.n2.prefixlen }}" + # if gatway is empty then it will be removed + #Gateway: "172.21.7.254" + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + hostInterface: {{ network.ips.cu.n2.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + n3Interface: + create: true + IPadd: "{{ network.ips.cu.n3.ip }}" + Netmask: "{{ network.ips.cu.n3.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n3" + # if gatway is commented then it will be removed + #Gateway: "172.21.11.254" + #routes: + hostInterface: {{ network.ips.cu.n3.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + +## If you want to change more configuration parameters then you should mount the config file +# in templates/configmap.yaml +# Example config files --> https://gitlab.eurecom.fr/oai/openairinterface5g/-/tree/develop/targets/PROJECTS/GENERIC-NR-5GC/CONF +config: + mountConfig: false #If config file is mounted then please edit mount.conf in configmap.yaml properly + timeZone: "Europe/Paris" + useAdditionalOptions: "--sa --log_config.global_log_options level,nocolor,time" + # If mounting the configuration file then below parameters are not used + cuName: "oai-cu" + mcc: "{{ core.mcc }}" # check the information with AMF, SMF, UPF + mnc: "{{ core.mnc }}" # check the information with AMF, SMF, UPF + tac: "{{ core.tac}}" # check the information with AMF + sst: "{{ core.sst }}" #currently only 4 standard values are allowed 1,2,3,4 + usrp: rfsim #allowed values rfsim, b2xx, n3xx or x3xx + amfhost: "{{ network.ips.amf.n2.ip }}" # amf ip-address or service-name oai-amf-svc or 172.21.6.94 + n2IfName: "n2" # if multus.n2Interface.create is true then use n2 + n3IfName: "n3" #if multus.n3Interface.create is true then use n3 or you can only use 1 interface n2 or eth0 + f1IfName: "f1" #if multus.f1Interface.create is true then use multus.f1Interface.Ipadd + f1cuPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + f1duPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + +## Debugging section +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +start: + gnbcu: true #If false the network function container will run in sleep mode for manually testing + tcpdump: false + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + requests: + nf: + cpu: 100m + memory: 128Mi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + + +tolerations: [] + +affinity: {} + +terminationGracePeriodSeconds: 5 + +nodeSelector: {} + +nodeName: diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-du/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-du/values.yaml.j2 new file mode 100644 index 0000000..fd7c074 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-du/values.yaml.j2 @@ -0,0 +1,128 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-gnb ## dockerhub oaisoftwarealliance/oai-gnb + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-du-sa" + + +# oai-du helm-charts can be used in RFSimulated mode which does not require multiple interaces. +# In case you use the charts with a RU/USRP you need a dedicated interface with the RU. +## Change these ip-addresses according to your environment + +multus: + # if default gatway is left blank then it will be removed + defaultGateway: "" + f1Interface: + create: true + IPadd: "{{ network.ips.du.f1.ip }}" + Netmask: "{{ network.ips.du.f1.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "f1" + # if gatway is empty then it will be removed + #Gateway: "172.21.7.254" + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.du.f1.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + ruInterface: #Only needed if using a ethernet based RU/USRP + create: false + IPadd: "192.168.80.90" + Netmask: "24" + # #name inside the pod is hardcoded right now + # name: "ru" + # if gatway is commented then it will be remove + #Gateway: "192.168.80.1" #In case you don't have a gateway remove it from here + ## The value must be [0, master's MTU]. If commented it will masters MTU + #mtu: 9000 + hostInterface: "bond0" # Interface of the host machine on which this pod will be scheduled + +## If you want to change more configuration parameters then you should mount the config file +# in templates/configmap.yaml +# Example config files --> https://gitlab.eurecom.fr/oai/openairinterface5g/-/tree/develop/targets/PROJECTS/GENERIC-NR-5GC/CONF +config: + mountConfig: false #If config file is mounted then please edit mount.conf in templates/configmap.yaml properly + timeZone: "Europe/Paris" + useAdditionalOptions: "--sa --rfsim --log_config.global_log_options level,nocolor,time" + duName: "oai-du-rfsim" + mcc: "{{ core.mcc }}" # check the information with AMF, SMF, UPF + mnc: "{{ core.mnc }}" # check the information with AMF, SMF, UPF + tac: "{{ core.tac}}" # check the information with AMF + sst: "{{ core.sst }}" #currently only 4 standard values are allowed 1,2,3,4 + usrp: rfsim #allowed values rfsim, b2xx, n3xx or x3xx + f1IfName: "f1" #if multus.f1Interface.create is true then use f1 + cuHost: "{{ network.ips.cu.f1.ip }}" ## Ip-address or hostname + f1cuPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + f1duPort: "2153" #2153 if using same interface for f1 and n3 else standard port 2152 should be use if f1 and n3 interface are different + + + usrp: rfsim #allowed values rfsim, b2xx, n3xx or x3xx + amfhost: "{{ network.ips.amf.n2.ip }}" # amf ip-address or service-name oai-amf-svc or 172.21.6.94 + n2IfName: "n2" # if multus.n2Interface.create is true then use n2 + n3IfName: "n3" #if multus.n3Interface.create is true then use n3 or you can only use 1 interface n2 or eth0 + +## Debugging section +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +start: + gnbdu: true #If false the network function container will run in sleep mode for manually testing + tcpdump: false + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 2000m + memory: 2Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 200m + memory: 128Mi + requests: + nf: + cpu: 2000m + memory: 2Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + +tolerations: [] + +affinity: {} + +terminationGracePeriodSeconds: 5 + +nodeSelector: {} + +nodeName: \ No newline at end of file diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/configmap.yaml b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/configmap.yaml new file mode 100644 index 0000000..40ede8c --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/configmap.yaml @@ -0,0 +1,294 @@ +{%- if core.flexric is defined and (core.flexric.present | default(false)) %} +--- +#https://gitlab.eurecom.fr/oai/openairinterface5g/-/blob/develop/ci-scripts/conf_files/gnb.sa.band78.106prb.rfsim.conf +--- +### Configuration file for band77 and 51 PRB tested with B210 +apiVersion: v1 +kind: ConfigMap +metadata: + name: conf-flexric +data: + flexric.conf: | + [NEAR-RIC] + NEAR_RIC_IP = @FLEXRIC_IP@ + [XAPP] + DB_PATH = /flexric/db/ + DB_NAME = xapp_rnis_db +{% endif -%} +{% raw %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Chart.Name }}-configmap +data: + gnb.conf: | + Active_gNBs = ( "{{ .Values.config.gnbName}}"); + # Asn1_verbosity, choice in: none, info, annoying + Asn1_verbosity = "none"; + gNBs = + ( + { + ////////// Identification parameters: + gNB_ID = 0xe00; + gNB_name = "{{ .Values.config.gnbName}}"; + // Tracking area code, 0x0000 and 0xfffe are reserved values + tracking_area_code = {{ .Values.config.tac}} ; + plmn_list = ({ mcc = {{ .Values.config.mcc}}; mnc = {{ .Values.config.mnc}}; mnc_length = 2; snssaiList = ({ sst = {{ .Values.config.sst}} }) }); + + nr_cellid = 12345678L + + # tr_s_preference = "local_mac" + + ////////// Physical parameters: + + min_rxtxtime = 6; + + servingCellConfigCommon = ( + { + #spCellConfigCommon + + physCellId = 0; + + # downlinkConfigCommon + #frequencyInfoDL + # this is 3600 MHz + 43 PRBs@30kHz SCS (same as initial BWP) + absoluteFrequencySSB = 641280; + dl_frequencyBand = 78; + # this is 3600 MHz + dl_absoluteFrequencyPointA = 640008; + #scs-SpecificCarrierList + dl_offstToCarrier = 0; + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + dl_subcarrierSpacing = 1; + dl_carrierBandwidth = 106; + #initialDownlinkBWP + #genericParameters + # this is RBstart=27,L=48 (275*(L-1))+RBstart + initialDLBWPlocationAndBandwidth = 28875; # 6366 12925 12956 28875 12952 + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + initialDLBWPsubcarrierSpacing = 1; + #pdcch-ConfigCommon + initialDLBWPcontrolResourceSetZero = 12; + initialDLBWPsearchSpaceZero = 0; + + #uplinkConfigCommon + #frequencyInfoUL + ul_frequencyBand = 78; + #scs-SpecificCarrierList + ul_offstToCarrier = 0; + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + ul_subcarrierSpacing = 1; + ul_carrierBandwidth = 106; + pMax = 20; + #initialUplinkBWP + #genericParameters + initialULBWPlocationAndBandwidth = 28875; + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + initialULBWPsubcarrierSpacing = 1; + #rach-ConfigCommon + #rach-ConfigGeneric + prach_ConfigurationIndex = 98; + #prach_msg1_FDM + #0 = one, 1=two, 2=four, 3=eight + prach_msg1_FDM = 0; + prach_msg1_FrequencyStart = 0; + zeroCorrelationZoneConfig = 13; + preambleReceivedTargetPower = -96; + #preamblTransMax (0...10) = (3,4,5,6,7,8,10,20,50,100,200) + preambleTransMax = 6; + #powerRampingStep + # 0=dB0,1=dB2,2=dB4,3=dB6 + powerRampingStep = 1; + #ra_ReponseWindow + #1,2,4,8,10,20,40,80 + ra_ResponseWindow = 4; + #ssb_perRACH_OccasionAndCB_PreamblesPerSSB_PR + #1=oneeighth,2=onefourth,3=half,4=one,5=two,6=four,7=eight,8=sixteen + ssb_perRACH_OccasionAndCB_PreamblesPerSSB_PR = 4; + #oneHalf (0..15) 4,8,12,16,...60,64 + ssb_perRACH_OccasionAndCB_PreamblesPerSSB = 14; + #ra_ContentionResolutionTimer + #(0..7) 8,16,24,32,40,48,56,64 + ra_ContentionResolutionTimer = 7; + rsrp_ThresholdSSB = 19; + #prach-RootSequenceIndex_PR + #1 = 839, 2 = 139 + prach_RootSequenceIndex_PR = 2; + prach_RootSequenceIndex = 1; + # SCS for msg1, can only be 15 for 30 kHz < 6 GHz, takes precendence over the one derived from prach-ConfigIndex + # + msg1_SubcarrierSpacing = 1, + # restrictedSetConfig + # 0=unrestricted, 1=restricted type A, 2=restricted type B + restrictedSetConfig = 0, + + msg3_DeltaPreamble = 1; + p0_NominalWithGrant =-90; + + # pucch-ConfigCommon setup : + # pucchGroupHopping + # 0 = neither, 1= group hopping, 2=sequence hopping + pucchGroupHopping = 0; + hoppingId = 40; + p0_nominal = -90; + # ssb_PositionsInBurs_BitmapPR + # 1=short, 2=medium, 3=long + ssb_PositionsInBurst_PR = 2; + ssb_PositionsInBurst_Bitmap = 1; + + # ssb_periodicityServingCell + # 0 = ms5, 1=ms10, 2=ms20, 3=ms40, 4=ms80, 5=ms160, 6=spare2, 7=spare1 + ssb_periodicityServingCell = 2; + + # dmrs_TypeA_position + # 0 = pos2, 1 = pos3 + dmrs_TypeA_Position = 0; + + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + subcarrierSpacing = 1; + + + #tdd-UL-DL-ConfigurationCommon + # subcarrierSpacing + # 0=kHz15, 1=kHz30, 2=kHz60, 3=kHz120 + referenceSubcarrierSpacing = 1; + # pattern1 + # dl_UL_TransmissionPeriodicity + # 0=ms0p5, 1=ms0p625, 2=ms1, 3=ms1p25, 4=ms2, 5=ms2p5, 6=ms5, 7=ms10 + dl_UL_TransmissionPeriodicity = 6; + nrofDownlinkSlots = 7; + nrofDownlinkSymbols = 6; + nrofUplinkSlots = 2; + nrofUplinkSymbols = 4; + + ssPBCH_BlockPower = -25; + } + + ); + + # ------- SCTP definitions + SCTP : + { + # Number of streams to use in input/output + SCTP_INSTREAMS = 2; + SCTP_OUTSTREAMS = 2; + }; + + ////////// AMF parameters: + amf_ip_address = ( { ipv4 = "@AMF_IP_ADDRESS@"; + ipv6 = "192:168:30::17"; + active = "yes"; + preference = "ipv4"; + } + ); + + + NETWORK_INTERFACES : + { + GNB_INTERFACE_NAME_FOR_NG_AMF = "{{ .Values.config.n2IfName}}"; + GNB_IPV4_ADDRESS_FOR_NG_AMF = "@N2_IP_ADDRESS@"; + GNB_INTERFACE_NAME_FOR_NGU = "{{ .Values.config.n3IfName}}"; + GNB_IPV4_ADDRESS_FOR_NGU = "@N3_IP_ADDRESS@"; + GNB_PORT_FOR_S1U = 2152; # Spec 2152 + }; + } + ); + + MACRLCs = ( + { + num_cc = 1; + tr_s_preference = "local_L1"; + tr_n_preference = "local_RRC"; + pusch_TargetSNRx10 = 200; + pucch_TargetSNRx10 = 200; + } + ); + + L1s = ( + { + num_cc = 1; + tr_n_preference = "local_mac"; + prach_dtx_threshold = 200; + # pucch0_dtx_threshold = 150; + } + ); + + RUs = ( + { + local_rf = "yes" + nb_tx = 1 + nb_rx = 1 + att_tx = 0 + att_rx = 0; + bands = [78]; + max_pdschReferenceSignalPower = -27; + max_rxgain = 75; + eNB_instances = [0]; + ##beamforming 1x2 matrix: 1 layer x 2 antennas + bf_weights = [0x00007fff, 0x0000]; + ##beamforming 1x4 matrix: 1 layer x 4 antennas + #bf_weights = [0x00007fff, 0x0000,0x0000, 0x0000]; + ## beamforming 2x2 matrix: + # bf_weights = [0x00007fff, 0x00000000, 0x00000000, 0x00007fff]; + ## beamforming 4x4 matrix: + #bf_weights = [0x00007fff, 0x0000, 0x0000, 0x0000, 0x00000000, 0x00007fff, 0x0000, 0x0000, 0x0000, 0x0000, 0x00007fff, 0x0000, 0x0000, 0x0000, 0x0000, 0x00007fff]; + sf_extension = 0 + sdr_addrs = "serial=XXXXXXX" + } + ); + + THREAD_STRUCT = ( + { + #three config for level of parallelism "PARALLEL_SINGLE_THREAD", "PARALLEL_RU_L1_SPLIT", or "PARALLEL_RU_L1_TRX_SPLIT" + parallel_config = "PARALLEL_SINGLE_THREAD"; + #two option for worker "WORKER_DISABLE" or "WORKER_ENABLE" + worker_config = "WORKER_ENABLE"; + } + ); + + rfsimulator: { + serveraddr = "server"; + }; + + security = { + # preferred ciphering algorithms + # the first one of the list that an UE supports in chosen + # valid values: nea0, nea1, nea2, nea3 + ciphering_algorithms = ( "nea0" ); + + # preferred integrity algorithms + # the first one of the list that an UE supports in chosen + # valid values: nia0, nia1, nia2, nia3 + integrity_algorithms = ( "nia2", "nia0" ); + + # setting 'drb_ciphering' to "no" disables ciphering for DRBs, no matter + # what 'ciphering_algorithms' configures; same thing for 'drb_integrity' + drb_ciphering = "yes"; + drb_integrity = "no"; + }; + + log_config : + { + global_log_level ="info"; + hw_log_level ="info"; + phy_log_level ="info"; + mac_log_level ="info"; + rlc_log_level ="info"; + pdcp_log_level ="info"; + rrc_log_level ="info"; + f1ap_log_level ="debug"; + }; +{% endraw %} +{%- if core.flexric is defined and (core.flexric.present | default(false)) %} + e2_agent : + { + near_ric_ip_addr = "@FLEXRIC_IP@"; + sm_dir = "/usr/local/lib/flexric/"; + }; +{% endif -%} \ No newline at end of file diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/deployment.yaml b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/deployment.yaml new file mode 100644 index 0000000..565a13e --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/templates/deployment.yaml @@ -0,0 +1,377 @@ +{%- if core.flexric is defined and (core.flexric.present | default(false)) %} +{% raw%} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Chart.Name }} + labels: + {{- include "oai-gnb.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "oai-gnb.selectorLabels" . | nindent 6 }} + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "oai-gnb.selectorLabels" . | nindent 8 }} + app: oai-gnb + {{- if .Values.multus.n2Interface.create }} + annotations: + k8s.v1.cni.cncf.io/networks: >- + [{ + "name": "{{ .Chart.Name }}-n2", + "interface": "n2" + {{- if .Values.multus.defaultGateway }} + ,"default-route": ["{{ .Values.multus.defaultGateway }}"] + {{- end }} + } + {{- if .Values.multus.n3Interface.create }} + ,{ + "name": "{{ .Chart.Name }}-n3", + "interface": "n3" + {{- if .Values.multus.n3Interface.Gateway }} + ,"gateway": {{ .Values.multus.n3Interface.Gateway }} + {{- end }} + } + {{- end }} + {{- if .Values.multus.ruInterface.create }} + ,{ + "name": "{{ .Chart.Name }}-ru", + "interface": "ru" + {{- if .Values.multus.ruInterface.Gateway }} + ,"gateway": {{ .Values.multus.ruInterface.Gateway }} + {{- end }} + } + {{- end }} + ] + {{- end }} + spec: + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | indent 8 }} + {{- end }} + containers: + - name: gnb + image: "{{ .Values.nfimage.repository }}:{{ .Values.nfimage.version }}" + imagePullPolicy: {{ .Values.nfimage.pullPolicy }} + volumeMounts: + - mountPath: /opt/oai-gnb/etc + name: configuration + # subPath: gnb.conf + - name: conf-flexric + mountPath: /usr/local/etc/flexric/flexric.conf + subPath: flexric.conf + - name: shared-volume + mountPath: /usr/local/lib/flexric/ + # USRP B210/B200 Mini + {{- if eq .Values.config.usrp "b2xx"}} + - mountPath: /dev/bus/usb/ + name: usrp + {{- end}} + {{- if .Values.resources.define}} + resources: + requests: + memory: {{ .Values.resources.requests.nf.memory | quote }} + cpu: {{ .Values.resources.requests.nf.cpu | quote }} + limits: + memory: {{ .Values.resources.limits.nf.memory | quote }} + cpu: {{ .Values.resources.limits.nf.cpu | quote }} + {{- end}} + securityContext: + privileged: true + # capabilities: + # add: + # - NET_ADMIN + # drop: + # - ALL + ports: + - containerPort: 2152 + name: n3 + protocol: UDP + - containerPort: 36412 + name: n2 + protocol: SCTP + {{- if .Values.start.gnb}} + command: [ "/bin/bash", "-c" ] + args: + - AMF_IP_ADDRESS=$(getent hosts $AMF_HOST | awk '{print $1}'); + AMF_IP_ADDRESS=$(if [[ $AMF_IP_ADDRESS ]]; then echo $AMF_IP_ADDRESS; else echo $AMF_HOST;fi); + FLEXRIC_IP=$(getent hosts $FLEXRIC_HOST | awk '{print $1}'); + FLEXRIC_IP=$(if [[ $FLEXRIC_IP ]]; then echo $FLEXRIC_IP; else echo $FLEXRIC_HOST;fi); + N2_IP_ADDRESS=$(ip -f inet addr show $N2_IF_NAME | grep -o "inet [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*"); + N3_IP_ADDRESS=$(ip -f inet addr show $N3_IF_NAME | grep -o "inet [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*"); + sed -e s/@N2_IP_ADDRESS@/$N2_IP_ADDRESS/g + -e s/@N3_IP_ADDRESS@/$N3_IP_ADDRESS/g + -e s/@AMF_IP_ADDRESS@/$AMF_IP_ADDRESS/g + -e s/@FLEXRIC_IP@/$FLEXRIC_IP/g + /opt/oai-gnb/etc/gnb.conf | tee /tmp/gnb.conf; + {{- if eq .Values.config.usrp "b2xx"}} + /opt/oai-gnb/bin/uhd_images_downloader.py -t b2xx; + {{- else if eq .Values.config.usrp "n3xx" }} + /opt/oai-gnb/bin/uhd_images_downloader.py -t n3xx; + {{- else if eq .Values.config.usrp "x3xx" }} + /opt/oai-gnb/bin/uhd_images_downloader.py -t x3xx; + {{- end }} + exec /opt/oai-gnb/bin/nr-softmodem -O /tmp/gnb.conf $USE_ADDITIONAL_OPTIONS; + {{- else}} + command: + - /bin/sleep + - infinity + {{- end}} + env: + - name: TZ + value: {{ .Values.config.timeZone }} + - name: USE_ADDITIONAL_OPTIONS + value: {{ .Values.config.useAdditionalOptions }} + - name: AMF_HOST + value: {{ .Values.config.amfhost }} + - name: FLEXRIC_HOST + value: {{ .Values.config.flexrichost }} + - name: N2_IF_NAME + value: {{ .Values.config.n2IfName }} + - name: N3_IF_NAME + value: {{ .Values.config.n3IfName }} + {{- if .Values.includeTcpDumpContainer }} + - name: tcpdump + image: "{{ .Values.tcpdumpimage.repository }}:{{ .Values.tcpdumpimage.version }}" + imagePullPolicy: {{ .Values.tcpdumpimage.pullPolicy }} + {{- if .Values.resources.define}} + resources: + requests: + memory: {{ .Values.resources.requests.tcpdump.memory | quote }} + cpu: {{ .Values.resources.requests.tcpdump.cpu | quote }} + limits: + memory: {{ .Values.resources.limits.tcpdump.memory | quote }} + cpu: {{ .Values.resources.limits.tcpdump.cpu | quote }} + {{- end}} + securityContext: + privileged: true + capabilities: + add: + - NET_ADMIN + drop: + - ALL + {{- if .Values.start.tcpdump}} + command: + - /bin/sh + - - /usr/sbin/tcpdump -i any -w /tmp/pcap/{{ .Chart.Name }}_`date +%Y-%m-%d_%H_%M-%S-%Z`.pcap + {{- else}} + command: + - /bin/sleep + - infinity + {{- end}} + {{- end }} + volumes: + - configMap: + name: {{ .Chart.Name }}-configmap + name: configuration + - configMap: + name: conf-flexric + name: conf-flexric + - name: shared-volume + hostPath: + path: /mnt/flexric/ + {{- if eq .Values.config.usrp "b2xx"}} + - name: usrp + hostPath: + path: /dev/bus/usb/ + {{- end}} + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccountName: {{ .Values.serviceAccount.name }} + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} + {{- if .Values.nodeSelector}} + nodeSelector: + {{- toYaml .Values.nodeSelector | nindent 12 }} + {{- end }} + {{- if .Values.nodeName}} + nodeName: {{ .Values.nodeName }} + {{- end }} +{% endraw %} +{% else %} ################################################################################################################################################################################################################################################################################### +{% raw%} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Chart.Name }} + labels: + {{- include "oai-gnb.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "oai-gnb.selectorLabels" . | nindent 6 }} + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "oai-gnb.selectorLabels" . | nindent 8 }} + app: oai-gnb + {{- if .Values.multus.n2Interface.create }} + annotations: + k8s.v1.cni.cncf.io/networks: >- + [{ + "name": "{{ .Chart.Name }}-n2", + "interface": "n2" + {{- if .Values.multus.defaultGateway }} + ,"default-route": ["{{ .Values.multus.defaultGateway }}"] + {{- end }} + } + {{- if .Values.multus.n3Interface.create }} + ,{ + "name": "{{ .Chart.Name }}-n3", + "interface": "n3" + {{- if .Values.multus.n3Interface.Gateway }} + ,"gateway": {{ .Values.multus.n3Interface.Gateway }} + {{- end }} + } + {{- end }} + {{- if .Values.multus.ruInterface.create }} + ,{ + "name": "{{ .Chart.Name }}-ru", + "interface": "ru" + {{- if .Values.multus.ruInterface.Gateway }} + ,"gateway": {{ .Values.multus.ruInterface.Gateway }} + {{- end }} + } + {{- end }} + ] + {{- end }} + spec: + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | indent 8 }} + {{- end }} + containers: + - name: gnb + image: "{{ .Values.nfimage.repository }}:{{ .Values.nfimage.version }}" + imagePullPolicy: {{ .Values.nfimage.pullPolicy }} + volumeMounts: + - mountPath: /opt/oai-gnb/etc + name: configuration + # subPath: gnb.conf + # USRP B210/B200 Mini + {{- if eq .Values.config.usrp "b2xx"}} + - mountPath: /dev/bus/usb/ + name: usrp + {{- end}} + {{- if .Values.resources.define}} + resources: + requests: + memory: {{ .Values.resources.requests.nf.memory | quote }} + cpu: {{ .Values.resources.requests.nf.cpu | quote }} + limits: + memory: {{ .Values.resources.limits.nf.memory | quote }} + cpu: {{ .Values.resources.limits.nf.cpu | quote }} + {{- end}} + securityContext: + privileged: true + # capabilities: + # add: + # - NET_ADMIN + # drop: + # - ALL + ports: + - containerPort: 2152 + name: n3 + protocol: UDP + - containerPort: 36412 + name: n2 + protocol: SCTP + {{- if .Values.start.gnb}} + command: [ "/bin/bash", "-c" ] + args: + - AMF_IP_ADDRESS=$(getent hosts $AMF_HOST | awk '{print $1}'); + AMF_IP_ADDRESS=$(if [[ $AMF_IP_ADDRESS ]]; then echo $AMF_IP_ADDRESS; else echo $AMF_HOST;fi); + N2_IP_ADDRESS=$(ip -f inet addr show $N2_IF_NAME | grep -o "inet [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*"); + N3_IP_ADDRESS=$(ip -f inet addr show $N3_IF_NAME | grep -o "inet [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*"); + sed -e s/@N2_IP_ADDRESS@/$N2_IP_ADDRESS/g + -e s/@N3_IP_ADDRESS@/$N3_IP_ADDRESS/g + -e s/@AMF_IP_ADDRESS@/$AMF_IP_ADDRESS/g + /opt/oai-gnb/etc/gnb.conf | tee /tmp/gnb.conf; + {{- if eq .Values.config.usrp "b2xx"}} + /opt/oai-gnb/bin/uhd_images_downloader.py -t b2xx; + {{- else if eq .Values.config.usrp "n3xx" }} + /opt/oai-gnb/bin/uhd_images_downloader.py -t n3xx; + {{- else if eq .Values.config.usrp "x3xx" }} + /opt/oai-gnb/bin/uhd_images_downloader.py -t x3xx; + {{- end }} + exec /opt/oai-gnb/bin/nr-softmodem -O /tmp/gnb.conf $USE_ADDITIONAL_OPTIONS; + {{- else}} + command: + - /bin/sleep + - infinity + {{- end}} + env: + - name: TZ + value: {{ .Values.config.timeZone }} + - name: USE_ADDITIONAL_OPTIONS + value: {{ .Values.config.useAdditionalOptions }} + - name: AMF_HOST + value: {{ .Values.config.amfhost }} + - name: N2_IF_NAME + value: {{ .Values.config.n2IfName }} + - name: N3_IF_NAME + value: {{ .Values.config.n3IfName }} + {{- if .Values.includeTcpDumpContainer }} + - name: tcpdump + image: "{{ .Values.tcpdumpimage.repository }}:{{ .Values.tcpdumpimage.version }}" + imagePullPolicy: {{ .Values.tcpdumpimage.pullPolicy }} + {{- if .Values.resources.define}} + resources: + requests: + memory: {{ .Values.resources.requests.tcpdump.memory | quote }} + cpu: {{ .Values.resources.requests.tcpdump.cpu | quote }} + limits: + memory: {{ .Values.resources.limits.tcpdump.memory | quote }} + cpu: {{ .Values.resources.limits.tcpdump.cpu | quote }} + {{- end}} + securityContext: + privileged: true + capabilities: + add: + - NET_ADMIN + drop: + - ALL + {{- if .Values.start.tcpdump}} + command: + - /bin/sh + - - /usr/sbin/tcpdump -i any -w /tmp/pcap/{{ .Chart.Name }}_`date +%Y-%m-%d_%H_%M-%S-%Z`.pcap + {{- else}} + command: + - /bin/sleep + - infinity + {{- end}} + {{- end }} + volumes: + - configMap: + name: {{ .Chart.Name }}-configmap + name: configuration + {{- if eq .Values.config.usrp "b2xx"}} + - name: usrp + hostPath: + path: /dev/bus/usb/ + {{- end}} + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccountName: {{ .Values.serviceAccount.name }} + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} + {{- if .Values.nodeSelector}} + nodeSelector: + {{- toYaml .Values.nodeSelector | nindent 12 }} + {{- end }} + {{- if .Values.nodeName}} + nodeName: {{ .Values.nodeName }} + {{- end }} +{% endraw %} +{% endif %} \ No newline at end of file diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/values.yaml.j2 new file mode 100644 index 0000000..ac994e0 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-gnb/values.yaml.j2 @@ -0,0 +1,137 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: # image name either locally present or in a public/private repository + repository: docker.io/oaisoftwarealliance/oai-gnb + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +# good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-gnb-sa" + +#service type is fixed to clusterIP, it is only support for non multus interface (eth0) + +# These helm-charts can be used for two purpose +# 1. RFSimulated oai-gNB: Only requires 1 interface for N2 and N3. You can use Kubernetes default interface eth0 +# 2. Physical gNB with Ethernet based USRP or RRU: You need to use ruInterface to connect with RU. Make sure that the ip-address you choose can reach to RU. +# Using a dedicated interface for N2 or N3 is optional you can still use eth0. +# 3. Another thing you can chose to use same virtual interface for N2/N3 its optional to have two different + +## NOTE: To use these charts with USRP B210 you need to modify and mount usb from the host. +## Change these ip-addresses according to your environment + + +multus: + # if default gatway is empty then it will be removed + defaultGateway: "" + n2Interface: + create: true + # #name inside the pod is hardcoded right now + # name: "n2" + IPadd: "{{ network.ips.gnb.n2.ip }}" + Netmask: "{{ network.ips.gnb.n2.prefixlen }}" + # if gatway is empty then it will be removed + #Gateway: "172.21.7.254" + #routes: [{'dst': '10.8.0.0/24','gw': '172.21.7.254'}, {'dst': '10.9.0.0/24','gw': '172.21.7.254'}] + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.gnb.n2.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + n3Interface: + create: true + IPadd: "{{ network.ips.gnb.n3.ip }}" + Netmask: "{{ network.ips.gnb.n3.prefixlen }}" + # #name inside the pod is hardcoded right now + # name: "n3" + # if gatway is empty then it will be removed + #Gateway: "172.21.11.254" + routes: {{ network.routes | default("")}} + hostInterface: {{ network.ips.gnb.n3.hostInterface | default("bond0") }} # Interface of the host machine on which this pod will be scheduled + ruInterface: #Only needed if using a ethernet based RU/USRP + create: false + IPadd: "192.168.80.90" + # #name inside the pod is hardcoded right now + # name: "ru" + Netmask: "24" + # if gatway is commented then it will be removed + Gateway: "192.168.80.1" #In case you don't have a gateway remove it from here + ## The value must be [0, master's MTU]. If commented it will masters MTU + #mtu: 1500 + hostInterface: "bond0" # Interface of the host machine on which this pod will be scheduled + +## If you want to change more configuration parameters then you should mount the config file +# in templates/configmap.yaml +# Example config files --> https://gitlab.eurecom.fr/oai/openairinterface5g/-/tree/develop/targets/PROJECTS/GENERIC-NR-5GC/CONF +config: + timeZone: "Europe/Paris" + useAdditionalOptions: "--sa --rfsim --log_config.global_log_options level,nocolor,time" + gnbName: "oai-gnb-rfsim" + mcc: "{{ core.mcc }}" # check the information with AMF, SMF, UPF + mnc: "{{ core.mnc }}" # check the information with AMF, SMF, UPF + tac: "{{ core.tac}}" # check the information with AMF + sst: "{{ core.sst }}" #currently only 4 standard values are allowed 1,2,3,4 + usrp: rfsim #allowed values rfsim, b2xx, n3xx or x3xx + amfhost: "{{ network.ips.amf.n2.ip }}" # amf ip-address or service-name oai-amf-svc or 172.21.6.94 + flexrichost: "oai-flexric" # DSA + n2IfName: "n2" # if multus.n2Interface.create is true then use n2 + n3IfName: "n3" #if multus.n3Interface.create is true then use n3 or you can only use 1 interface n2 or eth0 + +## Debugging section +start: + gnb: true #If false the network function container will run in sleep mode for manually testing + tcpdump: false + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 2000m + memory: 2Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 200m + memory: 128Mi + requests: + nf: + cpu: 2000m + memory: 2Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + +tolerations: [] +affinity: {} + +terminationGracePeriodSeconds: 5 + +nodeSelector: {} + +nodeName: diff --git a/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue/values.yaml.j2 b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue/values.yaml.j2 new file mode 100644 index 0000000..7322c16 --- /dev/null +++ b/roles/5g/base/templates/oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue/values.yaml.j2 @@ -0,0 +1,89 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift + +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift +nfimage: + repository: docker.io/oaisoftwarealliance/oai-nr-ue # dockehub oaisoftwarealliance/oai-nr-ue + version: 2023.w49 # image tag or develop + # pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "oai-nr-ue-sa" + +# good to use when pulling images from docker-hub mention +imagePullSecrets: + - name: "regcred" + +## Change these ip-addresses according to your environment +multus: + create: false + ipadd: "172.21.6.27" # subnet should be able to reach the gNB + netmask: "22" + # if default route is not needed then leave the field empty + defaultGateway: "172.21.7.254" + hostInterface: "bond0" + +config: + timeZone: "Europe/Paris" + rfSimServer: "oai-ran" # ip-address of rfsim or service name oai-gnb or oai-du + fullImsi: "001010000000100" # make sure all the below entries are present in the subscriber database + fullKey: "fec86ba6eb707ed08905757b1bb44b8f" + opc: "C42449363BBAD02B66D16BC975D77CC1" + dnn: "oai" + sst: "1" # configure according to gnb and amf, smf and upf + sd: "16777215" + usrp: "rfsim" # allowed rfsim, b2xx, n3xx, x3xx + useAdditionalOptions: "--sa --rfsim -r 106 --numerology 1 -C 3619200000 --nokrnmod --log_config.global_log_options level,nocolor,time" + +podSecurityContext: + runAsUser: 0 + runAsGroup: 0 + +start: + nrue: true + tcpdump: false # if false then started in sleep mode else it will start capturing packets, nr ue will have a lot of packets per sec better avoid this or add a filter in the deployment.yaml + +includeTcpDumpContainer: false #If true it will add a tcpdump container inside network function pod for debugging + +## For openshift you can use rhel8/support-tools:8.7-13 +tcpdumpimage: + repository: docker.io/corfr/tcpdump + version: latest + #pullPolicy: IfNotPresent or Never or Always + pullPolicy: IfNotPresent + +## NF is the network function and tcpdump is the tcpdump container. +## To know more about request and limit it is better to understand that how Kubernetes QoS works. +## https://kubernetes.io/docs/concepts/configuration/manage-resources-containers +## https://kubernetes.io/docs/concepts/workloads/pods/pod-qos +resources: + define: false + limits: + nf: + cpu: 1500m + memory: 1Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 200m + memory: 128Mi + requests: + nf: + cpu: 1500m + memory: 1Gi + #If tcpdump container is disabled this value will not be used + tcpdump: + cpu: 100m + memory: 128Mi + +terminationGracePeriodSeconds: 0 + +nodeSelector: {} + +nodeName: diff --git a/roles/5g/core/tasks/main.yaml b/roles/5g/core/tasks/main.yaml new file mode 100644 index 0000000..39fe880 --- /dev/null +++ b/roles/5g/core/tasks/main.yaml @@ -0,0 +1,28 @@ +# Deploy the core +- name: Cleanup core blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.core.namespace }}' + api_version: v1 + kind: Namespace + wait: yes + state: absent + wait_condition: + type: Complete + status: "True" + when: GCN.core.cleanup is defined and GCN.core.cleanup + +- name: Create core blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.core.namespace }}' + api_version: v1 + kind: Namespace + state: present + +- name: Deploy core + kubernetes.core.helm: + name: basic + chart_ref: ./oai-cn5g-fed/charts/oai-5g-core/oai-5g-basic + release_namespace: '{{ GCN.core.namespace }}' + dependency_update: true + wait: true + force: true \ No newline at end of file diff --git a/roles/5g/flexric/tasks/build_flexric.yaml b/roles/5g/flexric/tasks/build_flexric.yaml new file mode 100644 index 0000000..767221f --- /dev/null +++ b/roles/5g/flexric/tasks/build_flexric.yaml @@ -0,0 +1,108 @@ +- block: + - name: Install development tools and libraries + include_role: + name: packages + vars: + packages: + system: + - libboost-all-dev + - libusb-1.0-0-dev + - doxygen + - python3-docutils + - python3-mako + - python3-numpy + - python3-requests + - python3-ruamel.yaml + - python3-setuptools + - python3-dev + - build-essential + - libsctp-dev + - tar + - m4 + - automake + - ccache + - libtool + - cmake + - cmake-curses-gui + - bison + - flex + - gdb + - libpcre2-dev + - gcc-12 + - g++-12 + - mold + - ninja-build + become: true + +- name: Check installed GCC version + shell: gcc --version | grep ^gcc | sed 's/^.* //g' + register: gcc_version + ignore_errors: true + +- name: Print GCC version + debug: + msg: "Installed GCC version is {{ gcc_version.stdout }}" + +- block: + - name: Set alternative for GCC + command: update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 + when: gcc_version.stdout is not search("12") and ansible_facts['distribution'] == 'Ubuntu' + become: true + +- name: Clone SWIG repository + ansible.builtin.git: + repo: https://github.com/swig/swig.git + dest: "swig" + version: release-4.1 + depth: 1 + +- name: Build SWIG + ansible.builtin.shell: | + ./autogen.sh + ./configure --prefix=/usr/ + make -j8 + args: + chdir: "swig" + +- block: + - name: Install SWIG + ansible.builtin.shell: make install + args: + chdir: "swig" + - name: Configure dynamic linker run-time bindings + ansible.builtin.shell: + ldconfig + args: + chdir: "swig" + become: true + + +- name: Clone FlexRIC repository + git: + repo: 'https://gitlab.eurecom.fr/mosaic5g/flexric' + dest: "flexric" + version: "dev" + +- name: Create FlexRIC build directory + ansible.builtin.file: + path: "flexric/build" + state: directory + +- name: Configure FlexRIC + command: + cmd: cmake .. + chdir: "flexric/build" + +- name: Build FlexRIC + command: + cmd: make -j8 + chdir: "flexric/build" + +- block: + - name: Install FlexRIC + command: + cmd: make install + chdir: "flexric/build" + become: true + + diff --git a/roles/5g/flexric/tasks/main.yaml b/roles/5g/flexric/tasks/main.yaml new file mode 100644 index 0000000..7e71409 --- /dev/null +++ b/roles/5g/flexric/tasks/main.yaml @@ -0,0 +1,18 @@ +- name: Retrieve BP-Flexric + ansible.builtin.git: + repo: https://github.com/teo-tsou/oai-flexric.git + dest: bp-flexric + version: master + force: yes + +- name: Configure flexric + ansible.builtin.template: + src: values.yaml.j2 + dest: 'bp-flexric/oai-flexric/values.yaml' + +- name: Deploy flexRIC + kubernetes.core.helm: + name: oai-flexric + chart_ref: bp-flexric/oai-flexric + release_namespace: '{{ GCN.core.namespace }}' + wait: true \ No newline at end of file diff --git a/roles/5g/flexric/templates/values.yaml.j2 b/roles/5g/flexric/templates/values.yaml.j2 new file mode 100644 index 0000000..9e6f7f9 --- /dev/null +++ b/roles/5g/flexric/templates/values.yaml.j2 @@ -0,0 +1,37 @@ +kubernetesType: Vanilla #Vanilla for community kubernetes distribution else Openshift for Openshift +## In case of using these charts on Openshift then please use UBI images +## To know more about them follow this tutorial https://gitlab.eurecom.fr/oai/cn5g/oai-cn5g-fed/-/tree/master/openshift + +replicaCount: 1 + +containerImage: + repository: docker.io/ttsourdinis/bp-flexric + tag: new # image tag or develop + pullPolicy: IfNotPresent # pullPolicy: IfNotPresent or Never or Always + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + name: "oai-flexric" + +service: + ports: + gnb: + port: 36421 + targetPort: 36421 + protocol: SCTP + xapp: + port: 36422 + targetPort: 36422 + protocol: SCTP + +volume: # Specify the shared volume path + sharedVolume: + path: /mnt/flexric/ + +restartPolicy: Always +nodeSelector: {} +tolerations: [] +affinity: {} \ No newline at end of file diff --git a/roles/5g/ran/tasks/main.yaml b/roles/5g/ran/tasks/main.yaml new file mode 100644 index 0000000..a4b5e79 --- /dev/null +++ b/roles/5g/ran/tasks/main.yaml @@ -0,0 +1,65 @@ +# Deploy the RAN +- name: Cleanup RAN blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.RAN.namespace }}' + api_version: v1 + kind: Namespace + wait: yes + state: absent + wait_condition: + type: Complete + status: "True" + when: GCN.RAN.cleanup is defined and GCN.RAN.cleanup + +- name: Create RAN blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.RAN.namespace }}' + api_version: v1 + kind: Namespace + state: present + +- name: Deploy gNB + kubernetes.core.helm: + name: oai-gnb + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-gnb + release_namespace: '{{ GCN.RAN.namespace }}' + wait: true + force: true + when: not GCN.RAN.split.f1 and not GCN.RAN.split.e1 + +- block: + - name: Deploy gNB-CU + kubernetes.core.helm: + name: cu + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu + release_namespace: '{{ GCN.RAN.namespace }}' + wait: true + force: true + when: GCN.RAN.split.f1 and not GCN.RAN.split.e1 + + - block: + - name: Deploy gNB-CU-cp + kubernetes.core.helm: + name: cucp + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu-cp + release_namespace: '{{ GCN.RAN.namespace }}' + wait: true + force: true + - name: Deploy gNB-CU-up + kubernetes.core.helm: + name: cuup + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-cu-up + release_namespace: '{{ GCN.RAN.namespace }}' + wait: true + force: true + when: GCN.RAN.split.f1 and GCN.RAN.split.e1 + + - name: Deploy gNB-DU + kubernetes.core.helm: + name: du + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-du + release_namespace: '{{ GCN.RAN.namespace }}' + wait: true + force: true + when: GCN.RAN.split.f1 + when: GCN.RAN.split.f1 or GCN.RAN.split.e1 \ No newline at end of file diff --git a/roles/5g/ue/tasks/main.yaml b/roles/5g/ue/tasks/main.yaml new file mode 100644 index 0000000..dd26488 --- /dev/null +++ b/roles/5g/ue/tasks/main.yaml @@ -0,0 +1,28 @@ +# Deploy the UE +- name: Cleanup UE blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.UE.namespace }}' + api_version: v1 + kind: Namespace + wait: yes + state: absent + wait_condition: + type: Complete + status: "True" + when: GCN.UE.cleanup is defined and GCN.UE.cleanup + +- name: Create UE blueprint namespace + kubernetes.core.k8s: + name: '{{ GCN.UE.namespace }}' + api_version: v1 + kind: Namespace + state: present + +- name: Deploy UE + kubernetes.core.helm: + name: nrue + chart_ref: ./oai-cn5g-fed/charts/oai-5g-ran/oai-nr-ue + release_namespace: '{{ GCN.UE.namespace }}' + wait: true + force: true + diff --git a/roles/cluster_monitoring/tasks/main.yaml b/roles/cluster_monitoring/tasks/main.yaml new file mode 100644 index 0000000..38fa64c --- /dev/null +++ b/roles/cluster_monitoring/tasks/main.yaml @@ -0,0 +1,9 @@ +- name: Set parameters + ansible.builtin.include_role: + name: '{{ item }}' + loop: + - post-5g-bp-cluster-monitoring/prometheus + - post-5g-bp-cluster-monitoring/promtail + - post-5g-bp-cluster-monitoring/cadvisor + - post-5g-bp-cluster-monitoring/node-exporter + - post-5g-bp-cluster-monitoring/kube-state-metrics diff --git a/roles/common/README.md b/roles/common/README.md new file mode 100644 index 0000000..888c9e3 --- /dev/null +++ b/roles/common/README.md @@ -0,0 +1,26 @@ +# Install common software + +Install common software and tools assumed to be present by default on all the +machines. + +To get the list of installed software and packages check the content of files in +`defaults/`. + +## Depends + +## Parameters +### Variables +#### Required +#### Optionals + +* `go_version`: version of go to be insatlled (see https://go.dev/dl/). Default +to `default_go_version` + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults +* `default_go_version`: 1.22.5 + +## Returns \ No newline at end of file diff --git a/roles/common/defaults/main.yaml b/roles/common/defaults/main.yaml new file mode 100644 index 0000000..4cfc97c --- /dev/null +++ b/roles/common/defaults/main.yaml @@ -0,0 +1,24 @@ +# inspired by https://stackoverflow.com/a/73096785 + https://wiki.debian.org/ArchitectureSpecificsMemo#Summary +deb_architecture: + "arm": "arm" + "aarch64": "arm64" + "x86_64": "amd64" + +ubuntu_dependencies: + - git + - conntrack + - python3-pip + - curl + - apt-transport-https + - ca-certificates + - lsb-release + - patch + - wget + - openssl + - openvpn + +python_packages: + - docker==7.1.0 + - kubernetes==30.1.0 + +default_go_version: 1.22.5 \ No newline at end of file diff --git a/roles/common/tasks/main.yaml b/roles/common/tasks/main.yaml new file mode 100644 index 0000000..e382559 --- /dev/null +++ b/roles/common/tasks/main.yaml @@ -0,0 +1,26 @@ +--- +- include_tasks: "{{ (ansible_distribution | lower) + '.yaml' }}" + +- block: + - name: Install dependencies + include_role: + name: packages + +# Install Go + - name: Download Go + ansible.builtin.get_url: + url: https://go.dev/dl/go{{ go_version | default(default_go_version) }}.linux-{{[ansible_architecture] | map('extract', deb_architecture) | first}}.tar.gz + dest: ./go.tar.gz + + - block: + - name: Remove previous Go installation + ansible.builtin.file: + path: /usr/local/go + state: absent + + - name: Install Go + ansible.builtin.unarchive: + src: ./go.tar.gz + dest: /usr/local + copy: false + become: yes \ No newline at end of file diff --git a/roles/common/tasks/ubuntu.yaml b/roles/common/tasks/ubuntu.yaml new file mode 100644 index 0000000..62706ca --- /dev/null +++ b/roles/common/tasks/ubuntu.yaml @@ -0,0 +1,21 @@ +--- +- name: OS version supported + ansible.builtin.assert: + fail_msg: Ubuntu {{ansible_distribution_version}} not supported + that: + - ansible_distribution | lower == "ubuntu" + - ansible_distribution_version is version("20.04", '>=') + - ansible_distribution_version is version('24.04', '<=') + +- name: Create directory /etc/apt/keyrings + ansible.builtin.file: + path: /etc/apt/keyrings + state: directory + mode: '0755' + become: true + +- name: Set dependencies + set_fact: + packages: + system: "{{ lookup('vars', (ansible_distribution | lower)+ '_dependencies')}}" + python: "{{ python_packages }}" \ No newline at end of file diff --git a/roles/docker-cri/README.md b/roles/docker-cri/README.md new file mode 100644 index 0000000..8431730 --- /dev/null +++ b/roles/docker-cri/README.md @@ -0,0 +1,25 @@ +# Install cri-dockerd + +Download, build and install cri-dockerd, runs it as a service. + +## Depends + + +## Parameters +### Variables +#### Required + +#### Optionals +* `docker_cri_version`: string. cri-dockerd version to be installed (see +https://github.com/Mirantis/cri-dockerd/releases). Default to +`default_docker_cri_version`. + + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults +* `default_docker_cri_version`: v0.3.14 + +## Returns \ No newline at end of file diff --git a/roles/docker-cri/defaults/main.yaml b/roles/docker-cri/defaults/main.yaml new file mode 100644 index 0000000..5627a97 --- /dev/null +++ b/roles/docker-cri/defaults/main.yaml @@ -0,0 +1 @@ +default_docker_cri_version: v0.3.14 \ No newline at end of file diff --git a/roles/docker-cri/tasks/main.yaml b/roles/docker-cri/tasks/main.yaml new file mode 100644 index 0000000..1a97641 --- /dev/null +++ b/roles/docker-cri/tasks/main.yaml @@ -0,0 +1,60 @@ +- name: Retrieve cri-dockerd (1/2) + ansible.builtin.git: + repo: https://github.com/Mirantis/cri-dockerd.git + dest: cri-dockerd + version: "{{ docker_cri_version | default(default_docker_cri_version) }}" + force: yes +- name: Retrieve cri-dockerd (2/2) + ansible.builtin.file: + path: ./cri-dockerd/bin + state: directory + +- name: Build cri-dockerd + ansible.builtin.shell: 'go build -o bin/cri-dockerd' + args: + chdir: ./cri-dockerd + environment: + GOPATH: /usr/local/go/bin + PATH: "/usr/local/go/bin:{{ ansible_env.PATH }}" + +- block: # must be root + - name: Install cri-dockerd (1/4) + ansible.builtin.file: + path: /usr/local/bin + state: directory + + - name: Install cri-dockerd (2/4) + ansible.builtin.shell: 'install -o root -g root -m 0755 bin/cri-dockerd /usr/local/bin/cri-dockerd' + args: + chdir: ./cri-dockerd + + - name: Install cri-dockerd (3/4) + ansible.builtin.shell: 'cp -a packaging/systemd/* /etc/systemd/system' + args: + chdir: ./cri-dockerd + + - name: Install cri-dockerd (4/4) + ansible.builtin.shell: "sed -i -e 's,/usr/bin/cri-dockerd,/usr/local/bin/cri-dockerd,' /etc/systemd/system/cri-docker.service" + args: + chdir: ./cri-dockerd + + - name: Enable and start cri-docker service + ansible.builtin.systemd: + name: cri-docker.service + state: restarted + enabled: yes + daemon_reload: yes + + - name: Enable and start cri-docker socket + ansible.builtin.systemd: + name: cri-docker.socket + state: restarted + enabled: yes + daemon_reload: yes + become: true + +- name: Wait for cri-dockerd to be ready + ansible.builtin.wait_for: + path: /var/run/cri-dockerd.sock + state: present + timeout: 30 \ No newline at end of file diff --git a/roles/docker/README.md b/roles/docker/README.md new file mode 100644 index 0000000..10e43c3 --- /dev/null +++ b/roles/docker/README.md @@ -0,0 +1,51 @@ +# Install docker + +Install docker and docker compose. Necessary kernel modules are loaded and IP +forwarding is activated. + +The current user is added to the `docker` group in order for it to have access +to docker. + +The installation allows to use docker in ramdisk environement. + +The installation allows to use a specific docker data root. + +> **WARNING**: all previous docker installation and configuration files will be +> premanently removed during the installation. If docker is already installed, +> it will be stopped and deinstalled. There is no warranty that the containers +> that were running with the previous docker installation with still work. +> The content of `/etc/systemd/system/docker.service.d/` and `/etc/docker/` will +> be removed before the installation, potentially causing the definitive loss of +> data. If a docker root directory is provided, the directory will be removed +> before the installation of docker, potentially causing the definitive loss of +> data if the provided directory already existed. + +## Depends +Role: `common` + +## Parameters +### Variables +#### Required + +#### Optionals +* `docker.insecure_registries`: list of insecure registries for docker, e.g., +`"192.0.2.1:5000"` (see: https://docs.docker.com/reference/cli/dockerd/#insecure-registries). +Ignored if not defined. + +* `docker.registry_mirrors:` list of docker registry mirrors, e.g., +`"http://192.0.2.1:5001"` (see, https://docs.docker.com/reference/cli/dockerd/#daemon-configuration-file). +Ignored if not defined. + +* `docker.data_root`: string. Path to the directory for root of persisted +Docker data. Ignored if not defined. + +### Inventory hostvars +#### Required +#### Optionals +* `ramdisk`: bool. Define if docker is installed on a system launched in ramdisk + or not. If true the machine is using ramdisk. Otherwise, it is not. Default to + false. + +## Defaults + +## Returns diff --git a/roles/docker/defaults/main.yaml b/roles/docker/defaults/main.yaml new file mode 100644 index 0000000..8703c0f --- /dev/null +++ b/roles/docker/defaults/main.yaml @@ -0,0 +1,18 @@ +# inspired by https://stackoverflow.com/a/73096785 + https://wiki.debian.org/ArchitectureSpecificsMemo#Summary +deb_architecture: + "arm": "arm" + "aarch64": "arm64" + "x86_64": "amd64" + +docker_kernel: + modules: + - 'sctp' + - 'overlay' + - 'br_netfilter' + attributes: + - option: net.bridge.bridge-nf-call-ip6tables + value: 1 + - option: net.bridge.bridge-nf-call-iptables + value: 1 + - option: net.ipv4.ip_forward + value: 1 \ No newline at end of file diff --git a/roles/docker/meta/main.yaml b/roles/docker/meta/main.yaml new file mode 100644 index 0000000..1afe99d --- /dev/null +++ b/roles/docker/meta/main.yaml @@ -0,0 +1,2 @@ +dependencies: + - role: common \ No newline at end of file diff --git a/roles/docker/tasks/main.yaml b/roles/docker/tasks/main.yaml new file mode 100644 index 0000000..f5a2c24 --- /dev/null +++ b/roles/docker/tasks/main.yaml @@ -0,0 +1,116 @@ +# Prepare the machine +- name: Setup kernel + ansible.builtin.include_role: + name: kernel + vars: + kernel: "{{docker_kernel}}" + +# Install docker runtime +- block: # must be root + + - name: Stop docker # Make sure docker is stopped + ansible.builtin.service: + name: docker + state: stopped + ignore_errors: true + + - name: Cleanup docker data root directory + block: + - name: Remove docker data root path + ansible.builtin.file: + path: '{{ docker.data_root }}' + state: absent + - name: Create empty docker data root directory + ansible.builtin.file: + path: '{{ docker.data_root }}' + state: directory + when: (docker.data_root is defined) + + - name: Download Docker key + ansible.builtin.get_url: + url: https://download.docker.com/linux/ubuntu/gpg + dest: /etc/apt/keyrings/docker.asc + mode: '0644' + + - name: Add repository into sources list + ansible.builtin.apt_repository: + repo: "deb [arch={{ [ansible_architecture] | map('extract', deb_architecture) | first }} signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/{{ ansible_system | lower }}/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable" + state: present + filename: docker + + - name: Install docker + include_role: + name: packages + vars: + packages: + system_remove: + - docker.io + - docker-doc + - docker-compose + - docker-compose-v2 + - podman-docker + - containerd + - runc + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + system: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose + - docker-compose-plugin + become: true + +# Configure docker +- block: # must be root + - name: Operate docker on ramdisk machines + block: + - name: Cleanup docker systemd directory + ansible.builtin.file: + path: '/etc/systemd/system/docker.service.d/' + state: absent + - name: Create docker systemd directory + ansible.builtin.file: + path: '/etc/systemd/system/docker.service.d/' + state: directory + - name: Support ramdisk + ansible.builtin.template: + src: docker.service.d.j2 + dest: '/etc/systemd/system/docker.service.d/10-ramdisk.conf' + when: hostvars[inventory_hostname]['ramdisk'] | default(false) + + - name: Custom docker systemd parameters + block: + - name: Cleanup docker directory + ansible.builtin.file: + path: '/etc/docker/' + state: absent + - name: Create docker directory + ansible.builtin.file: + path: '/etc/docker/' + state: directory + - name: Set daemon + ansible.builtin.template: + src: daemon.json.j2 + dest: '/etc/docker/daemon.json' + when: (docker is defined) + + - name: Restart docker # Restart with a fresh docker + ansible.builtin.service: + name: docker + state: restarted + + - name: Add '{{ ansible_user }}' to the docker group + ansible.builtin.user: + name: "{{ ansible_user }}" + groups: docker + append: yes + + - name: Reset SSH connection to allow user group changes to affect 'current login user' + ansible.builtin.meta: reset_connection + become: true + diff --git a/roles/docker/templates/daemon.json.j2 b/roles/docker/templates/daemon.json.j2 new file mode 100644 index 0000000..24183fa --- /dev/null +++ b/roles/docker/templates/daemon.json.j2 @@ -0,0 +1,18 @@ +{ +{% if docker is mapping %} +{% for key, value in docker.items() %} +{% if key == 'insecure_registries' %} + "insecure-registries": {{ value | tojson }} +{% endif %} +{% if key == 'registry_mirrors' %} + "registry-mirrors": {{ value | tojson }} +{% endif %} +{% if key == 'data_root' %} + "data-root": {{ value | tojson }} +{% endif %} +{% if not loop.last %} + , +{% endif %} +{% endfor %} +{% endif %} +} \ No newline at end of file diff --git a/roles/docker/templates/docker.service.d.j2 b/roles/docker/templates/docker.service.d.j2 new file mode 100644 index 0000000..5e59194 --- /dev/null +++ b/roles/docker/templates/docker.service.d.j2 @@ -0,0 +1,2 @@ +[Service] +Environment=DOCKER_RAMDISK=true \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/README.md b/roles/grafana-loki-prometheus/README.md new file mode 100644 index 0000000..bca42a0 --- /dev/null +++ b/roles/grafana-loki-prometheus/README.md @@ -0,0 +1,428 @@ +# Ansible Playbooks for Grafana, Prometheus, and Loki # + +This repository contains Ansible playbooks and roles to automate the deployment of Grafana, Prometheus, and Loki on multiple servers. + +### Directory Structure ### +``` +grafana_loki_prometheus/ +├── grafana/ +│ ├── tasks/ +│ │ └── main.yml +│ ├── templates/ +│ │ ├── grafana.list.j2 +│ │ └── grafana.ini.j2 +│ └── vars/ +│ └── main.yml +├── prometheus/ +│ ├── tasks/ +│ │ └── main.yml +│ ├── templates/ +│ │ ├── prometheus.yml.j2 +│ │ ├── prometheus.service.j2 +│ └── vars/ +│ └── main.yml +├── loki/ +│ ├── tasks/ +│ │ └── main.yml +│ ├── templates/ +│ │ ├── loki-local-config.yaml.j2 +│ │ └── loki.service.j2 +│ └── vars/ +│ └── main.yml +├── site.yml +└── hosts +``` + +## File Explanations ## +##### lpg.yml ##### + +This is the main playbook that includes all the roles needed for deploying Grafana, Prometheus, and Loki. It runs these roles on all specified hosts. + +``` +- hosts: all + become: yes + roles: + - grafana + - prometheus + - loki +``` + +##### hosts ##### + +This inventory file lists the target servers where the playbooks will be executed. It specifies the server addresses and the SSH key to use for connecting. + +``` +[servers] +server1 ansible_host=your_server_ip_1 ansible_user=your_user ansible_ssh_private_key_file=/path/to/your/private/key +server2 ansible_host=your_server_ip_2 ansible_user=your_user ansible_ssh_private_key_file=/path/to/your/private/key +```` + +#### grafana/ #### +#### grafana/tasks/main.yml #### + +This playbook installs and configures Grafana on the target servers: + * Installs dependencies. + * Adds the Grafana GPG key and repository. + * Installs Grafana. + * Configures Grafana using a template. + * Starts and enables the Grafana service. +``` +- name: Install dependencies + apt: + name: + - gnupg2 + - apt-transport-https + - software-properties-common + - wget + state: present + +- name: Add Grafana GPG key + get_url: + url: https://packages.grafana.com/gpg.key + dest: /tmp/grafana.key + +- name: Add Grafana APT repository + shell: | + cat /tmp/grafana.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/grafana.gpg > /dev/null + echo 'deb [signed-by=/etc/apt/trusted.gpg.d/grafana.gpg] https://packages.grafana.com/oss/deb stable main' | tee /etc/apt/sources.list.d/grafana.list + +- name: Update APT cache + apt: + update_cache: yes + +- name: Install Grafana + apt: + name: grafana + state: present + +- name: Configure Grafana + template: + src: grafana.ini.j2 + dest: /etc/grafana/grafana.ini + +- name: Start and enable Grafana service + systemd: + name: grafana-server + state: started + enabled: yes +``` + +#### grafana/templates/grafana.list.j2 #### + +This template file adds the Grafana APT repository. + +``` +deb [signed-by=/etc/apt/trusted.gpg.d/grafana.gpg] https://packages.grafana.com/oss/deb stable main +``` + +#### grafana/templates/grafana.ini.j2 #### + +This template file configures Grafana to bind to all interfaces and sets the HTTP port to 3000. + +``` +[server] +http_addr = 0.0.0.0 +http_port = 3000 +``` + +#### grafana/vars/main.yml #### + +Variables for the Grafana role. Currently empty but can be used to store configuration variables if needed. + +#### prometheus/ #### +#### prometheus/tasks/main.yml #### + +This playbook installs and configures Prometheus on the target servers: +* Creates Prometheus user and group. +* Creates necessary directories. +* Downloads and installs Prometheus. +* Sets permissions for directories. +* Installs Apache2 utils for htpasswd. +* Configures Prometheus using templates. +* Creates and configures the Prometheus systemd service. +* Starts and enables the Prometheus service. + +``` +- name: Create Prometheus group and user + group: + name: prometheus + system: yes + +- name: Create Prometheus user + user: + name: prometheus + shell: /sbin/nologin + group: prometheus + system: yes + +- name: Create directories for Prometheus + file: + path: "{{ item }}" + state: directory + owner: prometheus + group: prometheus + mode: 0775 + with_items: + - /var/lib/prometheus + - /etc/prometheus + - /etc/prometheus/rules + - /etc/prometheus/rules.d + - /etc/prometheus/files_sd + +- name: Download Prometheus + shell: | + curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest | grep browser_download_url | grep linux-amd64 | cut -d '"' -f 4 | wget -qi - + tar xvf prometheus*.tar.gz + mv prometheus*/prometheus prometheus*/promtool /usr/local/bin/ + mv prometheus*/consoles prometheus*/console_libraries prometheus*/prometheus.yml /etc/prometheus/ + +- name: Set permissions for Prometheus directories + file: + path: "{{ item }}" + owner: prometheus + group: prometheus + recurse: yes + with_items: + - /etc/prometheus/rules + - /etc/prometheus/rules.d + - /etc/prometheus/files_sd + - /var/lib/prometheus + +- name: Install Apache2 utils + apt: + name: apache2-utils + state: present + +- name: Set up Prometheus basic auth + shell: htpasswd -nbB {{ prometheus_basic_auth_user }} {{ prometheus_basic_auth_password }} > /etc/prometheus/.htpasswd + +- name: Configure Prometheus + template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + +- name: Create systemd service for Prometheus + template: + src: prometheus.service.j2 + dest: /etc/systemd/system/prometheus.service + +- name: Reload systemd + systemd: + daemon_reload: yes + +- name: Start and enable Prometheus service + systemd: + name: prometheus + state: started + enabled: yes +``` + +#### prometheus/templates/prometheus.yml.j2 #### + +This template configures Prometheus, including the global settings, alerting configuration, and scrape configurations. It includes basic authentication for the web interface. + +``` +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'gateway' + +alerting: + alertmanagers: + - static_configs: + - targets: [] + +rule_files: [] + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + basic_auth: + username: '{{ prometheus_basic_auth_user }}' + password: '{{ prometheus_basic_auth_password }}' +``` + +#### prometheus/templates/prometheus.service.j2 #### + +This template creates the systemd service file for Prometheus, specifying how to start the Prometheus service and which configuration file to use. +``` +[Unit] +Description=Prometheus +Documentation=https://prometheus.io/docs/introduction/overview/ +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecReload=/bin/kill -HUP $MAINPID +ExecStart=/usr/local/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus \ + --web.console.templates=/etc/prometheus/consoles \ + --web.console.libraries=/etc/prometheus/console_libraries \ + --web.listen-address=0.0.0.0:9090 + +SyslogIdentifier=prometheus +Restart=always + +[Install] +WantedBy=multi-user.target +``` + +#### prometheus/vars/main.yml #### + +Variables for the Prometheus role. This includes basic authentication credentials through which Prometheus is exposed. + +``` +prometheus_basic_auth_user: 'admin' +prometheus_basic_auth_password: 'test1234' +``` + +#### loki/ #### +#### loki/tasks/main.yml #### + +This playbook installs and configures Loki on the target servers. +* Updates the APT cache. +* Downloads and installs Loki. +* Configures Loki using a template. +* Creates and configures the Loki systemd service. +* Starts and enables the Loki service. + +``` +- name: Update APT cache + apt: + update_cache: yes + +- name: Download Loki + shell: | + LOKI_VERSION=$(curl -s "https://api.github.com/repos/grafana/loki/releases/latest" | grep -Po '"tag_name": "v\K[0-9.]+') + mkdir -p /opt/loki + wget -qO /opt/loki/loki.gz "https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip" + gunzip /opt/loki/loki.gz + chmod a+x /opt/loki/loki + ln -s /opt/loki/loki /usr/local/bin/loki + wget -qO /opt/loki/loki-local-config.yaml "https://raw.githubusercontent.com/grafana/loki/v${LOKI_VERSION}/cmd/loki/loki-local-config.yaml" + +- name: Configure Loki + template: + src: loki-local-config.yaml.j2 + dest: /opt/loki/loki-local-config.yaml + +- name: Create systemd service for Loki + template: + src: loki.service.j2 + dest: /etc/systemd/system/loki.service + +- name: Reload systemd + systemd: + daemon_reload: yes + +- name: Start and enable Loki service + systemd: + name: loki + state: started + enabled: yes +``` + +#### loki/templates/loki-local-config.yaml.j2 #### + +This template configures Loki, specifying the server settings, storage paths, and other configurations. + +``` +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 0.0.0.0 + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go +# Refer to the buildReport method to see what goes into a report. +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false +``` + +#### loki/templates/loki.service.j2 #### + +This template creates the systemd service file for Loki, specifying how to start the Loki service and which configuration file to use. + +``` +[Unit] +Description=Loki log aggregation system +After=network.target + +[Service] +ExecStart=/opt/loki/loki -config.file=/opt/loki/loki-local-config.yaml +Restart=always + +[Install] +WantedBy=multi-user.target +``` + +#### loki/vars/main.yml #### + +Variables for the Loki role. Currently empty but can be used to store configuration variables if needed. + +## How to Run the Playbooks ## + +Ensure Ansible is installed on your control machine: + +``` +sudo apt update +sudo apt install ansible -y +``` + +Prepare your inventory file (hosts) with the target servers. + +```ansible-playbook -i hosts lpg.yml``` + +Verify the deployment by checking the status of the services on your target servers: + +``` +sudo systemctl status grafana-server +sudo systemctl status prometheus +sudo systemctl status loki +``` +### Contact ### +Nikos Makris - nimakris@uth.gr \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/defaults/main.yaml b/roles/grafana-loki-prometheus/grafana/defaults/main.yaml new file mode 100644 index 0000000..2cc24d7 --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/defaults/main.yaml @@ -0,0 +1 @@ +default_grafana_version: '11.0.2' \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster-monitoring.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster-monitoring.json new file mode 100755 index 0000000..fb5baf4 --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster-monitoring.json @@ -0,0 +1,1443 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard provides cluster admins with the ability to monitor nodes and identify workload bottlenecks. It can be deployed with PSPs enabled using the following helm chart - https://github.com/pivotal-cf/charts-grafana", + "editable": false, + "fiscalYearStartMonth": 0, + "gnetId": 10000, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 34, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 4, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Cluster memory usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 6, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Cluster CPU usage ($interval avg)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 7, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_usage_bytes{id=\"/\"}) / sum (container_fs_limit_bytes{id=\"/\"}) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "title": "Cluster filesystem usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 6 + }, + "id": 9, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 6 + }, + "id": 10, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 6 + }, + "id": 11, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[$interval]))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 6 + }, + "id": 12, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 6 + }, + "id": 13, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_usage_bytes{id=\"/\"})", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 6 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_limit_bytes{id=\"/\"})", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 35, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Memory", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 200 + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "title": "Pods memory usage", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 37, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "CPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "cores", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod_name)", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "title": "Pods CPU usage ($interval avg)", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 33, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Network I/O", + "type": "row" + }, + { + "aliasColors": {}, + "autoMigrateFrom": "graph", + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 26 + }, + "height": "200px", + "id": 32, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[$interval]))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[$interval]))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "title": "Network I/O pressure", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "timeseries", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "Bps", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "autoMigrateFrom": "graph", + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod_name)", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> {{ pod_name }}", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod_name)", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- {{ pod_name }}", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "title": "Pods network I/O ($interval avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "timeseries", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "kubernetes" + ], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 20, + "auto_min": "2m", + "current": { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + "hide": 2, + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "Node", + "options": [], + "query": "label_values(kubernetes_io_hostname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "BP Cluster Monitoring", + "uid": "JABGX_-mz", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster_monitoring_per_node.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster_monitoring_per_node.json new file mode 100755 index 0000000..6dd6ed7 --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/cluster_monitoring_per_node.json @@ -0,0 +1,2384 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 315, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 33, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Network I/O pressure", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false, + "width": 200 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "title": "Network I/O pressure", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 34, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Total usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 4, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Cluster memory usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 6, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Cluster CPU usage (1m avg)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 65 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 7, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "title": "Cluster filesystem usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 13 + }, + "id": 9, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 13 + }, + "id": 10, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 13 + }, + "id": 11, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 13 + }, + "id": 12, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 13 + }, + "id": 13, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 13 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_fs_limit_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "title": "Total", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 35, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Pods CPU usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "cores", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "title": "Pods CPU usage (1m avg)", + "type": "timeseries" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 36, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 24 + }, + "height": "", + "id": 23, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ systemd_service_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "title": "System services CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "System services CPU usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 37, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 25 + }, + "height": "", + "id": 24, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "container_cpu", + "refId": "B", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "container_cpu", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "title": "Containers CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Containers CPU usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 38, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 20, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ id }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "title": "All processes CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "All processes CPU usage", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 39, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Pods memory usage", + "type": "row" + }, + { + "aliasColors": {}, + "autoMigrateFrom": "graph", + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "title": "Pods memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "timeseries", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 40, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 26, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ systemd_service_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "title": "System services memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "System services memory usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 41, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 27, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (container_name, pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "container_memory_usage:sort_desc", + "refId": "B", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "title": "Containers memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Containers memory usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 42, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 28, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ id }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "title": "All processes memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "All processes memory usage", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 43, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Pods network I/O", + "type": "row" + }, + { + "aliasColors": {}, + "autoMigrateFrom": "graph", + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> {{ pod_name }}", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- {{ pod_name }}", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "title": "Pods network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "timeseries", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 44, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 30, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> pod: {{ pod_name }} | {{ container_name }}", + "metric": "network", + "refId": "B", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- pod: {{ pod_name }} | {{ container_name }}", + "metric": "network", + "refId": "D", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "network", + "refId": "C", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "network", + "refId": "E", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "network", + "refId": "F", + "step": 10 + } + ], + "thresholds": [], + "title": "Containers network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "Containers network I/O", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 45, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 29, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> {{ id }}", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "expr": "- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- {{ id }}", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "title": "All processes network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ] + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "refId": "A" + } + ], + "title": "All processes network I/O", + "type": "row" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "kubernetes" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "Node", + "options": [], + "query": "label_values(kubernetes_io_hostname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "BP cluster monitoring (per node)", + "uid": "ddqmmwrqyuhvke", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/k8s_monitoring.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/k8s_monitoring.json new file mode 100755 index 0000000..47b258c --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/k8s_monitoring.json @@ -0,0 +1,2199 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "to monitor pod cpu, memory, I/O, RX/TX and cluster cpu, memory request/limit/real usage, RX/TX, Disk I/O ", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 18283, + "graphTooltip": 0, + "id": 5, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 35, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(1-rate(node_cpu_seconds_total{mode=\"idle\", job=\"node-exporter\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{unit=\"core\"}) / sum(machine_cpu_cores)", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{unit=\"core\"}) / sum(machine_cpu_cores)", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + } + ], + "title": "Global CPU Usage", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 37, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\"} - node_memory_MemAvailable_bytes{job=\"node-exporter\"}) / sum(node_memory_MemTotal_bytes{job=\"node-exporter\"})\n", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{unit=\"byte\"}) / sum(machine_memory_bytes)", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{unit=\"byte\"}) / sum(machine_memory_bytes)", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + } + ], + "title": "Global RAM Usage", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 0 + }, + "id": 39, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "exemplar": true, + "expr": "count(up{job=\"node-exporter\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 14, + "y": 0 + }, + "id": 24, + "options": { + "displayLabels": [ + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "count by(namespace) (rate(kube_pod_container_status_running{namespace=~\"$namespace\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Running Pods", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 4 + }, + "id": 41, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "expr": "count(kube_namespace_created)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 10, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(1-rate(node_cpu_seconds_total{mode=\"idle\", job=\"node-exporter\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{unit=\"core\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{unit=\"core\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores)", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "D" + } + ], + "title": "CPU Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 12, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\"} - node_memory_MemAvailable_bytes{job=\"node-exporter\"})", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{unit=\"byte\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{unit=\"byte\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes)", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "D" + } + ], + "title": "RAM Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 8 + }, + "id": 43, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "expr": "sum(kube_pod_status_phase{phase='Running'})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 14, + "y": 8 + }, + "id": 2, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "avg(1-rate(node_cpu_seconds_total{mode=\"idle\", job=\"node-exporter\"}[$__rate_interval]))\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Cluster CPU Utilization", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 19, + "y": 8 + }, + "id": 4, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\"} - node_memory_MemAvailable_bytes{job=\"node-exporter\"}) / sum(node_memory_MemTotal_bytes{job=\"node-exporter\"})\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Cluster Memory Utilization", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(id) (rate(container_network_receive_bytes_total{instance=~\".+\", id=\"/\", interface=~\".+\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(id) (rate(container_network_transmit_bytes_total{instance=~\".+\", id=\"/\", interface=~\".+\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Total Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes per second", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(device) (rate(node_disk_read_bytes_total{device=\"sda\", instance=~\".+\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum by(device) (rate(node_disk_written_bytes_total{device=\"sda\", instance=~\".+\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 6, + "panels": [], + "title": "Row title", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean", + "sum" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean", + "sum" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(namespace) (rate(container_memory_working_set_bytes{namespace=~\"$namespace\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "code", + "expr": "sum(rate(coredns_forward_requests_total[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CoreDNS - Total Forward Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 48, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "exemplar": true, + "expr": "sum(coredns_cache_entries) by (type)", + "interval": "", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "title": "CoreDNS - Cache Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "sum by(interface) (rate(container_network_receive_bytes_total{id=\"/\", interface=~\"$netface\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod Network Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "avg by(pod) (rate(container_fs_writes_total{id=~\".+\", device=~\".+\", pod=~\"$pod\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod Read/Write Iops", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cores", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "max by(pod) (kube_pod_container_resource_limits{pod=~\".+\", resource=\"cpu\", unit=\"core\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod CPU Limits", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 67 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "max by(pod) (kube_pod_container_resource_limits{pod=~\".+\", resource=\"memory\", unit=\"byte\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod Memory Limits", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 76 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "max by(pod) (kube_pod_container_resource_requests{pod=~\".+\", resource=\"cpu\", unit=\"core\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod CPU Request", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-yellow", + "mode": "palette-classic", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "mytbedge-757d7fb6c7-n95sm" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 84 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "editorMode": "builder", + "expr": "max by(pod) (kube_pod_container_resource_requests{pod=~\".+\", resource=\"memory\", unit=\"byte\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pod Memory Requests", + "type": "timeseries" + } + ], + "refresh": "auto", + "schemaVersion": 39, + "tags": [ + "kubernetes", + "prometheus", + "cadvisor" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "definition": "label_values(kube_pod_info, namespace)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "definition": "label_values(kube_pod_info, pod)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(kube_pod_info, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "prometheus", + "value": "cdqmmombh8vswd" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "bdp9e4eexwp34c" + }, + "definition": "container_network_transmit_bytes_total{kubernetes_io_os=\"linux\"}", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "netface", + "options": [], + "query": { + "query": "container_network_transmit_bytes_total{kubernetes_io_os=\"linux\"}", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.*interface=\"([^\"]*).*/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "Asia/Yangon", + "title": "Kubernetes Dashboard 2", + "uid": "e1RXnCbVs", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/loki_general.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/loki_general.json new file mode 100755 index 0000000..8312216 --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/loki_general.json @@ -0,0 +1,284 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Loki dashboard with quick search and timeline.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 13186, + "graphTooltip": 0, + "id": 8, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "expr": "sum(count_over_time({namespace=\"$namespace\", app=\"$workload\", instance=~\"$pod\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "title": "Timeline", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "expr": "{namespace=\"$namespace\", app=\"$workload\", instance=~\"$pod\"} |~ \"$search\"", + "refId": "A" + } + ], + "title": "Logs", + "type": "logs" + } + ], + "refresh": false, + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(kube_pod_info, namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(mixin_pod_workload{namespace=\"$namespace\"}, workload)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "workload", + "options": [], + "query": "label_values(mixin_pod_workload{namespace=\"$namespace\"}, workload)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(mixin_pod_workload{namespace=\"$namespace\", workload=\"$workload\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(mixin_pod_workload{namespace=\"$namespace\", workload=\"$workload\"}, pod)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "name": "search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki Dashboard", + "uid": "liz0yRCZz", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/node_exporter.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/node_exporter.json new file mode 100755 index 0000000..14f1607 --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/node_exporter.json @@ -0,0 +1,1222 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 3, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "CPU", + "type": "row" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "(\n (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))\n/ ignoring(cpu) group_left\n count without (cpu, mode) (node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n", + "format": "time_series", + "intervalFactor": 5, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "1m load average", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5m load average", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "15m load average", + "refId": "C" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "logical cores", + "refId": "D" + } + ], + "title": "Load Average", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 11, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Memory", + "type": "row" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 18, + "x": 0, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) /\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"})\n* 100\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "gauge" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 12, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Disk", + "type": "row" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/ io time/" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "custom.axisPlacement", + "value": "right" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} read", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} written", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} io time", + "refId": "C" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Mounted on" + }, + "properties": [ + { + "id": "custom.width", + "value": 260 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Size" + }, + "properties": [ + { + "id": "custom.width", + "value": 93 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "custom.width", + "value": 72 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Available" + }, + "properties": [ + { + "id": "custom.width", + "value": 88 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used, %" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 1 + }, + { + "id": "min", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B" + } + ], + "title": "Disk Space Usage", + "transformations": [ + { + "id": "groupBy", + "options": { + "fields": { + "Value #A": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #B": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "mountpoint": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "calculateField", + "options": { + "alias": "Used", + "binary": { + "left": "Value #A (lastNotNull)", + "operator": "-", + "reducer": "sum", + "right": "Value #B (lastNotNull)" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "Used, %", + "binary": { + "left": "Used", + "operator": "/", + "reducer": "sum", + "right": "Value #A (lastNotNull)" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": {}, + "renameByName": { + "Value #A (lastNotNull)": "Size", + "Value #B (lastNotNull)": "Available", + "mountpoint": "Mounted on" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Mounted on" + } + ] + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 13, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Network received (bits/s)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "title": "Network Received", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Network transmitted (bits/s)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "title": "Network Transmitted", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "node-exporter-mixin" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "uth-cluster1", + "value": "uth-cluster1" + }, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "", + "hide": 2, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": "label_values(node_uname_info{job=\"node-exporter\", sysname!=\"Darwin\"}, cluster)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "10.64.45.224:9100", + "value": "10.64.45.224:9100" + }, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(node_uname_info{job=\"node-exporter\", cluster=\"$cluster\", sysname!=\"Darwin\"}, instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Node Exporter / Nodes", + "uid": "fdpcb45jhimm8d", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/files/dashboards/oai_monitoring.json b/roles/grafana-loki-prometheus/grafana/files/dashboards/oai_monitoring.json new file mode 100755 index 0000000..b88decf --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/files/dashboards/oai_monitoring.json @@ -0,0 +1,552 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Loki dashboard with quick search and timeline.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 13186, + "graphTooltip": 0, + "id": 9, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "expr": "sum(count_over_time({namespace=\"$namespace\", app=\"$workload\", instance=~\"$pod\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "title": "Timeline", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-upf\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "UPF Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 7, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-amf\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "AMF Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 8, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-smf\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "SMF Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 9, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-udr\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "UDR Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 10, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-ausf\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "AUSF Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 11, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-nrf\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "NRF Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 12, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"oai-udm\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "UDM Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 13, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cdqn4n5sjeqdcf" + }, + "editorMode": "code", + "expr": "{app_kubernetes_io_name=\"basic-mysql\",namespace=\"blueprint\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "MySQL Logs", + "type": "logs" + } + ], + "refresh": false, + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(kube_pod_info, namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(mixin_pod_workload{namespace=\"$namespace\"}, workload)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "workload", + "options": [], + "query": "label_values(mixin_pod_workload{namespace=\"$namespace\"}, workload)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "cdqmmombh8vswd" + }, + "definition": "label_values(mixin_pod_workload{namespace=\"$namespace\", workload=\"$workload\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(mixin_pod_workload{namespace=\"$namespace\", workload=\"$workload\"}, pod)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "name": "search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "OAI Logs Dashboard", + "uid": "ddqnhm5avauioa", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/tasks/main.yml b/roles/grafana-loki-prometheus/grafana/tasks/main.yml new file mode 100644 index 0000000..83211bb --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/tasks/main.yml @@ -0,0 +1,107 @@ +- name: Set parameters + ansible.builtin.set_fact: + _grafana_version: "{{ default_grafana_version }}" # https://api.github.com/repos/prometheus/prometheus/releases + +- name: Create directories for Grafana + ansible.builtin.file: + path: grafana + state: directory + mode: 0700 + +- name: Configure Grafana + template: + src: grafana.ini.j2 + dest: '{{ ansible_env.HOME }}/grafana/grafana.ini' + +- name: Copy dashboard files + ansible.builtin.copy: + src: dashboards + dest: "{{ ansible_env.HOME }}/grafana/" + +- name: Deploy Grafana + docker_container: + name: grafana-server + image: "grafana/grafana-enterprise:{{_grafana_version}}" + restart: true + state: started + restart_policy: always + ports: + - "3000:3000" + networks: + - name: host + volumes: + - "{{ ansible_env.HOME }}/grafana/grafana.ini:/etc/grafana/grafana.ini" + - "{{ ansible_env.HOME }}/grafana/dashboards:/tmp/dashboards" + +- name: Wait for grafana to be ready + ansible.builtin.uri: + url: http://localhost:3000/api/health + method: GET + status_code: 200 + register: grafana_wait + retries: 10 + delay: 5 + until: grafana_wait is succeeded + +- name: Reset Grafana admin password + community.docker.docker_container_exec: + container: grafana-server + command: "grafana-cli admin reset-admin-password {{ grafana_password }}" + +- name: Load data sources from JSON file + set_fact: + datasources: "{{ lookup('file', '{{ playbook_dir }}/roles/grafana-loki-prometheus/grafana/templates/datasources2.json') | from_json }}" + +- name: Ensure Grafana data sources are present + community.grafana.grafana_datasource: + url: http://localhost:3000 + url_username: admin + url_password: "{{ grafana_password }}" + user: admin + password: "{{ grafana_password }}" + validate_certs: no + state: present + name: "{{ item.name }}" + ds_type: "{{ item.type }}" + ds_url: "{{ item.url }}" + access: "{{ item.access }}" + is_default: "{{ item.isDefault | default(false) }}" + additional_json_data: "{{ item.jsonData | default({}) }}" + loop: "{{ datasources }}" + loop_control: + label: "{{ item.name }}" + register: datasource_result + +# - name: Debug datasource result +# debug: +# var: datasource_result + +- name: Find JSON files in the dashboard directory + ansible.builtin.find: + paths: "{{ ansible_env.HOME }}/grafana/dashboards/" + patterns: "*.json" + recurse: yes + register: dashboard_files + +# - name: Debug found dashboard files +# debug: +# var: dashboard_files.files + +- name: Ensure Grafana dashboards are present + community.grafana.grafana_dashboard: + grafana_url: http://localhost:3000 + grafana_user: admin + grafana_password: "{{ grafana_password }}" + validate_certs: no + state: present + folder: "General" + overwrite: true + path: "{{ item.path }}" + loop: "{{ dashboard_files.files }}" + loop_control: + label: "{{ item.path }}" + register: dashboard_result + +# - name: Debug dashboard result +# debug: +# var: dashboard_result diff --git a/roles/grafana-loki-prometheus/grafana/templates/datasources.json b/roles/grafana-loki-prometheus/grafana/templates/datasources.json new file mode 100644 index 0000000..320e5ea --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/templates/datasources.json @@ -0,0 +1,22 @@ +[ + { + "name": "Prometheus", + "type": "prometheus", + "url": "http://localhost:9090", + "access": "proxy", + "isDefault": true, + "basicAuth": false, + "jsonData": { + "httpMethod": "POST" + } + }, + { + "name": "Loki", + "type": "loki", + "url": "http://localhost:3100", + "access": "proxy", + "isDefault": false, + "basicAuth": false, + "jsonData": {} + } +] diff --git a/roles/grafana-loki-prometheus/grafana/templates/datasources2.json b/roles/grafana-loki-prometheus/grafana/templates/datasources2.json new file mode 100644 index 0000000..c29aeed --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/templates/datasources2.json @@ -0,0 +1 @@ +[{"id":2,"uid":"cdqn4n5sjeqdcf","orgId":1,"name":"loki","type":"loki","typeName":"Loki","typeLogoUrl":"public/app/plugins/datasource/loki/img/loki_icon.svg","access":"proxy","url":"http://localhost:3100","user":"","database":"","basicAuth":false,"isDefault":false,"jsonData":{},"readOnly":false},{"id":1,"uid":"cdqmmombh8vswd","orgId":1,"name":"prometheus","type":"prometheus","typeName":"Prometheus","typeLogoUrl":"public/app/plugins/datasource/prometheus/img/prometheus_logo.svg","access":"proxy","url":"http://localhost:9090","user":"","database":"","basicAuth":false,"isDefault":true,"jsonData":{"httpMethod":"POST"},"readOnly":false} ] \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/grafana/templates/grafana.ini.j2 b/roles/grafana-loki-prometheus/grafana/templates/grafana.ini.j2 new file mode 100644 index 0000000..8d1aebe --- /dev/null +++ b/roles/grafana-loki-prometheus/grafana/templates/grafana.ini.j2 @@ -0,0 +1,1641 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +;data = /var/lib/grafana + +# Temporary files in `data` directory older than given duration will be removed +;temp_data_lifetime = 24h + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +;plugins = /var/lib/grafana/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, h2, socket) +;protocol = http + +# This is the minimum TLS version allowed. By default, this value is empty. Accepted values are: TLS1.2, TLS1.3. If nothing is set TLS1.2 would be taken +;min_tls_version = "" + +# The ip address to bind to, empty will bind to all interfaces +http_addr = 0.0.0.0 + +# The http port to use +http_port = 3000 + +# The public facing domain name used to access grafana from a browser +;domain = grafana.example.com + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +;root_url = %(protocol)s://%(domain)s:%(http_port)s/ + +# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. +;serve_from_sub_path = false + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +;cert_file = +;cert_key = + +# Certificates file watch interval +;certs_watch_interval = + +# Unix socket gid +# Changing the gid of a file without privileges requires that the target group is in the group of the process and that the process is the file owner +# It is recommended to set the gid as http server user gid +# Not set when the value is -1 +;socket_gid = + +# Unix socket mode +;socket_mode = + +# Unix socket path +;socket = + +# CDN Url +;cdn_url = + +# Sets the maximum time using a duration format (5s/5m/5ms) before timing out read of an incoming request and closing idle connections. +# `0` means there is no timeout for reading the request. +;read_timeout = 0 + +# This setting enables you to specify additional headers that the server adds to HTTP(S) responses. +[server.custom_response_headers] +#exampleHeader1 = exampleValue1 +#exampleHeader2 = exampleValue2 + +#################################### GRPC Server ######################### +;[grpc_server] +;network = "tcp" +;address = "127.0.0.1:10000" +;use_tls = false +;cert_file = +;key_file = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres", use either "disable", "require" or "verify-full" +# For "mysql", use either "true", "false", or "skip-verify". +;ssl_mode = disable + +# For "postregs", use either "1" to enable or "0" to disable SNI +;ssl_sni = + +# Database drivers may support different transaction isolation levels. +# Currently, only "mysql" driver supports isolation levels. +# If the value is empty - driver's default isolation level is applied. +# For "mysql" use "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ" or "SERIALIZABLE". +;isolation_level = + +;ca_cert_path = +;client_key_path = +;client_cert_path = +;server_cert_name = + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +;log_queries = + +# For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) +;cache_mode = private + +# For "sqlite3" only. Enable/disable Write-Ahead Logging, https://sqlite.org/wal.html. Default is false. +;wal = false + +# For "mysql" and "postgres" only. Lock the database for the migrations, default is true. +;migration_locking = true + +# For "mysql" and "postgres" only. How many seconds to wait before failing to lock the database for the migrations, default is 0. +;locking_attempt_timeout_sec = 0 + +# For "sqlite" only. How many times to retry query in case of database is locked failures. Default is 0 (disabled). +;query_retries = 0 + +# For "sqlite" only. How many times to retry transaction in case of database is locked failures. Default is 5. +;transaction_retries = 5 + +# Set to true to add metrics and tracing for database queries. +;instrument_queries = false + +################################### Data sources ######################### +[datasources] +# Upper limit of data sources that Grafana will return. This limit is a temporary configuration and it will be deprecated when pagination will be introduced on the list data sources API. +;datasource_limit = 5000 + +#################################### Cache server ############################# +[remote_cache] +# Either "redis", "memcached" or "database" default is "database" +;type = database + +# cache connectionstring options +# database: will use Grafana primary database. +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. +# memcache: 127.0.0.1:11211 +;connstr = + +# prefix prepended to all the keys in the remote cache +; prefix = + +# This enables encryption of values stored in the remote cache +;encryption = + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +# How long the data proxy waits to read the headers of the response before timing out, default is 30 seconds. +# This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. +;timeout = 30 + +# How long the data proxy waits to establish a TCP connection before timing out, default is 10 seconds. +;dialTimeout = 10 + +# How many seconds the data proxy waits before sending a keepalive probe request. +;keep_alive_seconds = 30 + +# How many seconds the data proxy waits for a successful TLS Handshake before timing out. +;tls_handshake_timeout_seconds = 10 + +# How many seconds the data proxy will wait for a server's first response headers after +# fully writing the request headers if the request has an "Expect: 100-continue" +# header. A value of 0 will result in the body being sent immediately, without +# waiting for the server to approve. +;expect_continue_timeout_seconds = 1 + +# Optionally limits the total number of connections per host, including connections in the dialing, +# active, and idle states. On limit violation, dials will block. +# A value of zero (0) means no limit. +;max_conns_per_host = 0 + +# The maximum number of idle connections that Grafana will keep alive. +;max_idle_connections = 100 + +# How many seconds the data proxy keeps an idle connection open before timing out. +;idle_conn_timeout_seconds = 90 + +# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. +;send_user_header = false + +# Limit the amount of bytes that will be read/accepted from responses of outgoing HTTP requests. +;response_limit = 0 + +# Limits the number of rows that Grafana will process from SQL data sources. +;row_limit = 1000000 + +# Sets a custom value for the `User-Agent` header for outgoing data proxy requests. If empty, the default value is `Grafana/<BuildVersion>` (for example `Grafana/9.0.0`). +;user_agent = + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# The name of the distributor of the Grafana instance. Ex hosted-grafana, grafana-labs +;reporting_distributor = grafana-labs + +# Set to false to disable all checks to https://grafana.com +# for new versions of grafana. The check is used +# in some UI views to notify that a grafana update exists. +# This option does not cause any auto updates, nor send any information +# only a GET request to https://grafana.com/api/grafana/versions/stable to get the latest version. +;check_for_updates = true + +# Set to false to disable all checks to https://grafana.com +# for new versions of plugins. The check is used +# in some UI views to notify that a plugin update exists. +# This option does not cause any auto updates, nor send any information +# only a GET request to https://grafana.com to get the latest versions. +;check_for_plugin_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +# Google Analytics 4 tracking code, only enabled if you specify an id here +;google_analytics_4_id = + +# When Google Analytics 4 Enhanced event measurement is enabled, we will try to avoid sending duplicate events and let Google Analytics 4 detect navigation changes, etc. +;google_analytics_4_send_manual_page_views = false + +# Google Tag Manager ID, only enabled if you specify an id here +;google_tag_manager_id = + +# Rudderstack write key, enabled only if rudderstack_data_plane_url is also set +;rudderstack_write_key = + +# Rudderstack data plane url, enabled only if rudderstack_write_key is also set +;rudderstack_data_plane_url = + +# Rudderstack SDK url, optional, only valid if rudderstack_write_key and rudderstack_data_plane_url is also set +;rudderstack_sdk_url = + +# Rudderstack Config url, optional, used by Rudderstack SDK to fetch source config +;rudderstack_config_url = + +# Rudderstack Integrations URL, optional. Only valid if you pass the SDK version 1.1 or higher +;rudderstack_integrations_url = + +# Intercom secret, optional, used to hash user_id before passing to Intercom via Rudderstack +;intercom_secret = + +# Controls if the UI contains any links to user feedback forms +;feedback_links_enabled = true + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +;disable_initial_admin_creation = false + +# default admin user, created on startup +;admin_user = admin + +# default admin password, can be changed before first start of grafana, or in profile settings +;admin_password = admin + +# default admin email, created on startup +;admin_email = admin@localhost + +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# current key provider used for envelope encryption, default to static value specified by secret_key +;encryption_provider = secretKey.v1 + +# list of configured key providers, space separated (Enterprise only): e.g., awskms.v1 azurekv.v1 +;available_encryption_providers = + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +# set to true if you host Grafana behind HTTPS. default is false. +;cookie_secure = false + +# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" +;cookie_samesite = lax + +# set to true if you want to allow browsers to render Grafana in a <frame>, <iframe>, <embed> or <object>. default is false. +;allow_embedding = false + +# Set to true if you want to enable http strict transport security (HSTS) response header. +# HSTS tells browsers that the site should only be accessed using HTTPS. +;strict_transport_security = false + +# Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled. +;strict_transport_security_max_age_seconds = 86400 + +# Set to true if to enable HSTS preloading option. Only applied if strict_transport_security is enabled. +;strict_transport_security_preload = false + +# Set to true if to enable the HSTS includeSubDomains option. Only applied if strict_transport_security is enabled. +;strict_transport_security_subdomains = false + +# Set to true to enable the X-Content-Type-Options response header. +# The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised +# in the Content-Type headers should not be changed and be followed. +;x_content_type_options = true + +# Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading +# when they detect reflected cross-site scripting (XSS) attacks. +;x_xss_protection = true + +# Enable adding the Content-Security-Policy header to your requests. +# CSP allows to control resources the user agent is allowed to load and helps prevent XSS attacks. +;content_security_policy = false + +# Set Content Security Policy template used when adding the Content-Security-Policy header to your requests. +# $NONCE in the template includes a random nonce. +# $ROOT_PATH is server.root_url without the protocol. +;content_security_policy_template = """script-src 'self' 'unsafe-eval' 'unsafe-inline' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline' blob:;img-src * data:;base-uri 'self';connect-src 'self' grafana.com ws://$ROOT_PATH wss://$ROOT_PATH;manifest-src 'self';media-src 'none';form-action 'self';""" + +# Enable adding the Content-Security-Policy-Report-Only header to your requests. +# Allows you to monitor the effects of a policy without enforcing it. +;content_security_policy_report_only = false + +# Set Content Security Policy Report Only template used when adding the Content-Security-Policy-Report-Only header to your requests. +# $NONCE in the template includes a random nonce. +# $ROOT_PATH is server.root_url without the protocol. +;content_security_policy_report_only_template = """script-src 'self' 'unsafe-eval' 'unsafe-inline' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline' blob:;img-src * data:;base-uri 'self';connect-src 'self' grafana.com ws://$ROOT_PATH wss://$ROOT_PATH;manifest-src 'self';media-src 'none';form-action 'self';""" +# Controls if old angular plugins are supported or not. +;angular_support_enabled = false + +# List of additional allowed URLs to pass by the CSRF check, separated by spaces. Suggested when authentication comes from an IdP. +;csrf_trusted_origins = example.com + +# List of allowed headers to be set by the user, separated by spaces. Suggested to use for if authentication lives behind reverse proxies. +;csrf_additional_headers = + +# The CSRF check will be executed even if the request has no login cookie. +;csrf_always_check = false + +# Comma-separated list of plugins ids that won't be loaded inside the frontend sandbox +;disable_frontend_sandbox_for_plugins = + +[security.encryption] +# Defines the time-to-live (TTL) for decrypted data encryption keys stored in memory (cache). +# Please note that small values may cause performance issues due to a high frequency decryption operations. +;data_keys_cache_ttl = 15m + +# Defines the frequency of data encryption keys cache cleanup interval. +# On every interval, decrypted data encryption keys that reached the TTL are removed from the cache. +;data_keys_cache_cleanup_interval = 1m + +#################################### Snapshots ########################### +[snapshots] +# set to false to remove snapshot functionality +;enabled = true + +# snapshot sharing options +;external_enabled = true +;external_snapshot_url = https://snapshots.raintank.io +;external_snapshot_name = Publish to snapshots.raintank.io + +# Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for +# creating and deleting snapshots. +;public_mode = false + +# remove expired snapshot +;snapshot_remove_expired = true + +#################################### Dashboards History ################## +[dashboards] +# Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1 +;versions_to_keep = 20 + +# Minimum dashboard refresh interval. When set, this will restrict users to set the refresh interval of a dashboard lower than given interval. Per default this is 5 seconds. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;min_refresh_interval = 5s + +# Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json" +;default_home_dashboard_path = + +#################################### Users ############################### +[users] +# disable user signup / registration +;allow_sign_up = true + +# Allow non admin users to create organizations +;allow_org_create = true + +# Set to true to automatically assign new users to the default organization (id 1) +;auto_assign_org = true + +# Set this value to automatically add new users to the provided organization (if auto_assign_org above is set to true) +;auto_assign_org_id = 1 + +# Default role new users will be automatically assigned +;auto_assign_org_role = Viewer + +# Require email validation before sign up completes +;verify_email_enabled = false + +# Redirect to default OrgId after login +;login_default_org_id = + +# Background text for the user field on the login page +;login_hint = email or username +;password_hint = password + +# Default UI theme ("dark" or "light") +;default_theme = dark + +# Default UI language (supported IETF language tag, such as en-US) +;default_language = en-US + +# Path to a custom home page. Users are only redirected to this if the default home dashboard is used. It should match a frontend route and contain a leading slash. +;home_page = + +# External user management, these options affect the organization users view +;external_manage_link_url = +;external_manage_link_name = +;external_manage_info = + +# Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard. +;viewers_can_edit = false + +# Editors can administrate dashboard, folders and teams they create +;editors_can_admin = false + +# The duration in time a user invitation remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 24h (24 hours). The minimum supported duration is 15m (15 minutes). +;user_invite_max_lifetime_duration = 24h + +# The duration in time a verification email, used to update the email address of a user, remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 1h (1 hour). +;verification_email_max_lifetime_duration = 1h + +# Enter a comma-separated list of users login to hide them in the Grafana UI. These users are shown to Grafana admins and themselves. +; hidden_users = + +[secretscan] +# Enable secretscan feature +;enabled = false + +# Interval to check for token leaks +;interval = 5m + +# base URL of the grafana token leak check service +;base_url = https://secret-scanning.grafana.net + +# URL to send outgoing webhooks to in case of detection +;oncall_url = + +# Whether to revoke the token if a leak is detected or just send a notification +;revoke = true + +[service_accounts] +# Service account maximum expiration date in days. +# When set, Grafana will not allow the creation of tokens with expiry greater than this setting. +; token_expiration_day_limit = + +[auth] +basic_auth_enabled = true + +# Login cookie name +;login_cookie_name = grafana_session + +# Disable usage of Grafana build-in login solution. +;disable_login = false + +# The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation. +;login_maximum_inactive_lifetime_duration = + +# The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). +;login_maximum_lifetime_duration = + +# How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes. +;token_rotation_interval_minutes = 10 + +# Set to true to disable (hide) the login form, useful if you use OAuth, defaults to false +;disable_login_form = false + +# Set to true to disable the sign out link in the side menu. Useful if you use auth.proxy or auth.jwt, defaults to false +;disable_signout_menu = false + +# URL to redirect the user to after sign out +;signout_redirect_url = + +# Set to true to attempt login with OAuth automatically, skipping the login screen. +# This setting is ignored if multiple OAuth providers are configured. +# Deprecated, use auto_login option for specific provider instead. +;oauth_auto_login = false + +# OAuth state max age cookie duration in seconds. Defaults to 600 seconds. +;oauth_state_cookie_max_age = 600 + +# limit of api_key seconds to live before expiration +;api_key_max_seconds_to_live = -1 + +# Set to true to enable SigV4 authentication option for HTTP-based datasources. +;sigv4_auth_enabled = false + +# Set to true to enable verbose logging of SigV4 request signing +;sigv4_verbose_logging = false + +# Set to true to enable Azure authentication option for HTTP-based datasources. +;azure_auth_enabled = false + +# Set to skip the organization role from JWT login and use system's role assignment instead. +; skip_org_role_sync = false + +# Use email lookup in addition to the unique ID provided by the IdP +;oauth_allow_insecure_email_lookup = false + +# Set to true to include id of identity as a response header +;id_response_header_enabled = false + +# Prefix used for the id response header, X-Grafana-Identity-Id +;id_response_header_prefix = X-Grafana + +# List of identity namespaces to add id response headers for, separated by space. +# Available namespaces are user, api-key and service-account. +# The header value will encode the namespace ("user:<id>", "api-key:<id>", "service-account:<id>") +;id_response_header_namespaces = user api-key service-account + +#################################### Anonymous Auth ###################### +[auth.anonymous] +# enable anonymous access +;enabled = false + +# specify organization name that should be used for unauthenticated users +;org_name = Main Org. + +# specify role for unauthenticated users +;org_role = Viewer + +# mask the Grafana version number for unauthenticated users +;hide_version = false + +#################################### GitHub Auth ########################## +[auth.github] +;name = GitHub +;icon = github +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_id +;client_secret = some_secret +;scopes = user:email,read:org +;auth_url = https://github.com/login/oauth/authorize +;token_url = https://github.com/login/oauth/access_token +;api_url = https://api.github.com/user +;signout_redirect_url = +;allowed_domains = +;team_ids = +;allowed_organizations = +;role_attribute_path = +;role_attribute_strict = false +;allow_assign_grafana_admin = false +;skip_org_role_sync = false + +#################################### GitLab Auth ######################### +[auth.gitlab] +;name = GitLab +;icon = gitlab +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_id +;client_secret = some_secret +;scopes = openid email profile +;auth_url = https://gitlab.com/oauth/authorize +;token_url = https://gitlab.com/oauth/token +;api_url = https://gitlab.com/api/v4 +;signout_redirect_url = +;allowed_domains = +;allowed_groups = +;role_attribute_path = +;role_attribute_strict = false +;allow_assign_grafana_admin = false +;skip_org_role_sync = false +;tls_skip_verify_insecure = false +;tls_client_cert = +;tls_client_key = +;tls_client_ca = +;use_pkce = true + +#################################### Google Auth ########################## +[auth.google] +;name = Google +;icon = google +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_client_id +;client_secret = some_client_secret +;scopes = openid email profile +;auth_url = https://accounts.google.com/o/oauth2/v2/auth +;token_url = https://oauth2.googleapis.com/token +;api_url = https://openidconnect.googleapis.com/v1/userinfo +;signout_redirect_url = +;allowed_domains = +;validate_hd = +;hosted_domain = +;allowed_groups = +;role_attribute_path = +;role_attribute_strict = false +;allow_assign_grafana_admin = false +;skip_org_role_sync = false +;use_pkce = true + +#################################### Grafana.com Auth #################### +[auth.grafana_com] +;name = Grafana.com +;icon = grafana +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_id +;client_secret = some_secret +;scopes = user:email +;allowed_organizations = +;skip_org_role_sync = false + +#################################### Azure AD OAuth ####################### +[auth.azuread] +;name = Microsoft +;icon = microsoft +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_client_id +;client_secret = some_client_secret +;scopes = openid email profile +;auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize +;token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token +;signout_redirect_url = +;allowed_domains = +;allowed_groups = +;allowed_organizations = +;role_attribute_strict = false +;allow_assign_grafana_admin = false +;use_pkce = true +# prevent synchronizing users organization roles +;skip_org_role_sync = false + +#################################### Okta OAuth ####################### +[auth.okta] +;name = Okta +;enabled = false +;allow_sign_up = true +;auto_login = false +;client_id = some_id +;client_secret = some_secret +;scopes = openid profile email groups +;auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize +;token_url = https://<tenant-id>.okta.com/oauth2/v1/token +;api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo +;signout_redirect_url = +;allowed_domains = +;allowed_groups = +;role_attribute_path = +;role_attribute_strict = false +;allow_assign_grafana_admin = false +;skip_org_role_sync = false +;use_pkce = true + +#################################### Generic OAuth ########################## +[auth.generic_oauth] +;enabled = false +;name = OAuth +;allow_sign_up = true +;auto_login = false +;client_id = some_id +;client_secret = some_secret +;scopes = user:email,read:org +;empty_scopes = false +;email_attribute_name = email:primary +;email_attribute_path = +;login_attribute_path = +;name_attribute_path = +;id_token_attribute_name = +;auth_url = https://foo.bar/login/oauth/authorize +;token_url = https://foo.bar/login/oauth/access_token +;api_url = https://foo.bar/user +;signout_redirect_url = +;teams_url = +;allowed_domains = +;team_ids = +;allowed_organizations = +;role_attribute_path = +;role_attribute_strict = false +;groups_attribute_path = +;team_ids_attribute_path = +;tls_skip_verify_insecure = false +;tls_client_cert = +;tls_client_key = +;tls_client_ca = +;use_pkce = false +;auth_style = +;allow_assign_grafana_admin = false + +#################################### Basic Auth ########################## +[auth.basic] +enabled = true +;password_policy = false + +#################################### Auth Proxy ########################## +[auth.proxy] +;enabled = false +;header_name = X-WEBAUTH-USER +;header_property = username +;auto_sign_up = true +;sync_ttl = 60 +;whitelist = 192.168.1.1, 192.168.2.1 +;headers = Email:X-User-Email, Name:X-User-Name +# Non-ASCII strings in header values are encoded using quoted-printable encoding +;headers_encoded = false +# Read the auth proxy docs for details on what the setting below enables +;enable_login_token = false + +#################################### Auth JWT ########################## +[auth.jwt] +;enabled = true +;header_name = X-JWT-Assertion +;email_claim = sub +;username_claim = sub +;email_attribute_path = jmespath.email +;username_attribute_path = jmespath.username +;jwk_set_url = https://foo.bar/.well-known/jwks.json +;jwk_set_file = /path/to/jwks.json +;cache_ttl = 60m +;expect_claims = {"aud": ["foo", "bar"]} +;key_file = /path/to/key/file +# Use in conjunction with key_file in case the JWT token's header specifies a key ID in "kid" field +;key_id = some-key-id +;role_attribute_path = +;groups_attribute_path = +;role_attribute_strict = false +;auto_sign_up = false +;url_login = false +;allow_assign_grafana_admin = false + +#################################### Auth LDAP ########################## +[auth.ldap] +;enabled = false +;config_file = /etc/grafana/ldap.toml +;allow_sign_up = true +# prevent synchronizing ldap users organization roles +;skip_org_role_sync = false + +# LDAP background sync (Enterprise only) +# At 1 am every day +;sync_cron = "0 1 * * *" +;active_sync_enabled = true + +#################################### AWS ########################### +[aws] +# Enter a comma-separated list of allowed AWS authentication providers. +# Options are: default (AWS SDK Default), keys (Access && secret key), credentials (Credentials field), ec2_iam_role (EC2 IAM Role) +; allowed_auth_providers = default,keys,credentials + +# Allow AWS users to assume a role using temporary security credentials. +# If true, assume role will be enabled for all AWS authentication providers that are specified in aws_auth_providers +; assume_role_enabled = true + +# Specify max no of pages to be returned by the ListMetricPages API +; list_metrics_page_limit = 500 + +# Experimental, for use in Grafana Cloud only. Please do not set. +; external_id = + +# Sets the expiry duration of an assumed role. +# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). +; session_duration = "15m" + +# Set the plugins that will receive AWS settings for each request (via plugin context) +# By default this will include all Grafana Labs owned AWS plugins, or those that make use of AWS settings (ElasticSearch, Prometheus). +; forward_settings_to_plugins = cloudwatch, grafana-athena-datasource, grafana-redshift-datasource, grafana-x-ray-datasource, grafana-timestream-datasource, grafana-iot-sitewise-datasource, grafana-iot-twinmaker-app, grafana-opensearch-datasource, aws-datasource-provisioner, elasticsearch, prometheus + +#################################### Azure ############################### +[azure] +# Azure cloud environment where Grafana is hosted +# Possible values are AzureCloud, AzureChinaCloud, AzureUSGovernment and AzureGermanCloud +# Default value is AzureCloud (i.e. public cloud) +;cloud = AzureCloud + +# Specifies whether Grafana hosted in Azure service with Managed Identity configured (e.g. Azure Virtual Machines instance) +# If enabled, the managed identity can be used for authentication of Grafana in Azure services +# Disabled by default, needs to be explicitly enabled +;managed_identity_enabled = false + +# Client ID to use for user-assigned managed identity +# Should be set for user-assigned identity and should be empty for system-assigned identity +;managed_identity_client_id = + +# Specifies whether Azure AD Workload Identity authentication should be enabled in datasources that support it +# For more documentation on Azure AD Workload Identity, review this documentation: +# https://azure.github.io/azure-workload-identity/docs/ +# Disabled by default, needs to be explicitly enabled +;workload_identity_enabled = false + +# Tenant ID of the Azure AD Workload Identity +# Allows to override default tenant ID of the Azure AD identity associated with the Kubernetes service account +;workload_identity_tenant_id = + +# Client ID of the Azure AD Workload Identity +# Allows to override default client ID of the Azure AD identity associated with the Kubernetes service account +;workload_identity_client_id = + +# Custom path to token file for the Azure AD Workload Identity +# Allows to set a custom path to the projected service account token file +;workload_identity_token_file = + +# Specifies whether user identity authentication (on behalf of currently signed-in user) should be enabled in datasources +# that support it (requires AAD authentication) +# Disabled by default, needs to be explicitly enabled +;user_identity_enabled = false + +# Specifies whether user identity authentication fallback credentials should be enabled in data sources +# Enabling this allows data source creators to provide fallback credentials for backend initiated requests +# e.g. alerting, recorded queries etc. +# Enabled by default, needs to be explicitly disabled +# Will not have any effect if user identity is disabled above +;user_identity_fallback_credentials_enabled = true + +# Override token URL for Azure Active Directory +# By default is the same as token URL configured for AAD authentication settings +;user_identity_token_url = + +# Override ADD application ID which would be used to exchange users token to an access token for the datasource +# By default is the same as used in AAD authentication or can be set to another application (for OBO flow) +;user_identity_client_id = + +# Override the AAD application client secret +# By default is the same as used in AAD authentication or can be set to another application (for OBO flow) +;user_identity_client_secret = + +# Set the plugins that will receive Azure settings for each request (via plugin context) +# By default this will include all Grafana Labs owned Azure plugins, or those that make use of Azure settings (Azure Monitor, Azure Data Explorer, Prometheus, MSSQL). +;forward_settings_to_plugins = grafana-azure-monitor-datasource, prometheus, grafana-azure-data-explorer-datasource, mssql + +#################################### Role-based Access Control ########### +[rbac] +;permission_cache = true + +# Reset basic roles permissions on boot +# Warning left to true, basic roles permissions will be reset on every boot +#reset_basic_roles = false + +# Validate permissions' action and scope on role creation and update +; permission_validation_enabled = true + +#################################### SMTP / Emailing ########################## +[smtp] +;enabled = false +;host = localhost:25 +;user = +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = +;cert_file = +;key_file = +;skip_verify = false +;from_address = admin@grafana.localhost +;from_name = Grafana +# EHLO identity in SMTP dialog (defaults to instance_name) +;ehlo_identity = dashboard.example.com +# SMTP startTLS policy (defaults to 'OpportunisticStartTLS') +;startTLS_policy = NoStartTLS +# Enable trace propagation in e-mail headers, using the 'traceparent', 'tracestate' and (optionally) 'baggage' fields (defaults to false) +;enable_tracing = false + +[smtp.static_headers] +# Include custom static headers in all outgoing emails +;Foo-Header = bar +;Foo = bar + +[emails] +;welcome_email_on_sign_up = false +;templates_pattern = emails/*.html, emails/*.txt +;content_types = text/html + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use space to separate multiple modes, e.g. "console file" +;mode = console file + +# Either "debug", "info", "warn", "error", "critical", default is "info" +;level = info + +# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug +;filters = + +# Set the default error message shown to users. This message is displayed instead of sensitive backend errors which should be obfuscated. Default is the same as the sample value. +;user_facing_default_error = "please inspect Grafana server log for details" + +# For "console" mode only +[log.console] +;level = + +# log line format, valid options are text, console and json +;format = console + +# For "file" mode only +[log.file] +;level = + +# log line format, valid options are text, console and json +;format = text + +# This enables automated log rotate(switch of following options), default is true +;log_rotate = true + +# Max line number of single file, default is 1000000 +;max_lines = 1000000 + +# Max size shift of single file, default is 28 means 1 << 28, 256MB +;max_size_shift = 28 + +# Segment log daily, default is true +;daily_rotate = true + +# Expired days of log file(delete after max days), default is 7 +;max_days = 7 + +[log.syslog] +;level = + +# log line format, valid options are text, console and json +;format = text + +# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used. +;network = +;address = + +# Syslog facility. user, daemon and local0 through local7 are valid. +;facility = + +# Syslog tag. By default, the process' argv[0] is used. +;tag = + +[log.frontend] +# Should Faro javascript agent be initialized +;enabled = false + +# Custom HTTP endpoint to send events to. Default will log the events to stdout. +;custom_endpoint = /log-grafana-javascript-agent + +# Requests per second limit enforced an extended period, for Grafana backend log ingestion endpoint (/log). +;log_endpoint_requests_per_second_limit = 3 + +# Max requests accepted per short interval of time for Grafana backend log ingestion endpoint (/log). +;log_endpoint_burst_limit = 15 + +# Should error instrumentation be enabled, only affects Grafana Javascript Agent +;instrumentations_errors_enabled = true + +# Should console instrumentation be enabled, only affects Grafana Javascript Agent +;instrumentations_console_enabled = false + +# Should webvitals instrumentation be enabled, only affects Grafana Javascript Agent +;instrumentations_webvitals_enabled = false + +# Api Key, only applies to Grafana Javascript Agent provider +;api_key = testApiKey + +#################################### Usage Quotas ######################## +[quota] +; enabled = false + +#### set quotas to -1 to make unlimited. #### +# limit number of users per Org. +; org_user = 10 + +# limit number of dashboards per Org. +; org_dashboard = 100 + +# limit number of data_sources per Org. +; org_data_source = 10 + +# limit number of api_keys per Org. +; org_api_key = 10 + +# limit number of alerts per Org. +;org_alert_rule = 100 + +# limit number of orgs a user can create. +; user_org = 10 + +# Global limit of users. +; global_user = -1 + +# global limit of orgs. +; global_org = -1 + +# global limit of dashboards +; global_dashboard = -1 + +# global limit of api_keys +; global_api_key = -1 + +# global limit on number of logged in users. +; global_session = -1 + +# global limit of alerts +;global_alert_rule = -1 + +# global limit of correlations +; global_correlations = -1 + +# Limit of the number of alert rules per rule group. +# This is not strictly enforced yet, but will be enforced over time. +;alerting_rule_group_rules = 100 + +#################################### Unified Alerting #################### +[unified_alerting] +#Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed.``` +;enabled = true + +# Comma-separated list of organization IDs for which to disable unified alerting. Only supported if unified alerting is enabled. +;disabled_orgs = + +# Specify the frequency of polling for admin config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;admin_config_poll_interval = 60s + +# Specify the frequency of polling for Alertmanager config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;alertmanager_config_poll_interval = 60s + +# The redis server address that should be connected to. +;ha_redis_address = + +# The username that should be used to authenticate with the redis server. +;ha_redis_username = + +# The password that should be used to authenticate with the redis server. +;ha_redis_password = + +# The redis database, by default it's 0. +;ha_redis_db = + +# A prefix that is used for every key or channel that is created on the redis server +# as part of HA for alerting. +;ha_redis_prefix = + +# The name of the cluster peer that will be used as identifier. If none is +# provided, a random one will be generated. +;ha_redis_peer_name = + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. +;ha_listen_address = "0.0.0.0:9094" + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. +;ha_advertise_address = "" + +# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting. +;ha_peers = "" + +# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will +# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should +# each instance wait before sending the notification to take into account replication lag. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_peer_timeout = "15s" + +# The label is an optional string to include on each packet and stream. +# It uniquely identifies the cluster and prevents cross-communication +# issues when sending gossip messages in an enviromenet with multiple clusters. +;ha_label = + +# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated +# across cluster more quickly at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_gossip_interval = "200ms" + +# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds +# across larger clusters at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_push_pull_interval = "60s" + +# Enable or disable alerting rule execution. The alerting UI remains visible. +;execute_alerts = true + +# Alert evaluation timeout when fetching data from the datasource. +# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;evaluation_timeout = 30s + +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1. +;max_attempts = 1 + +# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;min_interval = 10s + +# This is an experimental option to add parallelization to saving alert states in the database. +# It configures the maximum number of concurrent queries per rule evaluated. The default value is 1 +# (concurrent queries per rule disabled). +;max_state_save_concurrency = 1 + +# If the feature flag 'alertingSaveStatePeriodic' is enabled, this is the interval that is used to persist the alerting instances to the database. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;state_periodic_save_interval = 5m + +# Disables the smoothing of alert evaluations across their evaluation window. +# Rules will evaluate in sync. +;disable_jitter = false + +[unified_alerting.reserved_labels] +# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled. +# For example: `disabled_labels=grafana_folder` +;disabled_labels = + +[unified_alerting.state_history] +# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI. +; enabled = true + +# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple" +# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once. +# Defaults to "annotations". +; backend = "multiple" + +# For "multiple" only. +# Indicates the main backend used to serve state history queries. +# Either "annotations" or "loki" +; primary = "loki" + +# For "multiple" only. +# Comma-separated list of additional backends to write state history data to. +; secondaries = "annotations" + +# For "loki" only. +# URL of the external Loki instance. +# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend. +; loki_remote_url = "http://loki:3100" + +# For "loki" only. +# URL of the external Loki's read path. To be used in configurations where Loki has separated read and write URLs. +# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend. +; loki_remote_read_url = "http://loki-querier:3100" + +# For "loki" only. +# URL of the external Loki's write path. To be used in configurations where Loki has separated read and write URLs. +# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend. +; loki_remote_write_url = "http://loki-distributor:3100" + +# For "loki" only. +# Optional tenant ID to attach to requests sent to Loki. +; loki_tenant_id = 123 + +# For "loki" only. +# Optional username for basic authentication on requests sent to Loki. Can be left blank to disable basic auth. +; loki_basic_auth_username = "myuser" + +# For "loki" only. +# Optional password for basic authentication on requests sent to Loki. Can be left blank. +; loki_basic_auth_password = "mypass" + +# For "loki" only. +# Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value. +; loki_max_query_length = 360h + +[unified_alerting.state_history.external_labels] +# Optional extra labels to attach to outbound state history records or log streams. +# Any number of label key-value-pairs can be provided. +; mylabelkey = mylabelvalue + +[unified_alerting.state_history.annotations] +# This section controls retention of annotations automatically created while evaluating alert rules +# when alerting state history backend is configured to be annotations (a setting [unified_alerting.state_history].backend + +# Configures for how long alert annotations are stored. Default is 0, which keeps them forever. +# This setting should be expressed as an duration. Ex 6h (hours), 10d (days), 2w (weeks), 1M (month). +max_age = + +# Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations. +max_annotations_to_keep = + +#################################### Annotations ######################### +[annotations] +# Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations. +;cleanupjob_batchsize = 100 + +# Enforces the maximum allowed length of the tags for any newly introduced annotations. It can be between 500 and 4096 inclusive (which is the respective's column length). Default value is 500. +# Setting it to a higher value would impact performance therefore is not recommended. +;tags_length = 500 + +[annotations.dashboard] +# Dashboard annotations means that annotations are associated with the dashboard they are created on. + +# Configures how long dashboard annotations are stored. Default is 0, which keeps them forever. +# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). +;max_age = + +# Configures max number of dashboard annotations that Grafana stores. Default value is 0, which keeps all dashboard annotations. +;max_annotations_to_keep = + +[annotations.api] +# API annotations means that the annotations have been created using the API without any +# association with a dashboard. + +# Configures how long Grafana stores API annotations. Default is 0, which keeps them forever. +# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). +;max_age = + +# Configures max number of API annotations that Grafana keeps. Default value is 0, which keeps all API annotations. +;max_annotations_to_keep = + +#################################### Explore ############################# +[explore] +# Enable the Explore section +;enabled = true + +#################################### Help ############################# +[help] +# Enable the Help section +;enabled = true + +#################################### Profile ############################# +[profile] +# Enable the Profile section +;enabled = true + +#################################### News ############################# +[news] +# Enable the news feed section +; news_feed_enabled = true + +#################################### Query ############################# +[query] +# Set the number of data source queries that can be executed concurrently in mixed queries. Default is the number of CPUs. +;concurrent_query_limit = + +#################################### Query History ############################# +[query_history] +# Enable the Query history +;enabled = true + +#################################### Internal Grafana Metrics ########################## +# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId +[metrics] +# Disable / Enable internal metrics +;enabled = true +# Graphite Publish interval +;interval_seconds = 10 +# Disable total stats (stat_totals_*) metrics to be generated +;disable_total_stats = false +# The interval at which the total stats collector will update the stats. Default is 1800 seconds. +;total_stats_collector_interval_seconds = 1800 + +#If both are set, basic auth will be required for the metrics endpoints. +; basic_auth_username = +; basic_auth_password = + +# Metrics environment info adds dimensions to the `grafana_environment_info` metric, which +# can expose more information about the Grafana instance. +[metrics.environment_info] +#exampleLabel1 = exampleValue1 +#exampleLabel2 = exampleValue2 + +# Send internal metrics to Graphite +[metrics.graphite] +# Enable by setting the address setting (ex localhost:2003) +;address = +;prefix = prod.grafana.%(instance_name)s. + +#################################### Grafana.com integration ########################## +# Url used to import dashboards directly from Grafana.com +[grafana_com] +;url = https://grafana.com +;api_url = https://grafana.com/api + +#################################### Distributed tracing ############ +# Opentracing is deprecated use opentelemetry instead +[tracing.jaeger] +# Enable by setting the address sending traces to jaeger (ex localhost:6831) +;address = localhost:6831 +# Tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2) +;always_included_tag = tag1:value1 +# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote +;sampler_type = const +# jaeger samplerconfig param +# for "const" sampler, 0 or 1 for always false/true respectively +# for "probabilistic" sampler, a probability between 0 and 1 +# for "rateLimiting" sampler, the number of spans per second +# for "remote" sampler, param is the same as for "probabilistic" +# and indicates the initial sampling rate before the actual one +# is received from the mothership +;sampler_param = 1 +# sampling_server_url is the URL of a sampling manager providing a sampling strategy. +;sampling_server_url = +# Whether or not to use Zipkin propagation (x-b3- HTTP headers). +;zipkin_propagation = false +# Setting this to true disables shared RPC spans. +# Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure. +;disable_shared_zipkin_spans = false + +[tracing.opentelemetry] +# attributes that will always be included in when creating new spans. ex (key1:value1,key2:value2) +;custom_attributes = key1:value1,key2:value2 +# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote +; sampler_type = remote +# Sampler configuration parameter +# for "const" sampler, 0 or 1 for always false/true respectively +# for "probabilistic" sampler, a probability between 0.0 and 1.0 +# for "rateLimiting" sampler, the number of spans per second +# for "remote" sampler, param is the same as for "probabilistic" +# and indicates the initial sampling rate before the actual one +# is received from the sampling server (set at sampling_server_url) +; sampler_param = 0.5 +# specifies the URL of the sampling server when sampler_type is remote +; sampling_server_url = http://localhost:5778/sampling + +[tracing.opentelemetry.jaeger] +# jaeger destination (ex http://localhost:14268/api/traces) +; address = http://localhost:14268/api/traces +# Propagation specifies the text map propagation format: w3c, jaeger +; propagation = jaeger + +# This is a configuration for OTLP exporter with GRPC protocol +[tracing.opentelemetry.otlp] +# otlp destination (ex localhost:4317) +; address = localhost:4317 +# Propagation specifies the text map propagation format: w3c, jaeger +; propagation = w3c + +#################################### External image storage ########################## +[external_image_storage] +# Used for uploading images to public servers so they can be included in slack/email messages. +# you can choose between (s3, webdav, gcs, azure_blob, local) +;provider = + +[external_image_storage.s3] +;endpoint = +;path_style_access = +;bucket = +;region = +;path = +;access_key = +;secret_key = + +[external_image_storage.webdav] +;url = +;public_url = +;username = +;password = + +[external_image_storage.gcs] +;key_file = +;bucket = +;path = + +[external_image_storage.azure_blob] +;account_name = +;account_key = +;container_name = +;sas_token_expiration_days = + +[external_image_storage.local] +# does not require any configuration + +[rendering] +# Options to configure a remote HTTP image rendering service, e.g. using https://github.com/grafana/grafana-image-renderer. +# URL to a remote HTTP image renderer service, e.g. http://localhost:8081/render, will enable Grafana to render panels and dashboards to PNG-images using HTTP requests to an external service. +;server_url = +# If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/. +;callback_url = +# An auth token that will be sent to and verified by the renderer. The renderer will deny any request without an auth token matching the one configured on the renderer side. +;renderer_token = - +# Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server, +# which this setting can help protect against by only allowing a certain amount of concurrent requests. +;concurrent_render_request_limit = 30 +# Determines the lifetime of the render key used by the image renderer to access and render Grafana. +# This setting should be expressed as a duration. Examples: 10s (seconds), 5m (minutes), 2h (hours). +# Default is 5m. This should be more than enough for most deployments. +# Change the value only if image rendering is failing and you see `Failed to get the render key from cache` in Grafana logs. +;render_key_lifetime = 5m + +[panels] +# If set to true Grafana will allow script tags in text panels. Not recommended as it enable XSS vulnerabilities. +;disable_sanitize_html = false + +[plugins] +;enable_alpha = false +;app_tls_skip_verify_insecure = false +# Enter a comma-separated list of plugin identifiers to identify plugins to load even if they are unsigned. Plugins with modified signatures are never loaded. +;allow_loading_unsigned_plugins = +# Enable or disable installing / uninstalling / updating plugins directly from within Grafana. +;plugin_admin_enabled = false +;plugin_admin_external_manage_enabled = false +;plugin_catalog_url = https://grafana.com/grafana/plugins/ +# Enter a comma-separated list of plugin identifiers to hide in the plugin catalog. +;plugin_catalog_hidden_plugins = +# Log all backend requests for core and external plugins. +;log_backend_requests = false +# Disable download of the public key for verifying plugin signature. +; public_key_retrieval_disabled = false +# Force download of the public key for verifying plugin signature on startup. If disabled, the public key will be retrieved every 10 days. +# Requires public_key_retrieval_disabled to be false to have any effect. +; public_key_retrieval_on_startup = false +# Enter a comma-separated list of plugin identifiers to avoid loading (including core plugins). These plugins will be hidden in the catalog. +; disable_plugins = + +#################################### Grafana Live ########################################## +[live] +# max_connections to Grafana Live WebSocket endpoint per Grafana server instance. See Grafana Live docs +# if you are planning to make it higher than default 100 since this can require some OS and infrastructure +# tuning. 0 disables Live, -1 means unlimited connections. +;max_connections = 100 + +# allowed_origins is a comma-separated list of origins that can establish connection with Grafana Live. +# If not set then origin will be matched over root_url. Supports wildcard symbol "*". +;allowed_origins = + +# engine defines an HA (high availability) engine to use for Grafana Live. By default no engine used - in +# this case Live features work only on a single Grafana server. Available options: "redis". +# Setting ha_engine is an EXPERIMENTAL feature. +;ha_engine = + +# ha_engine_address sets a connection address for Live HA engine. Depending on engine type address format can differ. +# For now we only support Redis connection address in "host:port" format. +# This option is EXPERIMENTAL. +;ha_engine_address = "127.0.0.1:6379" + +# ha_engine_password allows setting an optional password to authenticate with the engine +;ha_engine_password = "" + +#################################### Grafana Image Renderer Plugin ########################## +[plugin.grafana-image-renderer] +# Instruct headless browser instance to use a default timezone when not provided by Grafana, e.g. when rendering panel image of alert. +# See ICU’s metaZones.txt (https://cs.chromium.org/chromium/src/third_party/icu/source/data/misc/metaZones.txt) for a list of supported +# timezone IDs. Fallbacks to TZ environment variable if not set. +;rendering_timezone = + +# Instruct headless browser instance to use a default language when not provided by Grafana, e.g. when rendering panel image of alert. +# Please refer to the HTTP header Accept-Language to understand how to format this value, e.g. 'fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5'. +;rendering_language = + +# Instruct headless browser instance to use a default device scale factor when not provided by Grafana, e.g. when rendering panel image of alert. +# Default is 1. Using a higher value will produce more detailed images (higher DPI), but will require more disk space to store an image. +;rendering_viewport_device_scale_factor = + +# Instruct headless browser instance whether to ignore HTTPS errors during navigation. Per default HTTPS errors are not ignored. Due to +# the security risk it's not recommended to ignore HTTPS errors. +;rendering_ignore_https_errors = + +# Instruct headless browser instance whether to capture and log verbose information when rendering an image. Default is false and will +# only capture and log error messages. When enabled, debug messages are captured and logged as well. +# For the verbose information to be included in the Grafana server log you have to adjust the rendering log level to debug, configure +# [log].filter = rendering:debug. +;rendering_verbose_logging = + +# Instruct headless browser instance whether to output its debug and error messages into running process of remote rendering service. +# Default is false. This can be useful to enable (true) when troubleshooting. +;rendering_dumpio = + +# Additional arguments to pass to the headless browser instance. Default is --no-sandbox. The list of Chromium flags can be found +# here (https://peter.sh/experiments/chromium-command-line-switches/). Multiple arguments is separated with comma-character. +;rendering_args = + +# You can configure the plugin to use a different browser binary instead of the pre-packaged version of Chromium. +# Please note that this is not recommended, since you may encounter problems if the installed version of Chrome/Chromium is not +# compatible with the plugin. +;rendering_chrome_bin = + +# Instruct how headless browser instances are created. Default is 'default' and will create a new browser instance on each request. +# Mode 'clustered' will make sure that only a maximum of browsers/incognito pages can execute concurrently. +# Mode 'reusable' will have one browser instance and will create a new incognito page on each request. +;rendering_mode = + +# When rendering_mode = clustered, you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser' +# and will cluster using browser instances. +# Mode 'context' will cluster using incognito pages. +;rendering_clustering_mode = +# When rendering_mode = clustered, you can define the maximum number of browser instances/incognito pages that can execute concurrently. Default is '5'. +;rendering_clustering_max_concurrency = +# When rendering_mode = clustered, you can specify the duration a rendering request can take before it will time out. Default is `30` seconds. +;rendering_clustering_timeout = + +# Limit the maximum viewport width, height and device scale factor that can be requested. +;rendering_viewport_max_width = +;rendering_viewport_max_height = +;rendering_viewport_max_device_scale_factor = + +# Change the listening host and port of the gRPC server. Default host is 127.0.0.1 and default port is 0 and will automatically assign +# a port not in use. +;grpc_host = +;grpc_port = + +[support_bundles] +# Enable support bundle creation (default: true) +#enabled = true +# Only server admins can generate and view support bundles (default: true) +#server_admin_only = true +# If set, bundles will be encrypted with the provided public keys separated by whitespace +#public_keys = "" + +[enterprise] +# Path to a valid Grafana Enterprise license.jwt file +;license_path = + +[feature_toggles] +# there are currently two ways to enable feature toggles in the `grafana.ini`. +# you can either pass an array of feature you want to enable to the `enable` field or +# configure each toggle by setting the name of the toggle to true/false. Toggles set to true/false +# will take presidence over toggles in the `enable` list. + +;enable = feature1,feature2 + +;feature1 = true +;feature2 = false + +[date_formats] +# For information on what formatting patterns that are supported https://momentjs.com/docs/#/displaying/ + +# Default system date format used in time range picker and other places where full time is displayed +;full_date = YYYY-MM-DD HH:mm:ss + +# Used by graph and other places where we only show small intervals +;interval_second = HH:mm:ss +;interval_minute = HH:mm +;interval_hour = MM/DD HH:mm +;interval_day = MM/DD +;interval_month = YYYY-MM +;interval_year = YYYY + +# Experimental feature +;use_browser_locale = false + +# Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc. +;default_timezone = browser + +[expressions] +# Enable or disable the expressions functionality. +;enabled = true + +[geomap] +# Set the JSON configuration for the default basemap +;default_baselayer_config = `{ +; "type": "xyz", +; "config": { +; "attribution": "Open street map", +; "url": "https://tile.openstreetmap.org/{z}/{x}/{y}.png" +; } +;}` + +# Enable or disable loading other base map layers +;enable_custom_baselayers = true + +# Move an app plugin referenced by its id (including all its pages) to a specific navigation section +[navigation.app_sections] +# The following will move an app plugin with the id of `my-app-id` under the `cfg` section +# my-app-id = cfg + +# Move a specific app plugin page (referenced by its `path` field) to a specific navigation section +[navigation.app_standalone_pages] +# The following will move the page with the path "/a/my-app-id/my-page" from `my-app-id` to the `cfg` section +# /a/my-app-id/my-page = cfg + +#################################### Secure Socks5 Datasource Proxy ##################################### +[secure_socks_datasource_proxy] +; enabled = false +; root_ca_cert = +; client_key = +; client_cert = +; server_name = +# The address of the socks5 proxy datasources should connect to +; proxy_address = +; show_ui = true +; allow_insecure = false + +################################## Feature Management ############################################## +[feature_management] +# Options to configure the experimental Feature Toggle Admin Page feature, which is behind the `featureToggleAdminPage` feature toggle. Use at your own risk. +# Allow editing of feature toggles in the feature management page +;allow_editing = false +# Allow customization of URL for the controller that manages feature toggles +;update_webhook = +# Allow configuring an auth token for feature management update requests +;update_webhook_token = +# Hide specific feature toggles from the feature management page +;hidden_toggles = +# Disable updating specific feature toggles in the feature management page +;read_only_toggles = + +#################################### Public Dashboards ##################################### +[public_dashboards] +# Set to false to disable public dashboards +;enabled = true diff --git a/roles/grafana-loki-prometheus/loki/defaults/main.yaml b/roles/grafana-loki-prometheus/loki/defaults/main.yaml new file mode 100644 index 0000000..3d374f7 --- /dev/null +++ b/roles/grafana-loki-prometheus/loki/defaults/main.yaml @@ -0,0 +1 @@ +default_loki_version: "3.1.0" \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/loki/tasks/main.yml b/roles/grafana-loki-prometheus/loki/tasks/main.yml new file mode 100644 index 0000000..71d858b --- /dev/null +++ b/roles/grafana-loki-prometheus/loki/tasks/main.yml @@ -0,0 +1,41 @@ +- name: Set parameters + ansible.builtin.set_fact: + _loki_version: "{{ default_loki_version }}" # https://api.github.com/repos/grafana/loki/releases + +- name: Create directories for Loki + ansible.builtin.file: + path: loki + state: directory + mode: 0700 + +- name: Configure Loki (1/2) + template: + src: loki-local-config.yaml.j2 + dest: '{{ ansible_env.HOME }}/loki/loki-local-config.yaml' + +- name: Deploy Loki + docker_container: + name: loki-server + image: "grafana/loki:{{_loki_version}}" + restart: true + state: started + restart_policy: always + ports: + - "3100:3100" + networks: + - name: host + volumes: + - "{{ ansible_env.HOME }}/loki/loki-local-config.yaml:/mnt/config/loki-config.yaml" + command: + - '-config.file=/mnt/config/loki-config.yaml' + +- name: Wait for loki to be ready + ansible.builtin.uri: + url: http://localhost:3100/ready + method: GET + status_code: 200 + register: loki_wait + retries: 10 + delay: 5 + until: loki_wait is succeeded + \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/loki/templates/loki-local-config.yaml.j2 b/roles/grafana-loki-prometheus/loki/templates/loki-local-config.yaml.j2 new file mode 100644 index 0000000..fa81b58 --- /dev/null +++ b/roles/grafana-loki-prometheus/loki/templates/loki-local-config.yaml.j2 @@ -0,0 +1,48 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 0.0.0.0 + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go +# Refer to the buildReport method to see what goes into a report. +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false diff --git a/roles/grafana-loki-prometheus/prometheus/defaults/main.yaml b/roles/grafana-loki-prometheus/prometheus/defaults/main.yaml new file mode 100644 index 0000000..c0581fa --- /dev/null +++ b/roles/grafana-loki-prometheus/prometheus/defaults/main.yaml @@ -0,0 +1 @@ +default_prometheus_version: v2.53.1 \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/prometheus/files/genpass.py b/roles/grafana-loki-prometheus/prometheus/files/genpass.py new file mode 100644 index 0000000..bb06f33 --- /dev/null +++ b/roles/grafana-loki-prometheus/prometheus/files/genpass.py @@ -0,0 +1,10 @@ +import sys +import bcrypt + +if len(sys.argv) != 2: + raise Exception("password not provided") + exit(1) + +password = sys.argv[1] +hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()) +print(hashed_password.decode()) \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/prometheus/tasks/main.yml b/roles/grafana-loki-prometheus/prometheus/tasks/main.yml new file mode 100644 index 0000000..e529515 --- /dev/null +++ b/roles/grafana-loki-prometheus/prometheus/tasks/main.yml @@ -0,0 +1,43 @@ +- name: Set parameters + ansible.builtin.set_fact: + _prometheus_version: "{{ default_prometheus_version }}" # https://api.github.com/repos/prometheus/prometheus/releases + +- name: Gen passwd + delegate_to: localhost + ansible.builtin.shell: python3 {{role_path}}/files/genpass.py {{ prometheus_basic_auth_password }} + register: _passwd + +- name: Create directories for Prometheus + ansible.builtin.file: + path: prometheus + state: directory + mode: 0700 + +- name: Configure Prometheus (1/2) + template: + src: prometheus.yml.j2 + dest: '{{ ansible_env.HOME }}/prometheus/prometheus.yml' + +- name: Configure Prometheus (2/2) + template: + src: web.yml.j2 + dest: '{{ ansible_env.HOME }}/prometheus/web.yml' + +- name: Deploy Prometheus + docker_container: + name: prometheus-server + image: "prom/prometheus:{{_prometheus_version}}" + restart: true + state: started + restart_policy: always + ports: + - "9090:9090" + networks: + - name: host + volumes: + - "{{ ansible_env.HOME }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml" + - "{{ ansible_env.HOME }}/prometheus/web.yml:/etc/prometheus/web.yml" + command: + - '--web.enable-remote-write-receiver' + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.config.file=/etc/prometheus/web.yml' \ No newline at end of file diff --git a/roles/grafana-loki-prometheus/prometheus/templates/prometheus.yml.j2 b/roles/grafana-loki-prometheus/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 0000000..8444d14 --- /dev/null +++ b/roles/grafana-loki-prometheus/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,20 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'gateway' + +alerting: + alertmanagers: + - static_configs: + - targets: [] + +rule_files: [] + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + basic_auth: + username: '{{ prometheus_basic_auth_user }}' + password: '{{ prometheus_basic_auth_password }}' diff --git a/roles/grafana-loki-prometheus/prometheus/templates/web.yml.j2 b/roles/grafana-loki-prometheus/prometheus/templates/web.yml.j2 new file mode 100644 index 0000000..d375d24 --- /dev/null +++ b/roles/grafana-loki-prometheus/prometheus/templates/web.yml.j2 @@ -0,0 +1,2 @@ +basic_auth_users: + {{ prometheus_basic_auth_user }}: {{ _passwd.stdout }} \ No newline at end of file diff --git a/roles/ha/README.md b/roles/ha/README.md new file mode 100644 index 0000000..2749f3c --- /dev/null +++ b/roles/ha/README.md @@ -0,0 +1,29 @@ +# Deploy HA Proxy for k8s API + +Deploys HA Proxy as a container. All hosts belonging to the Ansible inventory +`masters` group are added in the load balancer pool. + +The HA Proxy listens on `*:6443`, as such any IP of the host can be used to +reach the k8s cluster. + +> **NOTE:** It is assumed that default k8s bind port `6443` is used for all +> control-plane nodes API. + +> **WARNING**: the hostvar `apiserver_advertise_address` must be defined for all +> nodes in the pool. Load balancing is performed to the IP address sepecified +> in the hostvar `apiserver_advertise_address`. + +## Depends + +## Parameters +### Variables +#### Required +#### Optionals + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults + +## Returns \ No newline at end of file diff --git a/roles/ha/tasks/main.yaml b/roles/ha/tasks/main.yaml new file mode 100644 index 0000000..657b92d --- /dev/null +++ b/roles/ha/tasks/main.yaml @@ -0,0 +1,16 @@ +- name: Create HAProxy configuration + ansible.builtin.template: + src: haproxy.cfg.j2 + dest: "{{ ansible_env.HOME }}/haproxy.cfg" + +- name: Deploy HAProxy + docker_container: + name: load-balancer + image: haproxytech/haproxy-ubuntu + restart: true + state: started + restart_policy: always + ports: + - "6443:6443" + volumes: + - "{{ ansible_env.HOME }}/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg" diff --git a/roles/ha/templates/haproxy.cfg.j2 b/roles/ha/templates/haproxy.cfg.j2 new file mode 100644 index 0000000..7c3fef1 --- /dev/null +++ b/roles/ha/templates/haproxy.cfg.j2 @@ -0,0 +1,53 @@ +global + log 127.0.0.1 local2 + + chroot /var/lib/haproxy + pidfile /var/run/haproxy.pid + maxconn 4000 + user haproxy + group haproxy + # daemon + + # turn on stats unix socket + stats socket /var/lib/haproxy/stats + +defaults + mode http + log global + option httplog + option dontlognull + option http-server-close + option forwardfor except 127.0.0.0/8 + option redispatch + retries 3 + timeout http-request 10s + timeout queue 1m + timeout connect 10s + timeout client 1m + timeout server 1m + timeout http-keep-alive 10s + timeout check 10s + maxconn 3000 + +#--------------------------------------------------------------------- +# main frontend which proxys to the backends +#--------------------------------------------------------------------- +frontend main + bind *:6443 + mode tcp + option tcplog + default_backend k8s + +#--------------------------------------------------------------------- +# round robin balancing between the various backends +#--------------------------------------------------------------------- +backend k8s + mode tcp + option tcplog + option tcp-check + default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100 + balance roundrobin + +{% for host in groups['masters'] %} + server {{ hostvars[host]['name'] }} {{ hostvars[host]['apiserver_advertise_address'] }}:6443 check +{% endfor %} diff --git a/roles/k8s/auth/README.md b/roles/k8s/auth/README.md new file mode 100644 index 0000000..d6bcadc --- /dev/null +++ b/roles/k8s/auth/README.md @@ -0,0 +1,72 @@ +# Get authenticated access to kubernetes cluster + +> **DANGER ZONE**: This role creates files and memory variables on the ansible +> control node that can be used to gain full access to the kubernetes cluster. +> Make sure that this roles is exectued in a secured environment. + +Get information necessary to connect to a kubernetes cluster and manage it with +`kubeadm`. A kubernetes bootstrap token of 1h validity is generated (see +https://kubernetes.io/docs/reference/access-authn-authz/bootstrap-tokens/). + +The provided information is: +* k8s bootstrap token +* kubeconfig file +* hash of the cluster CA certificate +* control-plane certificate key + +A temporary directory is created where the *kubeconfig* file is saved (see +https://docs.ansible.com/ansible/latest/collections/ansible/builtin/tempfile_module.html +for details on how the temporary files are created). + +> **WARNING**: the kubeconfig file provides full administrive access to the +> cluster, make sur it does not leak to unauthorized persons. + +For automation purposes, a dummy host called `ansible_dummy_host` is created in +the runtime inventory with the following variables: + +* `_token`: the k8s bootstrap token + +* `_kube_config`: path where to find the *kubeconfig* file on the Ansible + control node. + +* `_ca_cert_hash`: the hash of the cluster CA certificate + +* `_certificate_key`: the control-plane certificate key + +* `_tempdir`: the absolute path to the temporary directory created on the +Ansible control node. + +> **NOTE**: The location of the kubeconfig file on the Ansible control node is +> printed while executing the role. + +## Depends +This role must be run on a control-plane kubernetes node created with the role +`k8s-create`. + +## Parameters +### Variables +#### Required +#### Optionals + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults + +## Returns + +A dummy host called `ansible_dummy_host` is created in the runtime inventory +with the following variables: + +* `_token`: the k8s bootstrap token + +* `_kube_config`: path where to find the *kubeconfig* file on the Ansible + control node. + +* `_ca_cert_hash`: the hash of the cluster CA certificate + +* `_certificate_key`: the control-plane certificate key + +* `_tempdir`: the absolute path to the temporary directory created on the +Ansible control node. \ No newline at end of file diff --git a/roles/k8s/auth/tasks/main.yaml b/roles/k8s/auth/tasks/main.yaml new file mode 100644 index 0000000..bbdf124 --- /dev/null +++ b/roles/k8s/auth/tasks/main.yaml @@ -0,0 +1,40 @@ +- name: Create temporary build directory + ansible.builtin.tempfile: + state: directory + suffix: _k8s + delegate_to: localhost + register: tempfile + +- name: Get configuration + ansible.builtin.fetch: + src: /etc/kubernetes/admin.conf + dest: "{{ tempfile.path }}/kube_config" + flat: yes + become: true + +- name: Location of kube_config on local node + debug: + msg: "{{ tempfile.path }}/kube_config" + +- name: Create the token + shell: kubeadm token create --ttl 1h --description "Generated by Ansible to add nodes" + register: admin_token + +- name: Get certificate key + shell: cat certificate-key.pass + register: certificate_key + +# Credit: inspired from https://medium.com/@kosta709/kubernetes-by-kubeadm-config-yamls-94e2ee11244 +- name: Compute CA certificate hash + shell: "openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* /sha256:/'" + register: ca_cert_hash + +# Credit: inspired from https://stackoverflow.com/a/47811099 +- name: "Save information to a dummy host" + add_host: + name: "ansible_dummy_host" + _token: "{{ admin_token.stdout_lines[0] }}" + _kube_config: "{{ tempfile.path }}/kube_config" + _ca_cert_hash: "{{ ca_cert_hash.stdout_lines[0] }}" + _certificate_key: "{{ certificate_key.stdout_lines[0] }}" + _tempdir: "{{ tempfile.path }}" diff --git a/roles/k8s/base/README.md b/roles/k8s/base/README.md new file mode 100644 index 0000000..2383931 --- /dev/null +++ b/roles/k8s/base/README.md @@ -0,0 +1,84 @@ +# Prepare host for kubernetes cluster + +Prepare the host to be able to take part in a kubernetes cluster. It permanently +disables swap and installs `kubectl`, `kubelet`, `kubeadm`, and `helm` on the +host. Usefull information for later operations are registered in Ansible +host-level variables. + +> **WARNING**: when multiple control plane nodes are used (i.e., HA), then +> `k8s.controlPlaneEndpoint` must be defined and it must be the address of the +> load balancer instance in charge of the cluster management. + +## Depends +Roles: `common`, `docker`, and `docker-cri` + +## Parameters +### Variables +#### Required + +#### Optionals +* `k8s.version`: string. k8s version to install (see https://kubernetes.io/releases/). Default to `default_k8s_version`. Supported version are `1.28`, +`1.29`, and `1.30`. + +* `k8s.runtime`: string. runtime to use in the cluster. Default to +`default_k8s_runtime`. Only `docker` is supported. + +* `k8s.serviceSubnet`: string. IPv4 subnet to use for services. Default to `default_k8s_serviceSubnet`. + +* `k8s.podSubnet`: string. IPv4 subnet to use for pods. Defaults to `default_k8s_podSubnet`. + +* `k8s.dnsDomain`: string. DNS domain to use in the cluster. Default to `default_k8s_dnsDomain`. + +* `k8s.bindPort`: string. Bind port for K8S API for the host. Default to +`default_k8s_bindPort`. + +* `k8s.controlPlaneEndpoint`: string. The control plane endpoint to be used in +the cluster (see https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/). +Default to `{masters[0]['apiserver_advertise_address']}:{k8s.bindPort}`. + +### Inventory hostvars +#### Required +#### Optionals +* `name`: string. Hostname to be used in the cluster. Default to +`inventory_hostname` (see +https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html). + +* `apiserver_advertise_address`: string. IP address for the API server to +advertise (see https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/#kubeadm-k8s-io-v1beta3-APIEndpoint). Ignored if not defined. + +* `bindPort`: int. Bind port for K8S API. Default to `k8s.bindPort`. + +* `KUBELET_EXTRA_ARGS`: yaml. Extra args for kublet (see + https://cluster-api.sigs.k8s.io/tasks/bootstrap/kubeadm-bootstrap/kubelet-config#set-kubelet-flags-via-kubeadmconfigspeckubeletextraargs). + Ignored if not defined. + + Example: + ```yaml + KUBELET_EXTRA_ARGS: + cpu-manager-policy: "static" + kube-reserved=cpu: "2" + ``` + +## Defaults +* `default_k8s_runtime`: docker +* `default_k8s_version`: "1.29" +* `default_k8s_serviceSubnet`: '10.96.0.0/16' +* `default_k8s_podSubnet`: '10.244.0.0/16' +* `default_k8s_bindPort`: 6443 +* `default_k8s_dnsDomain`: 'cluster.local' + +## Retruns +The following Ansible host-level variables are registered according to defintion +above (see above for details): + +* `_k8s_version`: k8s version to install. +* `_kubelet_extra_args`: Extra args for kublet. +* `_name`: Hostname to be used in the cluster. +* `_advertiseAddress`: IP address for the API server to advertise. +* `_bindPort`: Bind port for K8S API for the host. +* `_controlPlaneBindPort`: Bind port for K8S API for the control plane. +* `_criSocket`: Path to the CRI socket to connect (see +https://kubernetes.io/docs/reference/setup-tools/kubeadm/kubeadm-init/ for details). +* `_serviceSubnet`: IPv4 subnet to use for services. +* `_podSubnet`: IPv4 subnet to use for pods. +* `_dnsDomain`: DNS domain to use in the cluster. diff --git a/roles/k8s/base/defaults/main.yaml b/roles/k8s/base/defaults/main.yaml new file mode 100644 index 0000000..9f4de31 --- /dev/null +++ b/roles/k8s/base/defaults/main.yaml @@ -0,0 +1,16 @@ +runtime: + docker: + cri: + socket: unix:///var/run/cri-dockerd.sock + +k8s_tools: + - kubectl + - kubeadm + - kubelet + +default_k8s_runtime: docker +default_k8s_version: "1.29" +default_k8s_serviceSubnet: '10.96.0.0/16' +default_k8s_podSubnet: '10.244.0.0/16' +default_k8s_bindPort: 6443 +default_k8s_dnsDomain: 'cluster.local' \ No newline at end of file diff --git a/roles/k8s/base/meta/main.yaml b/roles/k8s/base/meta/main.yaml new file mode 100644 index 0000000..47e044b --- /dev/null +++ b/roles/k8s/base/meta/main.yaml @@ -0,0 +1,4 @@ +dependencies: + - role: common + - role: docker + - role: docker-cri \ No newline at end of file diff --git a/roles/k8s/base/tasks/install.yaml b/roles/k8s/base/tasks/install.yaml new file mode 100644 index 0000000..f331df9 --- /dev/null +++ b/roles/k8s/base/tasks/install.yaml @@ -0,0 +1,38 @@ +--- +- block: # must be root + - name: Remove existing GPG key file to ensure it can be updated + ansible.builtin.file: + path: /etc/apt/keyrings/kubernetes-apt-keyring.gpg + state: absent + + - name: Download Kubernetes GPG key and place it in /etc/apt/keyrings/kubernetes-apt-keyring.gpg + become: yes + ansible.builtin.shell: + cmd: | + curl -fsSL https://pkgs.k8s.io/core:/stable:/v{{ _k8s_version }}/deb/Release.key | gpg --batch --yes --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg + executable: /bin/bash + args: + creates: /etc/apt/keyrings/kubernetes-apt-keyring.gpg + + - name: Create /etc/apt/sources.list.d/kubernetes.list file + copy: + content: | + deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v{{ _k8s_version }}/deb/ / + dest: /etc/apt/sources.list.d/kubernetes.list + + - name: Install k8s tools + include_role: + name: packages + vars: + packages: + system_remove: "{{k8s_tools}}" + system: "{{k8s_tools}}" + + - name: Install helm (1/2) + ansible.builtin.get_url: + url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + dest: /tmp/get_helm.sh + mode: '0555' + - name: Install helm (2/2) + ansible.builtin.shell: "/tmp/get_helm.sh" + become: true diff --git a/roles/k8s/base/tasks/main.yaml b/roles/k8s/base/tasks/main.yaml new file mode 100644 index 0000000..d991814 --- /dev/null +++ b/roles/k8s/base/tasks/main.yaml @@ -0,0 +1,46 @@ +- name: Set parameters + ansible.builtin.set_fact: + _k8s_version: "{{ k8s.version | default(default_k8s_version) }}" + _kubelet_extra_args: "{{ hostvars[inventory_hostname]['KUBELET_EXTRA_ARGS'] | default({}) }}" + _name: "{{ hostvars[inventory_hostname]['name'] | default(inventory_hostname) }}" + _advertiseAddress: "{{ hostvars[inventory_hostname]['apiserver_advertise_address'] | default(omit) }}" + _bindPort: "{{ hostvars[inventory_hostname]['bindPort'] | default(k8s.bindPort) | default(default_k8s_bindPort) }}" + _controlPlaneBindPort: "{{ hostvars[groups['masters'][0]]['bindPort'] | default(k8s.bindPort) | default(default_k8s_bindPort) }}" + _criSocket: "{{ runtime[k8s.runtime | default(default_k8s_runtime)].cri.socket }}" + _serviceSubnet: "{{ k8s.serviceSubnet | default(default_k8s_serviceSubnet) }}" + _podSubnet: "{{ k8s.podSubnet | default(default_k8s_podSubnet) }}" + _dnsDomain: "{{ k8s.dnsDomain | default(default_k8s_dnsDomain) }}" + +- name: Determine control plane endpoint + ansible.builtin.set_fact: + _controlPlaneEndpoint: "{{ k8s.controlPlaneEndpoint | default( hostvars[groups['masters'][0]]['apiserver_advertise_address'] +':' + _controlPlaneBindPort) }}" + +- name: k8s version supported + ansible.builtin.assert: + fail_msg: Kubernetes {{ _k8s_version }} not supported + that: + - _k8s_version is version('1.28', '>=') + - _k8s_version is version('1.30', '<=') + +- block: # Must be root + - name: Disable swap (1/2) + # NOTE:XXX: it would be beneficial to also remove all swap partitions to + # release disk space as we won't use it anymore. + ansible.builtin.template: + src: startup.swapoff.service.j2 + dest: '/etc/systemd/system/startup.swapoff.service' + - name: Disable swap (2/2) + ansible.builtin.systemd: + name: startup.swapoff.service + state: restarted + enabled: yes + daemon_reload: yes + become: true + +- include_tasks: "install.yaml" + + +- name: Create ~/.kube + ansible.builtin.file: + path: "{{ ansible_env.HOME }}/.kube/" + state: directory \ No newline at end of file diff --git a/roles/k8s/base/templates/startup.swapoff.service.j2 b/roles/k8s/base/templates/startup.swapoff.service.j2 new file mode 100644 index 0000000..605c5f7 --- /dev/null +++ b/roles/k8s/base/templates/startup.swapoff.service.j2 @@ -0,0 +1,8 @@ +[Unit] +Description=Swap OFF + +[Service] +ExecStart=/usr/sbin/swapoff -a + +[Install] +WantedBy=multi.user.target \ No newline at end of file diff --git a/roles/k8s/base/vars/main.yaml b/roles/k8s/base/vars/main.yaml new file mode 100644 index 0000000..1fbb118 --- /dev/null +++ b/roles/k8s/base/vars/main.yaml @@ -0,0 +1,2 @@ +k8s: + version: "1.29" \ No newline at end of file diff --git a/roles/k8s/create/README.md b/roles/k8s/create/README.md new file mode 100644 index 0000000..0cd8910 --- /dev/null +++ b/roles/k8s/create/README.md @@ -0,0 +1,55 @@ +# Create a kubernetes cluster + +Create a k8s cluster. Either calico or flannel CNI is installed and configured, +depending on the value of `k8s.cni_plugin`. Multus CNI is installed in the +cluster. + +Kubeconfig file with privileged credentials to access the cluster are added in +the file `${HOME}/.kube/config` on the host. + +> **WARNING**: the kubeconfig file provides full administrive access to the +> cluster, make sur it does not leak to unauthorized persons. + +## Depends +Role: `k8s/base` + +## Parameters +### Variables +#### Required + +#### Optionals +* `k8s.clusterName`: string. Name to give to the cluster. Default to + `default_k8s_clusterName`. + +* `k8s.cni_plugin`: string. The CNI plugin to use for the cluster. Default to + `default_k8s_cni_plugin`. supported values: `calico` or `flannel`. + +* `k8s.calico.nodeAddressAutodetectionV4`: dict. IPv4 autodetection used by + calcico (see https://docs.tigera.io/calico/latest/networking/ipam/ip-autodetection). + Default to `default_k8s_calico_nodeAddressAutodetectionV4`. + +* `k8s.calico.encapsulation`: string. Encapsulation to be used by calcio (see + https://docs.tigera.io/calico/latest/reference/installation/api#operator.tigera.io/v1.EncapsulationType). + Default to `default_k8s_calico_encapsulation`. + +* `k8s_calico_version`: string. Version of calico to be installed (see + https://github.com/projectcalico/calico/releases). Default to + `default_k8s_calico_version`. + +* `k8s_flannel_version`: string. Version of flannel to be installed (see + https://github.com/flannel-io/flannel/releases). Default to + `default_k8s_flannel_version`. + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults +* `default_k8s_clusterName`: 'kubernetes' +* `default_k8s_cni_plugin`: 'calico' +* `default_k8s_calico_encapsulation`: 'VXLAN' +* `default_k8s_calico_nodeAddressAutodetectionV4`: `kubernetes: NodeInternalIP` +* `default_k8s_calico_version`: 'v3.24.1' +* `default_k8s_flannel_version`: 'v0.24.4' + +## Returns diff --git a/roles/k8s/create/defaults/main.yaml b/roles/k8s/create/defaults/main.yaml new file mode 100644 index 0000000..c4d0d76 --- /dev/null +++ b/roles/k8s/create/defaults/main.yaml @@ -0,0 +1,7 @@ +default_k8s_clusterName: 'kubernetes' +default_k8s_cni_plugin: 'calico' +default_k8s_calico_encapsulation: 'VXLAN' +default_k8s_calico_nodeAddressAutodetectionV4: + kubernetes: NodeInternalIP +default_k8s_calico_version: v3.24.1 +default_k8s_flannel_version: v0.24.4 \ No newline at end of file diff --git a/roles/k8s/create/meta/main.yaml b/roles/k8s/create/meta/main.yaml new file mode 100644 index 0000000..ef8ee12 --- /dev/null +++ b/roles/k8s/create/meta/main.yaml @@ -0,0 +1,2 @@ +dependencies: + - role: k8s/base \ No newline at end of file diff --git a/roles/k8s/create/tasks/cni_calico.yaml b/roles/k8s/create/tasks/cni_calico.yaml new file mode 100644 index 0000000..2cbb05f --- /dev/null +++ b/roles/k8s/create/tasks/cni_calico.yaml @@ -0,0 +1,23 @@ +- name: Set parameters + ansible.builtin.set_fact: + _k8s_calico_nodeAddressAutodetectionV4: "{{ k8s.calico.nodeAddressAutodetectionV4 | default(default_k8s_calico_nodeAddressAutodetectionV4) }}" + _k8s_calico_encapsulation: " {{k8s.calico.encapsulation | default(default_k8s_calico_encapsulation) }}" + _k8s_calico_version: " {{k8s.calico.version | default(default_k8s_calico_version) }}" + +- name: Create tigera-operator namespace + kubernetes.core.k8s: + name: tigera-operator + api_version: v1 + kind: Namespace + state: present + +- name: Set Calico installation instructions + ansible.builtin.template: + src: tigera_operator.yaml.j2 + dest: tigera_operator.yaml + +- name: Register project-calico helm repo + shell: 'helm repo add projectcalico https://projectcalico.docs.tigera.io/charts' + +- name: Install Calico + shell: 'helm install calico projectcalico/tigera-operator --version {{ _k8s_calico_version }} -f tigera_operator.yaml --namespace tigera-operator' \ No newline at end of file diff --git a/roles/k8s/create/tasks/cni_flannel.yaml b/roles/k8s/create/tasks/cni_flannel.yaml new file mode 100644 index 0000000..17896de --- /dev/null +++ b/roles/k8s/create/tasks/cni_flannel.yaml @@ -0,0 +1,19 @@ +- name: Set parameters + ansible.builtin.set_fact: + _k8s_flannel_version: " {{k8s.flannel.version | default(default_k8s_flannel_version) }}" + +- name: Create kube-flannel namespace + kubernetes.core.k8s: + name: kube-flannel + api_version: v1 + kind: Namespace + state: present + +- name: Set kube-flannel privileges + shell: kubectl label --overwrite ns kube-flannel pod-security.kubernetes.io/enforce=privileged + +- name: Register kube-flannel helm repo + shell: 'helm repo add flannel https://flannel-io.github.io/flannel/' + +- name: Install kube-flannel + shell: helm install flannel --set podCidr="{{ _podSubnet }}" --version {{ _k8s_flannel_version }} --namespace kube-flannel flannel/flannel \ No newline at end of file diff --git a/roles/k8s/create/tasks/main.yaml b/roles/k8s/create/tasks/main.yaml new file mode 100644 index 0000000..282b340 --- /dev/null +++ b/roles/k8s/create/tasks/main.yaml @@ -0,0 +1,58 @@ +- name: Set parameters + ansible.builtin.set_fact: + _k8s_cni_plugin: "{{ k8s.cni_plugin | default(default_k8s_cni_plugin) }}" + _k8s_clusterName: " {{ k8s.clusterName | default(default_k8s_clusterName )}}" + +- name: Create a certificate key + shell: 'kubeadm certs certificate-key | tee certificate-key.pass' + register: kubeadm_certificate_key + +- name: Create kubeadm configuration + ansible.builtin.template: + src: kubeadm_config.yaml.j2 + dest: ./kubeadm_config.yaml + vars: + - certificate_key: '{{ kubeadm_certificate_key.stdout_lines[0] }}' + - kubelet_extra_args: "{{ {'enable-controller-attach-detach': 'false'} | combine(_kubelet_extra_args) }}" +- name: Reset k8s + shell: "kubeadm reset -f --cri-socket {{ _criSocket }}" + become: true + +- name: Initialize the master node + shell: 'kubeadm init --upload-certs --config kubeadm_config.yaml' + register: kubeadm_init + become: true + +- name: Create kubeconfig directory + file: + path: "{{ ansible_env.HOME }}/.kube" + state: directory + mode: "0755" + owner: "{{ ansible_user }}" + # group: "{{ ansible_user }}" +- name: Copy kubeconfig file + ansible.builtin.copy: + src: /etc/kubernetes/admin.conf + dest: "{{ ansible_env.HOME }}/.kube/config" + owner: "{{ ansible_user }}" + # group: "{{ ansible_user }}" + remote_src: yes + become: true + +- name: Deploy CNI + include_tasks: "cni_{{_k8s_cni_plugin}}.yaml" + +- name: Retrieve multus + ansible.builtin.git: + repo: https://github.com/k8snetworkplumbingwg/multus-cni.git + dest: multus-cni + version: v4.0.1 + force: yes + +- name: Install multus + kubernetes.core.k8s: + state: present + src: multus-cni/deployments/multus-daemonset.yml + +- name: Install Node Feature Discovery + shell: kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.15.4 diff --git a/roles/k8s/create/templates/kubeadm_config.yaml.j2 b/roles/k8s/create/templates/kubeadm_config.yaml.j2 new file mode 100644 index 0000000..2b19458 --- /dev/null +++ b/roles/k8s/create/templates/kubeadm_config.yaml.j2 @@ -0,0 +1,31 @@ +apiVersion: kubeadm.k8s.io/v1beta3 +kind: InitConfiguration +nodeRegistration: + name: "{{ _name }}" + criSocket: "{{ _criSocket }}" + kubeletExtraArgs: + {{ kubelet_extra_args }} +localAPIEndpoint: + bindPort: {{ _bindPort | int }} +{% if _advertiseAddress is defined %} + advertiseAddress: "{{ _advertiseAddress }}" +{% endif %} +certificateKey: "{{ certificate_key }}" +--- +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +clusterName: "{{ _k8s_clusterName }}" +networking: + serviceSubnet: "{{ _serviceSubnet }}" + podSubnet: "{{ _podSubnet }}" + dnsDomain: "{{ _dnsDomain }}" +{% if _controlPlaneEndpoint is defined %} +controlPlaneEndpoint: "{{ _controlPlaneEndpoint }}" +{% endif %} +--- +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +# kubelet specific options here +kind: KubeletConfiguration +apiVersion: kubelet.config.k8s.io/v1beta1 +cgroupDriver: systemd diff --git a/roles/k8s/create/templates/tigera_operator.yaml.j2 b/roles/k8s/create/templates/tigera_operator.yaml.j2 new file mode 100644 index 0000000..607fd25 --- /dev/null +++ b/roles/k8s/create/templates/tigera_operator.yaml.j2 @@ -0,0 +1,12 @@ +installation: + cni: + type: Calico + serviceCIDRs: + - {{ _serviceSubnet }} + calicoNetwork: + bgp: Disabled + nodeAddressAutodetectionV4: + {{ _k8s_calico_nodeAddressAutodetectionV4| to_yaml}} + ipPools: + - cidr: '{{ _podSubnet }}' + encapsulation: {{ _k8s_calico_encapsulation }} diff --git a/roles/k8s/join/README.md b/roles/k8s/join/README.md new file mode 100644 index 0000000..fcb6eef --- /dev/null +++ b/roles/k8s/join/README.md @@ -0,0 +1,67 @@ +# Join a kubernetes cluster + +Join a k8s cluster created with the role `k8s/create`. Kubeconfig file with +privileged credentials to access the cluster are added in the file +`${HOME}/.kube/config` on the host. + +The following authentication information must be provided: +* the k8s bootstrap token (see `roles/auth` for details). +* the hash of the cluster CA certificate (see `roles/auth` for details). +* the control-plane certificate key (see `roles/auth` for details). +* path where to find the *kubeconfig* file on the Ansible + control node (see `roles/auth` for details). + +For details on these parameters, see the `k8s/auth` role documentation. The +easiest way to have the parameters right is to run the `k8s/auth` role before +hand. + +In addition, the API address of the cluster must be provided. + +> **WARNING**: the kubeconfig file provides full administrive access to the +> cluster, make sur it does not leak to unauthorized persons. + +## Depends +Role: `k8s/base` + +> **WARNING**: It is assumed that the cluster to join has been created with the +> role `k8s/create`. + +## Parameters +### Variables +#### Required +* `token`: the k8s bootstrap token (see `k8s/auth` for details). + +* `ca_cert_hash`: the hash of the cluster CA certificate (see `k8s/auth` for details). + +* `certificate_key`: the control-plane certificate key (see `k8s/auth` for details). + +* `kube_config_local_path`: path where to find the *kubeconfig* file on the Ansible + control node (see `k8s/auth` for details). + +* `master`: string. API address of the cluster to join. + +If the `k8s/auth` role is run on a control-plane node of the cluster before +hand, an easy way to get the variables to their right values is to defined them +with + +```yaml +token: "{{ hostvars['ansible_dummy_host']['_token'] }}" +ca_cert_hash: "{{ hostvars['ansible_dummy_host']['_ca_cert_hash'] }}" +certificate_key: "{{ hostvars['ansible_dummy_host']['_certificate_key'] }}" +kube_config_local_path: "{{ hostvars['ansible_dummy_host']['_kube_config'] }}" +``` + +#### Optionals + * `control_plane_node`: bool. If true, the node is added to the cluster with + the control-plane role. Otherwise, it is added as a worker node. Default + to false. + + * `k8s.controlPlaneEndpoint`: IP or domain name to the API server from which + information will be fetched (see + https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/#kubeadm-k8s-io-v1beta3-BootstrapTokenDiscovery). Ingnored if not defined. + +### Inventory hostvars +#### Required +#### Optionals + +## Returns diff --git a/roles/k8s/join/meta/main.yaml b/roles/k8s/join/meta/main.yaml new file mode 100644 index 0000000..ef8ee12 --- /dev/null +++ b/roles/k8s/join/meta/main.yaml @@ -0,0 +1,2 @@ +dependencies: + - role: k8s/base \ No newline at end of file diff --git a/roles/k8s/join/tasks/main.yaml b/roles/k8s/join/tasks/main.yaml new file mode 100644 index 0000000..958ef04 --- /dev/null +++ b/roles/k8s/join/tasks/main.yaml @@ -0,0 +1,33 @@ +--- +- name: Set parameters + ansible.builtin.set_fact: + _is_control_plane_node: "{{ control_plane_node | default(false) }}" + _controlPlaneEndpoint: "{{ k8s.controlPlaneEndpoint | default(omit) }}" + +- name: Copy Kube config + ansible.builtin.copy: + src: '{{ kube_config_local_path }}' + dest: ~/.kube/config + +- name: Create kubeadm configuration + ansible.builtin.template: + src: kubeadm_config.yaml.j2 + dest: ./kubeadm_config.yaml + vars: + - kubelet_extra_args: "{{ {'enable-controller-attach-detach': 'false'} | combine(_kubelet_extra_args) }}" + +- name: Reset k8s + shell: "kubeadm reset -f --cri-socket {{ _criSocket }}" + become: true + +- name: Retrieve PKI from master (1/2) + ansible.builtin.file: + path: /etc/kubernetes/pki/ + state: directory + mode: 755 + when: _is_control_plane_node + become: true + +- name: Join k8s cluster + ansible.builtin.shell: 'kubeadm join --config ./kubeadm_config.yaml' + become: true \ No newline at end of file diff --git a/roles/k8s/join/templates/kubeadm_config.yaml.j2 b/roles/k8s/join/templates/kubeadm_config.yaml.j2 new file mode 100644 index 0000000..3913c4b --- /dev/null +++ b/roles/k8s/join/templates/kubeadm_config.yaml.j2 @@ -0,0 +1,22 @@ +apiVersion: kubeadm.k8s.io/v1beta3 +kind: JoinConfiguration +nodeRegistration: + name: "{{ _name }}" + criSocket: "{{ _criSocket }}" + kubeletExtraArgs: + {{ kubelet_extra_args }} +discovery: + bootstrapToken: + apiServerEndpoint: "{{ _controlPlaneEndpoint }}" + token: '{{ token }}' + caCertHashes: + - '{{ ca_cert_hash }}' +{% if is_control_plane_node | default(false) %} +controlPlane: + certificateKey: "{{ certificate_key }}" + localAPIEndpoint: + bindPort: {{ _bindPort | int }} +{%if _advertiseAddress is defined %} + advertiseAddress: "{{ _advertiseAddress }}" +{% endif %} +{% endif %} diff --git a/roles/k8s/metallb/README.md b/roles/k8s/metallb/README.md new file mode 100644 index 0000000..009037b --- /dev/null +++ b/roles/k8s/metallb/README.md @@ -0,0 +1,31 @@ +# Install and configure MetalLB on an kubernets cluster + +Install MetalLB in the kubernetes cluster and configure it to run in layer 2 +mode. An IPAddressPool named `blueprint-pool` is created with all IPs provided +in `k8s.metallb.addresses`. + +## Depends + +This role must be run on the machine that has `kubectl` installed and configured +to get admin access to the kubernetes cluster. + +> **NOTE**: The kubernetes cluster must have at least one node where workload +> can be scheduled. + +## Parameters +### Variables +#### Required + +* `k8s.metallb.addresses`: list of addresses to added to the IP address pool. +Addresses can be defined by CIDR, by range, and both IPV4 and IPV6 addresses +can be assigned (see https://metallb.universe.tf/configuration/ for details). + +#### Optionals + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults + +## Returns diff --git a/roles/k8s/metallb/tasks/main.yaml b/roles/k8s/metallb/tasks/main.yaml new file mode 100644 index 0000000..aed23de --- /dev/null +++ b/roles/k8s/metallb/tasks/main.yaml @@ -0,0 +1,19 @@ +- name: Retrieve metallb + ansible.builtin.get_url: + url: https://raw.githubusercontent.com/metallb/metallb/v0.14.4/config/manifests/metallb-native.yaml + dest: ./metallb-native.yaml + +- name: Install metallb + shell: 'kubectl create -f metallb-native.yaml' + +- name: Create metallb configuration + ansible.builtin.template: + src: metallb.yaml.j2 + dest: ./metallb.yaml + +- name: Wait for metallb to be ready + ansible.builtin.shell: 'kubectl wait --timeout=300s -n metallb-system --for=condition=Ready $(kubectl get pod -o name -n metallb-system)' + +- name: Setup metallb pool + shell: 'kubectl create -f metallb.yaml' + diff --git a/roles/k8s/metallb/templates/metallb.yaml.j2 b/roles/k8s/metallb/templates/metallb.yaml.j2 new file mode 100644 index 0000000..024700b --- /dev/null +++ b/roles/k8s/metallb/templates/metallb.yaml.j2 @@ -0,0 +1,16 @@ +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: blueprint-pool + namespace: metallb-system +spec: + addresses: +{% for address in k8s.metallb.addresses %} + - {{ address }} +{% endfor %} +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: blueprint-l2adv + namespace: metallb-system diff --git a/roles/kernel/README.md b/roles/kernel/README.md new file mode 100644 index 0000000..eba665d --- /dev/null +++ b/roles/kernel/README.md @@ -0,0 +1,48 @@ +# Setup kernel + +This role customizes the kernel. + +It is used to +* load kernel modules; +* set kernel attributes; +* set PAM limits; +* install and load a specific kernel version. + +All changes are persistent such that customizations remain to their new values +even after reboot. + +> **WARNING**: if a different kernel version than the current running one is +> requested the system will reboot. + +## Parameters +### Variables +#### Required +* `kernel.modules`: list of modules to load. Default to `default_kernel_modules`. + +* `kernel.attributes`: list of kernel attributes to set. Default to `default_kernel_attributes`. + +* `kernel.limits`: list of kernel attributes to set. Default to `default_kernel_limits`. + +#### Optionals +* `kernel.hugepage.number`: number of hugepage. If absent, it is ignored. + +* `install_custom_kernel`: boolean. Determine if a custom kernel needs to be +installed and configured. If absent, it is ignored. + +### Inventory hostvars +#### Required +#### Optionals +* `kernel.image`: string. Kernel version to install. For example, to install the +`linux-image-6.5.0-44-lowlatency` kernel, set this to `6.5.0-44-lowlatency`. The +kernel version must be available on the package repository. If absent, it is +ignored. + +* `kernel.GRUB_CMDLINE_LINUX_DEFAULT`: string. GRUB linux command line (see +https://help.ubuntu.com/community/Grub2/Setup). If absent, it is ignored. + +## Defaults +* `default_kernel_modules`: [] + +* `default_kernel_attributes`: [] + +* `default_kernel_limits`: [] \ No newline at end of file diff --git a/roles/kernel/defaults/main.yaml b/roles/kernel/defaults/main.yaml new file mode 100644 index 0000000..a95ccf4 --- /dev/null +++ b/roles/kernel/defaults/main.yaml @@ -0,0 +1,7 @@ +default_kernel_modules: [] +default_kernel_attributes: [] +default_kernel_limits: [] + +# kernel: +# hugepage: +# number: 0 \ No newline at end of file diff --git a/roles/kernel/tasks/custom_kernel.yaml b/roles/kernel/tasks/custom_kernel.yaml new file mode 100644 index 0000000..617bf84 --- /dev/null +++ b/roles/kernel/tasks/custom_kernel.yaml @@ -0,0 +1,33 @@ +- block: # must be root + - block: + - name: Install kernel + include_role: + name: packages + vars: + packages: + system: + - linux-image-{{ hostvars[inventory_hostname]['kernel']['image'] }} + + - name: Shall reboot + ansible.builtin.set_fact: + _reboot: true + when: hostvars[inventory_hostname]['kernel']['image'] not in ansible_kernel + + - block: + - name: Modify GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^GRUB_CMDLINE_LINUX_DEFAULT=' + line: GRUB_CMDLINE_LINUX_DEFAULT="{{ hostvars[inventory_hostname]['kernel']['GRUB_CMDLINE_LINUX_DEFAULT'] }}" + backup: yes + register: grub + when: hostvars[inventory_hostname]['kernel']['GRUB_CMDLINE_LINUX_DEFAULT'] is defined + + - block: + - name: Update grub + shell: update-grub + - name: Shall reboot + ansible.builtin.set_fact: + _reboot: true + when: grub.changed | default(false) + become: true \ No newline at end of file diff --git a/roles/kernel/tasks/main.yaml b/roles/kernel/tasks/main.yaml new file mode 100644 index 0000000..a285f1c --- /dev/null +++ b/roles/kernel/tasks/main.yaml @@ -0,0 +1,61 @@ +- name: Set parameters + ansible.builtin.set_fact: + _kernel_modules: "{{ kernel.modules | default(default_kernel_modules) }}" + _kernel_attributes: "{{ kernel.attributes | default(default_kernel_attributes) }}" + _kernel_limits: "{{ kernel.limits | default(default_kernel_limits) }}" + _install_custom_kernel: "{{ hostvars[inventory_hostname]['kernel']['image'] | default(false) }}" + +- include_tasks: custom_kernel.yaml + when: _install_custom_kernel + +- block: # must be root + - name: Load kernel modules + community.general.modprobe: + name: '{{ item }}' + state: present + persistent: present + with_items: "{{ _kernel_modules }}" + + - name: Set kernel attributes + ansible.posix.sysctl: + name: '{{ item.option }}' + value: '{{ item.value }}' + sysctl_set: yes + state: present + reload: yes + with_items: "{{ _kernel_attributes }}" + + - name: Set PAM limits + community.general.pam_limits: + domain: '{{ item.domain }}' + limit_type: '{{ item.type }}' + limit_item: '{{ item.item }}' + value: '{{ item.value }}' + with_items: "{{ _kernel_limits }}" + + - name: Setting hugepages + ansible.posix.sysctl: + name: vm.nr_hugepages + value: "{{ kernel.hugepage.number }}" + sysctl_set: yes + state: present + reload: yes + when: kernel.hugepage is defined + + - block: + - name: Reboot the system + reboot: + post_reboot_delay: 120 + connect_timeout: 5 + reboot_timeout: 400 + + - name: Wait for the system to come back online + wait_for_connection: + connect_timeout: 100 + sleep: 5 + delay: 5 + timeout: 300 + when: _reboot | default(false) + + become: true + \ No newline at end of file diff --git a/roles/packages/README.md b/roles/packages/README.md new file mode 100644 index 0000000..7bb2a97 --- /dev/null +++ b/roles/packages/README.md @@ -0,0 +1,37 @@ +# Linux and Python packages + +Manage system and python packages. This roles allows to install or remove: +* system packages; +* python packages. + +Packages are installed system wide and accessible to all users. + +## Parameters +### Variables +#### Required +* `packages.system_remove`: system packages to be remove on the entire + system before installing the rest. List of package names (need to be the name + as supported by the distribution). Version can be added to the name if a + specific one has to be specified. + +* `packages.system`: system packages to be installed on the entire system. + List of package names (need to be the name as supported by the distribution). + Version can be added to the name if a specific one has to be installed. + +* `packages.python_remove`: python packages to be removed on the entire system + before installing the rest. List of package names (need to be the name as + supported by the python nsatllation). Version can be added to the name if a + specific one has to be removed. + +* `packages.python`: python packages to be installed on the entire system. + List of package names (need to be the name as supported by the python + insatllation). Version can be added to the name if a specific one has to be + installed. + +#### Optionals + +### Inventory hostvars +#### Required +#### Optionals + +## Defaults diff --git a/roles/packages/tasks/main.yaml b/roles/packages/tasks/main.yaml new file mode 100644 index 0000000..1086c45 --- /dev/null +++ b/roles/packages/tasks/main.yaml @@ -0,0 +1,52 @@ +--- +- block: # must to be root + - block: + - name: Remove package (system) + ansible.builtin.package: + name: "{{ item }}" + state: absent + with_items: "{{ packages.system_remove }}" + when: packages.system_remove is defined and packages.system_remove | length > 0 + + - block: + - name: Remove package (Python) + ansible.builtin.pip: + name: "{{item}}" + break_system_packages: true + state: absent + with_items: "{{ packages.system_remove }}" + when: packages.system_remove is defined and packages.system_remove | length > 0 + + - block: # system packages + - name: Gather package (system) + ansible.builtin.package_facts: + manager: auto + + ## OS specific + - name: Update Package Cache (apt/Ubuntu/Debian) + ansible.builtin.apt: + update_cache: yes + changed_when: false + when: ansible_distribution == "Ubuntu" or ansible_distribution == "Debian" + + ## Generic installation + - name: Install package (system) + ansible.builtin.package: + name: "{{ item }}" + state: present + with_items: "{{ packages.system }}" + when: packages.system is defined and packages.system | length > 0 + + - block: # python packages + - name: Gather package (Python) + community.general.pip_package_info: + clients: [pip3, pip] + register: installed_pip_packages + + - name: Install package (Python) + ansible.builtin.pip: + name: "{{item}}" + break_system_packages: true + with_items: "{{ packages.python }}" + when: packages.python is defined and packages.python | length > 0 + become: yes diff --git a/roles/post-5g-bp-cluster-monitoring/README.md b/roles/post-5g-bp-cluster-monitoring/README.md new file mode 100644 index 0000000..8791b8d --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/README.md @@ -0,0 +1,73 @@ +# Blueprint Monitoring Charts + +This repository contains Ansible scripts and configurations for setting up a comprehensive monitoring solution using Prometheus, Promtail, Node Exporter, Cadvisor, and Kube State Metrics. + +## Table of Contents + +- [Project Structure](#project-structure) +- [Prerequisites](#prerequisites) +- [Installation](#installation) +- [Usage](#usage) +- [Contributing](#contributing) +- [License](#license) + +## Project Structure + +The repository is structured as follows: +``` +blueprint-monitoring-charts/ +├── prometheus/ +├── promtail/ +├── node-exporter/ +├── cadvisor/ +├── kube-state-metrics/ +├── bp-monitoring.yml +└── hosts +``` + +- **prometheus/**: Contains configurations and setup files for Prometheus. +- **promtail/**: Contains configurations and setup files for Promtail. +- **node-exporter/**: Contains configurations and setup files for Node Exporter. +- **cadvisor/**: Contains configurations and setup files for cAdvisor. +- **kube-state-metrics/**: Contains configurations and setup files for Kube State Metrics. +- **bp-monitoring.yml**: Ansible playbook for deploying the monitoring stack. +- **hosts**: Inventory file for Ansible. + +## Prerequisites + +Before you begin, ensure you have met the following requirements: + +- Ansible installed on your local machine. +- Access to the servers where you want to deploy the monitoring stack. +- SSH access to the target servers configured in the `hosts` file. +- K8s is installed and configured on the target servers. The deployments will be done in the "default" namespace + +## Installation + +1. Clone the repository to your local machine and cd to the directory: + +```bash +cd blueprint-monitoring-charts +``` + +2. Update the hosts file with the target server details. The servers should be reachable from the deployment node and accessible without requiring password (e.g. rsa-key based access) + +3. Update the respective vars files for each ansible role. + - prometheus/vars/main.yml +``` +remote_write_address: "10.64.45.85" # The address where a central prometheus instance is configured for collecting information from all the clusters +remote_write_port: "9090" # The port on which the central prometheus instance is listening to +remote_write_user: "admin" # The username used to access the remote prometheus instance +remote_write_pass: "test1234" # The password used to access the remote prometheus instance +remote_data_label: "uth" # A label for annotating the data generated within the k8s cluster, before pushing them to the central prometheus instance +``` + - promtail/vars/main.yml +``` +loki_address: 10.64.45.125 # A central Grafana LOKI address used to collect all the logs from the different clusters +loki_port: 3100 # The port on which the central LOKI instance is listening to +``` + +4. Run the Ansible playbook to deploy the monitoring stack +``` +ansible-playbook -i hosts bp-monitoring.yml +``` diff --git a/roles/post-5g-bp-cluster-monitoring/cadvisor/tasks/main.yml b/roles/post-5g-bp-cluster-monitoring/cadvisor/tasks/main.yml new file mode 100644 index 0000000..47c6a38 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/cadvisor/tasks/main.yml @@ -0,0 +1,13 @@ +- name: Create directories for node cadvisor + ansible.builtin.file: + path: cadvisor + state: directory + mode: 0700 + +- name: Apply cAdvisor deployment + template: + src: cadvisor.yaml.j2 + dest: cadvisor/cadvisor-deployment.yaml + +- name: Apply cAdvisor YAML to the cluster + shell: kubectl apply -f cadvisor/cadvisor-deployment.yaml diff --git a/roles/post-5g-bp-cluster-monitoring/cadvisor/templates/cadvisor.yaml.j2 b/roles/post-5g-bp-cluster-monitoring/cadvisor/templates/cadvisor.yaml.j2 new file mode 100644 index 0000000..3827fdd --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/cadvisor/templates/cadvisor.yaml.j2 @@ -0,0 +1,97 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cadvisor + labels: + app: cadvisor +spec: + replicas: 1 + selector: + matchLabels: + app: cadvisor + template: + metadata: + labels: + app: cadvisor + spec: + containers: + - name: cadvisor + image: gcr.io/cadvisor/cadvisor:v0.49.1 + ports: + - containerPort: 8080 + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - name: rootfs + mountPath: /rootfs + readOnly: true + - name: var-run + mountPath: /var/run + readOnly: true + - name: sys + mountPath: /sys + readOnly: true + - name: docker + mountPath: /var/lib/docker + readOnly: true + volumes: + - name: rootfs + hostPath: + path: / + - name: var-run + hostPath: + path: /var/run + - name: sys + hostPath: + path: /sys + - name: docker + hostPath: + path: /var/lib/docker +--- +apiVersion: v1 +kind: Service +metadata: + name: cadvisor + labels: + app: cadvisor +spec: + ports: + - name: cadvisor + port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app: cadvisor +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: cadvisor + labels: + release: prometheus +spec: + selector: + matchLabels: + app: cadvisor + endpoints: + - port: cadvisor + interval: 15s +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus + labels: +spec: + selector: + matchLabels: + namespaceSelector: + any: true + endpoints: + - path: /metrics/cadvisor + - port: cadvisor diff --git a/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/tasks/main.yml b/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/tasks/main.yml new file mode 100644 index 0000000..2e3ecf2 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/tasks/main.yml @@ -0,0 +1,13 @@ +- name: Create directories for kube state metrics + ansible.builtin.file: + path: kube_state_metrics + state: directory + mode: 0700 + +- name: Apply Kube-State-Metrics deployment + template: + src: kube-state-metrics.yaml.j2 + dest: kube_state_metrics/kube-state-metrics-deployment.yaml + +- name: Apply Kube-State-Metrics YAML to the cluster + shell: kubectl apply -f kube_state_metrics/kube-state-metrics-deployment.yaml diff --git a/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/templates/kube-state-metrics.yaml.j2 b/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/templates/kube-state-metrics.yaml.j2 new file mode 100644 index 0000000..7faf666 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/kube-state-metrics/templates/kube-state-metrics.yaml.j2 @@ -0,0 +1,128 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: default + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v2.0.0-rc.0 + ports: + - name: ksm-port + containerPort: 8080 + - name: telemetry + containerPort: 8081 +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: default + labels: + app: kube-state-metrics +spec: + ports: + - name: ksm-port + port: 8080 + targetPort: ksm-port + selector: + app: kube-state-metrics +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: [""] + resources: + - pods + - nodes + - namespaces + - replicationcontrollers + - resourcequotas + - services + - endpoints + - persistentvolumeclaims + - persistentvolumes + - events + - configmaps + - secrets + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + - volumeattachments + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: default +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-state-metrics + namespace: default + labels: + release: prometheus +spec: + selector: + matchLabels: + app: kube-state-metrics + endpoints: + - port: ksm-port + interval: 30s diff --git a/roles/post-5g-bp-cluster-monitoring/node-exporter/tasks/main.yml b/roles/post-5g-bp-cluster-monitoring/node-exporter/tasks/main.yml new file mode 100644 index 0000000..7337719 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/node-exporter/tasks/main.yml @@ -0,0 +1,13 @@ +- name: Create directories for node exporter + ansible.builtin.file: + path: node_exporter + state: directory + mode: 0700 + +- name: Apply Node Exporter DaemonSet + template: + src: node-exporter.yaml.j2 + dest: node_exporter/node-exporter.yaml + +- name: Apply Node Exporter YAML to the cluster + shell: kubectl apply -f node_exporter/node-exporter.yaml diff --git a/roles/post-5g-bp-cluster-monitoring/node-exporter/templates/node-exporter.yaml.j2 b/roles/post-5g-bp-cluster-monitoring/node-exporter/templates/node-exporter.yaml.j2 new file mode 100644 index 0000000..bd5cf6e --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/node-exporter/templates/node-exporter.yaml.j2 @@ -0,0 +1,103 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: node-exporter + name: node-exporter + namespace: default +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "true" + labels: + app: node-exporter + spec: + containers: + - args: + - --web.listen-address=0.0.0.0:9100 + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + image: quay.io/prometheus/node-exporter:v1.8.1 + imagePullPolicy: IfNotPresent + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: metrics + protocol: TCP + resources: + limits: + cpu: 200m + memory: 50Mi + requests: + cpu: 100m + memory: 30Mi + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: true + - mountPath: /host/sys + name: sys + readOnly: true + hostNetwork: true + hostPID: true + restartPolicy: Always + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists + volumes: + - hostPath: + path: /proc + type: "" + name: proc + - hostPath: + path: /sys + type: "" + name: sys +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: node-exporter + name: node-exporter + namespace: default +spec: + ports: + - name: node-exporter + port: 9100 + protocol: TCP + targetPort: 9100 + selector: + app: node-exporter + sessionAffinity: None + type: ClusterIP +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: node-exporter + serviceMonitorSelector: prometheus + name: node-exporter + namespace: default +spec: + endpoints: + - honorLabels: true + interval: 30s + path: /metrics + targetPort: 9100 + jobLabel: node-exporter + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + app: node-exporter diff --git a/roles/post-5g-bp-cluster-monitoring/prometheus/tasks/main.yml b/roles/post-5g-bp-cluster-monitoring/prometheus/tasks/main.yml new file mode 100644 index 0000000..74bfcd9 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/prometheus/tasks/main.yml @@ -0,0 +1,33 @@ +- name: Create directories for Prometheus + ansible.builtin.file: + path: prometheus + state: directory + mode: 0700 + +- name: Download CRDs + ansible.builtin.get_url: + url: https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/master/bundle.yaml + dest: prometheus/bundle.yaml + mode: '0644' + +- name: Apply Prometheus deployment + template: + src: prometheus.yaml.j2 + dest: prometheus/prometheus.yaml + +- name: Make sure that no secrets exist for accessing prometheus + shell: kubectl delete secret kubepromsecret + ignore_errors: true + +- name: Create secrets for accessing remote write prometheus + shell: kubectl create secret generic kubepromsecret --from-literal=username={{ remote_write_user }} --from-literal=password={{ remote_write_pass }} + +- name: Make sure CRDs do not exist + ansible.builtin.shell: kubectl delete -f prometheus/bundle.yaml + ignore_errors: true + +- name: Create CRDs + ansible.builtin.shell: kubectl create -f prometheus/bundle.yaml + +- name: Apply Prometheus YAML to the cluster + shell: kubectl apply -f prometheus/prometheus.yaml diff --git a/roles/post-5g-bp-cluster-monitoring/prometheus/templates/prometheus.yaml.j2 b/roles/post-5g-bp-cluster-monitoring/prometheus/templates/prometheus.yaml.j2 new file mode 100644 index 0000000..52aa18e --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/prometheus/templates/prometheus.yaml.j2 @@ -0,0 +1,107 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +#apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + labels: + app: prometheus +spec: + image: quay.io/prometheus/prometheus:v2.53.0 + nodeSelector: + kubernetes.io/os: linux + replicas: 2 + resources: + requests: + memory: 400Mi + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus + version: v2.53.0 + serviceMonitorSelector: {} + remoteWrite: + - url: "http://{{ remote_write_address }}:{{ remote_write_port }}/api/v1/write" + basicAuth: + username: + name: kubepromsecret + key: username + password: + name: kubepromsecret + key: password + replicaExternalLabelName: "__replica__" + externalLabels: + cluster: {{ remote_data_label }} +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + labels: + app: prometheus +spec: + ports: + - name: web + port: 9090 + targetPort: web + selector: + app.kubernetes.io/name: prometheus + sessionAffinity: ClientIP +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-self + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + port: web + selector: + matchLabels: + app: prometheus +--- diff --git a/roles/post-5g-bp-cluster-monitoring/promtail/tasks/main.yml b/roles/post-5g-bp-cluster-monitoring/promtail/tasks/main.yml new file mode 100644 index 0000000..1326b27 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/promtail/tasks/main.yml @@ -0,0 +1,13 @@ +- name: Create directories for node promtail + ansible.builtin.file: + path: promtail + state: directory + mode: 0700 + +- name: Apply Promtail DaemonSet + template: + src: promtail.yaml.j2 + dest: promtail/promtail-daemonset.yaml + +- name: Apply Promtail DaemonSet YAML to the cluster + shell: kubectl apply -f promtail/promtail-daemonset.yaml diff --git a/roles/post-5g-bp-cluster-monitoring/promtail/templates/promtail.yaml.j2 b/roles/post-5g-bp-cluster-monitoring/promtail/templates/promtail.yaml.j2 new file mode 100644 index 0000000..75d1ec4 --- /dev/null +++ b/roles/post-5g-bp-cluster-monitoring/promtail/templates/promtail.yaml.j2 @@ -0,0 +1,138 @@ +--- # Daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: promtail-daemonset +spec: + selector: + matchLabels: + name: promtail + template: + metadata: + labels: + name: promtail + spec: + serviceAccount: promtail-serviceaccount + containers: + - name: promtail-container + image: grafana/promtail + args: + - -config.file=/etc/promtail/promtail.yaml + env: + - name: 'HOSTNAME' # needed when using kubernetes_sd_configs + valueFrom: + fieldRef: + fieldPath: 'spec.nodeName' + - name: 'CLUSTER_NAME' + value: '{{ loki_label }}' + volumeMounts: + - name: logs + mountPath: /var/log + - name: promtail-config + mountPath: /etc/promtail + - mountPath: /var/lib/docker/containers + name: varlibdockercontainers + readOnly: true + volumes: + - name: logs + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: promtail-config + configMap: + name: promtail-config +--- # configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: promtail-config +data: + promtail.yaml: | + server: + http_listen_port: 9080 + grpc_listen_port: 0 + + clients: + - url: http://{{ loki_address }}:{{ loki_port }}/loki/api/v1/push + + positions: + filename: /tmp/positions.yaml + target_config: + sync_period: 10s + scrape_configs: + - job_name: pod-logs + kubernetes_sd_configs: + - role: pod + pipeline_stages: + - docker: {} + - labels: + cluster: ${CLUSTER_NAME} + relabel_configs: + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_pod_name + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + +--- # Clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: promtail-clusterrole +rules: + - apiGroups: [""] + resources: + - nodes + - services + - pods + verbs: + - get + - watch + - list + +--- # ServiceAccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: promtail-serviceaccount + +--- # Rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: promtail-clusterrolebinding +subjects: + - kind: ServiceAccount + name: promtail-serviceaccount + namespace: default +roleRef: + kind: ClusterRole + name: promtail-clusterrole + apiGroup: rbac.authorization.k8s.io -- GitLab