如何通过NVIDIA GPU Exporter、Prometheus和Grafana来监测GPU性能
最编程
2024-01-23 14:32:54
...
# vim /etc/systemd/system/nvidia_gpu_exporter.service
[Unit]
Description=Nvidia GPU Exporter
After=network-online.target
[Service]
Type=simple
User=nvidia_gpu_exporter
Group=nvidia_gpu_exporter
ExecStart=/usr/local/bin/nvidia_gpu_exporter
SyslogIdentifier=nvidia_gpu_exporter
Restart=always
RestartSec=1
NoNewPrivileges=yes
ProtectHome=yes
ProtectSystem=strict
ProtectControlGroups=true
ProtectKernelModules=true
ProtectKernelTunables=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectProc=yes
[Install]
WantedBy=multi-user.target
# systemctl daemon-reload
[root@k8s-gpu4 ~]# systemctl enable nvidia_gpu_exporter
[root@k8s-gpu4 ~]# systemctl start nvidia_gpu_exporter.service
[root@k8s-gpu4 ~]# systemctl status nvidia_gpu_exporter.service
● nvidia_gpu_exporter.service - Nvidia GPU Exporter
Loaded: loaded (/etc/systemd/system/nvidia_gpu_exporter.service; enabled; vendor preset: disabled)
Active: active (running) since Fri 2022-05-13 17:36:03 CST; 5s ago
Main PID: 80178 (nvidia_gpu_expo)
Tasks: 6
Memory: 5.6M
CGroup: /system.slice/nvidia_gpu_exporter.service
└─80178 /usr/local/bin/nvidia_gpu_exporter
May 13 17:36:03 k8s-gpu4 systemd[1]: Started Nvidia GPU Exporter.
May 13 17:36:04 k8s-gpu4 nvidia_gpu_exporter[80178]: ts=2022-05-13T09:36:04.005Z caller=main.go:68 level=info msg="Listening on add...=:9835
May 13 17:36:04 k8s-gpu4 nvidia_gpu_exporter[80178]: ts=2022-05-13T09:36:04.006Z caller=tls_config.go:195 level=info msg="TLS is di...=false
Hint: Some lines were ellipsized, use -l to show in full.
[Unit]
Description=Nvidia GPU Exporter
After=network-online.target
[Service]
Type=simple
User=nvidia_gpu_exporter
Group=nvidia_gpu_exporter
ExecStart=/usr/local/bin/nvidia_gpu_exporter
SyslogIdentifier=nvidia_gpu_exporter
Restart=always
RestartSec=1
NoNewPrivileges=yes
ProtectHome=yes
ProtectSystem=strict
ProtectControlGroups=true
ProtectKernelModules=true
ProtectKernelTunables=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectProc=yes
[Install]
WantedBy=multi-user.target
# systemctl daemon-reload
[root@k8s-gpu4 ~]# systemctl enable nvidia_gpu_exporter
[root@k8s-gpu4 ~]# systemctl start nvidia_gpu_exporter.service
[root@k8s-gpu4 ~]# systemctl status nvidia_gpu_exporter.service
● nvidia_gpu_exporter.service - Nvidia GPU Exporter
Loaded: loaded (/etc/systemd/system/nvidia_gpu_exporter.service; enabled; vendor preset: disabled)
Active: active (running) since Fri 2022-05-13 17:36:03 CST; 5s ago
Main PID: 80178 (nvidia_gpu_expo)
Tasks: 6
Memory: 5.6M
CGroup: /system.slice/nvidia_gpu_exporter.service
└─80178 /usr/local/bin/nvidia_gpu_exporter
May 13 17:36:03 k8s-gpu4 systemd[1]: Started Nvidia GPU Exporter.
May 13 17:36:04 k8s-gpu4 nvidia_gpu_exporter[80178]: ts=2022-05-13T09:36:04.005Z caller=main.go:68 level=info msg="Listening on add...=:9835
May 13 17:36:04 k8s-gpu4 nvidia_gpu_exporter[80178]: ts=2022-05-13T09:36:04.006Z caller=tls_config.go:195 level=info msg="TLS is di...=false
Hint: Some lines were ellipsized, use -l to show in full.