NVIDIA GPU loadable plugin
Source
xxxxxxxxxx
// - `pendingEnabled` (bool): `true` if ECC will be enabled on the next reboot, `false` if it will be disabled.
/*
** Copyright 2001-2024 Zabbix SIA
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
**/
package nvml
// Runner defines the interface for an NVML runner.
//
//nolint:interfacebloat
type Runner interface {
// InitNVML initializes the NVML library using the older NVML interface.
Init() error
// InitNVMLv2 initializes the NVML library using the NVML v2 interface.
InitV2() error
// GetDeviceCount retrieves the number of NVIDIA devices using the standard NVML interface.
GetDeviceCount() (uint, error)
// GetDeviceCountV2 retrieves the number of NVIDIA devices using the NVML v2 interface.
GetDeviceCountV2() (uint, error)
// GetDeviceByIndexV2 retrieves a handle to an NVIDIA device by its index using the NVML v2 interface.
GetDeviceByIndexV2(index uint) (Device, error)
// GetDeviceByUUID retrieves a handle to an NVIDIA device by its UUID.
GetDeviceByUUID(uuid string) (Device, error)
// GetNVMLVersion retrieves the version of the NVML library currently in use.
GetNVMLVersion() (string, error)
// GetDriverVersion retrieves the version of the NVIDIA driver currently in use.
GetDriverVersion() (string, error)
// Shutdown NVML and clean up resources
ShutdownNVML() error
// Close releases the resources associated with the loaded library in the Runner.
Close() error
}
// Device defines the methods for interacting with a GPU device.
//
//nolint:interfacebloat
type Device interface {
// GetTemperature retrieves the temperature of the NVIDIA device using the default sensor.
GetTemperature() (int, error)
// GetMemoryInfo retrieves memory information for the NVIDIA device.
GetMemoryInfo() (*MemoryInfo, error)
// GetBAR1MemoryInfo retrieves BAR1 memory information for the NVIDIA device.
GetBAR1MemoryInfo() (*MemoryInfo, error)
// GetMemoryInfoV2 retrieves detailed memory information for the NVIDIA device using the NVML v2 interface.
GetMemoryInfoV2() (*MemoryInfoV2, error)
// GetFanSpeed retrieves the current fan speed of the NVIDIA device as a percentage of its maximum speed.
GetFanSpeed() (uint, error)
// GetPCIeThroughput retrieves the PCIe throughput for the NVIDIA device, based on the specified metric type.
GetPCIeThroughput(metricType PcieMetricType) (uint, error)
// GetUtilizationRates retrieves the GPU and memory utilization rates for the device.
// The GPU utilization represents the percentage of time over the past sampling period
// that the GPU was actively processing, while the memory utilization indicates the
// percentage of time the memory was being accessed.
//
// Returns:
// - gpuUtilization (uint): The GPU utilization rate as a percentage (0-100).
// - memoryUtilization (uint): The memory utilization rate as a percentage (0-100).
// - error: An error object if there is a failure in verifying the NVML symbol existence
// or in retrieving the utilization rates from NVML.
GetUtilizationRates() (uint, uint, error)
// GetUUID retrieves the UUID of the NVIDIA device.
GetUUID() (string, error)
// GetName retrieves the name of the NVIDIA device.
GetName() (string, error)