/* ** Zabbix ** Copyright (C) 2001-2024 Zabbix SIA ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** ** http://www.apache.org/licenses/LICENSE-2.0 ** ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. **/ package nvml // Runner defines the interface for an NVML runner. // //nolint:interfacebloat type Runner interface { // InitNVML initializes the NVML library using the older NVML interface. Init() error // InitNVMLv2 initializes the NVML library using the NVML v2 interface. InitV2() error // GetDeviceCount retrieves the number of NVIDIA devices using the standard NVML interface. GetDeviceCount() (uint, error) // GetDeviceCountV2 retrieves the number of NVIDIA devices using the NVML v2 interface. GetDeviceCountV2() (uint, error) // GetDeviceByIndexV2 retrieves a handle to an NVIDIA device by its index using the NVML v2 interface. GetDeviceByIndexV2(index uint) (Device, error) // GetDeviceByUUID retrieves a handle to an NVIDIA device by its UUID. GetDeviceByUUID(uuid string) (Device, error) // GetNVMLVersion retrieves the version of the NVML library currently in use. GetNVMLVersion() (string, error) // GetDriverVersion retrieves the version of the NVIDIA driver currently in use. GetDriverVersion() (string, error) // Shutdown NVML and clean up resources ShutdownNVML() error // Close releases the resources associated with the loaded library in the Runner. Close() error } // Device defines the methods for interacting with a GPU device. // //nolint:interfacebloat type Device interface { // GetTemperature retrieves the temperature of the NVIDIA device using the default sensor. GetTemperature() (int, error) // GetMemoryInfo retrieves memory information for the NVIDIA device. GetMemoryInfo() (*MemoryInfo, error) // GetBAR1MemoryInfo retrieves BAR1 memory information for the NVIDIA device. GetBAR1MemoryInfo() (*MemoryInfo, error) // GetMemoryInfoV2 retrieves detailed memory information for the NVIDIA device using the NVML v2 interface. GetMemoryInfoV2() (*MemoryInfoV2, error) // GetFanSpeed retrieves the current fan speed of the NVIDIA device as a percentage of its maximum speed. GetFanSpeed() (uint, error) // GetPCIeThroughput retrieves the PCIe throughput for the NVIDIA device, based on the specified metric type. GetPCIeThroughput(metricType PcieMetricType) (uint, error) // GetUtilizationRates retrieves the GPU and memory utilization rates for the device. // The GPU utilization represents the percentage of time over the past sampling period // that the GPU was actively processing, while the memory utilization indicates the // percentage of time the memory was being accessed. // // Returns: // - gpuUtilization (uint): The GPU utilization rate as a percentage (0-100). // - memoryUtilization (uint): The memory utilization rate as a percentage (0-100). // - error: An error object if there is a failure in verifying the NVML symbol existence // or in retrieving the utilization rates from NVML. GetUtilizationRates() (uint, uint, error) // GetUUID retrieves the UUID of the NVIDIA device. GetUUID() (string, error) // GetName retrieves the name of the NVIDIA device. GetName() (string, error) // GetSerial retrieves the serial number of the NVIDIA device. GetSerial() (string, error) // GetPowerUsage retrieves the power usage of the NVIDIA device in milliwatts. GetPowerUsage() (uint, error) // GetPerformanceState retrieves the performance state (P-state) of the NVIDIA device. GetPerformanceState() (uint, error) // GetClockInfo retrieves the clock rate for the specified clock type of the NVIDIA device. GetClockInfo(clockType ClockType) (uint, error) // GetPowerManagementLimit retrieves the power management limit of the NVIDIA device in milliwatts. GetPowerManagementLimit() (uint, error) // GetTotalEnergyConsumption retrieves the total energy consumption of the NVIDIA device in millijoules. GetTotalEnergyConsumption() (uint64, error) // GetEncoderStats retrieves statistics related to the encoder activity on the device. // It returns the following statistics: // - sessionCount: the number of active encoder sessions. // - averageFps: the average frames per second across all active encoder sessions. // - averageLatency: the average latency (in milliseconds) across all active encoder sessions. GetEncoderStats() (uint, uint, uint, error) // GetEncoderUtilization retrieves the encoder utilization statistics for the device. // It returns the following values: // - utilization: the percentage of time over the past sampling period during which the encoder was active. // - samplingPeriodUs: the sampling period duration in microseconds, // indicating how long the utilization metric was measured. GetEncoderUtilization() (uint, uint, error) // GetDecoderUtilization retrieves the decoder utilization statistics for the device. // It returns the following values: // - utilization: the percentage of time over the past sampling period during which the decoder was active. // - samplingPeriodUs: the sampling period duration in microseconds, indicating how long the utilization // metric was measured. GetDecoderUtilization() (uint, uint, error) // GetMemoryErrorCounter retrieves the ECC memory error count for the specified error type, // memory location, and counter type. GetMemoryErrorCounter( errorType MemoryErrorType, memoryLocation MemoryLocation, counterType EccCounterType, ) (uint64, error) // GetEccMode retrieves the current and pending ECC (Error Correction Code) modes for the device. // ECC mode indicates whether error correction is enabled or disabled on the device. // // Returns: // - `currentEnabled` (bool): `true` if ECC is currently enabled, `false` if disabled. // - `pendingEnabled` (bool): `true` if ECC will be enabled on the next reboot, `false` if it will be disabled. // - `error` (error): An error if the function fails to retrieve the ECC mode, otherwise `nil`. GetEccMode() (bool, bool, error) } // MemoryInfoV2 represents the memory information of the device (in bytes). type MemoryInfoV2 struct { Total uint64 `json:"total_memory_bytes"` // Total memory available Reserved uint64 `json:"reserved_memory_bytes"` // Memory reserved by the system Free uint64 `json:"free_memory_bytes"` // Free memory available Used uint64 `json:"used_memory_bytes"` // Memory currently being used + reserved } // MemoryInfo represents the memory information of the device (in bytes). type MemoryInfo struct { Total uint64 `json:"total_memory_bytes"` // Total memory available Free uint64 `json:"free_memory_bytes"` // Free memory available Used uint64 `json:"used_memory_bytes"` // Memory currently being used }