NVIDIA GPU loadable plugin
Source
xxxxxxxxxx
1
1
package nvml
2
2
3
3
// Runner defines the interface for an NVML runner.
4
4
//
5
5
//nolint:interfacebloat
6
6
type Runner interface {
7
-
InitNVML() error
7
+
// InitNVML initializes the NVML library using the older NVML interface.
8
+
Init() error
9
+
// InitNVMLv2 initializes the NVML library using the NVML v2 interface.
10
+
InitV2() error
8
11
9
-
InitNVMLv2() error
10
-
// Initialize NVML and any necessary resources
12
+
// GetDeviceCount retrieves the number of NVIDIA devices using the standard NVML interface.
11
13
GetDeviceCount() (uint, error)
12
14
13
-
// Initialize NVML and any necessary resources
15
+
// GetDeviceCountV2 retrieves the number of NVIDIA devices using the NVML v2 interface.
14
16
GetDeviceCountV2() (uint, error)
15
17
16
-
// Get a device by index
18
+
// GetDeviceByIndexV2 retrieves a handle to an NVIDIA device by its index using the NVML v2 interface.
17
19
GetDeviceByIndexV2(index uint) (*NVMLDevice, error)
18
20
21
+
// GetDeviceByUUID retrieves a handle to an NVIDIA device by its UUID.
19
22
GetDeviceByUUID(uuid string) (*NVMLDevice, error)
20
23
21
-
// Get NVML version
24
+
// GetNVMLVersion retrieves the version of the NVML library currently in use.
22
25
GetNVMLVersion() (string, error)
23
26
24
-
// Get Driver version
27
+
// GetDriverVersion retrieves the version of the NVIDIA driver currently in use.
25
28
GetDriverVersion() (string, error)
26
29
27
30
// Shutdown NVML and clean up resources
28
31
ShutdownNVML() error
29
32
33
+
// Close releases the resources associated with the loaded library in the Runner.
30
34
Close() error
31
35
}
32
36
33
37
// Device defines the methods for interacting with a GPU device.
34
38
//
35
39
//nolint:interfacebloat
36
40
type Device interface {
37
-
// Get the temperature of the device.
41
+
// GetTemperature retrieves the temperature of the NVIDIA device using the default sensor.
38
42
GetTemperature() (int, error)
39
43
40
-
// Get the memory information for the device.
44
+
// GetMemoryInfo retrieves memory information for the NVIDIA device.
41
45
GetMemoryInfo() (*MemoryInfo, error)
42
46
47
+
// GetBAR1MemoryInfo retrieves BAR1 memory information for the NVIDIA device.
43
48
GetBAR1MemoryInfo() (*MemoryInfo, error)
44
49
45
-
// Get the memory information for the device.
50
+
// GetMemoryInfoV2 retrieves detailed memory information for the NVIDIA device using the NVML v2 interface.
46
51
GetMemoryInfoV2() (*MemoryInfoV2, error)
47
52
48
-
// Get the fan speed of the device.
53
+
// GetFanSpeed retrieves the current fan speed of the NVIDIA device as a percentage of its maximum speed.
49
54
GetFanSpeed() (uint, error)
50
55
51
-
// Get PCIe throughput (TX or RX).
52
-
GetPcieThroughput(metricType PcieMetricType) (uint, error)
53
-
54
-
// GetUtilizationRates returns the GPU and memory utilization in that order (GPU, Memory).
56
+
// GetPCIeThroughput retrieves the PCIe throughput for the NVIDIA device, based on the specified metric type.
57
+
GetPCIeThroughput(metricType PcieMetricType) (uint, error)
58
+
59
+
// GetUtilizationRates retrieves the GPU and memory utilization rates for the device.
60
+
// The GPU utilization represents the percentage of time over the past sampling period
61
+
// that the GPU was actively processing, while the memory utilization indicates the
62
+
// percentage of time the memory was being accessed.
63
+
//
64
+
// Returns:
65
+
// - gpuUtilization (uint): The GPU utilization rate as a percentage (0-100).
66
+
// - memoryUtilization (uint): The memory utilization rate as a percentage (0-100).
67
+
// - error: An error object if there is a failure in verifying the NVML symbol existence
68
+
// or in retrieving the utilization rates from NVML.
55
69
GetUtilizationRates() (uint, uint, error)
56
70
57
-
// Get the UUID of the device.
71
+
// GetUUID retrieves the UUID of the NVIDIA device.
58
72
GetUUID() (string, error)
59
73
60
-
// Get the name of the device.
74
+
// GetName retrieves the name of the NVIDIA device.
61
75
GetName() (string, error)
62
76
63
-
// Get the serial number of the device.
77
+
// GetSerial retrieves the serial number of the NVIDIA device.
64
78
GetSerial() (string, error)
65
79
80
+
// GetPowerUsage retrieves the power usage of the NVIDIA device in milliwatts.
66
81
GetPowerUsage() (uint, error)
67
82
83
+
// GetPerformanceState retrieves the performance state (P-state) of the NVIDIA device.
68
84
GetPerformanceState() (uint, error)
69
85
86
+
// GetClockInfo retrieves the clock rate for the specified clock type of the NVIDIA device.
70
87
GetClockInfo(clockType ClockType) (uint, error)
71
88
89
+
// GetPowerManagementLimit retrieves the power management limit of the NVIDIA device in milliwatts.
72
90
GetPowerManagementLimit() (uint, error)
73
91
74
92
GetTotalEnergyConsumption() (uint64, error)
75
93
94
+
// GetEncoderStats retrieves statistics related to the encoder activity on the device.
95
+
// It returns the following statistics:
96
+
// - sessionCount: the number of active encoder sessions.
97
+
// - averageFps: the average frames per second across all active encoder sessions.
98
+
// - averageLatency: the average latency (in milliseconds) across all active encoder sessions.
76
99
GetEncoderStats() (uint, uint, uint, error)
77
100
101
+
// GetEncoderUtilization retrieves the encoder utilization statistics for the device.
102
+
// It returns the following values:
103
+
// - utilization: the percentage of time over the past sampling period during which the encoder was active.
104
+
// - samplingPeriodUs: the sampling period duration in microseconds,
105
+
// indicating how long the utilization metric was measured.
78
106
GetEncoderUtilization() (uint, uint, error)
79
107
108
+
// GetDecoderUtilization retrieves the decoder utilization statistics for the device.
109
+
// It returns the following values:
110
+
// - utilization: the percentage of time over the past sampling period during which the decoder was active.
111
+
// - samplingPeriodUs: the sampling period duration in microseconds, indicating how long the utilization
112
+
// metric was measured.
80
113
GetDecoderUtilization() (uint, uint, error)
81
114
115
+
// GetMemoryErrorCounter retrieves the ECC memory error count for the specified error type,
116
+
// memory location, and counter type.
82
117
GetMemoryErrorCounter(
83
118
errorType MemoryErrorType,
84
119
memoryLocation MemoryLocation,
85
120
counterType EccCounterType,
86
121
) (uint64, error)
87
122
123
+
// GetEccMode retrieves the current and pending ECC (Error Correction Code) modes for the device.
124
+
// ECC mode indicates whether error correction is enabled or disabled on the device.
125
+
//
126
+
// Returns:
127
+
// - `currentEnabled` (bool): `true` if ECC is currently enabled, `false` if disabled.
128
+
// - `pendingEnabled` (bool): `true` if ECC will be enabled on the next reboot, `false` if it will be disabled.
129
+
// - `error` (error): An error if the function fails to retrieve the ECC mode, otherwise `nil`.
88
130
GetEccMode() (bool, bool, error)
89
131
}
90
132
91
133
// MemoryInfoV2 represents the memory information of the device (in bytes).
92
134
type MemoryInfoV2 struct {
93
135
Total uint64 `json:"total_memory_bytes"` // Total memory available
94
136
Reserved uint64 `json:"reserved_memory_bytes"` // Memory reserved by the system
95
137
Free uint64 `json:"free_memory_bytes"` // Free memory available
96
138
Used uint64 `json:"used_memory_bytes"` // Memory currently being used + reserved
97
139
}