Commits
86
86
GPU uint `json:"device"`
87
87
Memory uint `json:"memory"`
88
88
}
89
89
90
90
// ECCMode returns current and pending status of ECC.
91
91
type ECCMode struct {
92
92
Current bool `json:"current"`
93
93
Pending bool `json:"pending"`
94
94
}
95
95
96
-
// New creates a new handler with initialized clients for system and tcp calls.
97
-
func New(nvmlRunner nvml.Runner) *Handler {
98
-
return &Handler{
99
-
// negative indicates no limit
100
-
concurrentDeviceDiscoverys: -1,
101
-
nvmlRunner: nvmlRunner,
102
-
deviceCacheMux: &sync.Mutex{},
103
-
deviceCache: make(map[string]nvml.Device),
104
-
}
105
-
}
106
-
107
96
// GetNVMLVersion returns local NVML version.
108
97
func (h *Handler) GetNVMLVersion(_ context.Context, _ map[string]string, _ ...string) (any, error) {
109
98
version, err := h.nvmlRunner.GetNVMLVersion()
110
99
if err != nil {
111
100
return "", errs.Wrap(err, "failed to get NVML version")
112
101
}
113
102
114
103
return version, nil
115
104
}
116
105
459
448
}
460
449
461
450
ecc := ECCErrors{
462
451
Corrected: corrected,
463
452
Uncorrected: uncorrected,
464
453
}
465
454
466
455
return ecc, nil
467
456
}
468
457
469
-
// WithJSONResponse wraps a handler function, marshaling its response
470
-
// to a JSON object and returning it as string.
471
-
func WithJSONResponse(handler HandlerFunc) HandlerFunc {
472
-
return func(
473
-
ctx context.Context, metricParams map[string]string, extraParams ...string,
474
-
) (any, error) {
475
-
res, err := handler(ctx, metricParams, extraParams...)
476
-
if err != nil {
477
-
return nil, errs.Wrap(err, "failed to receive the result")
478
-
}
479
-
480
-
jsonRes, err := json.Marshal(res)
481
-
if err != nil {
482
-
return nil, errs.Wrap(err, "failed to marshal result to JSON")
483
-
}
484
-
485
-
return string(jsonRes), nil
486
-
}
487
-
}
488
-
489
458
// GetPCIeThroughput retrieves the PCIe receive and transmit throughput for the NVIDIA device in KB/s.
490
459
func (h *Handler) GetPCIeThroughput(_ context.Context, metricParams map[string]string, _ ...string) (any, error) {
491
460
uuid, ok := metricParams[params.DeviceUUIDParamName]
492
461
if !ok {
493
462
return nil, errs.New("failed to find param for UUID")
494
463
}
495
464
496
465
device, err := h.getDeviceByUUID(uuid)
497
466
if err != nil {
498
467
return nil, errs.Wrap(err, "failed to get device by UUID")
706
675
}
707
676
708
677
mode := ECCMode{
709
678
Current: current,
710
679
Pending: pending,
711
680
}
712
681
713
682
return mode, nil
714
683
}
715
684
685
+
// WithJSONResponse wraps a handler function, marshaling its response
686
+
// to a JSON object and returning it as string.
687
+
func WithJSONResponse(handler HandlerFunc) HandlerFunc {
688
+
return func(
689
+
ctx context.Context, metricParams map[string]string, extraParams ...string,
690
+
) (any, error) {
691
+
res, err := handler(ctx, metricParams, extraParams...)
692
+
if err != nil {
693
+
return nil, errs.Wrap(err, "failed to receive the result")
694
+
}
695
+
696
+
jsonRes, err := json.Marshal(res)
697
+
if err != nil {
698
+
return nil, errs.Wrap(err, "failed to marshal result to JSON")
699
+
}
700
+
701
+
return string(jsonRes), nil
702
+
}
703
+
}
704
+
705
+
// New creates a new handler with initialized clients for system and tcp calls.
706
+
func New(nvmlRunner nvml.Runner) *Handler {
707
+
return &Handler{
708
+
// negative indicates no limit
709
+
concurrentDeviceDiscoverys: -1,
710
+
nvmlRunner: nvmlRunner,
711
+
deviceCacheMux: &sync.Mutex{},
712
+
deviceCache: make(map[string]nvml.Device),
713
+
}
714
+
}
715
+
716
716
// getDeviceByUUID accesses devices from device cache of runner,
717
717
// if device with requested UUID not cached it requests it from NVML.
718
718
// In case of success it caches device for future requests, else returns error.
719
719
//
720
720
//nolint:ireturn
721
721
func (h *Handler) getDeviceByUUID(uuid string) (nvml.Device, error) {
722
722
h.deviceCacheMux.Lock()
723
723
defer h.deviceCacheMux.Unlock()
724
724
725
725
device, ok := h.deviceCache[uuid]