v1

package
v0.7.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 2, 2025 License: Apache-2.0 Imports: 9 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ComponentEvents added in v0.5.0

type ComponentEvents struct {
	Component string    `json:"component"`
	StartTime time.Time `json:"startTime"`
	EndTime   time.Time `json:"endTime"`
	Events    Events    `json:"events"`
}

type ComponentHealthStates added in v0.5.0

type ComponentHealthStates struct {
	Component string       `json:"component"`
	States    HealthStates `json:"states"`
}

type ComponentInfo added in v0.5.0

type ComponentInfo struct {
	Component string    `json:"component"`
	StartTime time.Time `json:"startTime"`
	EndTime   time.Time `json:"endTime"`
	Info      Info      `json:"info"`
}

type ComponentMetrics added in v0.5.0

type ComponentMetrics struct {
	Component string  `json:"component"`
	Metrics   Metrics `json:"metrics"`
}

type ComponentType added in v0.5.0

type ComponentType string

ComponentType defines the type of a component.

const (
	// ComponentTypeCustomPlugin represents a custom plugin of GPUd.
	ComponentTypeCustomPlugin ComponentType = "custom-plugin"
)

type Event added in v0.5.0

type Event struct {
	// Component represents which component generated the event.
	Component string `json:"component,omitempty"`

	// Time represents when the event happened.
	Time metav1.Time `json:"time"`

	// Name represents the name of the event.
	Name string `json:"name,omitempty"`

	// Type represents the type of the event.
	Type EventType `json:"type,omitempty"`

	// Message represents the detailed message of the event.
	Message string `json:"message,omitempty"`
}

Event represents an event that happened in a component at a specific time. A single event itself does not dictate whether the component is healthy or not. The healthiness of the component is evaluated at the component health state level.

type EventType added in v0.5.0

type EventType string
const (
	EventTypeUnknown EventType = "Unknown"

	// EventTypeInfo represents a general event that requires no action.
	// Info - Informative, no further action needed.
	EventTypeInfo EventType = "Info"

	// EventTypeWarning represents an event that may impact workloads.
	// Warning - Some issue happened but no further action needed, expecting automatic recovery.
	EventTypeWarning EventType = "Warning"

	// EventTypeCritical represents an event that is definitely impacting workloads
	// and requires immediate attention.
	// Critical - Some critical issue happened thus action required, not a hardware issue.
	EventTypeCritical EventType = "Critical"

	// EventTypeFatal represents a fatal event that impacts wide systems
	// and requires immediate attention and action.
	// Fatal - Fatal/hardware issue occurred thus immediate action required, may require reboot/hardware repair.
	EventTypeFatal EventType = "Fatal"
)

func EventTypeFromString added in v0.5.0

func EventTypeFromString(s string) EventType

type Events added in v0.5.0

type Events []Event

type GPUdComponentEvents added in v0.5.0

type GPUdComponentEvents []ComponentEvents

type GPUdComponentHealthStates added in v0.5.0

type GPUdComponentHealthStates []ComponentHealthStates

type GPUdComponentInfos added in v0.5.0

type GPUdComponentInfos []ComponentInfo

type GPUdComponentMetrics added in v0.5.0

type GPUdComponentMetrics []ComponentMetrics

type GossipRequest added in v0.5.0

type GossipRequest struct {
	MachineID   string       `json:"machineID"`
	MachineInfo *MachineInfo `json:"machineInfo,omitempty"`
}

GossipRequest is the request for the gossip request.

type GossipResponse added in v0.5.0

type GossipResponse struct {
	Status string `json:"status,omitempty"`
	Error  string `json:"error,omitempty"`
}

GossipResponse is the response for the gossip request.

type HealthState added in v0.5.0

type HealthState struct {
	// Time represents when the event happened.
	Time metav1.Time `json:"time"`

	// Component represents the component name.
	Component string `json:"component,omitempty"`
	// ComponentType represents the type of the component.
	// It is either "" (just 'component') or "custom-plugin".
	ComponentType ComponentType `json:"component_type,omitempty"`

	// Name is the name of the state,
	// can be different from the component name.
	Name string `json:"name,omitempty"`

	// RunMode is the run mode of the state.
	// It can be "manual" that requires manual trigger to run the check.
	// Or it can be empty that runs the check periodically.
	RunMode RunModeType `json:"run_mode,omitempty"`

	// Health represents the health level of the state,
	// including StateHealthy, StateUnhealthy and StateDegraded.
	// StateDegraded is similar to Unhealthy which also can trigger alerts
	// for users or operators, but what StateDegraded means is that the
	// issue detected does not affect users’ workload.
	Health HealthStateType `json:"health,omitempty"`

	// Reason represents what happened or detected by GPUd if it isn’t healthy.
	Reason string `json:"reason,omitempty"`

	// Error represents the detailed error information, which will be shown
	// as More Information to help analyze why it isn’t healthy.
	Error string `json:"error,omitempty"`

	// SuggestedActions represents the suggested actions to mitigate the issue.
	SuggestedActions *SuggestedActions `json:"suggested_actions,omitempty"`

	// ExtraInfo represents the extra information of the state.
	ExtraInfo map[string]string `json:"extra_info,omitempty"`

	// RawOutput represents the raw output of the health checker.
	// e.g., If a custom plugin runs a Python script, the raw output
	// is the stdout/stderr of the script.
	// The maximum length of the raw output is 4096 bytes.
	RawOutput string `json:"raw_output,omitempty"`
}

HealthState represents the health state of a component. The healthiness of the component is already evaluated at the component level, so the health state here is to provide more details about the healthiness, and other data for the control plane to decide how to alert and remediate the issue.

type HealthStateType added in v0.5.0

type HealthStateType string

HealthStateType defines the health state of a component.

const (
	HealthStateTypeHealthy      HealthStateType = "Healthy"
	HealthStateTypeUnhealthy    HealthStateType = "Unhealthy"
	HealthStateTypeDegraded     HealthStateType = "Degraded"
	HealthStateTypeInitializing HealthStateType = "Initializing"
)

type HealthStates added in v0.5.0

type HealthStates []HealthState

type Info added in v0.5.0

type Info struct {
	States  HealthStates `json:"states"`
	Events  Events       `json:"events"`
	Metrics Metrics      `json:"metrics"`
}

type JoinRequest added in v0.5.0

type JoinRequest struct {
	ID                 string `json:"id"`
	ClusterName        string `json:"cluster_name,omitempty"`
	PublicIP           string `json:"public_ip,omitempty"`
	PrivateIP          string `json:"private_ip,omitempty"`
	Provider           string `json:"provider,omitempty"`
	ProviderInstanceID string `json:"provider_instance_id,omitempty"`
	ProviderGPUShape   string `json:"provider_gpu_shape,omitempty"`
	TotalCPU           int64  `json:"total_cpu,omitempty"`
	NodeGroup          string `json:"node_group,omitempty"`
	ExtraInfo          string `json:"extra_info,omitempty"`
	Region             string `json:"region,omitempty"`
}

JoinRequest is the request for the join request.

type JoinResponse added in v0.5.0

type JoinResponse struct {
	Error  string `json:"error"`
	Status string `json:"status"`
}

JoinResponse is the response for the join request.

type LoginRequest added in v0.5.0

type LoginRequest struct {
	Token              string            `json:"token"`
	MachineID          string            `json:"machineID"`
	NodeGroup          string            `json:"nodeGroup,omitempty"`
	Network            *MachineNetwork   `json:"network,omitempty"`
	Location           *MachineLocation  `json:"location,omitempty"`
	Provider           string            `json:"provider"`
	ProviderInstanceID string            `json:"providerInstanceID"`
	MachineInfo        *MachineInfo      `json:"machineInfo,omitempty"`
	Resources          map[string]string `json:"resources,omitempty"`
}

LoginRequest is the request for the login request.

type LoginResponse added in v0.5.0

type LoginResponse struct {
	// Status is the status of the login request.
	Status string `json:"status,omitempty"`

	// Error is the error message for the login request.
	// The error is only returned if the machine ID retrieval fails.
	Error string `json:"error,omitempty"`

	// MachineID is the ID of the machine generated by the login control plane.
	// Only set upon successful login.
	MachineID string `json:"machineID"`

	// Token is the token used to report data from the machine.
	Token string `json:"token,omitempty"`

	// ValidationResults is the validation results for the login request.
	// The validation results are done by the control plane.
	ValidationResults []ValidationResult `json:"validationResults,omitempty"`
}

LoginResponse is the response for the login request.

type MachineCPUInfo added in v0.5.0

type MachineCPUInfo struct {
	Type         string `json:"type,omitempty"`
	Manufacturer string `json:"manufacturer,omitempty"`
	Architecture string `json:"architecture,omitempty"`
	LogicalCores int64  `json:"logicalCores,omitempty"`
}

type MachineDiskDevice added in v0.5.0

type MachineDiskDevice struct {
	Name       string   `json:"name,omitempty"`
	Type       string   `json:"type,omitempty"`
	Size       int64    `json:"size,omitempty"`
	Used       int64    `json:"used,omitempty"`
	Rota       bool     `json:"rota,omitempty"`
	Serial     string   `json:"serial,omitempty"`
	WWN        string   `json:"wwn,omitempty"`
	Vendor     string   `json:"vendor,omitempty"`
	Model      string   `json:"model,omitempty"`
	Rev        string   `json:"rev,omitempty"`
	MountPoint string   `json:"mountPoint,omitempty"`
	FSType     string   `json:"fsType,omitempty"`
	PartUUID   string   `json:"partUUID,omitempty"`
	Parents    []string `json:"parents,omitempty"`
	Children   []string `json:"children,omitempty"`
}

type MachineDiskInfo added in v0.5.0

type MachineDiskInfo struct {
	BlockDevices []MachineDiskDevice `json:"blockDevices,omitempty"`
	// ContainerRootDisk is the disk device name that mounts the container root (such as "/var/lib/kubelet" mount point).
	ContainerRootDisk string `json:"containerRootDisk,omitempty"`
}

func (*MachineDiskInfo) RenderTable added in v0.5.0

func (di *MachineDiskInfo) RenderTable(wr io.Writer)

type MachineGPUInfo added in v0.5.0

type MachineGPUInfo struct {
	// Product may be "NVIDIA-Graphics-Device" for NVIDIA GB200.
	Product string `json:"product,omitempty"`

	// Manufacturer is "NVIDIA" for NVIDIA GPUs (same as Brand).
	Manufacturer string `json:"manufacturer,omitempty"`

	// Architecture is "blackwell" for NVIDIA GB200.
	Architecture string `json:"architecture,omitempty"`

	Memory string `json:"memory,omitempty"`

	// GPUs is the GPU info of the machine.
	GPUs []MachineGPUInstance `json:"gpus,omitempty"`
}

func (*MachineGPUInfo) RenderTable added in v0.5.0

func (gi *MachineGPUInfo) RenderTable(wr io.Writer)

type MachineGPUInstance added in v0.5.0

type MachineGPUInstance struct {
	// UUID is the GPU UUID from the nvml API.
	// e.g., "GPU-46a3bbe2-3e87-3dde-b464-a03eba0c21d7"
	UUID string `json:"uuid,omitempty"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"busID,omitempty"`

	SN      string `json:"sn,omitempty"`
	MinorID string `json:"minorID,omitempty"`
	BoardID uint32 `json:"boardID,omitempty"`
}

type MachineInfo added in v0.5.0

type MachineInfo struct {
	// GPUdVersion represents the current version of GPUd
	GPUdVersion string `json:"gpudVersion,omitempty"`
	// GPUDriverVersion represents the current version of GPU driver installed
	GPUDriverVersion string `json:"gpuDriverVersion,omitempty"`
	// CUDAVersion represents the current version of cuda library.
	CUDAVersion string `json:"cudaVersion,omitempty"`
	// ContainerRuntime Version reported by the node through runtime remote API (e.g. containerd://1.4.2).
	ContainerRuntimeVersion string `json:"containerRuntimeVersion,omitempty"`
	// Kernel Version reported by the node from 'uname -r' (e.g. 3.16.0-0.bpo.4-amd64).
	KernelVersion string `json:"kernelVersion,omitempty"`
	// OS Image reported by the node from /etc/os-release (e.g. Debian GNU/Linux 7 (wheezy)).
	OSImage string `json:"osImage,omitempty"`
	// The Operating System reported by the node
	OperatingSystem string `json:"operatingSystem,omitempty"`
	// SystemUUID comes from https://github.com/google/cadvisor/blob/master/utils/sysfs/sysfs.go#L442
	SystemUUID string `json:"systemUUID,omitempty"`
	// MachineID is collected by GPUd. It comes from /etc/machine-id or /var/lib/dbus/machine-id
	MachineID string `json:"machineID,omitempty"`
	// BootID is collected by GPUd.
	BootID string `json:"bootID,omitempty"`
	// Hostname is the current host of machine
	Hostname string `json:"hostname,omitempty"`
	// Uptime represents when the machine up
	Uptime metav1.Time `json:"uptime,omitempty"`

	// CPUInfo is the CPU info of the machine.
	CPUInfo *MachineCPUInfo `json:"cpuInfo,omitempty"`
	// MemoryInfo is the memory info of the machine.
	MemoryInfo *MachineMemoryInfo `json:"memoryInfo,omitempty"`
	// GPUInfo is the GPU info of the machine.
	GPUInfo *MachineGPUInfo `json:"gpuInfo,omitempty"`
	// DiskInfo is the Disk info of the machine.
	DiskInfo *MachineDiskInfo `json:"diskInfo,omitempty"`
	// NICInfo is the network info of the machine.
	NICInfo *MachineNICInfo `json:"nicInfo,omitempty"`
}

func (*MachineInfo) RenderTable added in v0.5.0

func (i *MachineInfo) RenderTable(wr io.Writer)

type MachineLocation added in v0.5.0

type MachineLocation struct {
	Region string `json:"region,omitempty"`
	Zone   string `json:"zone,omitempty"`
}

MachineLocation is the location info of the machine.

type MachineMemoryInfo added in v0.5.0

type MachineMemoryInfo struct {
	TotalBytes uint64 `json:"totalBytes"`
}

type MachineNICInfo added in v0.5.0

type MachineNICInfo struct {
	// PrivateIPInterfaces is the private network interface info of the machine.
	PrivateIPInterfaces []MachineNetworkInterface `json:"privateIPInterfaces,omitempty"`
}

MachineNICInfo consists of the network info of the machine.

type MachineNetwork added in v0.5.0

type MachineNetwork struct {
	// PublicIP is the public IP address of the machine.
	PublicIP string `json:"publicIP,omitempty"`
	// PrivateIP is the first private IP in IPv4 family,
	// detected from the local host.
	// May be overridden by the user with the private IP address.
	PrivateIP string `json:"privateIP,omitempty"`
}

MachineNetwork is the network info of the machine.

type MachineNetworkInterface added in v0.5.0

type MachineNetworkInterface struct {
	// Interface is the network interface name of the machine.
	Interface string `json:"interface,omitempty"`

	// MAC is the MAC address of the machine.
	MAC string `json:"mac,omitempty"`

	// IP is the string representation of the netip.Addr of the machine.
	IP string `json:"ip,omitempty"`

	// Addr is the netip.Addr of the machine.
	Addr netip.Addr `json:"-"`
}

MachineNetworkInterface is the network interface info of the machine.

type Metric added in v0.5.0

type Metric struct {
	UnixSeconds int64             `json:"unix_seconds"`
	Name        string            `json:"name"`
	Labels      map[string]string `json:"labels,omitempty"`
	Value       float64           `json:"value"`
}

type Metrics added in v0.5.0

type Metrics []Metric

type NotificationRequest added in v0.5.0

type NotificationRequest struct {
	ID   string           `json:"id"`
	Type NotificationType `json:"type"`
}

type NotificationResponse added in v0.5.0

type NotificationResponse struct {
	Error  string `json:"error"`
	Status string `json:"status"`
}

type NotificationType added in v0.5.0

type NotificationType string
const (
	NotificationTypeShutdown NotificationType = "shutdown"
	NotificationTypeStartup  NotificationType = "startup"
)

type PackagePhase added in v0.5.0

type PackagePhase string
const (
	InstalledPhase  PackagePhase = "Installed"
	InstallingPhase PackagePhase = "Installing"
	UnknownPhase    PackagePhase = "Unknown"
)

type PackageStatus added in v0.5.0

type PackageStatus struct {
	Name           string       `json:"name"`
	Phase          PackagePhase `json:"phase"`
	Status         string       `json:"status"`
	CurrentVersion string       `json:"current_version"`
}

type RepairActionType added in v0.5.0

type RepairActionType string
const (
	// RepairActionTypeIgnoreNoActionRequired represents a suggested action to ignore the issue,
	// meaning no action is needed until further notice.
	RepairActionTypeIgnoreNoActionRequired RepairActionType = "IGNORE_NO_ACTION_REQUIRED"

	// RepairActionTypeRebootSystem represents a suggested action to reboot the system.
	// Specific to NVIDIA GPUs, this implies GPU reset by rebooting the system.
	RepairActionTypeRebootSystem RepairActionType = "REBOOT_SYSTEM"

	// RepairActionTypeHardwareInspection represents a suggested action for hardware inspection
	// and repair if any issue is found. This often involves data center (or cloud provider) support
	// to physically check/repair the machine.
	RepairActionTypeHardwareInspection RepairActionType = "HARDWARE_INSPECTION"

	// RepairActionTypeCheckUserApp represents a suggested action to check the user application.
	// For instance, NVIDIA may report XID 45 as user app error, but the underlying GPU might have other issues
	// thus requires further diagnosis of the application and the GPU.
	RepairActionTypeCheckUserAppAndGPU RepairActionType = "CHECK_USER_APP_AND_GPU"
)

type RunModeType added in v0.5.0

type RunModeType string

RunModeType defines the run mode of a component.

const (
	// RunModeTypeAuto is the run mode that runs automatically with the specified interval
	// when enabled as a component.
	RunModeTypeAuto RunModeType = "auto"
	// RunModeTypeManual is the run mode that requires manual trigger to run the check.
	RunModeTypeManual RunModeType = "manual"
)

type SuggestedActions added in v0.5.0

type SuggestedActions struct {
	// Description describes the issue in detail.
	Description string `json:"description"`

	// A list of repair actions to mitigate the issue.
	RepairActions []RepairActionType `json:"repair_actions"`
}

SuggestedActions represents a set of suggested actions to mitigate an issue.

func (*SuggestedActions) DescribeActions added in v0.5.0

func (sa *SuggestedActions) DescribeActions() string

type ValidationResult added in v0.5.0

type ValidationResult struct {
	Name       string `json:"name"`
	Valid      bool   `json:"valid"`
	Reason     string `json:"reason"`
	Suggestion string `json:"suggestion"`
}

ValidationResult is the validation result for the login request.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL