config

package
v1.0.0-rc.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 11, 2026 License: Apache-2.0 Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// DefaultAPIVersion for health server
	DefaultAPIVersion = "v1"

	// DefaultHealthPort for health metrics export
	DefaultHealthPort = 15133

	// DefaultListenHost is the default host to bind to (localhost only for security)
	DefaultListenHost = "127.0.0.1"
)

Variables

View Source
var (
	// DefaultListenAddress combines the default host and port (for server binding)
	DefaultListenAddress = fmt.Sprintf("%s:%d", DefaultListenHost, DefaultHealthPort)

	// DefaultClientURL is the default URL for client commands to connect to the server
	DefaultClientURL = fmt.Sprintf("http://localhost:%d", DefaultHealthPort)

	// DefaultRetentionPeriod - keep health data for 24 hours by default
	DefaultRetentionPeriod = metav1.Duration{Duration: 24 * time.Hour}
)

Functions

func DefaultStateFile

func DefaultStateFile() (string, error)

DefaultStateFile returns the default path for the health state database

Types

type AttestationConfig

type AttestationConfig struct {
	// Interval is how often to run attestation (default: 24 hours)
	Interval metav1.Duration `json:"interval"`

	// JitterEnabled controls whether to add random jitter to attestation schedule
	JitterEnabled bool `json:"jitter_enabled"`
}

AttestationConfig holds configuration for the attestation process

type Config

type Config struct {
	// APIVersion for the health server
	APIVersion string `json:"api_version"`

	// Address for the health server to listen on
	Address string `json:"address"`

	// State file that persists health status and metrics
	// If empty, states are not persisted to file
	State string `json:"state"`

	// Amount of time to retain health states/metrics
	// Once elapsed, old data is automatically purged
	RetentionPeriod metav1.Duration `json:"retention_period"`

	// NVIDIA tool command paths to overwrite defaults
	NvidiaToolOverwrites pkgconfigcommon.ToolOverwrites `json:"nvidia_tool_overwrites"`

	// Components specifies which health components to enable
	// Leave empty, "*", or "all" to enable all components
	// Prefix component names with "-" to disable them
	Components []string `json:"components"`

	// EnableDCGMPolicy enables DCGM policy violation monitoring.
	// All policies (XID, PCIe, DBE, NVLink, Power, Thermal, Page Retirement) are disabled by default.
	// PCIe, DBE, and NVLink have false-positive issues with monotonic counter checks.
	EnableDCGMPolicy bool `json:"enable_dcgm_policy"`

	// EnableFaultInjection enables the /inject-fault endpoint for testing.
	// This endpoint allows injecting faults (kernel messages, component errors, events) into the system.
	// SECURITY: Only accessible from localhost (127.0.0.0/8 or ::1). Disabled by default.
	EnableFaultInjection bool `json:"enable_fault_injection"`

	// Health Exporter Configuration
	HealthExporter *HealthExporterConfig `json:"health_exporter,omitempty"`
	// contains filtered or unexported fields
}

Config provides configuration for the health metrics exporter

func Default

func Default(ctx context.Context, opts ...OpOption) (*Config, error)

Default creates a default health configuration

func (*Config) ShouldDisable

func (config *Config) ShouldDisable(componentName string) bool

ShouldDisable returns true if the component should be disabled. If no disable components are specified, all components are enabled by default.

func (*Config) ShouldEnable

func (config *Config) ShouldEnable(componentName string) bool

ShouldEnable returns true if the component should be enabled. If no components are specified, all components are enabled by default.

func (*Config) ToConfigEntries

func (config *Config) ToConfigEntries(allComponentNames []string) []ConfigEntry

ToConfigEntries converts the Config struct into a slice of ConfigEntry for export.

func (*Config) Validate

func (config *Config) Validate() error

Validate checks if the configuration is valid

type ConfigEntry

type ConfigEntry struct {
	Key   string
	Value string
}

ConfigEntry represents a single configuration option as a key-value pair

type HealthExporterConfig

type HealthExporterConfig struct {
	// MetricsEndpoint is the specific endpoint for sending metrics data
	MetricsEndpoint string `json:"metrics_endpoint"`

	// LogsEndpoint is the specific endpoint for sending logs/events data
	LogsEndpoint string `json:"logs_endpoint"`

	// Attestation configuration
	Attestation AttestationConfig `json:"attestation"`

	// AuthToken is the authentication token for HTTP requests
	AuthToken string `json:"auth_token,omitempty"`

	// Interval is how often to export health data
	Interval metav1.Duration `json:"interval"`

	// Timeout for HTTP requests to the global health endpoint
	Timeout metav1.Duration `json:"timeout"`

	// IncludeMetrics controls whether to include metrics data in exports
	IncludeMetrics bool `json:"include_metrics"`

	// IncludeEvents controls whether to include events data in exports
	IncludeEvents bool `json:"include_events"`

	// IncludeMachineInfo controls whether to include machine hardware info in exports
	IncludeMachineInfo bool `json:"include_machine_info"`

	// IncludeComponentData controls whether to include actual component data/numbers in exports
	IncludeComponentData bool `json:"include_component_data"`

	// MetricsLookback determines how far back to look for metrics data
	MetricsLookback metav1.Duration `json:"metrics_lookback"`

	// EventsLookback determines how far back to look for events data
	EventsLookback metav1.Duration `json:"events_lookback"`

	// HealthCheckInterval determines how often individual components perform their health checks
	// Valid range: 1 second (minimum) to 24 hours (maximum), default is 1 minute
	HealthCheckInterval metav1.Duration `json:"health_check_interval"`

	// RetryMaxAttempts is the maximum number of retry attempts for failed requests
	RetryMaxAttempts int `json:"retry_max_attempts"`

	// Offline mode configuration
	// OfflineMode controls whether to use offline mode (write to files instead of HTTP endpoint)
	OfflineMode bool `json:"offline_mode"`

	// OutputPath is the directory path where files will be written (required when OfflineMode is true)
	OutputPath string `json:"output_path"`

	// OutputFormat specifies the format for offline mode output files: "json" (default) or "csv"
	OutputFormat string `json:"output_format"`

	// Duration is how long to collect telemetry data in offline mode
	Duration time.Duration `json:"duration"`
}

HealthExporterConfig holds configuration for the health data exporter

type Op

type Op struct {
	pkgconfigcommon.ToolOverwrites
}

Op contains options for health configuration

func (*Op) ApplyOpts

func (op *Op) ApplyOpts(opts []OpOption) error

ApplyOpts applies all the provided options to the Op struct

type OpOption

type OpOption func(*Op)

OpOption is a function that modifies health configuration options

func WithInfinibandClassRootDir

func WithInfinibandClassRootDir(p string) OpOption

WithInfinibandClassRootDir specifies the root directory of the InfiniBand class

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL