Documentation
¶
Index ¶
Constants ¶
const ( // DefaultAPIVersion for health server DefaultAPIVersion = "v1" // DefaultHealthPort for health metrics export DefaultHealthPort = 15133 // DefaultListenHost is the default host to bind to (localhost only for security) DefaultListenHost = "127.0.0.1" )
Variables ¶
var ( // DefaultListenAddress combines the default host and port (for server binding) DefaultListenAddress = fmt.Sprintf("%s:%d", DefaultListenHost, DefaultHealthPort) // DefaultClientURL is the default URL for client commands to connect to the server DefaultClientURL = fmt.Sprintf("http://localhost:%d", DefaultHealthPort) // DefaultRetentionPeriod - keep health data for 24 hours by default DefaultRetentionPeriod = metav1.Duration{Duration: 24 * time.Hour} )
Functions ¶
func DefaultStateFile ¶
DefaultStateFile returns the default path for the health state database
Types ¶
type AttestationConfig ¶
type AttestationConfig struct {
// Interval is how often to run attestation (default: 24 hours)
Interval metav1.Duration `json:"interval"`
// JitterEnabled controls whether to add random jitter to attestation schedule
JitterEnabled bool `json:"jitter_enabled"`
}
AttestationConfig holds configuration for the attestation process
type Config ¶
type Config struct {
// APIVersion for the health server
APIVersion string `json:"api_version"`
// Address for the health server to listen on
Address string `json:"address"`
// State file that persists health status and metrics
// If empty, states are not persisted to file
State string `json:"state"`
// Amount of time to retain health states/metrics
// Once elapsed, old data is automatically purged
RetentionPeriod metav1.Duration `json:"retention_period"`
// NVIDIA tool command paths to overwrite defaults
NvidiaToolOverwrites pkgconfigcommon.ToolOverwrites `json:"nvidia_tool_overwrites"`
// Components specifies which health components to enable
// Leave empty, "*", or "all" to enable all components
// Prefix component names with "-" to disable them
Components []string `json:"components"`
// EnableDCGMPolicy enables DCGM policy violation monitoring.
// All policies (XID, PCIe, DBE, NVLink, Power, Thermal, Page Retirement) are disabled by default.
// PCIe, DBE, and NVLink have false-positive issues with monotonic counter checks.
EnableDCGMPolicy bool `json:"enable_dcgm_policy"`
// EnableFaultInjection enables the /inject-fault endpoint for testing.
// This endpoint allows injecting faults (kernel messages, component errors, events) into the system.
// SECURITY: Only accessible from localhost (127.0.0.0/8 or ::1). Disabled by default.
EnableFaultInjection bool `json:"enable_fault_injection"`
// Health Exporter Configuration
HealthExporter *HealthExporterConfig `json:"health_exporter,omitempty"`
// contains filtered or unexported fields
}
Config provides configuration for the health metrics exporter
func (*Config) ShouldDisable ¶
ShouldDisable returns true if the component should be disabled. If no disable components are specified, all components are enabled by default.
func (*Config) ShouldEnable ¶
ShouldEnable returns true if the component should be enabled. If no components are specified, all components are enabled by default.
func (*Config) ToConfigEntries ¶
func (config *Config) ToConfigEntries(allComponentNames []string) []ConfigEntry
ToConfigEntries converts the Config struct into a slice of ConfigEntry for export.
type ConfigEntry ¶
ConfigEntry represents a single configuration option as a key-value pair
type HealthExporterConfig ¶
type HealthExporterConfig struct {
// MetricsEndpoint is the specific endpoint for sending metrics data
MetricsEndpoint string `json:"metrics_endpoint"`
// LogsEndpoint is the specific endpoint for sending logs/events data
LogsEndpoint string `json:"logs_endpoint"`
// Attestation configuration
Attestation AttestationConfig `json:"attestation"`
// AuthToken is the authentication token for HTTP requests
AuthToken string `json:"auth_token,omitempty"`
// Interval is how often to export health data
Interval metav1.Duration `json:"interval"`
// Timeout for HTTP requests to the global health endpoint
Timeout metav1.Duration `json:"timeout"`
// IncludeMetrics controls whether to include metrics data in exports
IncludeMetrics bool `json:"include_metrics"`
// IncludeEvents controls whether to include events data in exports
IncludeEvents bool `json:"include_events"`
// IncludeMachineInfo controls whether to include machine hardware info in exports
IncludeMachineInfo bool `json:"include_machine_info"`
// IncludeComponentData controls whether to include actual component data/numbers in exports
IncludeComponentData bool `json:"include_component_data"`
// MetricsLookback determines how far back to look for metrics data
MetricsLookback metav1.Duration `json:"metrics_lookback"`
// EventsLookback determines how far back to look for events data
EventsLookback metav1.Duration `json:"events_lookback"`
// HealthCheckInterval determines how often individual components perform their health checks
// Valid range: 1 second (minimum) to 24 hours (maximum), default is 1 minute
HealthCheckInterval metav1.Duration `json:"health_check_interval"`
// RetryMaxAttempts is the maximum number of retry attempts for failed requests
RetryMaxAttempts int `json:"retry_max_attempts"`
// Offline mode configuration
// OfflineMode controls whether to use offline mode (write to files instead of HTTP endpoint)
OfflineMode bool `json:"offline_mode"`
// OutputPath is the directory path where files will be written (required when OfflineMode is true)
OutputPath string `json:"output_path"`
// OutputFormat specifies the format for offline mode output files: "json" (default) or "csv"
OutputFormat string `json:"output_format"`
// Duration is how long to collect telemetry data in offline mode
Duration time.Duration `json:"duration"`
}
HealthExporterConfig holds configuration for the health data exporter
type Op ¶
type Op struct {
pkgconfigcommon.ToolOverwrites
}
Op contains options for health configuration