controller

package
v0.2.0-rc12 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 24, 2026 License: MIT Imports: 47 Imported by: 0

Documentation

Index

Constants

View Source
const (
	AnnotationPrefix        = "kaiwo.silogen.ai/gpu-preemption."
	AnnotationEnabled       = AnnotationPrefix + "enabled"
	AnnotationThreshold     = AnnotationPrefix + "threshold"
	AnnotationGracePeriod   = AnnotationPrefix + "grace-period"
	AnnotationPolicy        = AnnotationPrefix + "policy"
	AnnotationAggregation   = AnnotationPrefix + "aggregation"
	AnnotationTTL           = AnnotationPrefix + "ttl"
	GpuWorkloadFinalizer    = "kaiwo.silogen.ai/gpu-workload-protection"
	PreemptionEvalLeaseName = "gpu-preemption-eval"
	PreemptionEvalLeaseDur  = 30 * time.Second
	IdleRequeueInterval     = 60 * time.Second

	// GpuResourcePrefix matches all AMD device-plugin resources (amd.com/gpu,
	// amd.com/gpu-0, etc.). NVIDIA resources are intentionally excluded.
	GpuResourcePrefix = "amd.com/"

	DefaultOperatorNamespace    = "kaiwo-system"
	DefaultUtilizationThreshold = 5.0
	DefaultGracePeriod          = 10 * time.Minute
	DefaultTTL                  = 24 * time.Hour

	EnvGpuPreemptionPrefix = "GPU_PREEMPTION_"
	EnvEnabled             = EnvGpuPreemptionPrefix + "ENABLED"
	EnvDefaultThreshold    = EnvGpuPreemptionPrefix + "DEFAULT_THRESHOLD"
	EnvDefaultGracePeriod  = EnvGpuPreemptionPrefix + "DEFAULT_GRACE_PERIOD"
	EnvDefaultPolicy       = EnvGpuPreemptionPrefix + "DEFAULT_POLICY"
	EnvDefaultAggregation  = EnvGpuPreemptionPrefix + "DEFAULT_AGGREGATION"
	EnvDefaultTTL          = EnvGpuPreemptionPrefix + "DEFAULT_TTL"
	EnvOperatorNamespace   = EnvGpuPreemptionPrefix + "OPERATOR_NAMESPACE"
)
View Source
const (
	EnvMetricsEndpoint = EnvGpuPreemptionPrefix + "METRICS_ENDPOINT"
	EnvPollingInterval = EnvGpuPreemptionPrefix + "POLLING_INTERVAL"
)

Variables

This section is empty.

Functions

func GpuWorkloadName added in v0.2.1

func GpuWorkloadName(kind string, name string, uid types.UID) string

GpuWorkloadName generates a deterministic name for a GpuWorkload CR from the root owner: <lowercase-kind>-<name>-<first-8-chars-of-uid>.

func IsGpuPreemptionEnabled added in v0.2.1

func IsGpuPreemptionEnabled() bool

func JobStatusChangedPredicate added in v0.1.6

func JobStatusChangedPredicate() predicate.Predicate

JobStatusChangedPredicate returns true only if Failed, Succeeded or the lengths of UncountedTerminatedPods.Failed / .Succeeded have changed.

Types

type GpuMetricsScraper added in v0.2.1

type GpuMetricsScraper struct {
	// contains filtered or unexported fields
}

GpuMetricsScraper is a manager.Runnable that periodically scrapes AMD GPU metrics and writes per-pod utilization data into GpuWorkload CR statuses.

func NewGpuMetricsScraper added in v0.2.1

func NewGpuMetricsScraper(c client.Client) (*GpuMetricsScraper, error)

NewGpuMetricsScraper creates a scraper from environment variables.

func (*GpuMetricsScraper) Start added in v0.2.1

func (s *GpuMetricsScraper) Start(ctx context.Context) error

Start implements manager.Runnable.

type GpuWorkloadReconciler added in v0.2.1

type GpuWorkloadReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
	// contains filtered or unexported fields
}

func (*GpuWorkloadReconciler) Reconcile added in v0.2.1

func (r *GpuWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

func (*GpuWorkloadReconciler) SetupWithManager added in v0.2.1

func (r *GpuWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error

type KaiwoJobReconciler

type KaiwoJobReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

KaiwoJobReconciler reconciles a KaiwoJob object

func (*KaiwoJobReconciler) Reconcile

func (r *KaiwoJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

func (*KaiwoJobReconciler) SetupWithManager

func (r *KaiwoJobReconciler) SetupWithManager(mgr ctrl.Manager) error

type KaiwoQueueConfigReconciler

type KaiwoQueueConfigReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

KaiwoQueueConfigReconciler reconciles a KaiwoQueueConfig object

func (*KaiwoQueueConfigReconciler) CreateTopology added in v0.1.6

func (r *KaiwoQueueConfigReconciler) CreateTopology(ctx context.Context) error

func (*KaiwoQueueConfigReconciler) EnsureKaiwoQueueConfig added in v0.1.4

func (r *KaiwoQueueConfigReconciler) EnsureKaiwoQueueConfig(ctx context.Context, kaiwoQueueConfigName string, clusterQueueName string, cohort string, queueConfig *kaiwo.KaiwoQueueConfig) error

func (*KaiwoQueueConfigReconciler) Reconcile

func (*KaiwoQueueConfigReconciler) SetupWithManager

func (r *KaiwoQueueConfigReconciler) SetupWithManager(mgr ctrl.Manager) error

func (*KaiwoQueueConfigReconciler) SyncKueueResources

func (r *KaiwoQueueConfigReconciler) SyncKueueResources(ctx context.Context, queueConfig *kaiwo.KaiwoQueueConfig) error

type KaiwoServiceReconciler

type KaiwoServiceReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
}

KaiwoServiceReconciler reconciles a KaiwoService object

func (*KaiwoServiceReconciler) Reconcile

func (r *KaiwoServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

func (*KaiwoServiceReconciler) SetupWithManager

func (r *KaiwoServiceReconciler) SetupWithManager(mgr ctrl.Manager) error

type RootOwnerResult added in v0.2.1

type RootOwnerResult struct {
	Ref        kaiwo.WorkloadReference
	OwnerChain string
	Namespace  string
}

RootOwnerResult holds the resolved root owner and the owner chain string.

func ResolveRootOwner added in v0.2.1

func ResolveRootOwner(ctx context.Context, c client.Client, namespace string, name string, kind string, apiVersion string, uid types.UID) (*RootOwnerResult, error)

ResolveRootOwner walks the controller ownerReferences chain from a pod up to the root owner (the resource with no controller owner). It uses the unstructured client so it works with arbitrary resource types without importing their Go types.

Directories

Path Synopsis
aim

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL