Documentation
¶
Index ¶
- Constants
- func GpuWorkloadName(kind string, name string, uid types.UID) string
- func IsGpuPreemptionEnabled() bool
- func JobStatusChangedPredicate() predicate.Predicate
- type GpuMetricsScraper
- type GpuWorkloadReconciler
- type KaiwoJobReconciler
- type KaiwoQueueConfigReconciler
- func (r *KaiwoQueueConfigReconciler) CreateTopology(ctx context.Context) error
- func (r *KaiwoQueueConfigReconciler) EnsureKaiwoQueueConfig(ctx context.Context, kaiwoQueueConfigName string, clusterQueueName string, ...) error
- func (r *KaiwoQueueConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *KaiwoQueueConfigReconciler) SetupWithManager(mgr ctrl.Manager) error
- func (r *KaiwoQueueConfigReconciler) SyncKueueResources(ctx context.Context, queueConfig *kaiwo.KaiwoQueueConfig) error
- type KaiwoServiceReconciler
- type RootOwnerResult
Constants ¶
const ( AnnotationPrefix = "kaiwo.silogen.ai/gpu-preemption." AnnotationEnabled = AnnotationPrefix + "enabled" AnnotationThreshold = AnnotationPrefix + "threshold" AnnotationGracePeriod = AnnotationPrefix + "grace-period" AnnotationPolicy = AnnotationPrefix + "policy" AnnotationAggregation = AnnotationPrefix + "aggregation" AnnotationTTL = AnnotationPrefix + "ttl" GpuWorkloadFinalizer = "kaiwo.silogen.ai/gpu-workload-protection" PreemptionEvalLeaseName = "gpu-preemption-eval" PreemptionEvalLeaseDur = 30 * time.Second IdleRequeueInterval = 60 * time.Second // GpuResourcePrefix matches all AMD device-plugin resources (amd.com/gpu, // amd.com/gpu-0, etc.). NVIDIA resources are intentionally excluded. GpuResourcePrefix = "amd.com/" DefaultOperatorNamespace = "kaiwo-system" DefaultUtilizationThreshold = 5.0 DefaultGracePeriod = 10 * time.Minute DefaultTTL = 24 * time.Hour EnvGpuPreemptionPrefix = "GPU_PREEMPTION_" EnvEnabled = EnvGpuPreemptionPrefix + "ENABLED" EnvDefaultThreshold = EnvGpuPreemptionPrefix + "DEFAULT_THRESHOLD" EnvDefaultGracePeriod = EnvGpuPreemptionPrefix + "DEFAULT_GRACE_PERIOD" EnvDefaultPolicy = EnvGpuPreemptionPrefix + "DEFAULT_POLICY" EnvDefaultAggregation = EnvGpuPreemptionPrefix + "DEFAULT_AGGREGATION" EnvDefaultTTL = EnvGpuPreemptionPrefix + "DEFAULT_TTL" EnvOperatorNamespace = EnvGpuPreemptionPrefix + "OPERATOR_NAMESPACE" )
const ( EnvMetricsEndpoint = EnvGpuPreemptionPrefix + "METRICS_ENDPOINT" EnvPollingInterval = EnvGpuPreemptionPrefix + "POLLING_INTERVAL" )
Variables ¶
This section is empty.
Functions ¶
func GpuWorkloadName ¶ added in v0.2.1
GpuWorkloadName generates a deterministic name for a GpuWorkload CR from the root owner: <lowercase-kind>-<name>-<first-8-chars-of-uid>.
func IsGpuPreemptionEnabled ¶ added in v0.2.1
func IsGpuPreemptionEnabled() bool
func JobStatusChangedPredicate ¶ added in v0.1.6
JobStatusChangedPredicate returns true only if Failed, Succeeded or the lengths of UncountedTerminatedPods.Failed / .Succeeded have changed.
Types ¶
type GpuMetricsScraper ¶ added in v0.2.1
type GpuMetricsScraper struct {
// contains filtered or unexported fields
}
GpuMetricsScraper is a manager.Runnable that periodically scrapes AMD GPU metrics and writes per-pod utilization data into GpuWorkload CR statuses.
func NewGpuMetricsScraper ¶ added in v0.2.1
func NewGpuMetricsScraper(c client.Client) (*GpuMetricsScraper, error)
NewGpuMetricsScraper creates a scraper from environment variables.
type GpuWorkloadReconciler ¶ added in v0.2.1
type GpuWorkloadReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
// contains filtered or unexported fields
}
func (*GpuWorkloadReconciler) SetupWithManager ¶ added in v0.2.1
func (r *GpuWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error
type KaiwoJobReconciler ¶
type KaiwoJobReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
KaiwoJobReconciler reconciles a KaiwoJob object
func (*KaiwoJobReconciler) SetupWithManager ¶
func (r *KaiwoJobReconciler) SetupWithManager(mgr ctrl.Manager) error
type KaiwoQueueConfigReconciler ¶
type KaiwoQueueConfigReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
KaiwoQueueConfigReconciler reconciles a KaiwoQueueConfig object
func (*KaiwoQueueConfigReconciler) CreateTopology ¶ added in v0.1.6
func (r *KaiwoQueueConfigReconciler) CreateTopology(ctx context.Context) error
func (*KaiwoQueueConfigReconciler) EnsureKaiwoQueueConfig ¶ added in v0.1.4
func (r *KaiwoQueueConfigReconciler) EnsureKaiwoQueueConfig(ctx context.Context, kaiwoQueueConfigName string, clusterQueueName string, cohort string, queueConfig *kaiwo.KaiwoQueueConfig) error
func (*KaiwoQueueConfigReconciler) SetupWithManager ¶
func (r *KaiwoQueueConfigReconciler) SetupWithManager(mgr ctrl.Manager) error
func (*KaiwoQueueConfigReconciler) SyncKueueResources ¶
func (r *KaiwoQueueConfigReconciler) SyncKueueResources(ctx context.Context, queueConfig *kaiwo.KaiwoQueueConfig) error
type KaiwoServiceReconciler ¶
type KaiwoServiceReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
KaiwoServiceReconciler reconciles a KaiwoService object
func (*KaiwoServiceReconciler) SetupWithManager ¶
func (r *KaiwoServiceReconciler) SetupWithManager(mgr ctrl.Manager) error
type RootOwnerResult ¶ added in v0.2.1
type RootOwnerResult struct {
Ref kaiwo.WorkloadReference
OwnerChain string
Namespace string
}
RootOwnerResult holds the resolved root owner and the owner chain string.
func ResolveRootOwner ¶ added in v0.2.1
func ResolveRootOwner(ctx context.Context, c client.Client, namespace string, name string, kind string, apiVersion string, uid types.UID) (*RootOwnerResult, error)
ResolveRootOwner walks the controller ownerReferences chain from a pod up to the root owner (the resource with no controller owner). It uses the unstructured client so it works with arbitrary resource types without importing their Go types.