Documentation
¶
Index ¶
- Constants
- func GetGPUDiscoveryFailureReason(err error) string
- func IsDeploymentReady(deployment *appsv1.Deployment) bool
- func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool
- type CheckpointReconciler
- type ComponentReconcileResult
- type DynamoComponentDeploymentReconciler
- func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, ...) error
- func (r *DynamoComponentDeploymentReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
- func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
- type DynamoGraphDeploymentReconciler
- func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error
- func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
- func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
- type DynamoGraphDeploymentRequestReconciler
- func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error
- func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error
- type DynamoGraphDeploymentScalingAdapterReconciler
- type DynamoModelReconciler
- type FailoverCascadeReconciler
- type Message
- type RBACManager
- type Reason
- type ReconcileResult
- type Resource
Constants ¶
const ( // Condition types ConditionTypeEndpointsReady = "EndpointsReady" ConditionTypeServicesFound = "ServicesFound" // Condition reasons ReasonAllEndpointsReady = "AllEndpointsReady" ReasonEndpointsDiscovered = "EndpointsDiscovered" ReasonNotReady = "NotReady" ReasonNoEndpoints = "NoEndpoints" ReasonServicesFound = "ServicesFound" ReasonNoServicesFound = "NoServicesFound" )
const ( DefaultClusterName = "default" DefaultServiceAccountName = "default" KubeAnnotationDeploymentStrategy = "nvidia.com/deployment-strategy" KubeAnnotationDeploymentRollingUpdateMaxSurge = "nvidia.com/deployment-rolling-update-max-surge" )
const ( // Job naming JobNamePrefixOnline = "profile-online-" JobNamePrefixAIC = "profile-aic-" // Container names ContainerNameProfiler = "profiler" ContainerNameOutputCopier = "output-copier" // ServiceAccount ServiceAccountProfilingJob = "dgdr-profiling-job" // ConfigMap naming ConfigMapOutputPrefix = "dgdr-output-" // Annotation keys AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources" // Annotation keys for v1alpha1 round-trip compatibility. // The conversion layer stores v1alpha1 fields that have no v1beta1 spec equivalent // as annotations so the controller can still honour them for converted resources. AnnotationConfigMapRef = "nvidia.com/dgdr-config-map-ref" AnnotationOutputPVC = "nvidia.com/dgdr-output-pvc" // Size limits MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit // Sidecar image SidecarImage = "bitnami/kubectl:latest" // Volume names VolumeNameProfilingOutput = "profiling-output" VolumeNameProfilingConfig = "profiling-config" VolumeNameModelCache = "model-cache" VolumeNameOutputCopierKubeAPIAccess = "output-copier-kube-api-access" ConfigMapNameKubeRootCA = "kube-root-ca.crt" ServiceAccountTokenExpirationSeconds = 3600 // Volume paths ProfilingOutputPath = "/data" ProfilingOutputFile = "final_config.yaml" ProfilingConfigMountPath = "/config" ProfilingConfigDefaultKey = "disagg.yaml" DefaultModelCacheMountPath = "/opt/model-cache" ServiceAccountTokenPath = "/var/run/secrets/kubernetes.io/serviceaccount" // Command line arguments ArgModel = "--model" ArgBackend = "--backend" ArgTTFT = "--ttft" ArgITL = "--itl" ArgConfig = "--config" // Messages MessageInitialized = "DGDR initialized successfully" MessageDiscoveringHardware = "Discovering GPU hardware and preparing profiling job" MessageProfilingJobCreated = "Profiling job created" MessageAICProfilingJobCreated = "AIC profiling job created" MessageProfilingInProgress = "Profiling is in progress" MessageSpecGenerated = "DynamoGraphDeployment spec generated successfully" MessageSpecAvailable = "Generated spec is available in annotation nvidia.com/generated-dgd-spec" MessageDeploymentCreated = "DynamoGraphDeployment %s created successfully" MessageDeploymentReady = "DynamoGraphDeployment %s is ready" MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s" MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy." MessageInvalidState = "Invalid state" MessageSpecChangeRejected = "" /* 151-byte string literal not displayed */ MessageJobCreationFailed = "JobCreationFailed" MessageDeploymentCreationFailed = "DeploymentCreationFailed" MessageResultsRetrievalFailed = "ResultsRetrievalFailed" MessageGenerationFailed = "GenerationFailed" MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed" MessageProfilingCheckFailed = "ProfilingCheckFailed" MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s" MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s" MessageModelCachePVCNotFound = "model cache PVC %s not found in namespace %s" )
Variables ¶
This section is empty.
Functions ¶
func GetGPUDiscoveryFailureReason ¶
GetGPUDiscoveryFailureReason classifies a GPU discovery error and returns a stable, actionable reason string suitable for structured logging.
The classification is based on known error message patterns produced during:
- DCGM exporter pod discovery
- Helm-based GPU operator and DCGM discovery
- Metrics scraping
- Prometheus parsing
If the error does not match any known category, "unknown" is returned.
func IsDeploymentReady ¶
func IsDeploymentReady(deployment *appsv1.Deployment) bool
IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available. It checks various status fields to ensure all replicas are available and the deployment configuration has been fully applied.
func IsLeaderWorkerSetReady ¶
func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool
IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available
Types ¶
type CheckpointReconciler ¶
type CheckpointReconciler struct {
client.Client
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
Recorder record.EventRecorder
}
CheckpointReconciler reconciles a DynamoCheckpoint object
func (*CheckpointReconciler) GetRecorder ¶
func (r *CheckpointReconciler) GetRecorder() record.EventRecorder
GetRecorder returns the event recorder (implements controller_common.Reconciler interface)
func (*CheckpointReconciler) SetupWithManager ¶
func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type ComponentReconcileResult ¶
type ComponentReconcileResult struct {
// contains filtered or unexported fields
}
type DynamoComponentDeploymentReconciler ¶
type DynamoComponentDeploymentReconciler struct {
client.Client
Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
DockerSecretRetriever dockerSecretRetriever
}
DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object
func (*DynamoComponentDeploymentReconciler) FinalizeResource ¶
func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error
func (*DynamoComponentDeploymentReconciler) GetRecorder ¶
func (r *DynamoComponentDeploymentReconciler) GetRecorder() record.EventRecorder
func (*DynamoComponentDeploymentReconciler) Reconcile ¶
func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoComponentDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.
For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile
func (*DynamoComponentDeploymentReconciler) SetupWithManager ¶
func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type DynamoGraphDeploymentReconciler ¶
type DynamoGraphDeploymentReconciler struct {
client.Client
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commoncontroller.RuntimeConfig
Recorder record.EventRecorder
DockerSecretRetriever dockerSecretRetriever
ScaleClient scale.ScalesGetter
SSHKeyManager *secret.SSHKeyManager
RBACManager rbacManager
}
DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
func (*DynamoGraphDeploymentReconciler) FinalizeResource ¶
func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error
func (*DynamoGraphDeploymentReconciler) GetRecorder ¶
func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder
func (*DynamoGraphDeploymentReconciler) Reconcile ¶
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoGraphDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.
For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
func (*DynamoGraphDeploymentReconciler) SetupWithManager ¶
func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type DynamoGraphDeploymentRequestReconciler ¶
type DynamoGraphDeploymentRequestReconciler struct {
client.Client
APIReader client.Reader
Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
GPUDiscoveryCache *gpu.GPUDiscoveryCache
GPUDiscovery *gpu.GPUDiscovery
// RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager
}
DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
func (*DynamoGraphDeploymentRequestReconciler) FinalizeResource ¶
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error
FinalizeResource implements commonController.Finalizer interface
func (*DynamoGraphDeploymentRequestReconciler) GetRecorder ¶
func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder
GetRecorder implements commonController.Reconciler interface
func (*DynamoGraphDeploymentRequestReconciler) Reconcile ¶
func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest
func (*DynamoGraphDeploymentRequestReconciler) SetupWithManager ¶
func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager
type DynamoGraphDeploymentScalingAdapterReconciler ¶
type DynamoGraphDeploymentScalingAdapterReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
}
DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object
func (*DynamoGraphDeploymentScalingAdapterReconciler) Reconcile ¶
func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile implements the reconciliation loop for DynamoGraphDeploymentScalingAdapter
func (*DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager ¶
func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager
type DynamoModelReconciler ¶
type DynamoModelReconciler struct {
client.Client
Recorder record.EventRecorder
EndpointClient *modelendpoint.Client
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commoncontroller.RuntimeConfig
}
DynamoModelReconciler reconciles a DynamoModel object
func (*DynamoModelReconciler) FinalizeResource ¶
func (r *DynamoModelReconciler) FinalizeResource(ctx context.Context, model *v1alpha1.DynamoModel) error
FinalizeResource implements the Finalizer interface Performs cleanup when a DynamoModel is being deleted
func (*DynamoModelReconciler) Reconcile ¶
func (r *DynamoModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile handles the reconciliation loop for DynamoModel resources
func (*DynamoModelReconciler) SetupWithManager ¶
func (r *DynamoModelReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager
type FailoverCascadeReconciler ¶
type FailoverCascadeReconciler struct {
client.Client
Recorder record.EventRecorder
}
FailoverCascadeReconciler watches GMS failover pods (restartPolicy: Never) and cascade-deletes all pods in the same engine group when any member reaches a terminal phase (Failed or Succeeded). This ensures broken distributed inference groups are restarted cleanly by Grove.
Background: GMS (GPU Memory Service) pods run with restartPolicy: Never so that Kubernetes does not attempt to restart them in-place — a partial restart would leave the distributed inference group in an inconsistent state. Instead, this controller detects the terminal pod and deletes the entire group. Grove then sees the missing pods and recreates the whole group from scratch.
An engine group is identified by three Grove labels:
- grove.io/podcliquescalinggroup (PCSG name)
- grove.io/podcliquescalinggroup-replica-index (PCSG replica — which copy of the group)
- grove.io/podclique-pod-index (pod index within the clique)
Only pods carrying the dynamo failover engine-group-member label are considered; see failoverCascadePredicate().
func NewFailoverCascadeReconciler ¶
func NewFailoverCascadeReconciler(c client.Client, recorder record.EventRecorder) *FailoverCascadeReconciler
NewFailoverCascadeReconciler creates a new reconciler.
func (*FailoverCascadeReconciler) Reconcile ¶
func (r *FailoverCascadeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile is called whenever a failover-eligible pod transitions to a terminal phase (see failoverCascadePredicate).
DeleteAllOf is idempotent, so concurrent reconciles for multiple pods in the same engine group are harmless — the first deletes the group and subsequent calls are no-ops.
func (*FailoverCascadeReconciler) SetupWithManager ¶
func (r *FailoverCascadeReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager registers a controller that watches all Pods (not just owned ones) and uses failoverCascadePredicate to filter down to only the failover-eligible phase transitions. EnqueueRequestForObject means the reconcile key is the pod itself (namespace/name), not a parent resource.
type RBACManager ¶
type RBACManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}
RBACManager interface for managing RBAC resources
type ReconcileResult ¶
type ReconcileResult struct {
State nvidiacomv1alpha1.DGDState
Reason Reason
Message Message
ServiceStatus map[string]nvidiacomv1alpha1.ServiceReplicaStatus
RestartStatus *nvidiacomv1alpha1.RestartStatus
}
type Resource ¶
type Resource interface {
IsReady() (ready bool, reason string)
GetName() string
GetServiceStatuses() map[string]nvidiacomv1alpha1.ServiceReplicaStatus
}
Source Files
¶
- checkpoint_job.go
- common.go
- dynamo_model_controller.go
- dynamocheckpoint_controller.go
- dynamocomponentdeployment_controller.go
- dynamographdeployment_controller.go
- dynamographdeployment_rollingupdate.go
- dynamographdeploymentrequest_controller.go
- dynamographdeploymentscalingadapter_controller.go
- failover_cascade_controller.go
- profiling_job_overrides.go