Documentation
¶
Index ¶
- Constants
- func IsDeploymentReady(deployment *appsv1.Deployment) bool
- func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool
- type DynamoComponentDeploymentReconciler
- func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, ...) error
- func (r *DynamoComponentDeploymentReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
- func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
- type DynamoGraphDeploymentReconciler
- func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error
- func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
- type DynamoGraphDeploymentRequestReconciler
- func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error
- func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder
- func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error
- type IngressConfig
- type Message
- type RBACManager
- type Reason
- type Resource
- type State
- type TLSModeOpt
Constants ¶
const ( DefaultClusterName = "default" DefaultServiceAccountName = "default" KubeAnnotationDeploymentStrategy = "nvidia.com/deployment-strategy" KubeAnnotationEnableStealingTrafficDebugMode = "nvidia.com/enable-stealing-traffic-debug-mode" KubeAnnotationEnableDebugMode = "nvidia.com/enable-debug-mode" KubeAnnotationEnableDebugPodReceiveProductionTraffic = "nvidia.com/enable-debug-pod-receive-production-traffic" DeploymentTargetTypeProduction = "production" DeploymentTargetTypeDebug = "debug" HeaderNameDebug = "X-Nvidia-Debug" KubernetesDeploymentStrategy = "kubernetes" DeploymentTypeStandard = "standard" DeploymentTypeMultinodeGrove = "multinode-grove" ComponentTypePlanner = "Planner" )
const ( // State constants StateEmpty = "" StatePending = "Pending" StateProfiling = "Profiling" StateDeploying = "Deploying" StateReady = "Ready" StateDeploymentDeleted = "DeploymentDeleted" StateFailed = "Failed" // Condition types ConditionTypeValidation = "Validation" ConditionTypeProfiling = "Profiling" ConditionTypeSpecGenerated = "SpecGenerated" ConditionTypeDeploymentReady = "DeploymentReady" // Event reasons EventReasonInitialized = "Initialized" EventReasonValidationFailed = "ValidationFailed" EventReasonProfilingJobCreated = "ProfilingJobCreated" EventReasonProfilingJobFailed = "ProfilingJobFailed" EventReasonAIConfiguratorFailed = "AIConfiguratorFailed" EventReasonSpecGenerated = "SpecGenerated" EventReasonSpecChangeRejected = "SpecChangeRejected" EventReasonDeploymentCreated = "DeploymentCreated" EventReasonDeploymentReady = "DeploymentReady" EventReasonDeploymentDegraded = "DeploymentDegraded" EventReasonDeploymentDeleted = "DeploymentDeleted" // Label keys LabelApp = "app" LabelDGDR = "dgdr" LabelDGDRName = "dgdr.nvidia.com/name" LabelDGDRNamespace = "dgdr.nvidia.com/namespace" LabelManagedBy = "nvidia.com/managed-by" // Label values LabelValueDynamoProfiler = "dynamo-profiler" LabelValueAICProfiler = "aic-profiler" LabelValueDynamoOperator = "dynamo-operator" // Job naming JobNamePrefixOnline = "profile-online-" JobNamePrefixAIC = "profile-aic-" // Container names ContainerNameProfiler = "profiler" ContainerNameOutputCopier = "output-copier" // ServiceAccount ServiceAccountProfilingJob = "dgdr-profiling-job" // ConfigMap naming ConfigMapOutputPrefix = "dgdr-output-" // Sidecar image SidecarImage = "bitnami/kubectl:latest" // Volume names VolumeNameProfilingConfig = "profiling-config" VolumeNameProfilingOutput = "profiling-output" // Volume paths ProfilingOutputPath = "/data" ProfilingOutputFile = "config_with_planner.yaml" ProfilingConfigPath = "/config" ProfilingConfigFile = "disagg.yaml" // Command line arguments ArgModel = "--model" ArgBackend = "--backend" ArgTTFT = "--ttft" ArgITL = "--itl" ArgConfig = "--config" // Messages MessageInitialized = "DGDR initialized successfully" MessageProfilingJobCreated = "Profiling job created" MessageAICProfilingJobCreated = "AIC profiling job created" MessageProfilingInProgress = "Profiling is in progress" MessageSpecGenerated = "DynamoGraphDeployment spec generated successfully" MessageSpecAvailable = "Generated spec is available in status.generatedDeployment" MessageDeploymentCreated = "DynamoGraphDeployment %s created successfully" MessageDeploymentReady = "DynamoGraphDeployment %s is ready" MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s" MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy." MessageInvalidState = "Invalid state" MessageSpecChangeRejected = "" /* 151-byte string literal not displayed */ MessageJobCreationFailed = "JobCreationFailed" MessageDeploymentCreationFailed = "DeploymentCreationFailed" MessageResultsRetrievalFailed = "ResultsRetrievalFailed" MessageGenerationFailed = "GenerationFailed" MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed" MessageProfilingCheckFailed = "ProfilingCheckFailed" MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s" MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s" // Validation messages ValidationErrorModelNameRequired = "modelName is required" ValidationErrorITLPositive = "sla.itl must be positive" ValidationErrorTTFTPositive = "sla.ttft must be positive" ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)" // Valid backend values BackendVLLM = "vllm" BackendSGLang = "sglang" BackendTRTLLM = "trtllm" )
Variables ¶
This section is empty.
Functions ¶
func IsDeploymentReady ¶
func IsDeploymentReady(deployment *appsv1.Deployment) bool
IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available. It checks various status fields to ensure all replicas are available and the deployment configuration has been fully applied.
func IsLeaderWorkerSetReady ¶
func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool
IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available
Types ¶
type DynamoComponentDeploymentReconciler ¶
type DynamoComponentDeploymentReconciler struct {
client.Client
Recorder record.EventRecorder
Config controller_common.Config
EtcdStorage etcdStorage
DockerSecretRetriever dockerSecretRetriever
}
DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object
func (*DynamoComponentDeploymentReconciler) FinalizeResource ¶
func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error
func (*DynamoComponentDeploymentReconciler) GetRecorder ¶
func (r *DynamoComponentDeploymentReconciler) GetRecorder() record.EventRecorder
func (*DynamoComponentDeploymentReconciler) Reconcile ¶
func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoComponentDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.
For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile
func (*DynamoComponentDeploymentReconciler) SetupWithManager ¶
func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type DynamoGraphDeploymentReconciler ¶
type DynamoGraphDeploymentReconciler struct {
client.Client
Config commonController.Config
Recorder record.EventRecorder
DockerSecretRetriever dockerSecretRetriever
ScaleClient scale.ScalesGetter
MPISecretReplicator *secret.SecretReplicator
RBACManager rbacManager
}
DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
func (*DynamoGraphDeploymentReconciler) FinalizeResource ¶
func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error
func (*DynamoGraphDeploymentReconciler) GetRecorder ¶
func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder
func (*DynamoGraphDeploymentReconciler) Reconcile ¶
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoGraphDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.
For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
func (*DynamoGraphDeploymentReconciler) SetupWithManager ¶
func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type DynamoGraphDeploymentRequestReconciler ¶
type DynamoGraphDeploymentRequestReconciler struct {
client.Client
Recorder record.EventRecorder
Config commonController.Config
// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
ProfilerImage string
// RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager
}
DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
func (*DynamoGraphDeploymentRequestReconciler) FinalizeResource ¶
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error
FinalizeResource implements commonController.Finalizer interface
func (*DynamoGraphDeploymentRequestReconciler) GetRecorder ¶
func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder
GetRecorder implements commonController.Reconciler interface
func (*DynamoGraphDeploymentRequestReconciler) Reconcile ¶
func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest
func (*DynamoGraphDeploymentRequestReconciler) SetupWithManager ¶
func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager
type IngressConfig ¶
type IngressConfig struct {
ClassName *string
Annotations map[string]string
Path string
PathType networkingv1.PathType
TLSMode TLSModeOpt
StaticTLSSecretName string
}
type RBACManager ¶
type RBACManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}
RBACManager interface for managing RBAC resources
type TLSModeOpt ¶
type TLSModeOpt string
const ( TLSModeNone TLSModeOpt = "none" TLSModeAuto TLSModeOpt = "auto" TLSModeStatic TLSModeOpt = "static" )