controller

package
v0.0.0-...-818d72a Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 24, 2025 License: Apache-2.0, Apache-2.0 Imports: 44 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultClusterName                                   = "default"
	DefaultServiceAccountName                            = "default"
	KubeAnnotationDeploymentStrategy                     = "nvidia.com/deployment-strategy"
	KubeAnnotationEnableStealingTrafficDebugMode         = "nvidia.com/enable-stealing-traffic-debug-mode"
	KubeAnnotationEnableDebugMode                        = "nvidia.com/enable-debug-mode"
	KubeAnnotationEnableDebugPodReceiveProductionTraffic = "nvidia.com/enable-debug-pod-receive-production-traffic"
	DeploymentTargetTypeProduction                       = "production"
	DeploymentTargetTypeDebug                            = "debug"
	HeaderNameDebug                                      = "X-Nvidia-Debug"
	KubernetesDeploymentStrategy                         = "kubernetes"

	DeploymentTypeStandard       = "standard"
	DeploymentTypeMultinodeGrove = "multinode-grove"
	ComponentTypePlanner         = "Planner"
)
View Source
const (
	// State constants
	StateEmpty             = ""
	StatePending           = "Pending"
	StateProfiling         = "Profiling"
	StateDeploying         = "Deploying"
	StateReady             = "Ready"
	StateDeploymentDeleted = "DeploymentDeleted"
	StateFailed            = "Failed"

	// Condition types
	ConditionTypeValidation      = "Validation"
	ConditionTypeProfiling       = "Profiling"
	ConditionTypeSpecGenerated   = "SpecGenerated"
	ConditionTypeDeploymentReady = "DeploymentReady"

	// Event reasons
	EventReasonInitialized          = "Initialized"
	EventReasonValidationFailed     = "ValidationFailed"
	EventReasonProfilingJobCreated  = "ProfilingJobCreated"
	EventReasonProfilingJobFailed   = "ProfilingJobFailed"
	EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
	EventReasonSpecGenerated        = "SpecGenerated"
	EventReasonSpecChangeRejected   = "SpecChangeRejected"
	EventReasonDeploymentCreated    = "DeploymentCreated"
	EventReasonDeploymentReady      = "DeploymentReady"
	EventReasonDeploymentDegraded   = "DeploymentDegraded"
	EventReasonDeploymentDeleted    = "DeploymentDeleted"

	// Label keys
	LabelApp           = "app"
	LabelDGDR          = "dgdr"
	LabelDGDRName      = "dgdr.nvidia.com/name"
	LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
	LabelManagedBy     = "nvidia.com/managed-by"

	// Label values
	LabelValueDynamoProfiler = "dynamo-profiler"
	LabelValueAICProfiler    = "aic-profiler"
	LabelValueDynamoOperator = "dynamo-operator"

	// Job naming
	JobNamePrefixOnline = "profile-online-"
	JobNamePrefixAIC    = "profile-aic-"

	// Container names
	ContainerNameProfiler     = "profiler"
	ContainerNameOutputCopier = "output-copier"

	// ServiceAccount
	ServiceAccountProfilingJob = "dgdr-profiling-job"

	// ConfigMap naming
	ConfigMapOutputPrefix = "dgdr-output-"

	// Sidecar image
	SidecarImage = "bitnami/kubectl:latest"

	// Volume names
	VolumeNameProfilingConfig = "profiling-config"
	VolumeNameProfilingOutput = "profiling-output"

	// Volume paths
	ProfilingOutputPath = "/data"
	ProfilingOutputFile = "config_with_planner.yaml"
	ProfilingConfigPath = "/config"
	ProfilingConfigFile = "disagg.yaml"

	// Command line arguments
	ArgModel   = "--model"
	ArgBackend = "--backend"
	ArgTTFT    = "--ttft"
	ArgITL     = "--itl"
	ArgConfig  = "--config"

	// Messages
	MessageInitialized               = "DGDR initialized successfully"
	MessageProfilingJobCreated       = "Profiling job created"
	MessageAICProfilingJobCreated    = "AIC profiling job created"
	MessageProfilingInProgress       = "Profiling is in progress"
	MessageSpecGenerated             = "DynamoGraphDeployment spec generated successfully"
	MessageSpecAvailable             = "Generated spec is available in status.generatedDeployment"
	MessageDeploymentCreated         = "DynamoGraphDeployment %s created successfully"
	MessageDeploymentReady           = "DynamoGraphDeployment %s is ready"
	MessageDeploymentDegraded        = "DynamoGraphDeployment %s degraded from Ready to %s"
	MessageDeploymentDeleted         = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
	MessageInvalidState              = "Invalid state"
	MessageSpecChangeRejected        = "" /* 151-byte string literal not displayed */
	MessageJobCreationFailed         = "JobCreationFailed"
	MessageDeploymentCreationFailed  = "DeploymentCreationFailed"
	MessageResultsRetrievalFailed    = "ResultsRetrievalFailed"
	MessageGenerationFailed          = "GenerationFailed"
	MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed"
	MessageProfilingCheckFailed      = "ProfilingCheckFailed"
	MessageConfigMapNotFound         = "ConfigMap %s not found in namespace %s"
	MessageConfigMapKeyNotFound      = "key %s not found in ConfigMap %s"

	// Validation messages
	ValidationErrorModelNameRequired = "modelName is required"
	ValidationErrorITLPositive       = "sla.itl must be positive"
	ValidationErrorTTFTPositive      = "sla.ttft must be positive"
	ValidationErrorInvalidBackend    = "invalid backend: %s (must be vllm, sglang, or trtllm)"

	// Valid backend values
	BackendVLLM   = "vllm"
	BackendSGLang = "sglang"
	BackendTRTLLM = "trtllm"
)

Variables

This section is empty.

Functions

func IsDeploymentReady

func IsDeploymentReady(deployment *appsv1.Deployment) bool

IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available. It checks various status fields to ensure all replicas are available and the deployment configuration has been fully applied.

func IsLeaderWorkerSetReady

func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool

IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available

Types

type DynamoComponentDeploymentReconciler

type DynamoComponentDeploymentReconciler struct {
	client.Client
	Recorder              record.EventRecorder
	Config                controller_common.Config
	EtcdStorage           etcdStorage
	DockerSecretRetriever dockerSecretRetriever
}

DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object

func (*DynamoComponentDeploymentReconciler) FinalizeResource

func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error

func (*DynamoComponentDeploymentReconciler) GetRecorder

func (*DynamoComponentDeploymentReconciler) Reconcile

func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoComponentDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile

func (*DynamoComponentDeploymentReconciler) SetupWithManager

func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentReconciler

type DynamoGraphDeploymentReconciler struct {
	client.Client
	Config                commonController.Config
	Recorder              record.EventRecorder
	DockerSecretRetriever dockerSecretRetriever
	ScaleClient           scale.ScalesGetter
	MPISecretReplicator   *secret.SecretReplicator
	RBACManager           rbacManager
}

DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object

func (*DynamoGraphDeploymentReconciler) FinalizeResource

func (*DynamoGraphDeploymentReconciler) GetRecorder

func (*DynamoGraphDeploymentReconciler) Reconcile

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoGraphDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile

func (*DynamoGraphDeploymentReconciler) SetupWithManager

func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentRequestReconciler

type DynamoGraphDeploymentRequestReconciler struct {
	client.Client
	Recorder record.EventRecorder
	Config   commonController.Config

	// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
	ProfilerImage string
	// RBACMgr handles RBAC setup for profiling jobs
	RBACManager RBACManager
}

DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object

func (*DynamoGraphDeploymentRequestReconciler) FinalizeResource

FinalizeResource implements commonController.Finalizer interface

func (*DynamoGraphDeploymentRequestReconciler) GetRecorder

GetRecorder implements commonController.Reconciler interface

func (*DynamoGraphDeploymentRequestReconciler) Reconcile

Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest

func (*DynamoGraphDeploymentRequestReconciler) SetupWithManager

func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager

type IngressConfig

type IngressConfig struct {
	ClassName           *string
	Annotations         map[string]string
	Path                string
	PathType            networkingv1.PathType
	TLSMode             TLSModeOpt
	StaticTLSSecretName string
}

type Message

type Message string

type RBACManager

type RBACManager interface {
	EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}

RBACManager interface for managing RBAC resources

type Reason

type Reason string

type Resource

type Resource interface {
	IsReady() (ready bool, reason string)
	GetName() string
}

type State

type State string
const (
	FailedState  State = "failed"
	ReadyState   State = "successful"
	PendingState State = "pending"
)

type TLSModeOpt

type TLSModeOpt string
const (
	TLSModeNone   TLSModeOpt = "none"
	TLSModeAuto   TLSModeOpt = "auto"
	TLSModeStatic TLSModeOpt = "static"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL