controller

package
v0.0.0-...-d2faf0e Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 19, 2025 License: Apache-2.0, Apache-2.0 Imports: 54 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Condition types
	ConditionTypeEndpointsReady = "EndpointsReady"
	ConditionTypeServicesFound  = "ServicesFound"

	// Condition reasons
	ReasonAllEndpointsReady   = "AllEndpointsReady"
	ReasonEndpointsDiscovered = "EndpointsDiscovered"
	ReasonNotReady            = "NotReady"
	ReasonNoEndpoints         = "NoEndpoints"
	ReasonServicesFound       = "ServicesFound"
	ReasonNoServicesFound     = "NoServicesFound"
)
View Source
const (
	DefaultClusterName                                   = "default"
	DefaultServiceAccountName                            = "default"
	KubeAnnotationDeploymentStrategy                     = "nvidia.com/deployment-strategy"
	KubeAnnotationDeploymentRollingUpdateMaxSurge        = "nvidia.com/deployment-rolling-update-max-surge"
	KubeAnnotationDeploymentRollingUpdateMaxUnavailable  = "nvidia.com/deployment-rolling-update-max-unavailable"
	KubeAnnotationEnableStealingTrafficDebugMode         = "nvidia.com/enable-stealing-traffic-debug-mode"
	KubeAnnotationEnableDebugMode                        = "nvidia.com/enable-debug-mode"
	KubeAnnotationEnableDebugPodReceiveProductionTraffic = "nvidia.com/enable-debug-pod-receive-production-traffic"
	DeploymentTargetTypeDebug                            = "debug"
)
View Source
const (
	// State constants
	StateEmpty             = ""
	StatePending           = "Pending"
	StateProfiling         = "Profiling"
	StateDeploying         = "Deploying"
	StateReady             = "Ready"
	StateDeploymentDeleted = "DeploymentDeleted"
	StateFailed            = "Failed"

	// Condition types
	ConditionTypeValidation      = "Validation"
	ConditionTypeProfiling       = "Profiling"
	ConditionTypeSpecGenerated   = "SpecGenerated"
	ConditionTypeDeploymentReady = "DeploymentReady"

	// Event reasons
	EventReasonInitialized          = "Initialized"
	EventReasonValidationFailed     = "ValidationFailed"
	EventReasonProfilingJobCreated  = "ProfilingJobCreated"
	EventReasonProfilingJobFailed   = "ProfilingJobFailed"
	EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
	EventReasonSpecGenerated        = "SpecGenerated"
	EventReasonSpecChangeRejected   = "SpecChangeRejected"
	EventReasonDeploymentCreated    = "DeploymentCreated"
	EventReasonDeploymentReady      = "DeploymentReady"
	EventReasonDeploymentDegraded   = "DeploymentDegraded"
	EventReasonDeploymentDeleted    = "DeploymentDeleted"

	// Label keys
	LabelApp           = "app"
	LabelDGDR          = "dgdr"
	LabelDGDRName      = "dgdr.nvidia.com/name"
	LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
	LabelManagedBy     = "nvidia.com/managed-by"

	// Label values
	LabelValueDynamoProfiler = "dynamo-profiler"
	LabelValueAICProfiler    = "aic-profiler"
	LabelValueDynamoOperator = "dynamo-operator"

	// Job naming
	JobNamePrefixOnline = "profile-online-"
	JobNamePrefixAIC    = "profile-aic-"

	// Container names
	ContainerNameProfiler     = "profiler"
	ContainerNameOutputCopier = "output-copier"

	// ServiceAccount
	ServiceAccountProfilingJob = "dgdr-profiling-job"

	// ConfigMap naming
	ConfigMapOutputPrefix = "dgdr-output-"

	// Annotation keys
	AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources"

	// Size limits
	MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit

	// Sidecar image
	SidecarImage = "bitnami/kubectl:latest"

	// Volume names
	VolumeNameProfilingConfig = "profiling-config"
	VolumeNameProfilingOutput = "profiling-output"

	// Volume paths
	ProfilingOutputPath       = "/data"
	ProfilingOutputFile       = "config_with_planner.yaml"
	ProfilingOutputFileMocker = "mocker_config_with_planner.yaml"
	ProfilingConfigPath       = "/config"
	ProfilingConfigFile       = "disagg.yaml"

	// Command line arguments
	ArgModel   = "--model"
	ArgBackend = "--backend"
	ArgTTFT    = "--ttft"
	ArgITL     = "--itl"
	ArgConfig  = "--config"

	// Messages
	MessageInitialized               = "DGDR initialized successfully"
	MessageProfilingJobCreated       = "Profiling job created"
	MessageAICProfilingJobCreated    = "AIC profiling job created"
	MessageProfilingInProgress       = "Profiling is in progress"
	MessageSpecGenerated             = "DynamoGraphDeployment spec generated successfully"
	MessageSpecAvailable             = "Generated spec is available in status.generatedDeployment"
	MessageDeploymentCreated         = "DynamoGraphDeployment %s created successfully"
	MessageDeploymentReady           = "DynamoGraphDeployment %s is ready"
	MessageDeploymentDegraded        = "DynamoGraphDeployment %s degraded from Ready to %s"
	MessageDeploymentDeleted         = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
	MessageInvalidState              = "Invalid state"
	MessageSpecChangeRejected        = "" /* 151-byte string literal not displayed */
	MessageJobCreationFailed         = "JobCreationFailed"
	MessageDeploymentCreationFailed  = "DeploymentCreationFailed"
	MessageResultsRetrievalFailed    = "ResultsRetrievalFailed"
	MessageGenerationFailed          = "GenerationFailed"
	MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed"
	MessageProfilingCheckFailed      = "ProfilingCheckFailed"
	MessageConfigMapNotFound         = "ConfigMap %s not found in namespace %s"
	MessageConfigMapKeyNotFound      = "key %s not found in ConfigMap %s"

	// Validation messages
	ValidationErrorModelRequired  = "model is required"
	ValidationErrorITLPositive    = "sla.itl must be positive"
	ValidationErrorTTFTPositive   = "sla.ttft must be positive"
	ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"

	// Valid backend values
	BackendVLLM   = "vllm"
	BackendSGLang = "sglang"
	BackendTRTLLM = "trtllm"
)

Variables

This section is empty.

Functions

func IsDeploymentReady

func IsDeploymentReady(deployment *appsv1.Deployment) bool

IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available. It checks various status fields to ensure all replicas are available and the deployment configuration has been fully applied.

func IsLeaderWorkerSetReady

func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool

IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available

Types

type ComponentReconcileResult

type ComponentReconcileResult struct {
	// contains filtered or unexported fields
}

type DynamoComponentDeploymentReconciler

type DynamoComponentDeploymentReconciler struct {
	client.Client
	Recorder              record.EventRecorder
	Config                commonController.Config
	EtcdStorage           etcdStorage
	DockerSecretRetriever dockerSecretRetriever
}

DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object

func (*DynamoComponentDeploymentReconciler) FinalizeResource

func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error

func (*DynamoComponentDeploymentReconciler) GetRecorder

func (*DynamoComponentDeploymentReconciler) Reconcile

func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoComponentDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile

func (*DynamoComponentDeploymentReconciler) SetupWithManager

func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentReconciler

type DynamoGraphDeploymentReconciler struct {
	client.Client
	Config                commoncontroller.Config
	Recorder              record.EventRecorder
	DockerSecretRetriever dockerSecretRetriever
	ScaleClient           scale.ScalesGetter
	MPISecretReplicator   *secret.SecretReplicator
	RBACManager           rbacManager
}

DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object

func (*DynamoGraphDeploymentReconciler) FinalizeResource

func (*DynamoGraphDeploymentReconciler) GetRecorder

func (*DynamoGraphDeploymentReconciler) Reconcile

func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoGraphDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile

func (*DynamoGraphDeploymentReconciler) SetupWithManager

func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentRequestReconciler

type DynamoGraphDeploymentRequestReconciler struct {
	client.Client
	Recorder record.EventRecorder
	Config   commonController.Config

	// RBACMgr handles RBAC setup for profiling jobs
	RBACManager RBACManager
}

DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object

func (*DynamoGraphDeploymentRequestReconciler) FinalizeResource

FinalizeResource implements commonController.Finalizer interface

func (*DynamoGraphDeploymentRequestReconciler) GetRecorder

GetRecorder implements commonController.Reconciler interface

func (*DynamoGraphDeploymentRequestReconciler) Reconcile

Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest

func (*DynamoGraphDeploymentRequestReconciler) SetupWithManager

func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager

type DynamoGraphDeploymentScalingAdapterReconciler

type DynamoGraphDeploymentScalingAdapterReconciler struct {
	client.Client
	Scheme   *runtime.Scheme
	Recorder record.EventRecorder
	Config   commonController.Config
}

DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object

func (*DynamoGraphDeploymentScalingAdapterReconciler) Reconcile

Reconcile implements the reconciliation loop for DynamoGraphDeploymentScalingAdapter

func (*DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager

SetupWithManager sets up the controller with the Manager

type DynamoModelReconciler

type DynamoModelReconciler struct {
	client.Client
	Recorder       record.EventRecorder
	EndpointClient *modelendpoint.Client
	Config         commoncontroller.Config
}

DynamoModelReconciler reconciles a DynamoModel object

func (*DynamoModelReconciler) FinalizeResource

func (r *DynamoModelReconciler) FinalizeResource(ctx context.Context, model *v1alpha1.DynamoModel) error

FinalizeResource implements the Finalizer interface Performs cleanup when a DynamoModel is being deleted

func (*DynamoModelReconciler) Reconcile

func (r *DynamoModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile handles the reconciliation loop for DynamoModel resources

func (*DynamoModelReconciler) SetupWithManager

func (r *DynamoModelReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager

type IngressConfig

type IngressConfig struct {
	ClassName           *string
	Annotations         map[string]string
	Path                string
	PathType            networkingv1.PathType
	TLSMode             TLSModeOpt
	StaticTLSSecretName string
}

type Message

type Message string

type RBACManager

type RBACManager interface {
	EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}

RBACManager interface for managing RBAC resources

type Reason

type Reason string

type ReconcileResult

type ReconcileResult struct {
	State         State
	Reason        Reason
	Message       Message
	ServiceStatus map[string]nvidiacomv1alpha1.ServiceReplicaStatus
}

type Resource

type Resource interface {
	IsReady() (ready bool, reason string)
	GetName() string
	GetServiceStatuses() map[string]v1alpha1.ServiceReplicaStatus
}

type State

type State string
const (
	FailedState  State = "failed"
	ReadyState   State = "successful"
	PendingState State = "pending"
)

type TLSModeOpt

type TLSModeOpt string
const (
	TLSModeNone   TLSModeOpt = "none"
	TLSModeAuto   TLSModeOpt = "auto"
	TLSModeStatic TLSModeOpt = "static"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL