shared

package
v0.2.0-rc10 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 17, 2025 License: MIT Imports: 50 Imported by: 0

Documentation

Index

Constants

View Source
const (

	// DefaultRuntimeConfigName is the name of the default AIM runtime config
	DefaultRuntimeConfigName = "default"

	// MaxConcurrentDiscoveryJobs is the global limit for concurrent discovery jobs across all namespaces
	MaxConcurrentDiscoveryJobs = 10

	// DefaultPVCHeadroomPercent is the default percentage of extra space to add to PVCs
	// for model storage. This accounts for filesystem overhead and temporary files during
	// model loading. The value represents a percentage (e.g., 10 means 10% extra space).
	DefaultPVCHeadroomPercent = 10

	// AimLabelDomain is the base domain used for AIM-specific labels.
	AimLabelDomain = "aim.silogen.ai"

	// AIM label keys.
	LabelKeyTemplate        = AimLabelDomain + "/template"
	LabelKeyModelID         = AimLabelDomain + "/model-id"
	LabelKeyDerivedTemplate = AimLabelDomain + "/derived-template"
	LabelKeyAutoGenerated   = AimLabelDomain + "/auto-generated"
	LabelKeyImageName       = AimLabelDomain + "/aim-image"
	LabelKeyMetric          = AimLabelDomain + "/template.metric"
	LabelKeyPrecision       = AimLabelDomain + "/template.precision"
	LabelKeyServiceName     = AimLabelDomain + "/service-name"
	LabelKeyModelCache      = AimLabelDomain + "/modelcache"
	LabelKeyTemplateCache   = AimLabelDomain + "/template-cache"
	LabelKeyCacheType       = AimLabelDomain + "/cache-type"
	LabelKeySourceModel     = AimLabelDomain + "/source-model"

	// AIM label values.
	LabelValueRuntimeName        = "aim-runtime"
	LabelValueRuntimeComponent   = "serving-runtime"
	LabelValueManagedBy          = "aim-controller"
	LabelValueDiscoveryName      = "aim-discovery"
	LabelValueDiscoveryComponent = "discovery-job"
	LabelValueServiceName        = "aim-service"
	LabelValueServiceComponent   = "inference-service"
	LabelValueDerivedTemplate    = "true"
	LabelValueAutoGenerated      = "true"

	// Cache type label values.
	LabelValueCacheTypeTemplateCache = "template-cache"
	LabelValueCacheTypeTempService   = "temporary-service-cache"

	// NodeLabelAMDGPUDeviceID is the primary node label for AMD GPU device IDs (e.g., "74a1" for MI300X)
	NodeLabelAMDGPUDeviceID = "amd.com/gpu.device-id"
)
View Source
const (

	// DiscoveryJobBackoffLimit is the number of retries before marking the discovery job as failed
	DiscoveryJobBackoffLimit = 3

	// DiscoveryJobTTLSeconds defines how long completed discovery jobs persist
	// before automatic cleanup. This allows time for status inspection and log retrieval.
	DiscoveryJobTTLSeconds = 60
)
View Source
const (
	// DefaultGPUResourceName is the default resource name for AMD GPUs in Kubernetes
	DefaultGPUResourceName = "amd.com/gpu"

	// DefaultSharedMemorySize is the default size allocated for /dev/shm in inference containers.
	// This is required for efficient inter-process communication in model serving workloads.
	DefaultSharedMemorySize = "8Gi"

	// KubernetesLabelValueMaxLength is the maximum length for a Kubernetes label value
	KubernetesLabelValueMaxLength = 63
)
View Source
const (
	// LabelAutoCreated marks models that were automatically created from service image references
	LabelAutoCreated    = "aim.silogen.ai/auto-created"
	LabelKeyModelSource = "aim.silogen.ai/model-source"
)
View Source
const (
	// MaxRoutePathLength is the maximum allowed length for a route path.
	// This prevents excessively long paths that could cause issues with gateways or proxies.
	MaxRoutePathLength = 200
)

Variables

View Source
var ErrImageNotFound = errors.New("image not found in catalog")

ErrImageNotFound is returned when an image is not found in the catalog

View Source
var (

	// ErrMultipleModelsFound is returned when multiple models exist with the same image URI
	ErrMultipleModelsFound = errors.New("multiple models found with the same image")
)
View Source
var ErrRuntimeConfigNotFound = errors.New("runtime config not found")

ErrRuntimeConfigNotFound indicates that neither namespace nor cluster runtime config could be located.

View Source
var GPUPreferenceOrder = []string{
	"MI325X",
	"MI300X",
	"MI250X",
	"MI210",
	"A100",
	"H100",
}

GPUPreferenceOrder defines the preference order for GPU models when selecting templates. GPUs earlier in the list are preferred over later ones. TODO: Fill in the complete preference order based on performance characteristics.

View Source
var KnownAmdGpuDevices = map[string]string{

	"738c": "MI100",
	"738e": "MI100",
	"7408": "MI250X",
	"740c": "MI250X",
	"740f": "MI210",
	"7410": "MI210",
	"74a0": "MI300A",
	"74a1": "MI300X",
	"74a2": "MI308X",
	"74a5": "MI325X",
	"74a8": "MI308X",
	"74a9": "MI300X",
	"74b5": "MI300X",
	"74b6": "MI308X",
	"74b9": "MI325X",
	"74bd": "MI300X",
	"75a0": "MI350X",
	"75a3": "MI355X",
	"75b0": "MI350X",
	"75b3": "MI355X",

	"7460": "V710",
	"7461": "V710",
	"7448": "W7900",
	"744a": "W7900",
	"7449": "W7800",
	"745e": "W7800",
	"73a2": "W6900X",
	"73a3": "W6800",
	"73ab": "W6800X",
	"73a1": "V620",
	"73ae": "V620",

	"7550": "RX9070",
	"744c": "RX7900",
	"73af": "RX6900",
	"73bf": "RX6800",
}
View Source
var MetricPreferenceOrder = []string{
	"latency",
	"throughput",
}

MetricPreferenceOrder defines preference for optimization metrics. "latency" is preferred over "throughput" by default.

View Source
var PrecisionPreferenceOrder = []string{
	"fp8",
	"fp16",
	"bf16",
	"fp32",
}

PrecisionPreferenceOrder defines preference for precision levels. Lower precision (more optimized) is preferred.

Functions

func ApplyHeadroomAndRound

func ApplyHeadroomAndRound(baseSizeBytes int64, headroomPercent int32) int64

ApplyHeadroomAndRound applies headroom percentage to a base size and rounds up to the nearest Gi. This ensures PVC sizes are clean, human-readable values (e.g., "421Gi" instead of "451936812032").

Parameters:

  • baseSizeBytes: The original size in bytes
  • headroomPercent: Percentage of extra space to add (0-100, e.g., 10 means 10% extra)

Returns:

  • The final size in bytes, rounded up to the nearest Gi boundary

Example:

  • Input: 9,094,593,249 bytes with 10% headroom
  • With headroom: 10,004,052,573 bytes (9.31 Gi)
  • Rounded: 10,737,418,240 bytes (10 Gi)

func BuildClusterServingRuntime

func BuildClusterServingRuntime(template aimstate.TemplateState, ownerRef metav1.OwnerReference) *servingv1alpha1.ClusterServingRuntime

BuildClusterServingRuntime creates a KServe ClusterServingRuntime for a cluster-scoped template.

func BuildDerivedTemplate

func BuildDerivedTemplate(
	service *aimv1alpha1.AIMService,
	templateName string,
	resolvedModelName string,
	baseSpec *aimv1alpha1.AIMServiceTemplateSpec,
) *aimv1alpha1.AIMServiceTemplate

BuildDerivedTemplate constructs an AIMServiceTemplate for a service with overrides. The template inherits from the base spec and applies service-specific customizations.

func BuildDiscoveryJob

func BuildDiscoveryJob(spec DiscoveryJobSpec) *batchv1.Job

BuildDiscoveryJob creates a Job that runs model discovery dry-run

func BuildInferenceService

func BuildInferenceService(serviceState aimstate.ServiceState, ownerRef metav1.OwnerReference) *servingv1beta1.InferenceService

BuildInferenceService constructs a KServe InferenceService referencing a ServingRuntime or ClusterServingRuntime.

func BuildInferenceServiceHTTPRoute

func BuildInferenceServiceHTTPRoute(serviceState aimstate.ServiceState, ownerRef metav1.OwnerReference) *gatewayapiv1.HTTPRoute

BuildInferenceServiceHTTPRoute creates an HTTPRoute that exposes the predictor service via the provided gateway parent.

func BuildServingRuntime

func BuildServingRuntime(template aimstate.TemplateState, ownerRef metav1.OwnerReference) *servingv1alpha1.ServingRuntime

BuildServingRuntime creates a KServe ServingRuntime for a namespace-scoped template.

func BuildServingRuntimeFromState

func BuildServingRuntimeFromState(state aimstate.TemplateState, ownerRef metav1.OwnerReference) *servingv1alpha1.ServingRuntime

BuildServingRuntimeFromState constructs a namespaced ServingRuntime from a TemplateState snapshot. This is an adapter function that maintains compatibility with the original signature.

func BuildTemplateStateFromObservation

func BuildTemplateStateFromObservation(
	name, namespace string,
	specCommon aimv1alpha1.AIMServiceTemplateSpecCommon,
	observation *TemplateObservation,
	runtimeConfigSpec aimv1alpha1.AIMRuntimeConfigSpec,
	status *aimv1alpha1.AIMServiceTemplateStatus,
) aimstate.TemplateState

BuildTemplateStateFromObservation constructs a TemplateState from the template specification, observation, and status. This is an adapter function that combines template metadata with observed resources.

func CountActiveDiscoveryJobs

func CountActiveDiscoveryJobs(ctx context.Context, k8sClient client.Client) (int, error)

CountActiveDiscoveryJobs counts the number of active (non-complete) discovery jobs across all namespaces. A job is considered active if it exists and is not in a complete state (succeeded or failed).

func DefaultRoutePath

func DefaultRoutePath(service *aimv1alpha1.AIMService) string

DefaultRoutePath returns the default HTTP route prefix.

func DerivedTemplateName

func DerivedTemplateName(baseName, suffix string) string

DerivedTemplateName constructs a template name from a base name and suffix. Ensures the final name does not exceed Kubernetes name length limits.

func EvaluateHTTPRouteStatus

func EvaluateHTTPRouteStatus(route *gatewayapiv1.HTTPRoute) (bool, string, string)

EvaluateHTTPRouteStatus checks the HTTPRoute status and returns readiness state.

func EvaluateInferenceServiceStatus

func EvaluateInferenceServiceStatus(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	inferenceService *servingv1beta1.InferenceService,
	httpRoute *gatewayapiv1.HTTPRoute,
	routingEnabled bool,
	routingReady bool,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
)

EvaluateInferenceServiceStatus checks InferenceService and routing readiness. Updates status conditions based on the InferenceService and routing state.

func EvaluateRoutingStatus

func EvaluateRoutingStatus(
	service *aimv1alpha1.AIMService,
	obs *ServiceObservation,
	status *aimv1alpha1.AIMServiceStatus,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) (enabled bool, ready bool, hasFatalError bool)

EvaluateRoutingStatus checks routing configuration and updates status accordingly. Returns (enabled, ready, hasFatalError) to indicate if routing is enabled, if it's ready, and if there's a terminal error.

func FormatRuntimeConfigSources

func FormatRuntimeConfigSources(resolution *RuntimeConfigResolution, namespaceLabel string) []string

FormatRuntimeConfigSources renders a human-readable list of runtime config sources for logging/events.

func GenerateInferenceServiceName

func GenerateInferenceServiceName(serviceName, namespace string) string

GenerateInferenceServiceName creates a KServe InferenceService name that fits DNS label constraints. KServe creates hostnames in the format: {isvc-name}-predictor-{namespace} These hostnames must be ≤ 63 characters to comply with DNS label limits.

If the original name would exceed the limit, this function: 1. Truncates the base name 2. Appends an 8-character hash of the full original name 3. Ensures the result is RFC1123 compliant

The hash ensures uniqueness while keeping names deterministic and short.

func GetAMDDeviceIDsForModel

func GetAMDDeviceIDsForModel(modelName string) []string

GetAMDDeviceIDsForModel returns all AMD device IDs that map to a given GPU model name. This is the inverse of mapAMDDeviceIDToModel, allowing lookup of all device IDs for a model. Example: GetAMDDeviceIDsForModel("MI300X") returns ["74a1", "74a9", "74b5", "74bd"] Returns empty slice if the model is not found or is not an AMD GPU.

func GetClusterGPUResources

func GetClusterGPUResources(ctx context.Context, k8sClient client.Client) (map[string]GPUResourceInfo, error)

GetClusterGPUResources returns an aggregated view of all GPU resources in the cluster. It scans all nodes and aggregates resources that start with "amd.com/" or "nvidia.com/". Returns a map where keys are GPU models (e.g., "MI300X", "A100") extracted from node labels, and values contain the resource name.

func GetClusterServingRuntime

func GetClusterServingRuntime(ctx context.Context, k8sClient client.Client, name string) (*servingv1alpha1.ClusterServingRuntime, error)

GetClusterServingRuntime fetches a ClusterServingRuntime by name

func GetDiscoveryJob

func GetDiscoveryJob(ctx context.Context, k8sClient client.Client, namespace, templateName string) (*batchv1.Job, error)

GetDiscoveryJob fetches the discovery job for a template. Returns the newest job (by CreationTimestamp) if multiple exist.

func GetImageConfigLabels

func GetImageConfigLabels(ctx context.Context, imageURI string, keychain authn.Keychain) (map[string]string, error)

GetImageConfigLabels is a helper function that retrieves just the labels from an image without parsing them into structured metadata. Useful for debugging.

func GetOperatorNamespace

func GetOperatorNamespace() string

GetOperatorNamespace returns the namespace where the AIM operator runs. It reads the AIM_OPERATOR_NAMESPACE environment variable; if unset, it defaults to "kaiwo-system".

func GetPVCHeadroomPercent

func GetPVCHeadroomPercent(spec aimv1alpha1.AIMRuntimeConfigSpec) int32

GetPVCHeadroomPercent returns the PVC headroom percentage from the runtime config spec. If not set, returns the default value defined in DefaultPVCHeadroomPercent.

func GetServingRuntime

func GetServingRuntime(ctx context.Context, k8sClient client.Client, namespace, name string) (*servingv1alpha1.ServingRuntime, error)

GetServingRuntime fetches a ServingRuntime by namespace and name

func HandleImageMissing

func HandleImageMissing(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleImageMissing checks for missing image and updates status. Returns true if the image is missing.

func HandleImageNotReady

func HandleImageNotReady(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleImageNotReady checks if the resolved image is not yet ready and updates status. Returns true if the service should wait for the image to become ready.

func HandleInferenceServicePodImageError

func HandleInferenceServicePodImageError(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleInferenceServicePodImageError checks for image pull errors in InferenceService pods. Returns true if an image pull error was detected.

func HandleMissingModelSource

func HandleMissingModelSource(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleMissingModelSource checks if the template is available but has no model sources. Returns true if model sources are missing (discovery succeeded but produced no usable sources).

func HandleModelCacheReadiness

func HandleModelCacheReadiness(service *aimv1alpha1.AIMService, status *aimv1alpha1.AIMServiceStatus, obs *ServiceObservation, setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string)) bool

func HandleModelResolutionFailure

func HandleModelResolutionFailure(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleModelResolutionFailure checks for model resolution failures and updates status. Returns true if model resolution failed.

func HandlePathTemplateError

func HandlePathTemplateError(
	status *aimv1alpha1.AIMServiceStatus,
	service *aimv1alpha1.AIMService,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandlePathTemplateError checks for path template errors and updates status. Returns true if there is a path template error. This can occur when routing is enabled (via service spec or runtime config) but the path template is invalid.

func HandleReconcileErrors

func HandleReconcileErrors(
	status *aimv1alpha1.AIMServiceStatus,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
	errs controllerutils.ReconcileErrors,
) bool

HandleReconcileErrors processes reconciliation errors and updates service status. Returns true if errors were found and handled.

func HandleRuntimeConfigMissing

func HandleRuntimeConfigMissing(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleRuntimeConfigMissing checks for missing runtime config and updates status. Returns true if the runtime config is missing.

func HandleTemplateDegraded

func HandleTemplateDegraded(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleTemplateDegraded checks if the template is degraded, not available, or failed and updates status. Returns true if the template is degraded, not available, or failed.

func HandleTemplateNotAvailable

func HandleTemplateNotAvailable(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleTemplateNotAvailable checks if the template is not available and updates status. Returns true if the template is not yet available (Pending or Progressing). Sets the service to Pending state because it's waiting for a dependency (the template).

func HandleTemplateSelectionFailure

func HandleTemplateSelectionFailure(
	status *aimv1alpha1.AIMServiceStatus,
	obs *ServiceObservation,
	setCondition func(conditionType string, conditionStatus metav1.ConditionStatus, reason, message string),
) bool

HandleTemplateSelectionFailure reports failures during automatic template selection.

func HasOwnerReference

func HasOwnerReference(refs []metav1.OwnerReference, uid types.UID) bool

HasOwnerReference checks if the given UID exists in the owner references list.

func InferenceServiceRouteName

func InferenceServiceRouteName(serviceName string) string

InferenceServiceRouteName returns the canonical HTTPRoute name for an InferenceService.

func InspectImage

func InspectImage(
	ctx context.Context,
	imageURI string,
	imagePullSecrets []corev1.LocalObjectReference,
	clientset kubernetes.Interface,
	namespace string,
) (*aimv1alpha1.ImageMetadata, error)

InspectImage extracts metadata from a container image using the provided image pull secrets. It uses go-containerregistry to authenticate and fetch image labels, then parses them into the ImageMetadata structure.

Parameters:

  • ctx: Context for the operation
  • imageURI: Full container image reference (e.g., "registry.example.com/repo/image:tag")
  • imagePullSecrets: Kubernetes image pull secrets for authentication
  • clientset: Kubernetes clientset for accessing secrets
  • namespace: Namespace where the secrets are located

Returns:

  • *ImageMetadata: Extracted metadata if successful
  • error: Any error encountered during inspection (authentication, network, parsing, etc.) Registry access errors are wrapped in ImageRegistryError for categorization.

func IsDerivedTemplate

func IsDerivedTemplate(labels map[string]string) bool

IsDerivedTemplate returns true when the provided labels indicate a controller-managed derived template.

func IsGPUAvailable

func IsGPUAvailable(ctx context.Context, k8sClient client.Client, gpuModel string) (bool, error)

IsGPUAvailable checks if a specific GPU model is available in the cluster. The gpuModel parameter should be the GPU model name (e.g., "MI300X", "A100"), not the resource name. The input is normalized to handle variants like "MI300X (rev 2)" or "Instinct MI300X".

func IsJobComplete

func IsJobComplete(job *batchv1.Job) bool

IsJobComplete returns true if the job has completed (successfully or failed)

func IsJobFailed

func IsJobFailed(job *batchv1.Job) bool

IsJobFailed returns true if the job failed

func IsJobSucceeded

func IsJobSucceeded(job *batchv1.Job) bool

IsJobSucceeded returns true if the job completed successfully

func JoinRuntimeConfigSources

func JoinRuntimeConfigSources(resolution *RuntimeConfigResolution, namespaceLabel string) string

JoinRuntimeConfigSources joins runtime config sources for concise logging.

func ListAvailableGPUs

func ListAvailableGPUs(ctx context.Context, k8sClient client.Client) ([]string, error)

ListAvailableGPUs returns a list of all GPU resource types available in the cluster.

func NodeGPUChangePredicate

func NodeGPUChangePredicate() predicate.Predicate

NodeGPUChangePredicate returns a predicate that triggers reconciles when GPU-related node attributes change.

func NormalizeRuntimeConfigName

func NormalizeRuntimeConfigName(name string) string

NormalizeRuntimeConfigName returns the effective name to use for lookups when the user omits the field.

func ObserveDerivedTemplate

func ObserveDerivedTemplate(
	ctx context.Context,
	k8sClient client.Client,
	service *aimv1alpha1.AIMService,
	resolution TemplateResolution,
	obs *ServiceObservation,
) error

ObserveDerivedTemplate handles observation for services with derived templates. It fetches the derived template if it exists, or loads the base template spec for creation.

func ObserveNonDerivedTemplate

func ObserveNonDerivedTemplate(
	ctx context.Context,
	k8sClient client.Client,
	service *aimv1alpha1.AIMService,
	templateName string,
	preferredScope TemplateScope,
	obs *ServiceObservation,
) error

ObserveNonDerivedTemplate handles observation for services with non-derived templates. It searches for namespace-scoped templates first, then falls back to cluster-scoped templates. Does not set ShouldCreateTemplate - that decision is made in the controller based on whether an explicit templateRef was provided.

func OverridesSuffix

func OverridesSuffix(overrides *aimv1alpha1.AIMServiceOverrides) string

OverridesSuffix computes a hash suffix for service overrides.

func PlanImageResources

func PlanImageResources(ctx context.Context, input ImagePlanInput) ([]client.Object, *aimv1alpha1.ImageMetadata, error)

PlanImageResources plans the desired state for an image resource. It performs metadata extraction if needed and creates ServiceTemplates based on recommendedDeployments.

func PlanTemplateResources

func PlanTemplateResources(ctx TemplatePlanContext, builders TemplatePlanBuilders) ([]client.Object, bool)

PlanTemplateResources produces desired objects based on the observation and controller-provided builders. It respects the global limit on concurrent discovery jobs (MaxConcurrentDiscoveryJobs). Returns the desired objects and a boolean indicating if a requeue is needed (when job limit is reached).

func PopulateObservationFromClusterTemplate

func PopulateObservationFromClusterTemplate(
	ctx context.Context,
	k8sClient client.Client,
	service *aimv1alpha1.AIMService,
	template *aimv1alpha1.AIMClusterServiceTemplate,
	obs *ServiceObservation,
) error

PopulateObservationFromClusterTemplate extracts data from a cluster-scoped template into the observation.

func PopulateObservationFromNamespaceTemplate

func PopulateObservationFromNamespaceTemplate(
	ctx context.Context,
	k8sClient client.Client,
	service *aimv1alpha1.AIMService,
	template *aimv1alpha1.AIMServiceTemplate,
	obs *ServiceObservation,
) error

PopulateObservationFromNamespaceTemplate extracts data from a namespace-scoped template into the observation.

func ProjectImageStatus

func ProjectImageStatus(
	status *aimv1alpha1.AIMModelStatus,
	spec aimv1alpha1.AIMModelSpec,
	observation *ImageObservation,
	extractedMetadata *aimv1alpha1.ImageMetadata,
	extractionErr error,
	observedGeneration int64,
)

ProjectImageStatus updates the status of an image resource based on observation and errors.

func ProjectServiceStatus

func ProjectServiceStatus(
	service *aimv1alpha1.AIMService,
	obs *ServiceObservation,
	inferenceService *servingv1beta1.InferenceService,
	httpRoute *gatewayapiv1.HTTPRoute,
	errs controllerutils.ReconcileErrors,
)

ProjectServiceStatus computes and updates the service status based on observations and errors. This is a high-level orchestrator that calls the individual status handler functions.

func ProjectTemplateStatus

func ProjectTemplateStatus(
	ctx context.Context,
	k8sClient client.Client,
	clientset kubernetes.Interface,
	recorder record.EventRecorder,
	template TemplateWithStatus,
	obs *TemplateObservation,
	errs controllerutils.ReconcileErrors,
	imageNotFoundMessage string,
) error

ProjectTemplateStatus computes status from observation and errors. This is shared between cluster and namespace-scoped template controllers. Modifies templateStatus directly and emits events for discovery phase changes.

func PropagateLabels

func PropagateLabels(parent, child client.Object, config *aimv1alpha1.AIMRuntimeConfigCommon)

PropagateLabels propagates labels from a parent resource to a child resource based on the runtime config's label propagation settings. Only labels whose keys match the patterns defined in the config are copied. The child's existing labels are preserved and only new labels are added.

Parameters:

  • parent: The source resource whose labels should be propagated
  • child: The target resource that will receive the propagated labels
  • config: The runtime config common spec containing label propagation settings

The function does nothing if:

  • Label propagation is not enabled in the config
  • The config is nil or has no label propagation settings
  • The parent has no labels

Special handling for Jobs: Labels are also propagated to the PodTemplateSpec.

func QuantityWithHeadroom

func QuantityWithHeadroom(baseSizeBytes int64, headroomPercent int32) resource.Quantity

QuantityWithHeadroom creates a resource.Quantity with headroom applied and rounded to the nearest Gi. This is a convenience wrapper around ApplyHeadroomAndRound that returns a Kubernetes Quantity.

The returned Quantity uses BinarySI format (Ki, Mi, Gi, Ti suffixes) for compatibility with Kubernetes storage resources.

Parameters:

  • baseSizeBytes: The original size in bytes
  • headroomPercent: Percentage of extra space to add (0-100)

Returns:

  • A resource.Quantity representing the size with headroom, formatted cleanly

func RequestsForServices

func RequestsForServices(services []aimv1alpha1.AIMService) []reconcile.Request

RequestsForServices converts a list of AIMServices to reconcile requests.

func ResolveServiceRoutePath

func ResolveServiceRoutePath(service *aimv1alpha1.AIMService, runtimeConfig aimv1alpha1.AIMRuntimeConfigSpec) (string, error)

ResolveServiceRoutePath renders the HTTP route prefix using service and runtime config context. The precedence order is: 1. Service.Spec.Routing.PathTemplate (highest priority) 2. RuntimeConfig.Routing.PathTemplate (base layer)

func ResolveServiceRouteTimeout

func ResolveServiceRouteTimeout(service *aimv1alpha1.AIMService, runtimeConfig aimv1alpha1.AIMRuntimeConfigSpec) *string

ResolveServiceRouteTimeout resolves the HTTP route timeout using service and runtime config context. The precedence order is: 1. Service.Spec.Routing.RequestTimeout (highest priority) 2. RuntimeConfig.Routing.RequestTimeout (base layer) Returns nil if no timeout is configured at any level.

func ResolveStorageClass

func ResolveStorageClass(explicitStorageClass string, runtimeConfigSpec aimv1alpha1.AIMRuntimeConfigSpec) string

ResolveStorageClass determines the effective storage class using fallback logic:

  1. Use explicit storage class if provided (non-empty)
  2. Fall back to runtime config's defaultStorageClassName if explicit is empty
  3. Empty string means use the cluster's default StorageClass

This implements consistent storage class resolution across all PVC creation paths.

Parameters:

  • explicitStorageClass: Storage class explicitly specified in the resource spec
  • runtimeConfigSpec: The resolved runtime configuration spec

Returns:

  • The effective storage class name (may be empty to use cluster default)

func ResolveTemplateNameForService

func ResolveTemplateNameForService(
	ctx context.Context,
	k8sClient client.Client,
	service *aimv1alpha1.AIMService,
) (TemplateResolution, TemplateSelectionStatus, error)

ResolveTemplateNameForService determines the template name to use for a service. It handles default template lookup, base template resolution, and derived template naming. Returns an empty BaseName/FinalName if no template can be resolved, which indicates the service should enter a degraded state.

func RuntimeConfigNameForService

func RuntimeConfigNameForService(service *aimv1alpha1.AIMService, templateSpec aimv1alpha1.AIMServiceTemplateSpecCommon) string

RuntimeConfigNameForService determines the effective runtime config name for a service.

func SanitizeLabelValue

func SanitizeLabelValue(s string) string

SanitizeLabelValue converts a string to a valid Kubernetes label value. Valid label values must: - Be empty or consist of alphanumeric characters, '-', '_' or '.' - Start and end with an alphanumeric character - Be at most 63 characters Returns "unknown" if the sanitized value is empty.

func SelectBestTemplate

func SelectBestTemplate(
	candidates []TemplateCandidate,
	overrides *aimv1alpha1.AIMServiceOverrides,
	availableGPUs []string,
	allowUnoptimized bool,
) (*TemplateCandidate, int, SelectionDiagnostics, []CandidateEvaluation)

SelectBestTemplate selects the best template candidate from the provided list. The heuristic is: 1. Consider only templates that are Available. 2. Filter by service overrides when provided. 3. Filter by GPUs that exist in the cluster. 4. Prefer namespace-scoped templates over cluster-scoped templates. 5. Prefer higher-tier GPUs, then latency over throughput, then lower precision. Returns (selected template, count of templates with identical preference scores, diagnostics, per-candidate evaluations). If count > 1, the templates are ambiguous (identical in all preference dimensions).

func SetCondition

func SetCondition(conditions *[]metav1.Condition, newCondition metav1.Condition)

SetCondition adds or updates a condition in the conditions list.

func TemplateNameFromSpec

func TemplateNameFromSpec(service *aimv1alpha1.AIMService) string

TemplateNameFromSpec returns the template name from the service spec or status. Falls back to service name if no template reference is found.

func TemplateRequiresGPU

func TemplateRequiresGPU(spec aimv1alpha1.AIMServiceTemplateSpecCommon) bool

TemplateRequiresGPU returns true if the template spec declares a GPU selector with a model.

func UpdateTemplateGPUAvailability

func UpdateTemplateGPUAvailability(
	ctx context.Context,
	k8sClient client.Client,
	spec aimv1alpha1.AIMServiceTemplateSpecCommon,
	obs *TemplateObservation,
) error

UpdateTemplateGPUAvailability checks whether the GPU model declared by the template exists in the cluster. It updates the provided TemplateObservation with the result of the check. The GPU model is normalized to ensure consistent matching across different label formats.

Types

type CandidateEvaluation

type CandidateEvaluation struct {
	Candidate TemplateCandidate
	Status    string // "chosen" or "rejected"
	Reason    string // CamelCase reason
	Rank      int    // For candidates that passed all filters
}

CandidateEvaluation captures why a specific candidate was chosen or rejected.

type DiscoveryJobSpec

type DiscoveryJobSpec struct {
	TemplateName     string
	TemplateSpec     aimv1alpha1.AIMServiceTemplateSpecCommon
	Namespace        string
	ModelID          string
	Image            string
	Env              []corev1.EnvVar
	ImagePullSecrets []corev1.LocalObjectReference
	ServiceAccount   string
	OwnerRef         metav1.OwnerReference
}

DiscoveryJobSpec defines parameters for creating a discovery job

type GPUResourceInfo

type GPUResourceInfo struct {
	// ResourceName is the full Kubernetes resource name (e.g., "amd.com/gpu").
	ResourceName string
}

GPUResourceInfo contains GPU resource information for a specific GPU model.

type ImageLookupResult

type ImageLookupResult struct {
	Image     string
	Resources corev1.ResourceRequirements
}

ImageLookupResult captures the resolved image metadata from the catalog.

func LookupImageForClusterTemplate

func LookupImageForClusterTemplate(ctx context.Context, k8sClient client.Client, modelName string) (*ImageLookupResult, error)

LookupImageForClusterTemplate looks up the container image for a cluster-scoped template. It searches only in AIMClusterModel resources. Returns ErrImageNotFound if no image is found in the catalog.

func LookupImageForNamespaceTemplate

func LookupImageForNamespaceTemplate(ctx context.Context, k8sClient client.Client, namespace, modelName string) (*ImageLookupResult, error)

LookupImageForNamespaceTemplate looks up the container image for a namespace-scoped template. It searches AIMModel resources in the specified namespace first, then falls back to cluster-scoped AIMClusterModel resources. Returns ErrImageNotFound if no image is found in either location.

func (*ImageLookupResult) DeepCopy

func (r *ImageLookupResult) DeepCopy() *ImageLookupResult

DeepCopy returns a deep copy of the ImageLookupResult.

type ImageObservation

type ImageObservation struct {
	// MetadataAlreadyAttempted is true if we've already attempted metadata extraction.
	MetadataAlreadyAttempted bool

	// MetadataExtracted is true if metadata was successfully extracted.
	MetadataExtracted bool

	// ImageMetadata contains the extracted metadata (if extraction succeeded).
	ImageMetadata *aimv1alpha1.ImageMetadata

	// RuntimeConfigResolution contains the resolved runtime config (for image pull secrets).
	RuntimeConfigResolution *RuntimeConfigResolution

	// ExistingTemplates are the ServiceTemplates currently owned by this image.
	ExistingTemplates []client.Object

	// DiscoveryEnabled reflects whether discovery is enabled from runtime config.
	// Discovery is now always attempted unless disabled by runtime config.
	DiscoveryEnabled bool

	// MetadataError captures the latest metadata format issue encountered during extraction.
	MetadataError *MetadataFormatError

	// RegistryError captures categorized registry access errors (auth, not-found, etc.).
	RegistryError *ImageRegistryError

	// MetadataExtractionErr captures non-format extraction failures (e.g., registry or auth errors).
	MetadataExtractionErr error

	// TemplatesAutoGenerated tracks whether auto-generated templates were requested this cycle.
	TemplatesAutoGenerated bool
}

ImageObservation holds the observed state for an AIMModel or AIMClusterModel.

func ObserveImage

func ObserveImage(ctx context.Context, opts ImageObservationOptions) (*ImageObservation, error)

ObserveImage gathers the current state for an image resource.

type ImageObservationOptions

type ImageObservationOptions struct {
	// GetRuntimeConfig returns the runtime config for this scope (namespace or cluster).
	GetRuntimeConfig func(ctx context.Context) (*RuntimeConfigResolution, error)

	// ListOwnedTemplates returns templates owned by this image.
	ListOwnedTemplates func(ctx context.Context) ([]client.Object, error)

	// GetCurrentStatus returns the current status to check for existing conditions.
	GetCurrentStatus func() *aimv1alpha1.AIMModelStatus

	// GetImageSpec returns the image spec.
	GetImageSpec func() aimv1alpha1.AIMModelSpec
}

ImageObservationOptions provides callbacks for observing image state.

type ImagePlanInput

type ImagePlanInput struct {
	// ImageName is the name of the image resource.
	ImageName string

	// Namespace is the namespace (empty for cluster-scoped).
	Namespace string

	// ImageSpec is the image specification.
	ImageSpec aimv1alpha1.AIMModelSpec

	// Observation is the observed state.
	Observation *ImageObservation

	// OwnerReference for created templates.
	OwnerReference []metav1.OwnerReference

	// Clientset for image inspection.
	Clientset kubernetes.Interface

	// IsClusterScoped indicates if this is a cluster-scoped image.
	IsClusterScoped bool

	// ParentObject is the AIMModel or AIMClusterModel for label propagation.
	ParentObject client.Object
}

ImagePlanInput provides the input for planning image resources.

type ImagePullError

type ImagePullError struct {
	Type            ImagePullErrorType
	Container       string
	Reason          string // e.g., "ImagePullBackOff", "ErrImagePull"
	Message         string // Full error message from Kubernetes
	IsInitContainer bool
}

ImagePullError contains categorized information about an image pull failure

func CheckInferenceServicePodImagePullStatus

func CheckInferenceServicePodImagePullStatus(ctx context.Context, k8sClient client.Client, inferenceServiceName, namespace string) *ImagePullError

CheckInferenceServicePodImagePullStatus checks if an InferenceService's pods are stuck in ImagePullBackOff or ErrImagePull state. It looks for pods with the isvc.serving.kserve.io/inferenceservice label matching the InferenceService name. Returns the image pull error details if found, or nil otherwise.

type ImagePullErrorType

type ImagePullErrorType string

ImagePullErrorType categorizes image pull errors

const (
	ImagePullErrorAuth     ImagePullErrorType = "auth"
	ImagePullErrorNotFound ImagePullErrorType = "not-found"
	ImagePullErrorGeneric  ImagePullErrorType = "generic"
)

type ImageRegistryError

type ImageRegistryError struct {
	Type    ImagePullErrorType // From template.go
	Message string
	Cause   error
}

ImageRegistryError wraps registry access errors with categorization

func (*ImageRegistryError) Error

func (e *ImageRegistryError) Error() string

func (*ImageRegistryError) Unwrap

func (e *ImageRegistryError) Unwrap() error

type MetadataFormatError

type MetadataFormatError struct {
	Reason  string
	Message string
}

MetadataFormatError indicates the image metadata is malformed and cannot be processed.

func (*MetadataFormatError) Error

func (e *MetadataFormatError) Error() string

type ModelReference

type ModelReference struct {
	Name  string
	Scope TemplateScope
}

ModelReference represents a found model

type ParsedDiscovery

type ParsedDiscovery struct {
	ModelSources []aimv1alpha1.AIMModelSource
	Profile      *aimv1alpha1.AIMProfile
}

ParsedDiscovery holds the parsed discovery result

func ParseDiscoveryLogs

func ParseDiscoveryLogs(ctx context.Context, k8sClient client.Client, clientset kubernetes.Interface, job *batchv1.Job) (*ParsedDiscovery, error)

ParseDiscoveryLogs parses the discovery job output to extract model sources and profile. Reads pod logs from the completed job and parses the JSON output.

type RuntimeConfigResolution

type RuntimeConfigResolution struct {
	// Name is the runtime config name requested by the consumer.
	Name string

	// Namespace is the consumer namespace used when searching for AIMRuntimeConfig.
	Namespace string

	ClusterConfig           *aimv1alpha1.AIMClusterRuntimeConfig
	NamespaceConfig         *aimv1alpha1.AIMRuntimeConfig
	ClusterConfigNotFound   bool
	NamespaceConfigNotFound bool

	EffectiveSpec aimv1alpha1.AIMRuntimeConfigSpec
	ResolvedRef   *aimv1alpha1.AIMResolvedRuntimeConfig
}

RuntimeConfigResolution captures the resolved runtime configuration. When both namespace and cluster configs exist, they are merged with namespace config taking precedence.

func ResolveRuntimeConfig

func ResolveRuntimeConfig(ctx context.Context, k8sClient client.Client, namespace, configName string) (*RuntimeConfigResolution, error)

ResolveRuntimeConfig resolves runtime config with field-level merging. When both cluster and namespace configs exist, cluster config is used as base and namespace config fields override/merge on top. When configName is empty, the default runtime config name is used.

type RuntimeObservation

type RuntimeObservation[R client.Object] struct {
	Runtime R
	TemplateObservation
}

RuntimeObservation combines TemplateObservation with a controller-specific runtime object.

func ObserveTemplate

func ObserveTemplate[R client.Object](ctx context.Context, opts TemplateObservationOptions[R]) (*RuntimeObservation[R], error)

ObserveTemplate gathers runtime, discovery job, image, and runtime config information with common error handling.

type SelectionDiagnostics

type SelectionDiagnostics struct {
	TotalCandidates                  int
	AfterAvailabilityFilter          int
	AfterUnoptimizedFilter           int
	AfterOverridesFilter             int
	AfterGPUAvailabilityFilter       int
	UnoptimizedTemplatesWereFiltered bool
}

SelectionDiagnostics provides detailed information about why template selection failed.

type ServiceObservation

type ServiceObservation struct {
	InferenceService              *servingv1beta1.InferenceService
	TemplateName                  string
	BaseTemplateName              string
	Scope                         TemplateScope
	AutoSelectedTemplate          bool
	TemplateAvailable             bool
	TemplateOwnedByService        bool
	ShouldCreateTemplate          bool
	RuntimeConfigSpec             aimv1alpha1.AIMRuntimeConfigSpec
	ResolvedRuntimeConfig         *aimv1alpha1.AIMResolvedRuntimeConfig
	ResolvedImage                 *aimv1alpha1.AIMResolvedReference
	RoutePath                     string
	RouteTimeout                  *string
	PathTemplateErr               error
	RuntimeConfigErr              error
	ImageErr                      error
	ModelResolutionErr            error
	TemplateStatus                *aimv1alpha1.AIMServiceTemplateStatus
	TemplateSpecCommon            aimv1alpha1.AIMServiceTemplateSpecCommon
	TemplateSpec                  *aimv1alpha1.AIMServiceTemplateSpec
	TemplateNamespace             string
	ImageResources                *corev1.ResourceRequirements
	TemplateSelectionReason       string
	TemplateSelectionMessage      string
	TemplateSelectionCount        int
	TemplatesExistButNotReady     bool // True when templates exist but aren't Available yet
	ImageReady                    bool
	ImageReadyReason              string
	ImageReadyMessage             string
	InferenceServicePodImageError *ImagePullError // Categorized image pull error from InferenceService pods
	TemplateMatchingResults       []aimv1alpha1.AIMTemplateCandidateResult
	TemplateCache                 *aimv1alpha1.AIMTemplateCache
	ModelCaches                   *aimv1alpha1.AIMModelCacheList
	KVCache                       *aimv1alpha1.AIMKVCache // Observed AIMKVCache resource
	KVCacheConfigMap              *corev1.ConfigMap       // ConfigMap with KV cache configuration
	KVCacheErr                    error                   // Error from observing KV cache resources
}

ServiceObservation holds observed state for an AIMService reconciliation.

func (*ServiceObservation) RuntimeName

func (o *ServiceObservation) RuntimeName() string

RuntimeName returns the effective runtime name for the service.

func (*ServiceObservation) TemplateFound

func (o *ServiceObservation) TemplateFound() bool

TemplateFound returns true if a template was resolved (namespace or cluster scope).

type TemplateCandidate

type TemplateCandidate struct {
	Name      string
	Namespace string
	Scope     TemplateScope
	Spec      aimv1alpha1.AIMServiceTemplateSpecCommon
	Status    aimv1alpha1.AIMServiceTemplateStatus
}

TemplateCandidate captures the information needed to evaluate a template during selection.

func (TemplateCandidate) QualifiedName

func (c TemplateCandidate) QualifiedName() string

QualifiedName returns a human-readable identifier for logging/debugging.

type TemplateObservation

type TemplateObservation struct {
	Job                *batchv1.Job
	Image              string
	ImageResources     *corev1.ResourceRequirements
	ImagePullSecrets   []corev1.LocalObjectReference
	ServiceAccountName string
	RuntimeConfig      *RuntimeConfigResolution
	TemplateCaches     *aimv1alpha1.AIMTemplateCacheList
	GPUModel           string
	GPUAvailable       bool
	GPUChecked         bool
	JobPodImageError   *ImagePullError // Categorized image pull error if job pod is stuck
}

TemplateObservation holds the common observed state for both template types

type TemplateObservationOptions

type TemplateObservationOptions[R client.Object] struct {
	K8sClient               client.Client // Required for pod status checking
	GetRuntime              func(ctx context.Context) (R, error)
	ShouldCheckDiscoveryJob bool
	GetDiscoveryJob         func(ctx context.Context) (*batchv1.Job, error)
	GetJobNamespace         func() string // Namespace where the job runs (for pod lookup)
	LookupImage             func(ctx context.Context) (*ImageLookupResult, error)
	ResolveRuntimeConfig    func(ctx context.Context) (*RuntimeConfigResolution, error)
	OnRuntimeConfigResolved func(resolution *RuntimeConfigResolution)
	GetImagePullSecrets     func() []corev1.LocalObjectReference // Template's imagePullSecrets
	GetServiceAccountName   func() string                        // Template's serviceAccountName
	GetTemplateCaches       func(ctx context.Context) (*aimv1alpha1.AIMTemplateCacheList, error)
}

TemplateObservationOptions configures ObserveTemplate behaviour.

type TemplatePlanBuilders

type TemplatePlanBuilders struct {
	BuildRuntime      func(input TemplatePlanInput) client.Object
	BuildDiscoveryJob func(input TemplatePlanInput) client.Object
}

TemplatePlanBuilders specifies how to render runtime and discovery job objects.

type TemplatePlanContext

type TemplatePlanContext struct {
	Ctx         context.Context
	Client      client.Client
	Template    metav1.Object
	APIVersion  string
	Kind        string
	Status      aimv1alpha1.AIMTemplateStatusEnum
	Observation *TemplateObservation
}

TemplatePlanContext provides metadata needed during plan generation.

type TemplatePlanInput

type TemplatePlanInput struct {
	Observation       *TemplateObservation
	RuntimeConfigSpec aimv1alpha1.AIMRuntimeConfigSpec
	OwnerReference    metav1.OwnerReference
}

TemplatePlanInput supplies builders with convenient access to observation data.

type TemplateResolution

type TemplateResolution struct {
	BaseName  string
	FinalName string
	Derived   bool
	Scope     TemplateScope
}

TemplateResolution captures the result of resolving a template name for a service.

type TemplateScope

type TemplateScope string

TemplateScope indicates whether a template is namespace-scoped, cluster-scoped, or unresolved.

const (
	TemplateScopeNone      TemplateScope = ""
	TemplateScopeNamespace TemplateScope = "namespace"
	TemplateScopeCluster   TemplateScope = "cluster"
)

func LoadBaseTemplateSpec

func LoadBaseTemplateSpec(ctx context.Context, k8sClient client.Client, service *aimv1alpha1.AIMService, baseName string) (*aimv1alpha1.AIMServiceTemplateSpec, TemplateScope, error)

LoadBaseTemplateSpec fetches the base template spec for a derived template. Searches namespace-scoped templates first, then falls back to cluster-scoped templates.

func ResolveOrCreateModelFromImage

func ResolveOrCreateModelFromImage(
	ctx context.Context,
	k8sClient client.Client,
	serviceNamespace string,
	imageURI string,
	runtimeConfig *aimv1alpha1.AIMRuntimeConfigSpec,
	imagePullSecrets []corev1.LocalObjectReference,
	serviceAccountName string,
	parentService *aimv1alpha1.AIMService,
) (modelName string, scope TemplateScope, err error)

ResolveOrCreateModelFromImage searches for existing models matching the image URI, or creates a new one if none exists. Returns the model name and scope.

type TemplateSelectionStatus

type TemplateSelectionStatus struct {
	AutoSelected              bool
	CandidateCount            int
	SelectionReason           string
	SelectionMessage          string
	TemplatesExistButNotReady bool
	ImageReady                bool
	ImageReadyReason          string
	ImageReadyMessage         string
	ModelResolutionErr        error
	TemplateMatchingResults   []aimv1alpha1.AIMTemplateCandidateResult
}

TemplateSelectionStatus captures metadata about automatic template selection.

type TemplateSpec

type TemplateSpec interface {
	GetModelName() string
	GetSpecModelSources() []aimv1alpha1.AIMModelSource
}

TemplateSpec provides the common template specification

type TemplateWithStatus

type TemplateWithStatus interface {
	TemplateSpec
	client.Object
	GetStatus() *aimv1alpha1.AIMServiceTemplateStatus
}

TemplateWithStatus extends TemplateSpec with status access

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL