Documentation
¶
Index ¶
- Constants
- type AssumeRoleConfig
- type Config
- type DebugConfig
- type DriftedPodUpdaterConfig
- type EngineHeartbeatConfig
- type LeaderElectionConfig
- type ModelConfig
- type ModelConfigItem
- type NIMConfig
- type NIMModelConfig
- type ObjectStoreConfig
- type OllamaConfig
- type PersistentVolume
- type ProcessedModelConfig
- type Resources
- type RuntimeConfig
- type S3Config
- type TolerationConfig
- type VLLMConfig
- type WorkerConfig
- type WorkerTLSConfig
Constants ¶
const ( // RuntimeNameOllama is the Ollama runtime name. RuntimeNameOllama string = "ollama" // RuntimeNameVLLM is the VLLM runtime name. RuntimeNameVLLM string = "vllm" // RuntimeNameTriton is the runtime name for Nvidia Triton Inference Server. RuntimeNameTriton string = "triton" // RuntimeNameNIM is the NIM runtime name. RuntimeNameNIM string = "nim" // RuntimeNameSGLang is the SGLang runtime name. RuntimeNameSGLang string = "sglang" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AssumeRoleConfig ¶
type AssumeRoleConfig struct {
RoleARN string `yaml:"roleArn"`
ExternalID string `yaml:"externalId"`
}
AssumeRoleConfig is the assume role configuration.
type Config ¶
type Config struct {
Runtime RuntimeConfig `yaml:"runtime"`
Ollama OllamaConfig `yaml:"ollama"`
VLLM VLLMConfig `yaml:"vllm"`
NIM NIMConfig `yaml:"nim"`
Model ModelConfig `yaml:"model"`
HealthPort int `yaml:"healthPort"`
MetricsPort int `yaml:"metricsPort"`
// GracefulShutdownTimeout is the duration given to runnable to stop
// before the manager actually returns on stop. Default is 30 seconds.
GracefulShutdownTimeout time.Duration `yaml:"gracefulShutdownTimeout"`
LeaderElection LeaderElectionConfig `yaml:"leaderElection"`
Autoscaler autoscaler.Config `yaml:"autoscaler"`
ObjectStore ObjectStoreConfig `yaml:"objectStore"`
// PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally
// at the startup time.
// TODO(kenji):Remove once every env uses ModelConfig.
PreloadedModelIDs []string `yaml:"preloadedModelIds"`
// ModelContextLengths is a map of model ID to context length. If not specified, the default
// context length is used.
// TODO(kenji):Remove once every env uses ModelConfig.
ModelContextLengths map[string]int `yaml:"modelContextLengths"`
EngineHeartbeat EngineHeartbeatConfig `yaml:"engineHeartbeat"`
DriftedPodUpdater DriftedPodUpdaterConfig `yaml:"driftedPodUpdater"`
Debug DebugConfig `yaml:"debug"`
InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"`
ModelManagerServerWorkerServiceAddr string `yaml:"modelManagerServerWorkerServiceAddr"`
// GracefulShutdownDelay is the delay before shutting down the engine.
GracefulShutdownDelay time.Duration `yaml:"gracefulShutdownDelay"`
Worker WorkerConfig `yaml:"worker"`
// ComponentStatusSender is the configuration for the component status sender.
ComponentStatusSender status.Config `yaml:"componentStatusSender"`
}
Config is the configuration.
type DebugConfig ¶
type DebugConfig struct {
// Standalone is true if the service is running in standalone mode (except the
// dependency to inference-manager-server).
Standalone bool `yaml:"standalone"`
}
DebugConfig is the debug configuration.
type DriftedPodUpdaterConfig ¶ added in v1.37.0
type DriftedPodUpdaterConfig struct {
Enable bool `yaml:"enable"`
}
DriftedPodUpdaterConfig is the configuration for the drifted pod updater.
type EngineHeartbeatConfig ¶ added in v1.35.0
type EngineHeartbeatConfig struct {
ReconnectOnNoHeartbeat bool `yaml:"reconnectOnNoHeartbeat"`
HeartbeatTimeout time.Duration `yaml:"heartbeatTimeout"`
}
EngineHeartbeatConfig is the configuration for the engine heartbeat.
type LeaderElectionConfig ¶
type LeaderElectionConfig struct {
ID string `yaml:"id"`
// LeaseDuration is the duration that non-leader candidates will
// wait to force acquire leadership. This is measured against time of
// last observed ack. Default is 15 seconds.
LeaseDuration *time.Duration `yaml:"leaseDuration"`
// RenewDeadline is the duration that the acting controlplane will retry
// refreshing leadership before giving up. Default is 10 seconds.
RenewDeadline *time.Duration `yaml:"renewDeadline"`
// RetryPeriod is the duration the LeaderElector clients should wait
// between tries of actions. Default is 2 seconds.
RetryPeriod *time.Duration `yaml:"retryPeriod"`
}
LeaderElectionConfig is the leader election configuration.
type ModelConfig ¶
type ModelConfig struct {
Default ModelConfigItem `yaml:"default"`
// Overrides is a map of model ID to the model configuration item to be overriden. Only
// fields that are set in the overrides are applied.
Overrides map[string]ModelConfigItem `yaml:"overrides"`
EnableOverrideWithModelConfig bool `yaml:"enableOverrideWithModelConfig"`
}
ModelConfig is the model configuration.
type ModelConfigItem ¶
type ModelConfigItem struct {
RuntimeName string `yaml:"runtimeName" json:"runtimeName"`
Resources Resources `yaml:"resources" json:"resources"`
Replicas int `yaml:"replicas" json:"replicas"`
// Preloaded is true if the model is preloaded.
// If this is set to true in the the default model item, all models that are specified in override items
// are preloaded.
Preloaded bool `yaml:"preloaded" json:"preloaded"`
// ContextLength is the context length for the model. If the value is 0,
// the default context length is used.
ContextLength int `yaml:"contextLength" json:"contextLength"`
// VLLMExtraFlags is the extra flags for VLLM.
VLLMExtraFlags []string `yaml:"vllmExtraFlags" json:"vllmExtraFlags"`
// SchedulerName is the name of the scheduler to use.
// This is set when a vLLM runs on Inferentia instances and
// requires Neuron scheduling extension.
// See https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html.
SchedulerName string `yaml:"schedulerName" json:"schedulerName"`
// ContainerRuntimeClassName is the name of a K8s Runtime Class
// (https://kubernetes.io/docs/concepts/containers/runtime-class/) used by model runtime.
// This is set the Runtime Class of Nvidia container runtime if it is not a cluster default.
ContainerRuntimeClassName string `yaml:"containerRuntimeClassName" json:"containerRuntimeClassName"`
// Image is the docker image to use for the model. If empty, use the default runtime image.
Image string `yaml:"image" json:"image"`
}
ModelConfigItem is the model configuration item.
type NIMConfig ¶ added in v1.30.0
type NIMConfig struct {
NGCAPIKey string `yaml:"ngcApiKey"`
Models map[string]NIMModelConfig `yaml:"models"`
}
NIMConfig is the NIM configuration.
type NIMModelConfig ¶ added in v1.30.0
type NIMModelConfig struct {
LogLevel string `yaml:"logLevel"`
Image string `yaml:"image"`
ImagePullPolicy string `yaml:"imagePullPolicy"`
ModelName string `yaml:"modelName"`
ModelVersion string `yaml:"modelVersion"`
OpenAIPort int `yaml:"openaiPort"`
Resources Resources `yaml:"resources"`
}
NIMModelConfig is the model configuration.
type ObjectStoreConfig ¶
type ObjectStoreConfig struct {
S3 S3Config `yaml:"s3"`
}
ObjectStoreConfig is the object store configuration.
type OllamaConfig ¶
type OllamaConfig struct {
// KeepAlive is the keep-alive duration for Ollama.
// This controls how long Ollama keeps models in GPU memory.
KeepAlive time.Duration `yaml:"keepAlive"`
// NumParallel is the maximum number of requests procesed in parallel.
NumParallel int `yaml:"numParallel"`
// ForceSpreading is true if the models should be spread across all GPUs.
ForceSpreading bool `yaml:"forceSpreading"`
Debug bool `yaml:"debug"`
RunnersDir string `yaml:"runnersDir"`
// DynamicModelLoading is true if the model is loaded dynamically.
// If this is set to true, the puller is run in the daemon mode.
DynamicModelLoading bool `yaml:"dynamicModelLoading"`
}
OllamaConfig is the Ollama configuration.
type PersistentVolume ¶
type PersistentVolume struct {
// ShareWithReplicas sets whether to share the volume among replicas.
StorageClassName string `yaml:"storageClassName" json:"storageClassName"`
Size string `yaml:"size" json:"size"`
AccessMode string `yaml:"accessMode" json:"accessMode"`
}
PersistentVolume is the persistent volume configuration.
type ProcessedModelConfig ¶
type ProcessedModelConfig struct {
// contains filtered or unexported fields
}
ProcessedModelConfig is the processed model configuration.
func NewProcessedModelConfig ¶
func NewProcessedModelConfig(c *Config) *ProcessedModelConfig
NewProcessedModelConfig returns a new ProcessedModelConfig.
func (*ProcessedModelConfig) ModelConfigItem ¶
func (c *ProcessedModelConfig) ModelConfigItem(modelID string) *ModelConfigItem
ModelConfigItem returns the model configuration item for the given model ID.
func (*ProcessedModelConfig) PreloadedModelIDs ¶
func (c *ProcessedModelConfig) PreloadedModelIDs() []string
PreloadedModelIDs returns the IDs of the models to be preloaded.
type Resources ¶
type Resources struct {
Requests map[string]string `yaml:"requests" json:"requests"`
Limits map[string]string `yaml:"limits" json:"limits"`
Volume *PersistentVolume `yaml:"volume" json:"volume"`
}
Resources is the resources configuration.
type RuntimeConfig ¶
type RuntimeConfig struct {
PullerImage string `yaml:"pullerImage"`
TritonProxyImage string `yaml:"tritonProxyImage"`
RuntimeImages map[string]string `yaml:"runtimeImages"`
PullerImagePullPolicy string `yaml:"pullerImagePullPolicy"`
TritonProxyImagePullPolicy string `yaml:"tritonProxyImagePullPolicy"`
RuntimeImagePullPolicy string `yaml:"runtimeImagePullPolicy"`
RuntimeImagePullSecrets []string `yaml:"runtimeImagePullSecrets"`
ConfigMapName string `yaml:"configMapName"`
AWSSecretName string `yaml:"awsSecretName"`
AWSKeyIDEnvKey string `yaml:"awsKeyIdEnvKey"`
AWSAccessKeyEnvKey string `yaml:"awsAccessKeyEnvKey"`
LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"`
LLMOKeyEnvKey string `yaml:"llmoKeyEnvKey"`
ServiceAccountName string `yaml:"serviceAccountName"`
PodAnnotations map[string]string `yaml:"podAnnotations"`
NodeSelector map[string]string `yaml:"nodeSelector"`
Tolerations []TolerationConfig `yaml:"tolerations"`
UnstructuredAffinity any `yaml:"affinity"`
Affinity *corev1.Affinity `yaml:"-"`
// PullerPort is the port for the puller. This is only used when
// Ollama's DynamicModelLoading or vLLM's DynamicLoRALoading is enabled.
PullerPort int `yaml:"pullerPort"`
// Env and EnvFrom are lists of environment variables or env sources to set in runtime containers.
UnstructuredEnv any `yaml:"env"`
Env []corev1.EnvVar `yaml:"-"`
UnstructuredEnvFrom any `yaml:"envFrom"`
EnvFrom []corev1.EnvFromSource `yaml:"-"`
UseMemoryMediumForModelVolume bool `yaml:"useMemoryMediumForModelVolume"`
TerminationGracePeriodSeconds *int `yaml:"terminationGracePeriodSeconds"`
}
RuntimeConfig is the runtime configuration.
type S3Config ¶
type S3Config struct {
EndpointURL string `yaml:"endpointUrl"`
Region string `yaml:"region"`
InsecureSkipVerify bool `yaml:"insecureSkipVerify"`
Bucket string `yaml:"bucket"`
AssumeRole *AssumeRoleConfig `yaml:"assumeRole"`
}
S3Config is the S3 configuration.
type TolerationConfig ¶
type TolerationConfig struct {
Key string `yaml:"key"`
Operator string `yaml:"operator"`
Value string `yaml:"value"`
Effect string `yaml:"effect"`
TolerationSeconds int64 `yaml:"tolerationSeconds"`
}
TolerationConfig is the toleration configuration.
type VLLMConfig ¶ added in v1.22.0
type VLLMConfig struct {
DynamicLoRALoading bool `yaml:"dynamicLoRALoading"`
LoggingLevel string `yaml:"loggingLevel"`
}
VLLMConfig is the VLLM configuration.
type WorkerConfig ¶
type WorkerConfig struct {
TLS WorkerTLSConfig `yaml:"tls"`
}
WorkerConfig is the worker configuration.
type WorkerTLSConfig ¶
type WorkerTLSConfig struct {
Enable bool `yaml:"enable"`
}
WorkerTLSConfig is the worker TLS configuration.