config

package

v1.46.0 Latest Latest Go to latest Published: Apr 27, 2026 License: Apache-2.0 Imports: 12 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/llmariner/inference-manager

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type AssumeRoleConfig
type Config
- func Parse(path string) (*Config, error)
- func (c *Config) Validate() error
type DebugConfig
type DriftedPodUpdaterConfig
type EngineHeartbeatConfig
type LeaderElectionConfig
type ModelConfig
type ModelConfigItem
type NIMConfig
type NIMModelConfig
type ObjectStoreConfig
type OllamaConfig
type PersistentVolume
type ProcessedModelConfig
- func NewProcessedModelConfig(c *Config) *ProcessedModelConfig
- func (c *ProcessedModelConfig) ModelConfigItem(modelID string) *ModelConfigItem
- func (c *ProcessedModelConfig) PreloadedModelIDs() []string
type Resources
type RuntimeConfig
type S3Config
type TolerationConfig
type VLLMConfig
type WorkerConfig
type WorkerTLSConfig

Constants ¶

View Source

const (
	// RuntimeNameOllama is the Ollama runtime name.
	RuntimeNameOllama string = "ollama"
	// RuntimeNameVLLM is the VLLM runtime name.
	RuntimeNameVLLM string = "vllm"
	// RuntimeNameTriton is the runtime name for Nvidia Triton Inference Server.
	RuntimeNameTriton string = "triton"
	// RuntimeNameNIM is the NIM runtime name.
	RuntimeNameNIM string = "nim"
	// RuntimeNameSGLang is the SGLang runtime name.
	RuntimeNameSGLang string = "sglang"
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AssumeRoleConfig ¶

type AssumeRoleConfig struct {
	RoleARN    string `yaml:"roleArn"`
	ExternalID string `yaml:"externalId"`
}

AssumeRoleConfig is the assume role configuration.

type Config ¶

type Config struct {
	Runtime RuntimeConfig `yaml:"runtime"`
	Ollama  OllamaConfig  `yaml:"ollama"`
	VLLM    VLLMConfig    `yaml:"vllm"`
	NIM     NIMConfig     `yaml:"nim"`
	Model   ModelConfig   `yaml:"model"`

	HealthPort  int `yaml:"healthPort"`
	MetricsPort int `yaml:"metricsPort"`

	// GracefulShutdownTimeout is the duration given to runnable to stop
	// before the manager actually returns on stop. Default is 30 seconds.
	GracefulShutdownTimeout time.Duration `yaml:"gracefulShutdownTimeout"`

	LeaderElection LeaderElectionConfig `yaml:"leaderElection"`

	Autoscaler autoscaler.Config `yaml:"autoscaler"`

	ObjectStore ObjectStoreConfig `yaml:"objectStore"`

	// PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally
	// at the startup time.
	// TODO(kenji):Remove once every env uses ModelConfig.
	PreloadedModelIDs []string `yaml:"preloadedModelIds"`

	// ModelContextLengths is a map of model ID to context length. If not specified, the default
	// context length is used.
	// TODO(kenji):Remove once every env uses ModelConfig.
	ModelContextLengths map[string]int `yaml:"modelContextLengths"`

	EngineHeartbeat EngineHeartbeatConfig `yaml:"engineHeartbeat"`

	DriftedPodUpdater DriftedPodUpdaterConfig `yaml:"driftedPodUpdater"`

	Debug DebugConfig `yaml:"debug"`

	InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"`
	ModelManagerServerWorkerServiceAddr     string `yaml:"modelManagerServerWorkerServiceAddr"`

	// GracefulShutdownDelay is the delay before shutting down the engine.
	GracefulShutdownDelay time.Duration `yaml:"gracefulShutdownDelay"`

	Worker WorkerConfig `yaml:"worker"`

	// ComponentStatusSender is the configuration for the component status sender.
	ComponentStatusSender status.Config `yaml:"componentStatusSender"`
}

Config is the configuration.

func Parse ¶

func Parse(path string) (*Config, error)

Parse parses the configuration file at the given path, returning a new Config struct.

func (*Config) Validate ¶

func (c *Config) Validate() error

Validate validates the configuration.

type DebugConfig ¶

type DebugConfig struct {
	// Standalone is true if the service is running in standalone mode (except the
	// dependency to inference-manager-server).
	Standalone bool `yaml:"standalone"`
}

DebugConfig is the debug configuration.

type DriftedPodUpdaterConfig ¶ added in v1.37.0

type DriftedPodUpdaterConfig struct {
	Enable bool `yaml:"enable"`
}

DriftedPodUpdaterConfig is the configuration for the drifted pod updater.

type EngineHeartbeatConfig ¶ added in v1.35.0

type EngineHeartbeatConfig struct {
	ReconnectOnNoHeartbeat bool          `yaml:"reconnectOnNoHeartbeat"`
	HeartbeatTimeout       time.Duration `yaml:"heartbeatTimeout"`
}

EngineHeartbeatConfig is the configuration for the engine heartbeat.

type LeaderElectionConfig ¶

type LeaderElectionConfig struct {
	ID string `yaml:"id"`
	// LeaseDuration is the duration that non-leader candidates will
	// wait to force acquire leadership. This is measured against time of
	// last observed ack. Default is 15 seconds.
	LeaseDuration *time.Duration `yaml:"leaseDuration"`
	// RenewDeadline is the duration that the acting controlplane will retry
	// refreshing leadership before giving up. Default is 10 seconds.
	RenewDeadline *time.Duration `yaml:"renewDeadline"`
	// RetryPeriod is the duration the LeaderElector clients should wait
	// between tries of actions. Default is 2 seconds.
	RetryPeriod *time.Duration `yaml:"retryPeriod"`
}

LeaderElectionConfig is the leader election configuration.

type ModelConfig ¶

type ModelConfig struct {
	Default ModelConfigItem `yaml:"default"`
	// Overrides is a map of model ID to the model configuration item to be overriden. Only
	// fields that are set in the overrides are applied.
	Overrides map[string]ModelConfigItem `yaml:"overrides"`

	EnableOverrideWithModelConfig bool `yaml:"enableOverrideWithModelConfig"`
}

ModelConfig is the model configuration.

type ModelConfigItem ¶

type ModelConfigItem struct {
	RuntimeName string `yaml:"runtimeName" json:"runtimeName"`

	Resources Resources `yaml:"resources" json:"resources"`
	Replicas  int       `yaml:"replicas" json:"replicas"`

	// Preloaded is true if the model is preloaded.
	// If this is set to true in the the default model item, all models that are specified in override items
	// are preloaded.
	Preloaded bool `yaml:"preloaded" json:"preloaded"`

	// ContextLength is the context length for the model. If the value is 0,
	// the default context length is used.
	ContextLength int `yaml:"contextLength" json:"contextLength"`

	// VLLMExtraFlags is the extra flags for VLLM.
	VLLMExtraFlags []string `yaml:"vllmExtraFlags" json:"vllmExtraFlags"`

	// SchedulerName is the name of the scheduler to use.
	// This is set when a vLLM runs on Inferentia instances and
	// requires Neuron scheduling extension.
	// See https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html.
	SchedulerName string `yaml:"schedulerName" json:"schedulerName"`

	// ContainerRuntimeClassName is the name of a K8s Runtime Class
	// (https://kubernetes.io/docs/concepts/containers/runtime-class/) used by model runtime.
	// This is set the Runtime Class of Nvidia container runtime if it is not a cluster default.
	ContainerRuntimeClassName string `yaml:"containerRuntimeClassName" json:"containerRuntimeClassName"`

	// Image is the docker image to use for the model. If empty, use the default runtime image.
	Image string `yaml:"image" json:"image"`
}

ModelConfigItem is the model configuration item.

type NIMConfig ¶ added in v1.30.0

type NIMConfig struct {
	NGCAPIKey string                    `yaml:"ngcApiKey"`
	Models    map[string]NIMModelConfig `yaml:"models"`
}

NIMConfig is the NIM configuration.

type NIMModelConfig ¶ added in v1.30.0

type NIMModelConfig struct {
	LogLevel        string    `yaml:"logLevel"`
	Image           string    `yaml:"image"`
	ImagePullPolicy string    `yaml:"imagePullPolicy"`
	ModelName       string    `yaml:"modelName"`
	ModelVersion    string    `yaml:"modelVersion"`
	OpenAIPort      int       `yaml:"openaiPort"`
	Resources       Resources `yaml:"resources"`
}

NIMModelConfig is the model configuration.

type ObjectStoreConfig ¶

type ObjectStoreConfig struct {
	S3 S3Config `yaml:"s3"`
}

ObjectStoreConfig is the object store configuration.

type OllamaConfig ¶

type OllamaConfig struct {
	// KeepAlive is the keep-alive duration for Ollama.
	// This controls how long Ollama keeps models in GPU memory.
	KeepAlive time.Duration `yaml:"keepAlive"`

	// NumParallel is the maximum number of requests procesed in parallel.
	NumParallel int `yaml:"numParallel"`

	// ForceSpreading is true if the models should be spread across all GPUs.
	ForceSpreading bool `yaml:"forceSpreading"`

	Debug bool `yaml:"debug"`

	RunnersDir string `yaml:"runnersDir"`

	// DynamicModelLoading is true if the model is loaded dynamically.
	// If this is set to true, the puller is run in the daemon mode.
	DynamicModelLoading bool `yaml:"dynamicModelLoading"`
}

OllamaConfig is the Ollama configuration.

type PersistentVolume ¶

type PersistentVolume struct {
	// ShareWithReplicas sets whether to share the volume among replicas.
	ShareWithReplicas bool   `yaml:"shareWithReplicas" json:"shareWithReplicas"`
	StorageClassName  string `yaml:"storageClassName" json:"storageClassName"`
	Size              string `yaml:"size" json:"size"`
	AccessMode        string `yaml:"accessMode" json:"accessMode"`
}

PersistentVolume is the persistent volume configuration.

type ProcessedModelConfig ¶

type ProcessedModelConfig struct {
	// contains filtered or unexported fields
}

ProcessedModelConfig is the processed model configuration.

func NewProcessedModelConfig ¶

func NewProcessedModelConfig(c *Config) *ProcessedModelConfig

NewProcessedModelConfig returns a new ProcessedModelConfig.

func (*ProcessedModelConfig) ModelConfigItem ¶

func (c *ProcessedModelConfig) ModelConfigItem(modelID string) *ModelConfigItem

ModelConfigItem returns the model configuration item for the given model ID.

func (*ProcessedModelConfig) PreloadedModelIDs ¶

func (c *ProcessedModelConfig) PreloadedModelIDs() []string

PreloadedModelIDs returns the IDs of the models to be preloaded.

type Resources ¶

type Resources struct {
	Requests map[string]string `yaml:"requests" json:"requests"`
	Limits   map[string]string `yaml:"limits" json:"limits"`
	Volume   *PersistentVolume `yaml:"volume" json:"volume"`
}

Resources is the resources configuration.

type RuntimeConfig ¶

type RuntimeConfig struct {
	PullerImage                string            `yaml:"pullerImage"`
	TritonProxyImage           string            `yaml:"tritonProxyImage"`
	RuntimeImages              map[string]string `yaml:"runtimeImages"`
	PullerImagePullPolicy      string            `yaml:"pullerImagePullPolicy"`
	TritonProxyImagePullPolicy string            `yaml:"tritonProxyImagePullPolicy"`
	RuntimeImagePullPolicy     string            `yaml:"runtimeImagePullPolicy"`
	RuntimeImagePullSecrets    []string          `yaml:"runtimeImagePullSecrets"`

	ConfigMapName        string `yaml:"configMapName"`
	AWSSecretName        string `yaml:"awsSecretName"`
	AWSKeyIDEnvKey       string `yaml:"awsKeyIdEnvKey"`
	AWSAccessKeyEnvKey   string `yaml:"awsAccessKeyEnvKey"`
	LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"`
	LLMOKeyEnvKey        string `yaml:"llmoKeyEnvKey"`

	ServiceAccountName string `yaml:"serviceAccountName"`

	PodAnnotations map[string]string `yaml:"podAnnotations"`

	NodeSelector         map[string]string  `yaml:"nodeSelector"`
	Tolerations          []TolerationConfig `yaml:"tolerations"`
	UnstructuredAffinity any                `yaml:"affinity"`
	Affinity             *corev1.Affinity   `yaml:"-"`

	// PullerPort is the port for the puller. This is only used when
	// Ollama's DynamicModelLoading or vLLM's DynamicLoRALoading is enabled.
	PullerPort int `yaml:"pullerPort"`

	// Env and EnvFrom are lists of environment variables or env sources to set in runtime containers.
	UnstructuredEnv     any                    `yaml:"env"`
	Env                 []corev1.EnvVar        `yaml:"-"`
	UnstructuredEnvFrom any                    `yaml:"envFrom"`
	EnvFrom             []corev1.EnvFromSource `yaml:"-"`

	UseMemoryMediumForModelVolume bool `yaml:"useMemoryMediumForModelVolume"`

	TerminationGracePeriodSeconds *int `yaml:"terminationGracePeriodSeconds"`
}

RuntimeConfig is the runtime configuration.

type S3Config ¶

type S3Config struct {
	EndpointURL        string `yaml:"endpointUrl"`
	Region             string `yaml:"region"`
	InsecureSkipVerify bool   `yaml:"insecureSkipVerify"`
	Bucket             string `yaml:"bucket"`

	AssumeRole *AssumeRoleConfig `yaml:"assumeRole"`
}

S3Config is the S3 configuration.

type TolerationConfig ¶

type TolerationConfig struct {
	Key               string `yaml:"key"`
	Operator          string `yaml:"operator"`
	Value             string `yaml:"value"`
	Effect            string `yaml:"effect"`
	TolerationSeconds int64  `yaml:"tolerationSeconds"`
}

TolerationConfig is the toleration configuration.

type VLLMConfig ¶ added in v1.22.0

type VLLMConfig struct {
	DynamicLoRALoading bool `yaml:"dynamicLoRALoading"`

	LoggingLevel string `yaml:"loggingLevel"`
}

VLLMConfig is the VLLM configuration.

type WorkerConfig ¶

type WorkerConfig struct {
	TLS WorkerTLSConfig `yaml:"tls"`
}

WorkerConfig is the worker configuration.

type WorkerTLSConfig ¶

type WorkerTLSConfig struct {
	Enable bool `yaml:"enable"`
}

WorkerTLSConfig is the worker TLS configuration.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL