Documentation
¶
Overview ¶
Copyright 2024 The Aibrix Team.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Copyright 2024 The Aibrix Team.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Copyright 2024 The Aibrix Team.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Index ¶
- Constants
- Variables
- func Init()
- func NewLeastBusyTimeRouter() (types.Router, error)
- func NewLeastExpectedLatencyRouter() (types.Router, error)
- func NewLeastGpuCacheRouter() (types.Router, error)
- func NewLeastKvCacheRouter() (types.Router, error)
- func NewLeastLoadPullingRouter(provider cache.CappedLoadProvider) (types.Router, error)
- func NewLeastLoadRouter(provider cache.LoadProvider) (types.Router, error)
- func NewLeastRequestRouter() (types.Router, error)
- func NewLeastUtilRouter() (types.Router, error)
- func NewPDRouter() (types.Router, error)
- func NewPackLoadRouter(provider cache.CappedLoadProvider) (types.Router, error)
- func NewPrefixCacheAndLoadRouter() (types.Router, error)
- func NewPrefixCacheRouter() (types.Router, error)
- func NewQueueRouter(backend types.Router, queue types.RouterQueue[*types.RoutingContext]) (types.QueueRouter, error)
- func NewRandomRouter() (types.Router, error)
- func NewSLORouter(modelName string) (types.QueueRouter, error)
- func NewThroughputRouter() (types.Router, error)
- func Register(algorithm types.RoutingAlgorithm, constructor types.RouterConstructor)
- func RegisterProvider(algorithm types.RoutingAlgorithm, provider types.RouterProviderFunc)
- func Select(ctx *types.RoutingContext) (types.Router, error)
- func SelectRandomPodAsFallback(ctx *types.RoutingContext, pods []*v1.Pod, randomFunc func(int) int) (*v1.Pod, error)
- func SetFallback(router types.Router, fallback types.RoutingAlgorithm) error
- func Validate(algorithms string) (types.RoutingAlgorithm, bool)
- type FallbackRouter
- type PrefillRequestTracker
- type PrefillTimeParams
- type PrefixCacheMetrics
- type RouterManager
- func (rm *RouterManager) Init()
- func (rm *RouterManager) Register(algorithm types.RoutingAlgorithm, constructor types.RouterConstructor)
- func (rm *RouterManager) RegisterProvider(algorithm types.RoutingAlgorithm, provider types.RouterProviderFunc)
- func (rm *RouterManager) Select(ctx *types.RoutingContext) (types.Router, error)
- func (rm *RouterManager) SetFallback(router types.Router, fallback types.RoutingAlgorithm) error
- func (rm *RouterManager) Validate(algorithms string) (types.RoutingAlgorithm, bool)
- type SLORouter
- type Scores
- type SlidingWindowHistogram
- type TokenizerPool
- type TokenizerPoolConfig
- type TokenizerPoolInterface
- type TokenizerPoolMetrics
Constants ¶
const ( RouterPD types.RoutingAlgorithm = "pd" VLLMEngine string = "vllm" SGLangEngine string = "sglang" SGLangBootstrapPort int64 = 8998 SGLangBootstrapPortIdentifier string = "model.aibrix.ai/sglang-bootstrap-port" LLMEngineIdentifier string = constants.ModelLabelEngine PDRoleSetIdentifier string = "roleset-name" PDRoleIdentifier string = "role-name" RoleReplicaIndex string = "stormservice.orchestration.aibrix.ai/role-replica-index" PodGroupIndex string = "stormservice.orchestration.aibrix.ai/pod-group-index" )
const ( PREBLE_TARGET_GPU = "AIBRIX_ROUTER_PREBLE_TARGET_GPU" PREBLE_DECODING_LENGTH = "AIBRIX_ROUTER_PREBLE_DECODING_LENGTH" PREBLE_SLIDING_WINDOW_PERIOD = "AIBRIX_ROUTER_PREBLE_SLIDING_WINDOW_PERIOD" PREBLE_EVICTION_LOOP_INTERVAL = "AIBRIX_ROUTER_PREBLE_EVICTION_LOOP_INTERVAL" )
const ( // Default routing algorithm for slo-family algorithms, setting to RouterSLOLeastLoadPulling. RouterSLO types.RoutingAlgorithm = "slo" // SLO-aware routing algorithm that using SLOQueue and packLoadRouter as the backend. RouterSLOPackLoad types.RoutingAlgorithm = "slo-pack-load" // SLO-aware routing algorithm that using SLOQueue and leastLoadRouter (push mode) as the backend. RouterSLOLeastLoad types.RoutingAlgorithm = "slo-least-load" // SLO-aware routing algorithm that using SLOQueue and leastLoadRouter (pull mode) as the backend. RouterSLOLeastLoadPulling types.RoutingAlgorithm = "slo-least-load-pulling" )
const DefaultFallbackAlgorithm types.RoutingAlgorithm = RouterRandom
const RouterLeastBusyTime types.RoutingAlgorithm = "least-busy-time"
const RouterLeastGpuCache types.RoutingAlgorithm = "least-gpu-cache"
const RouterLeastKvCache types.RoutingAlgorithm = "least-kv-cache"
const RouterLeastLatency types.RoutingAlgorithm = "least-latency"
const RouterLeastRequest types.RoutingAlgorithm = "least-request"
const (
RouterNotSet = ""
)
const RouterPrefixCachePreble types.RoutingAlgorithm = "prefix-cache-preble"
const RouterRandom types.RoutingAlgorithm = "random"
const RouterThroughput types.RoutingAlgorithm = "throughput"
const RouterUtil types.RoutingAlgorithm = "least-utilization"
Variables ¶
var ( RandomRouter = &randomRouter{} RandomRouterProviderFunc = func(_ *types.RoutingContext) (types.Router, error) { return RandomRouter, nil } )
var ( ErrInitTimeout = errors.New("router initialization timeout") ErrFallbackNotSupported = errors.New("router not support fallback") ErrFallbackNotRegistered = errors.New("fallback router not registered") )
var DefaultFallbackRouter types.RouterProviderFunc = RandomRouterProviderFunc
var (
ErrorNoAvailablePod = fmt.Errorf("no pod available")
)
var ModelRouterFactory = NewSLORouter
var (
RouterPrefixCache types.RoutingAlgorithm = "prefix-cache"
)
Functions ¶
func NewLeastBusyTimeRouter ¶
func NewLeastGpuCacheRouter ¶ added in v0.4.0
func NewLeastKvCacheRouter ¶
func NewLeastLoadPullingRouter ¶ added in v0.4.0
func NewLeastLoadPullingRouter(provider cache.CappedLoadProvider) (types.Router, error)
NewLeastLoadRouter creates leastLoadRouter instance in the pull mode.
func NewLeastLoadRouter ¶ added in v0.4.0
func NewLeastLoadRouter(provider cache.LoadProvider) (types.Router, error)
NewLeastLoadRouter creates leastLoadRouter instance in the push mode.
func NewLeastRequestRouter ¶
func NewLeastUtilRouter ¶ added in v0.4.0
func NewPDRouter ¶ added in v0.4.0
func NewPackLoadRouter ¶ added in v0.4.0
func NewPackLoadRouter(provider cache.CappedLoadProvider) (types.Router, error)
NewPackLoadRouter creates packLoadRouter instance.
func NewPrefixCacheRouter ¶
func NewQueueRouter ¶ added in v0.4.0
func NewQueueRouter(backend types.Router, queue types.RouterQueue[*types.RoutingContext]) (types.QueueRouter, error)
func NewRandomRouter ¶
func NewSLORouter ¶ added in v0.4.0
func NewSLORouter(modelName string) (types.QueueRouter, error)
func NewThroughputRouter ¶
func Register ¶
func Register(algorithm types.RoutingAlgorithm, constructor types.RouterConstructor)
func RegisterProvider ¶ added in v0.4.0
func RegisterProvider(algorithm types.RoutingAlgorithm, provider types.RouterProviderFunc)
func SelectRandomPodAsFallback ¶
func SelectRandomPodAsFallback(ctx *types.RoutingContext, pods []*v1.Pod, randomFunc func(int) int) (*v1.Pod, error)
SelectRandomPodAsFallback selects a pod randomly as a fallback. This method should only be used when all other selection mechanisms have failed. For example, if no pods meet the required criteria (e.g., valid metrics or specific conditions), this method can be called to randomly select a pod from the provided list.
func SetFallback ¶ added in v0.4.0
func SetFallback(router types.Router, fallback types.RoutingAlgorithm) error
Types ¶
type FallbackRouter ¶ added in v0.4.0
type FallbackRouter struct {
// contains filtered or unexported fields
}
func (*FallbackRouter) Route ¶ added in v0.4.0
func (r *FallbackRouter) Route(ctx *types.RoutingContext, pods types.PodList) (string, error)
func (*FallbackRouter) SetFallback ¶ added in v0.4.0
func (r *FallbackRouter) SetFallback(fallback types.RoutingAlgorithm, provider types.RouterProviderFunc)
type PrefillRequestTracker ¶
type PrefillRequestTracker struct {
// contains filtered or unexported fields
}
PrefillRequestTracker manages prefill-specific request counts
func NewPrefillRequestTracker ¶
func NewPrefillRequestTracker() *PrefillRequestTracker
NewPrefillRequestTracker creates a new prefill request tracker
func (*PrefillRequestTracker) AddPrefillRequest ¶
func (t *PrefillRequestTracker) AddPrefillRequest(requestID, podName string)
func (*PrefillRequestTracker) GetPrefillRequestCountsForPods ¶
func (t *PrefillRequestTracker) GetPrefillRequestCountsForPods(pods []*v1.Pod) map[string]int32
func (*PrefillRequestTracker) RemovePrefillRequest ¶
func (t *PrefillRequestTracker) RemovePrefillRequest(requestID string)
type PrefillTimeParams ¶
type PrefixCacheMetrics ¶ added in v0.4.0
type PrefixCacheMetrics struct {
// contains filtered or unexported fields
}
PrefixCacheMetrics holds all prefix cache metrics
type RouterManager ¶ added in v0.4.0
type RouterManager struct {
// contains filtered or unexported fields
}
func NewRouterManager ¶ added in v0.4.0
func NewRouterManager() *RouterManager
func (*RouterManager) Init ¶ added in v0.4.0
func (rm *RouterManager) Init()
func (*RouterManager) Register ¶ added in v0.4.0
func (rm *RouterManager) Register(algorithm types.RoutingAlgorithm, constructor types.RouterConstructor)
func (*RouterManager) RegisterProvider ¶ added in v0.4.0
func (rm *RouterManager) RegisterProvider(algorithm types.RoutingAlgorithm, provider types.RouterProviderFunc)
func (*RouterManager) Select ¶ added in v0.4.0
func (rm *RouterManager) Select(ctx *types.RoutingContext) (types.Router, error)
Select the user provided router provider supported by gateway, no error reported and fallback to random router Call Validate before this function to ensure expected behavior.
func (*RouterManager) SetFallback ¶ added in v0.4.0
func (rm *RouterManager) SetFallback(router types.Router, fallback types.RoutingAlgorithm) error
func (*RouterManager) Validate ¶ added in v0.4.0
func (rm *RouterManager) Validate(algorithms string) (types.RoutingAlgorithm, bool)
Validate validates if user provided routing routers is supported by gateway
type SLORouter ¶ added in v0.4.0
type SLORouter struct {
FallbackRouter
*queue.SLOQueue
}
SLORouter is a router that add FallbackRouter mechanism to the queue.
type SlidingWindowHistogram ¶
type SlidingWindowHistogram struct {
// contains filtered or unexported fields
}
type TokenizerPool ¶ added in v0.4.0
type TokenizerPool struct {
// contains filtered or unexported fields
}
TokenizerPool manages model-specific tokenizers with caching and health checking
func NewTokenizerPool ¶ added in v0.4.0
func NewTokenizerPool(config TokenizerPoolConfig, cache cache.Cache) *TokenizerPool
NewTokenizerPool creates a new TokenizerPool instance
func (*TokenizerPool) Close ¶ added in v0.4.0
func (p *TokenizerPool) Close() error
Close gracefully shuts down the TokenizerPool
func (*TokenizerPool) GetTokenizer ¶ added in v0.4.0
GetTokenizer returns a tokenizer for the specified model
type TokenizerPoolConfig ¶ added in v0.4.0
type TokenizerPoolConfig struct {
EnableVLLMRemote bool // Feature flag
EndpointTemplate string // "http://%s:8000"
HealthCheckPeriod time.Duration // Default: 30s
TokenizerTTL time.Duration // Default: 5m
MaxTokenizersPerPool int // Default: 100
DefaultTokenizer tokenizer.Tokenizer // Default when remote fails
ModelServiceMap map[string]string // Model -> Service endpoint mapping
Timeout time.Duration // Request timeout
}
TokenizerPoolConfig represents configuration for the TokenizerPool
type TokenizerPoolInterface ¶ added in v0.4.0
type TokenizerPoolInterface interface {
GetTokenizer(model string, pods []*v1.Pod) tokenizer.Tokenizer
Close() error
}
TokenizerPoolInterface defines the interface for tokenizer pools
type TokenizerPoolMetrics ¶ added in v0.4.0
type TokenizerPoolMetrics struct {
// contains filtered or unexported fields
}
TokenizerPoolMetrics contains Prometheus metrics for the pool