config

package
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 23, 2025 License: Apache-2.0 Imports: 1 Imported by: 0

Documentation

Index

Constants

View Source
const DefaultHighPriority int = 1

default priority of a highest service class

View Source
const DefaultLowPriority int = 100

default priority of a lowest service class

View Source
const DefaultServiceClassName string = "Free"

default name of a service class

View Source
const DefaultServiceClassPriority int = DefaultLowPriority

default priority of a service class (lowest)

Variables

View Source
var AccelPenaltyFactor = float32(0.1)

accelerator transition penalty factor

View Source
var MaxQueueToBatchRatio = 10

maximum number of requests in queueing system as multiples of maximum batch size

View Source
var SLOMargin = -float32(math.Log(1 - SLOPercentile))

Multiplier of average of exponential distribution to attain percentile

View Source
var SLOPercentile = 0.95

Tolerated percentile for SLOs

Functions

This section is empty.

Types

type AcceleratorCount

type AcceleratorCount struct {
	Type  string `json:"type"`  // name of accelerator type
	Count int    `json:"count"` // number of available units
}

Count of accelerator types in the system

type AcceleratorData

type AcceleratorData struct {
	Spec []AcceleratorSpec `json:"accelerators"` // accelerator specs
}

Data related to an Accelerator

type AcceleratorSpec

type AcceleratorSpec struct {
	Name         string    `json:"name"`         // name of accelerator
	Type         string    `json:"type"`         // name of accelerator type (e.g. A100)
	Multiplicity int       `json:"multiplicity"` // number of cards of type for this accelerator
	MemSize      int       `json:"memSize"`      // GB
	MemBW        int       `json:"memBW"`        // GB/sec
	Power        PowerSpec `json:"power"`        // power consumption specs
	Cost         float32   `json:"cost"`         // cents/hr
}

Specifications for accelerator data

type AllocationData

type AllocationData struct {
	Accelerator string         `json:"accelerator"` // accelerator name
	NumReplicas int            `json:"numReplicas"` // number of replicas
	MaxBatch    int            `json:"maxBatch"`    // max batch size
	Cost        float32        `json:"cost"`        // cost of allocation
	ITLAverage  float32        `json:"itlAverage"`  // average ITL
	TTFTAverage float32        `json:"ttftAverage"` // average TTFT
	Load        ServerLoadSpec `json:"load"`        // server load statistics
}

Data about a server allocation

type AllocationSolution

type AllocationSolution struct {
	Spec map[string]AllocationData `json:"allocations"` // map of server names to allocation data
}

type CapacityData

type CapacityData struct {
	Count []AcceleratorCount `json:"count"` // count of accelerator types
}

Data about accelerator type availability

type DecodeParms

type DecodeParms struct {
	Alpha float32 `json:"alpha"` // base
	Beta  float32 `json:"beta"`  // slope
}

Parameters for estimating decode time = alpha + beta * batchSize (msec); batchSize > 0

type ModelAcceleratorPerfData

type ModelAcceleratorPerfData struct {
	Name         string       `json:"name"`         // model name
	Acc          string       `json:"acc"`          // accelerator name
	AccCount     int          `json:"accCount"`     // number of accelerator units used by model
	MaxBatchSize int          `json:"maxBatchSize"` // max batch size based on average number of tokens per request
	AtTokens     int          `json:"atTokens"`     // average number of tokens per request assumed in max batch size calculation
	DecodeParms  DecodeParms  `json:"decodeParms"`  // parameters for estimating decode time
	PrefillParms PrefillParms `json:"prefillParms"` // parameters for estimating prefill time
}

Specifications for a combination of a model and accelerator data

type ModelData

type ModelData struct {
	PerfData []ModelAcceleratorPerfData `json:"models"` // performance data for model on accelerators
}

Data related to a Model

type ModelTarget

type ModelTarget struct {
	Model    string  `json:"model"`    // model name
	SLO_ITL  float32 `json:"slo-itl"`  // inter-token latency (msec)
	SLO_TTFT float32 `json:"slo-ttft"` // time to first token, including queueing (msec)
	SLO_TPS  float32 `json:"slo-tps"`  // throughput (tokens/sec)
}

Specification of SLO targets for a model

type OptimizerData

type OptimizerData struct {
	Spec OptimizerSpec `json:"optimizer"`
}

Data related to Optimizer

type OptimizerSpec

type OptimizerSpec struct {
	Unlimited         bool   `json:"unlimited"`         // unlimited number of accelerator types (for capacity planning and/or cloud)
	DelayedBestEffort bool   `json:"delayedBestEffort"` // delay best effort allocation after attempting allocation to all priority groups
	SaturationPolicy  string `json:"saturationPolicy"`  // allocation policy under saturated condition
}

Specifications for optimizer data

type PowerSpec

type PowerSpec struct {
	Idle     int     `json:"idle"`     // idle power
	Full     int     `json:"full"`     // full utilization power
	MidPower int     `json:"midPower"` // power at inflection point
	MidUtil  float32 `json:"midUtil"`  // utilization at inflection point
}

Specifications for Accelerator power consumption data (Watts)

type PrefillParms

type PrefillParms struct {
	Gamma float32 `json:"gamma"` // base
	Delta float32 `json:"delta"` // slope
}

Parameters for estimating prefill time = gamma + delta * inputTokens * batchSize (msec); inputTokens, batchSize > 0

type SaturatedAllocationPolicy

type SaturatedAllocationPolicy int

options for allocation under saturated condition

const (
	None               SaturatedAllocationPolicy = iota // 0 : no additional allocation beyond satisfying SLOs
	PriorityExhaustive                                  // 1 : allocating exhaustively to servers in priority ordering
	PriorityRoundRobin                                  // 2 : allocating in round-robin fashion within priority groups
	RoundRobin                                          // 3 : allocating in round-robin fashion across all servers
)
var DefaultSaturatedAllocationPolicy SaturatedAllocationPolicy = None

default option for allocation under saturated condition

func SaturatedAllocationPolicyEnum

func SaturatedAllocationPolicyEnum(s string) SaturatedAllocationPolicy

func (SaturatedAllocationPolicy) String

func (p SaturatedAllocationPolicy) String() string

type ServerData

type ServerData struct {
	Spec []ServerSpec `json:"servers"`
}

Data related to a Server

type ServerLoadSpec

type ServerLoadSpec struct {
	ArrivalRate  float32 `json:"arrivalRate"`  // req/min
	AvgInTokens  int     `json:"avgInTokens"`  // average number of input tokens
	AvgOutTokens int     `json:"avgOutTokens"` // average number of output tokens
}

Specifications of server load statistics

type ServerSpec

type ServerSpec struct {
	Name            string         `json:"name"`            // server name
	Class           string         `json:"class"`           // service class name
	Model           string         `json:"model"`           // model name
	KeepAccelerator bool           `json:"keepAccelerator"` // option to not change accelerator
	MinNumReplicas  int            `json:"minNumReplicas"`  // minimum number of replicas
	MaxBatchSize    int            `json:"maxBatchSize"`    // overriding value for the maximum batch size
	CurrentAlloc    AllocationData `json:"currentAlloc"`    // current allocation
	DesiredAlloc    AllocationData `json:"desiredAlloc"`    // desired allocation
}

Specifications of a server

type ServiceClassData

type ServiceClassData struct {
	Spec []ServiceClassSpec `json:"serviceClasses"`
}

Data related to a service class SLOs

type ServiceClassSpec

type ServiceClassSpec struct {
	Name         string        `json:"name"`         // service class name
	Priority     int           `json:"priority"`     // [1,100] priority (lower value is higher priority)
	ModelTargets []ModelTarget `json:"modelTargets"` // target SLOs for models
}

Specification of a service class

type SystemData

type SystemData struct {
	Spec SystemSpec `json:"system"`
}

All data related to the system (accelerators, models, service classes, ...)

type SystemSpec

type SystemSpec struct {
	// static data
	Accelerators   AcceleratorData  `json:"acceleratorData"`  // accelerator data
	Models         ModelData        `json:"modelData"`        // model data
	ServiceClasses ServiceClassData `json:"serviceClassData"` // service class data
	Servers        ServerData       `json:"serverData"`       // server data
	Optimizer      OptimizerData    `json:"optimizerData"`    // optimizer data

	// dynamic data
	Capacity CapacityData `json:"capacityData"` // data about accelerator type availability
}

Specifications for system data

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL