config

package

v0.0.2 Latest Latest Go to latest Published: Oct 23, 2025 License: Apache-2.0 Imports: 1 Imported by: 0

Documentation ¶

Index ¶

Constants
Variables
type AcceleratorCount
type AcceleratorData
type AcceleratorSpec
type AllocationData
type AllocationSolution
type CapacityData
type DecodeParms
type ModelAcceleratorPerfData
type ModelData
type ModelTarget
type OptimizerData
type OptimizerSpec
type PowerSpec
type PrefillParms
type SaturatedAllocationPolicy
- func SaturatedAllocationPolicyEnum(s string) SaturatedAllocationPolicy
- func (p SaturatedAllocationPolicy) String() string
type ServerData
type ServerLoadSpec
type ServerSpec
type ServiceClassData
type ServiceClassSpec
type SystemData
type SystemSpec

Constants ¶

const DefaultHighPriority int = 1

default priority of a highest service class

View Source

const DefaultLowPriority int = 100

default priority of a lowest service class

View Source

const DefaultServiceClassName string = "Free"

default name of a service class

View Source

const DefaultServiceClassPriority int = DefaultLowPriority

default priority of a service class (lowest)

Variables ¶

View Source

var AccelPenaltyFactor = float32(0.1)

accelerator transition penalty factor

View Source

var MaxQueueToBatchRatio = 10

maximum number of requests in queueing system as multiples of maximum batch size

View Source

var SLOMargin = -float32(math.Log(1 - SLOPercentile))

Multiplier of average of exponential distribution to attain percentile

View Source

var SLOPercentile = 0.95

Tolerated percentile for SLOs

Functions ¶

This section is empty.

Types ¶

type AcceleratorCount ¶

type AcceleratorCount struct {
	Type  string `json:"type"`  // name of accelerator type
	Count int    `json:"count"` // number of available units
}

Count of accelerator types in the system

type AcceleratorData ¶

type AcceleratorData struct {
	Spec []AcceleratorSpec `json:"accelerators"` // accelerator specs
}

Data related to an Accelerator

type AcceleratorSpec ¶

type AcceleratorSpec struct {
	Name         string    `json:"name"`         // name of accelerator
	Type         string    `json:"type"`         // name of accelerator type (e.g. A100)
	Multiplicity int       `json:"multiplicity"` // number of cards of type for this accelerator
	MemSize      int       `json:"memSize"`      // GB
	MemBW        int       `json:"memBW"`        // GB/sec
	Power        PowerSpec `json:"power"`        // power consumption specs
	Cost         float32   `json:"cost"`         // cents/hr
}

Specifications for accelerator data

type AllocationData ¶

type AllocationData struct {
	Accelerator string         `json:"accelerator"` // accelerator name
	NumReplicas int            `json:"numReplicas"` // number of replicas
	MaxBatch    int            `json:"maxBatch"`    // max batch size
	Cost        float32        `json:"cost"`        // cost of allocation
	ITLAverage  float32        `json:"itlAverage"`  // average ITL
	TTFTAverage float32        `json:"ttftAverage"` // average TTFT
	Load        ServerLoadSpec `json:"load"`        // server load statistics
}

Data about a server allocation

type AllocationSolution ¶

type AllocationSolution struct {
	Spec map[string]AllocationData `json:"allocations"` // map of server names to allocation data
}

type CapacityData ¶

type CapacityData struct {
	Count []AcceleratorCount `json:"count"` // count of accelerator types
}

Data about accelerator type availability

type DecodeParms ¶

type DecodeParms struct {
	Alpha float32 `json:"alpha"` // base
	Beta  float32 `json:"beta"`  // slope
}

Parameters for estimating decode time = alpha + beta * batchSize (msec); batchSize > 0

type ModelAcceleratorPerfData ¶

type ModelAcceleratorPerfData struct {
	Name         string       `json:"name"`         // model name
	Acc          string       `json:"acc"`          // accelerator name
	AccCount     int          `json:"accCount"`     // number of accelerator units used by model
	MaxBatchSize int          `json:"maxBatchSize"` // max batch size based on average number of tokens per request
	AtTokens     int          `json:"atTokens"`     // average number of tokens per request assumed in max batch size calculation
	DecodeParms  DecodeParms  `json:"decodeParms"`  // parameters for estimating decode time
	PrefillParms PrefillParms `json:"prefillParms"` // parameters for estimating prefill time
}

Specifications for a combination of a model and accelerator data

type ModelData ¶

type ModelData struct {
	PerfData []ModelAcceleratorPerfData `json:"models"` // performance data for model on accelerators
}

Data related to a Model

type ModelTarget ¶

type ModelTarget struct {
	Model    string  `json:"model"`    // model name
	SLO_ITL  float32 `json:"slo-itl"`  // inter-token latency (msec)
	SLO_TTFT float32 `json:"slo-ttft"` // time to first token, including queueing (msec)
	SLO_TPS  float32 `json:"slo-tps"`  // throughput (tokens/sec)
}

Specification of SLO targets for a model

type OptimizerData ¶

type OptimizerData struct {
	Spec OptimizerSpec `json:"optimizer"`
}

Data related to Optimizer

type OptimizerSpec ¶

type OptimizerSpec struct {
	Unlimited         bool   `json:"unlimited"`         // unlimited number of accelerator types (for capacity planning and/or cloud)
	DelayedBestEffort bool   `json:"delayedBestEffort"` // delay best effort allocation after attempting allocation to all priority groups
	SaturationPolicy  string `json:"saturationPolicy"`  // allocation policy under saturated condition
}

Specifications for optimizer data

type PowerSpec ¶

type PowerSpec struct {
	Idle     int     `json:"idle"`     // idle power
	Full     int     `json:"full"`     // full utilization power
	MidPower int     `json:"midPower"` // power at inflection point
	MidUtil  float32 `json:"midUtil"`  // utilization at inflection point
}

Specifications for Accelerator power consumption data (Watts)

type PrefillParms ¶

type PrefillParms struct {
	Gamma float32 `json:"gamma"` // base
	Delta float32 `json:"delta"` // slope
}

Parameters for estimating prefill time = gamma + delta * inputTokens * batchSize (msec); inputTokens, batchSize > 0

type SaturatedAllocationPolicy ¶

type SaturatedAllocationPolicy int

options for allocation under saturated condition

const (
	None               SaturatedAllocationPolicy = iota // 0 : no additional allocation beyond satisfying SLOs
	PriorityExhaustive                                  // 1 : allocating exhaustively to servers in priority ordering
	PriorityRoundRobin                                  // 2 : allocating in round-robin fashion within priority groups
	RoundRobin                                          // 3 : allocating in round-robin fashion across all servers
)

var DefaultSaturatedAllocationPolicy SaturatedAllocationPolicy = None

default option for allocation under saturated condition

func SaturatedAllocationPolicyEnum ¶

func SaturatedAllocationPolicyEnum(s string) SaturatedAllocationPolicy

func (SaturatedAllocationPolicy) String ¶

func (p SaturatedAllocationPolicy) String() string

type ServerData ¶

type ServerData struct {
	Spec []ServerSpec `json:"servers"`
}

Data related to a Server

type ServerLoadSpec ¶

type ServerLoadSpec struct {
	ArrivalRate  float32 `json:"arrivalRate"`  // req/min
	AvgInTokens  int     `json:"avgInTokens"`  // average number of input tokens
	AvgOutTokens int     `json:"avgOutTokens"` // average number of output tokens
}

Specifications of server load statistics

type ServerSpec ¶

type ServerSpec struct {
	Name            string         `json:"name"`            // server name
	Class           string         `json:"class"`           // service class name
	Model           string         `json:"model"`           // model name
	KeepAccelerator bool           `json:"keepAccelerator"` // option to not change accelerator
	MinNumReplicas  int            `json:"minNumReplicas"`  // minimum number of replicas
	MaxBatchSize    int            `json:"maxBatchSize"`    // overriding value for the maximum batch size
	CurrentAlloc    AllocationData `json:"currentAlloc"`    // current allocation
	DesiredAlloc    AllocationData `json:"desiredAlloc"`    // desired allocation
}

Specifications of a server

type ServiceClassData ¶

type ServiceClassData struct {
	Spec []ServiceClassSpec `json:"serviceClasses"`
}

Data related to a service class SLOs

type ServiceClassSpec ¶

type ServiceClassSpec struct {
	Name         string        `json:"name"`         // service class name
	Priority     int           `json:"priority"`     // [1,100] priority (lower value is higher priority)
	ModelTargets []ModelTarget `json:"modelTargets"` // target SLOs for models
}

Specification of a service class

type SystemData ¶

type SystemData struct {
	Spec SystemSpec `json:"system"`
}

All data related to the system (accelerators, models, service classes, ...)

type SystemSpec ¶

type SystemSpec struct {
	// static data
	Accelerators   AcceleratorData  `json:"acceleratorData"`  // accelerator data
	Models         ModelData        `json:"modelData"`        // model data
	ServiceClasses ServiceClassData `json:"serviceClassData"` // service class data
	Servers        ServerData       `json:"serverData"`       // server data
	Optimizer      OptimizerData    `json:"optimizerData"`    // optimizer data

	// dynamic data
	Capacity CapacityData `json:"capacityData"` // data about accelerator type availability
}

Specifications for system data

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL