Documentation
¶
Index ¶
- Constants
- func BinarySearch(xMin float32, xMax float32, yTarget float32, ...) (float32, int, error)
- func EffectiveConcurrency(avgServiceTime float32, serviceParms *ServiceParms, requestSize *RequestSize, ...) float32
- func EvalITL(x float32) (float32, error)
- func EvalServTime(x float32) (float32, error)
- func EvalTTFT(x float32) (float32, error)
- func EvalWaitingTime(x float32) (float32, error)
- func WithinTolerance(x, value, tolerance float32) bool
- type AnalysisMetrics
- type Configuration
- type DecodeParms
- type MM1KModel
- type MM1ModelStateDependent
- type PrefillParms
- type QueueAnalyzer
- type QueueModel
- func (m *QueueModel) GetAvgNumInSystem() float32
- func (m *QueueModel) GetAvgQueueLength() float32
- func (m *QueueModel) GetAvgRespTime() float32
- func (m *QueueModel) GetAvgServTime() float32
- func (m *QueueModel) GetAvgWaitTime() float32
- func (m *QueueModel) GetLambda() float32
- func (m *QueueModel) GetMu() float32
- func (m *QueueModel) GetRho() float32
- func (m *QueueModel) IsValid() bool
- func (m *QueueModel) Solve(lambda float32, mu float32)
- func (m *QueueModel) String() string
- type RateRange
- type RequestSize
- type ServiceParms
- type TargetPerf
- type TargetRate
Constants ¶
const Epsilon = float32(0.001)
small disturbance around a value
const StabilitySafetyFraction = float32(0.1)
fraction of maximum server throughput to provide stability (running this fraction below the maximum)
Variables ¶
This section is empty.
Functions ¶
func BinarySearch ¶
func BinarySearch(xMin float32, xMax float32, yTarget float32, eval func(float32) (float32, error)) (float32, int, error)
Binary search: find xStar in a range [xMin, xMax] such that f(xStar)=yTarget. Function f() must be monotonically increasing or decreasing over the range. Returns an indicator of whether target is below (-1), within (0), or above (+1) the bounded region. Returns an error if the function cannot be evaluated or the target is not found.
func EffectiveConcurrency ¶
func EffectiveConcurrency(avgServiceTime float32, serviceParms *ServiceParms, requestSize *RequestSize, maxBatchSize int) float32
calculate effective average number of requests in service (n), given average request service time
- n has to satisfy: prefillTime(n) + totalDecodeTime(n) = avgServiceTime
- prefillTime(n) = gamma + delta * inTokens * n
- totalDecodeTime(n) = (alpha + beta * n) * (outTokens - 1)
func EvalServTime ¶
Function used in binary search (target service time)
func EvalWaitingTime ¶
Function used in binary search (target waiting time)
func WithinTolerance ¶
A variable x is relatively within a given tolerance from a value
Types ¶
type AnalysisMetrics ¶
type AnalysisMetrics struct {
Throughput float32 // effective throughput (requests/sec)
AvgRespTime float32 // average request response time (aka latency) (msec)
AvgWaitTime float32 // average request queueing time (msec)
AvgNumInServ float32 // average number of requests in service
AvgPrefillTime float32 // average request prefill time (msec)
AvgTokenTime float32 // average token decode time (msec)
MaxRate float32 // maximum throughput (requests/sec)
Rho float32 // utilization
}
analysis solution metrics data
func (*AnalysisMetrics) String ¶
func (am *AnalysisMetrics) String() string
type Configuration ¶
type Configuration struct {
MaxBatchSize int // maximum batch size (limit on the number of requests concurrently receiving service >0)
MaxQueueSize int // maximum queue size (limit on the number of requests queued for servive >=0)
ServiceParms *ServiceParms // request processing parameters
}
queue configuration parameters
func (*Configuration) String ¶
func (c *Configuration) String() string
type DecodeParms ¶
decode time = alpha + beta * batchSize (msec); batchSize > 0
func (*DecodeParms) DecodeTime ¶
func (p *DecodeParms) DecodeTime(batchSize float32) float32
func (*DecodeParms) String ¶
func (p *DecodeParms) String() string
type MM1KModel ¶
type MM1KModel struct {
QueueModel // extends base class
K int // limit on number in system
// contains filtered or unexported fields
}
M/M/1/K Finite storage single server queue
func NewMM1KModel ¶
func (*MM1KModel) ComputeRho ¶
Compute utilization of queueing model
func (*MM1KModel) GetProbabilities ¶
func (*MM1KModel) GetThroughput ¶
type MM1ModelStateDependent ¶
type MM1ModelStateDependent struct {
MM1KModel // extends base class
// contains filtered or unexported fields
}
M/M/1 model with state dependent service rate
var Model *MM1ModelStateDependent
model as global variable, accesses by eval functions
func NewMM1ModelStateDependent ¶
func NewMM1ModelStateDependent(K int, servRate []float32) *MM1ModelStateDependent
func (*MM1ModelStateDependent) ComputeRho ¶
func (m *MM1ModelStateDependent) ComputeRho() float32
Compute utilization of queueing model
func (*MM1ModelStateDependent) GetAvgNumInServers ¶
func (m *MM1ModelStateDependent) GetAvgNumInServers() float32
func (*MM1ModelStateDependent) Solve ¶
func (m *MM1ModelStateDependent) Solve(lambda float32, mu float32)
Solve queueing model given arrival and service rates
func (*MM1ModelStateDependent) String ¶
func (m *MM1ModelStateDependent) String() string
type PrefillParms ¶
prefill time = gamma + delta * inputTokens * batchSize (msec); inputTokens > 0
func (*PrefillParms) PrefillTime ¶
func (p *PrefillParms) PrefillTime(avgInputTokens int, batchSize float32) float32
func (*PrefillParms) String ¶
func (p *PrefillParms) String() string
type QueueAnalyzer ¶
type QueueAnalyzer struct {
MaxBatchSize int // maximum batch size
MaxQueueSize int // maximum queue size
ServiceParms *ServiceParms // request processing parameters
RequestSize *RequestSize // number of input and output tokens per request
Model *MM1ModelStateDependent // queueing model
RateRange *RateRange // range of request rates for model stability
}
Analyzer of inference server queue
func BuildModel ¶
func BuildModel(qConfig *Configuration, requestSize *RequestSize) (modelData *QueueAnalyzer)
build queueing model using service rates, leaving arrival rate as parameter
func NewQueueAnalyzer ¶
func NewQueueAnalyzer(qConfig *Configuration, requestSize *RequestSize) (*QueueAnalyzer, error)
create a new queue analyzer from config
func (*QueueAnalyzer) Analyze ¶
func (qa *QueueAnalyzer) Analyze(requestRate float32) (metrics *AnalysisMetrics, err error)
evaluate performance metrics given request rate
func (*QueueAnalyzer) Size ¶
func (qa *QueueAnalyzer) Size(targetPerf *TargetPerf) (targetRate *TargetRate, metrics *AnalysisMetrics, achieved *TargetPerf, err error)
evaluate max request rates to achieve a given target performance, returns
- max request rates
- performance metrics at min of max request rates
- achieved values of targets
func (*QueueAnalyzer) String ¶
func (qa *QueueAnalyzer) String() string
type QueueModel ¶
type QueueModel struct {
ComputeRho func() float32 // compute utilization of queueing model
GetRhoMax func() float32 // compute the maximum utilization of queueing model
// contains filtered or unexported fields
}
Basic Queueing Model (Abstract Class)
func (*QueueModel) GetAvgNumInSystem ¶
func (m *QueueModel) GetAvgNumInSystem() float32
func (*QueueModel) GetAvgQueueLength ¶
func (m *QueueModel) GetAvgQueueLength() float32
func (*QueueModel) GetAvgRespTime ¶
func (m *QueueModel) GetAvgRespTime() float32
func (*QueueModel) GetAvgServTime ¶
func (m *QueueModel) GetAvgServTime() float32
func (*QueueModel) GetAvgWaitTime ¶
func (m *QueueModel) GetAvgWaitTime() float32
func (*QueueModel) GetLambda ¶
func (m *QueueModel) GetLambda() float32
func (*QueueModel) GetMu ¶
func (m *QueueModel) GetMu() float32
func (*QueueModel) GetRho ¶
func (m *QueueModel) GetRho() float32
func (*QueueModel) IsValid ¶
func (m *QueueModel) IsValid() bool
func (*QueueModel) Solve ¶
func (m *QueueModel) Solve(lambda float32, mu float32)
Solve queueing model given arrival and service rates
func (*QueueModel) String ¶
func (m *QueueModel) String() string
type RateRange ¶
type RateRange struct {
Min float32 // lowest rate (slightly larger than zero)
Max float32 // highest rate (slightly less than maximum service rate)
}
range of request rates (requests/sec)
type RequestSize ¶
type RequestSize struct {
AvgInputTokens int // average number of input tokens per request
AvgOutputTokens int // average number of output tokens per request
}
request tokens data
func (*RequestSize) String ¶
func (rq *RequestSize) String() string
type ServiceParms ¶
type ServiceParms struct {
Prefill *PrefillParms // parameters to calculate prefill time
Decode *DecodeParms // parameters to calculate decode time
}
request processing parameters
func (*ServiceParms) String ¶
func (sp *ServiceParms) String() string
type TargetPerf ¶
type TargetPerf struct {
TargetTTFT float32 // target time to first token (queueing + prefill) (msec)
TargetITL float32 // target inter-token latency (msec)
TargetTPS float32 // target token generation throughtput (tokens/sec)
}
queue performance targets
func (*TargetPerf) String ¶
func (tp *TargetPerf) String() string
type TargetRate ¶
type TargetRate struct {
RateTargetTTFT float32 // max request rate for target TTFT (requests/sec)
RateTargetITL float32 // max request rate for target ITL (requests/sec)
RateTargetTPS float32 // max request rate for target TPS (requests/sec)
}
queue max request rates to achieve performance targets
func (*TargetRate) String ¶
func (tr *TargetRate) String() string