Documentation
¶
Index ¶
- Constants
- Variables
- func GetGPUOperatorNamespace() string
- func MockGpuInfo() *[]GpuInfo
- func SetGlobalConfig(config *GlobalConfig)
- type Alert
- type AlertRule
- type AutoMigrationConfig
- type AutoMigrationRules
- type AutoMigrationScope
- type FiringAlertCache
- type GPUFitConfig
- type GPUNetworkTopologyAwareConfig
- type GlobalConfig
- type GpuInfo
- type LabelSet
- type PartitionTemplateInfo
- type PostableAlert
Constants ¶
View Source
const ( // Default format for fast greptimedb ingestion // See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/ MetricsFormatInflux = "influx" // Json format with { "measure", "tag", "field", "ts"} MetricsFormatJson = "json" // Open telemetry format MetricsFormatOTel = "otel" // Default GPU operator namespace DefaultGPUOperatorNamespace = "gpu-operator" )
View Source
const ( IsolationGroupSharingExclusive = "exclusive" )
IsolationGroupSharing constants
Variables ¶
View Source
var MockDeployment = &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "pytorch-example", Namespace: "tensor-fusion", Labels: map[string]string{ "app": "pytorch-example", "tensor-fusion.ai/enabled": "true", }, }, Spec: appsv1.DeploymentSpec{ Replicas: ptr.To[int32](1), Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "app": "pytorch-example", }, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "app": "pytorch-example", "tensor-fusion.ai/enabled": "true", }, Annotations: map[string]string{ "tensor-fusion.ai/generate-workload": "true", "tensor-fusion.ai/gpupool": "mock", "tensor-fusion.ai/inject-container": "python", "tensor-fusion.ai/replicas": "1", "tensor-fusion.ai/tflops-limit": "10", "tensor-fusion.ai/tflops-request": "10", "tensor-fusion.ai/vram-limit": "1Gi", "tensor-fusion.ai/vram-request": "1Gi", "tensor-fusion.ai/workload": "pytorch-example", }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: "python", Image: "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime", Command: []string{"sh", "-c", "sleep", "1d"}, }, }, }, }, }, }
View Source
var MockGPUPoolSpec = &tfv1.GPUPoolSpec{ CapacityConfig: &tfv1.CapacityConfig{ Oversubscription: &tfv1.Oversubscription{ TFlopsOversellRatio: 2000, }, MinResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("100"), VRAM: resource.MustParse("10Gi"), }, MaxResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("3000"), VRAM: resource.MustParse("3000Gi"), }, WarmResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("2200"), VRAM: resource.MustParse("2020Gi"), }, }, NodeManagerConfig: &tfv1.NodeManagerConfig{ DefaultVendor: constants.AcceleratorVendorNvidia, NodeSelector: &corev1.NodeSelector{ NodeSelectorTerms: []corev1.NodeSelectorTerm{ { MatchExpressions: []corev1.NodeSelectorRequirement{ { Key: "mock-label", Operator: "In", Values: []string{"true"}, }, }, }, }, }, NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{ BatchPercentage: 25, BatchInterval: "10m", MaxDuration: "10m", MaintenanceWindow: tfv1.MaintenanceWindow{}, }, ProvisioningMode: tfv1.ProvisioningModeAutoSelect, }, ComponentConfig: &tfv1.ComponentConfig{ Hypervisor: &tfv1.HypervisorConfig{ Image: "hypervisor", VectorImage: "vector", PodTemplate: &runtime.RawExtension{ Raw: lo.Must(json.Marshal( corev1.PodTemplate{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Volumes: []corev1.Volume{ { Name: "mock-volume", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, }, }, Containers: []corev1.Container{ { Name: "tensorfusion-hypervisor", Image: "busybox:stable-glibc", Command: []string{"sleep", "infinity"}, VolumeMounts: []corev1.VolumeMount{ { Name: "mock-volume", MountPath: "/mock", }, }, }, }, }, }, }, )), }, }, Worker: &tfv1.WorkerConfig{ Image: "worker", PodTemplate: &runtime.RawExtension{ Raw: lo.Must(json.Marshal( corev1.PodTemplate{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ TerminationGracePeriodSeconds: ptr.To[int64](0), Containers: []corev1.Container{ { Name: "tensorfusion-worker", Image: "busybox:stable-glibc", Command: []string{"sleep", "infinity"}, }, }, }, }, }, )), }, }, Client: &tfv1.ClientConfig{ Image: "client", OperatorEndpoint: "http://localhost:8080", PatchToPod: &runtime.RawExtension{ Raw: lo.Must(json.Marshal(map[string]any{ "spec": map[string]any{ "initContainers": []corev1.Container{ { Name: "inject-lib", Image: "busybox:stable-glibc", }, }, }, })), }, PatchToContainer: &runtime.RawExtension{ Raw: lo.Must(json.Marshal(map[string]any{ "env": []corev1.EnvVar{ { Name: "LD_PRELOAD", Value: "tensorfusion.so", }, }, })), }, }, }, QosConfig: &tfv1.QosConfig{ Definitions: []tfv1.QosDefinition{ { Name: constants.QoSLevelMedium, }, { Name: constants.QoSLevelHigh, }, }, DefaultQoS: constants.QoSLevelMedium, Pricing: []tfv1.QosPricing{ { Qos: constants.QoSLevelMedium, Requests: tfv1.GPUResourcePricingUnit{ PerFP16TFlopsPerHour: "2", PerGBOfVRAMPerHour: "1", }, LimitsOverRequestsChargingRatio: "0.5", }, { Qos: constants.QoSLevelHigh, Requests: tfv1.GPUResourcePricingUnit{ PerFP16TFlopsPerHour: "2", PerGBOfVRAMPerHour: "1", }, LimitsOverRequestsChargingRatio: "0.8", }, }, }, }
This is for unit testing
Functions ¶
func GetGPUOperatorNamespace ¶ added in v1.54.1
func GetGPUOperatorNamespace() string
GetGPUOperatorNamespace returns the configured GPU operator namespace or default value
func MockGpuInfo ¶ added in v1.24.0
func MockGpuInfo() *[]GpuInfo
func SetGlobalConfig ¶ added in v1.41.0
func SetGlobalConfig(config *GlobalConfig)
Types ¶
type AlertRule ¶ added in v1.37.0
type AlertRule struct {
Name string `yaml:"name"`
Query string `yaml:"query"`
Threshold float64 `yaml:"threshold"`
EvaluationInterval string `yaml:"evaluationInterval"`
ConsecutiveCount int `yaml:"consecutiveCount"`
Severity string `yaml:"severity"`
Summary string `yaml:"summary"`
Description string `yaml:"description"`
RunBookURL string `yaml:"runbookURL"`
AlertTargetInstance string `yaml:"alertTargetInstance"`
// when the rule is in test mode, it will not send alerts to alert manager
TestMode bool
FiringAlerts map[string]*FiringAlertCache
// contains filtered or unexported fields
}
func (*AlertRule) AddFiringAlertAndCheckResolved ¶ added in v1.37.0
func (*AlertRule) CheckAndRemoveFiringAlerts ¶ added in v1.37.0
func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert
func (*AlertRule) IsTestMode ¶ added in v1.37.0
type AutoMigrationConfig ¶ added in v1.49.14
type AutoMigrationConfig struct {
Enable bool `yaml:"enable"`
Scope *AutoMigrationScope `yaml:"scope"`
}
type AutoMigrationRules ¶ added in v1.49.14
type AutoMigrationRules struct {
NamespaceNames []string `yaml:"namespaceNames"`
NamespaceSelector *metav1.LabelSelector `yaml:"namespaceSelector"`
PodSelector *metav1.LabelSelector `yaml:"podSelector"`
}
type AutoMigrationScope ¶ added in v1.49.14
type AutoMigrationScope struct {
Includes *AutoMigrationRules `yaml:"includes"`
Excludes *AutoMigrationRules `yaml:"excludes"`
}
type FiringAlertCache ¶ added in v1.37.0
type FiringAlertCache struct {
Alert PostableAlert
Count int
}
type GPUFitConfig ¶ added in v1.35.0
type GPUNetworkTopologyAwareConfig ¶ added in v1.35.0
type GPUNetworkTopologyAwareConfig struct {
TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}
type GlobalConfig ¶ added in v1.34.0
type GlobalConfig struct {
MetricsTTL string `yaml:"metricsTTL"`
MetricsFormat string `yaml:"metricsFormat"`
MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`
AlertRules []AlertRule `yaml:"alertRules"`
AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`
AutoScalingInterval string `yaml:"autoScalingInterval"`
GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
}
func GetGlobalConfig ¶ added in v1.41.0
func GetGlobalConfig() *GlobalConfig
func MockGlobalConfig ¶ added in v1.34.0
func MockGlobalConfig() *GlobalConfig
type GpuInfo ¶
type GpuInfo struct {
Model string `json:"model"`
Vendor string `json:"vendor"`
CostPerHour float64 `json:"costPerHour"`
Fp16TFlops resource.Quantity `json:"fp16TFlops"`
FullModelName string `json:"fullModelName"`
// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
// Only applicable for GPUs that support hardware partitioning
PartitionTemplates []PartitionTemplateInfo `json:"partitionTemplates,omitempty"`
// MaxPartitions is the maximum number of partitions this GPU can support (e.g., 7 for MIG)
MaxPartitions uint32 `json:"maxPartitions,omitempty"`
// MaxPlacementSlots is the maximum number of placement slots this GPU can support (e.g., 8 for NVIDIA MIG)
MaxPlacementSlots uint32 `json:"maxPlacementSlots,omitempty"`
// MaxIsolationGroups is the maximum number of isolation groups (e.g., 4 for Ascend vGroups)
// If not set, defaults to MaxPlacementSlots for backward compatibility
MaxIsolationGroups uint32 `json:"maxIsolationGroups,omitempty"`
// TotalExtendedResources defines the total capacity of extended resources for this GPU
// For Ascend NPU: {"AICORE": 8, "AICPU": 7, "VPC": 12, "VENC": 3, "VDEC": 12, "JPEGD": 16, "JPEGE": 8}
TotalExtendedResources map[string]uint32 `json:"totalExtendedResources,omitempty"`
}
type PartitionTemplateInfo ¶ added in v1.55.0
type PartitionTemplateInfo struct {
// TemplateID is the unique identifier for this partition template Profile `19` for 1g.10gb in A100
TemplateID string `json:"templateId"`
// Name is the unique identifier (e.g., "1g.24gb", "4g.94gb", "vir01", "vir04")
Name string `json:"name"`
// MemoryGigabytes is the memory allocated to this partition in gigabytes
MemoryGigabytes uint64 `json:"memoryGigabytes"`
// ComputePercent is the percent of sliced GPU (0-100)
ComputePercent float64 `json:"computePercent"`
// Description provides additional information about this template
Description string `json:"description,omitempty"`
// MaxPartition for this single template, eg. 1g.10gb+me can only be allocate once
MaxPartition uint32 `json:"maxPartition"`
// The placement limit for this template, use a bitmask to represent the placement limit
// e.g. sudo nvidia-smi mig -i 0 -lgipp
// GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
// GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
// GPU 0 Profile ID 15 Placements: {0,2,4,6}:2
// GPU 0 Profile ID 14 Placements: {0,2,4}:2
// GPU 0 Profile ID 9 Placements: {0,4}:4
// GPU 0 Profile ID 5 Placement : {0}:4
// GPU 0 Profile ID 0 Placement : {0}:8
PlacementLimit []uint32 `json:"placementLimit"`
PlacementOffSet uint32 `json:"placementOffSet"`
// ExtendedResources contains additional resource dimensions beyond compute and memory
// For Ascend NPU: {"AICORE": 1, "AICPU": 1, "VPC": 1, "VENC": 0, "VDEC": 1, "JPEGD": 2, "JPEGE": 1, "PNGD": 0}
// This enables multi-dimensional resource tracking for different accelerator types
ExtendedResources map[string]uint32 `json:"extendedResources,omitempty"`
// IsolationGroupSharing defines how isolation groups (vGroups for Ascend) are handled
// "exclusive" - each partition requires its own isolation group (default for most templates)
// "shared" - multiple partitions of this template can share an isolation group (time-sharing)
// For Ascend vir01: shared (can have 2 vNPUs per vGroup with time-sharing)
// For Ascend vir02/vir04: exclusive (each requires its own vGroup)
IsolationGroupSharing string `json:"isolationGroupSharing,omitempty"`
// MaxPartitionsPerIsolationGroup limits how many partitions of this template can share one group
// Only applicable when IsolationGroupSharing is "shared"
// For Ascend vir01: 2 (two vir01 can share one vGroup)
MaxPartitionsPerIsolationGroup uint32 `json:"maxPartitionsPerIsolationGroup,omitempty"`
// IsolationGroupSlots defines the minimum AI Core slots required by this template's isolation group
// For Ascend: vir01 uses 1 AICORE but vGroup requires min 2 AICORE
// vir02/vir04 use 2/4 AICORE respectively
IsolationGroupSlots uint32 `json:"isolationGroupSlots,omitempty"`
}
PartitionTemplateInfo contains detailed resource information for a partition template
type PostableAlert ¶ added in v1.37.0
type PostableAlert struct {
Alert
StartsAt time.Time `json:"startsAt,omitempty"`
EndsAt time.Time `json:"endsAt,omitempty"`
Annotations LabelSet `json:"annotations,omitempty"`
}
func CreateAlertData ¶ added in v1.37.0
Click to show internal directories.
Click to hide internal directories.