config

package
v1.56.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 23, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Default format for fast greptimedb ingestion
	// See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/
	MetricsFormatInflux = "influx"

	// Json format with { "measure", "tag", "field", "ts"}
	MetricsFormatJson = "json"

	// Open telemetry format
	MetricsFormatOTel = "otel"

	// Default GPU operator namespace
	DefaultGPUOperatorNamespace = "gpu-operator"
)
View Source
const (
	IsolationGroupSharingExclusive = "exclusive"
	IsolationGroupSharingShared    = "shared"
)

IsolationGroupSharing constants

Variables

View Source
var MockDeployment = &appsv1.Deployment{
	ObjectMeta: metav1.ObjectMeta{
		Name:      "pytorch-example",
		Namespace: "tensor-fusion",
		Labels: map[string]string{
			"app":                      "pytorch-example",
			"tensor-fusion.ai/enabled": "true",
		},
	},
	Spec: appsv1.DeploymentSpec{
		Replicas: ptr.To[int32](1),
		Selector: &metav1.LabelSelector{
			MatchLabels: map[string]string{
				"app": "pytorch-example",
			},
		},
		Template: corev1.PodTemplateSpec{
			ObjectMeta: metav1.ObjectMeta{
				Labels: map[string]string{
					"app":                      "pytorch-example",
					"tensor-fusion.ai/enabled": "true",
				},
				Annotations: map[string]string{
					"tensor-fusion.ai/generate-workload": "true",
					"tensor-fusion.ai/gpupool":           "mock",
					"tensor-fusion.ai/inject-container":  "python",
					"tensor-fusion.ai/replicas":          "1",
					"tensor-fusion.ai/tflops-limit":      "10",
					"tensor-fusion.ai/tflops-request":    "10",
					"tensor-fusion.ai/vram-limit":        "1Gi",
					"tensor-fusion.ai/vram-request":      "1Gi",
					"tensor-fusion.ai/workload":          "pytorch-example",
				},
			},
			Spec: corev1.PodSpec{
				Containers: []corev1.Container{
					{
						Name:    "python",
						Image:   "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime",
						Command: []string{"sh", "-c", "sleep", "1d"},
					},
				},
			},
		},
	},
}
View Source
var MockGPUPoolSpec = &tfv1.GPUPoolSpec{
	CapacityConfig: &tfv1.CapacityConfig{
		Oversubscription: &tfv1.Oversubscription{
			TFlopsOversellRatio: 2000,
		},
		MinResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("100"),
			VRAM:   resource.MustParse("10Gi"),
		},
		MaxResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("3000"),
			VRAM:   resource.MustParse("3000Gi"),
		},
		WarmResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("2200"),
			VRAM:   resource.MustParse("2020Gi"),
		},
	},
	NodeManagerConfig: &tfv1.NodeManagerConfig{
		DefaultVendor: constants.AcceleratorVendorNvidia,
		NodeSelector: &corev1.NodeSelector{
			NodeSelectorTerms: []corev1.NodeSelectorTerm{
				{
					MatchExpressions: []corev1.NodeSelectorRequirement{
						{
							Key:      "mock-label",
							Operator: "In",
							Values:   []string{"true"},
						},
					},
				},
			},
		},
		NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{
			BatchPercentage:   25,
			BatchInterval:     "10m",
			MaxDuration:       "10m",
			MaintenanceWindow: tfv1.MaintenanceWindow{},
		},
		ProvisioningMode: tfv1.ProvisioningModeAutoSelect,
	},
	ComponentConfig: &tfv1.ComponentConfig{
		Hypervisor: &tfv1.HypervisorConfig{
			Image:       "hypervisor",
			VectorImage: "vector",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								Volumes: []corev1.Volume{
									{
										Name: "mock-volume",
										VolumeSource: corev1.VolumeSource{
											EmptyDir: &corev1.EmptyDirVolumeSource{},
										},
									},
								},
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-hypervisor",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
										VolumeMounts: []corev1.VolumeMount{
											{
												Name:      "mock-volume",
												MountPath: "/mock",
											},
										},
									},
								},
							},
						},
					},
				)),
			},
		},
		Worker: &tfv1.WorkerConfig{
			Image: "worker",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-worker",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Client: &tfv1.ClientConfig{
			Image:            "client",
			OperatorEndpoint: "http://localhost:8080",
			PatchToPod: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"spec": map[string]any{
						"initContainers": []corev1.Container{
							{
								Name:  "inject-lib",
								Image: "busybox:stable-glibc",
							},
						},
					},
				})),
			},
			PatchToContainer: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"env": []corev1.EnvVar{
						{
							Name:  "LD_PRELOAD",
							Value: "tensorfusion.so",
						},
					},
				})),
			},
		},
	},
	QosConfig: &tfv1.QosConfig{
		Definitions: []tfv1.QosDefinition{
			{
				Name: constants.QoSLevelMedium,
			},
			{
				Name: constants.QoSLevelHigh,
			},
		},
		DefaultQoS: constants.QoSLevelMedium,
		Pricing: []tfv1.QosPricing{
			{
				Qos: constants.QoSLevelMedium,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.5",
			},
			{
				Qos: constants.QoSLevelHigh,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.8",
			},
		},
	},
}

This is for unit testing

Functions

func GetGPUOperatorNamespace added in v1.54.1

func GetGPUOperatorNamespace() string

GetGPUOperatorNamespace returns the configured GPU operator namespace or default value

func MockGpuInfo added in v1.24.0

func MockGpuInfo() *[]GpuInfo

func SetGlobalConfig added in v1.41.0

func SetGlobalConfig(config *GlobalConfig)

Types

type Alert added in v1.37.0

type Alert struct {
	Labels       LabelSet `json:"labels"`
	GeneratorURL string   `json:"generatorURL,omitempty"`
}

type AlertRule added in v1.37.0

type AlertRule struct {
	Name                string  `yaml:"name"`
	Query               string  `yaml:"query"`
	Threshold           float64 `yaml:"threshold"`
	EvaluationInterval  string  `yaml:"evaluationInterval"`
	ConsecutiveCount    int     `yaml:"consecutiveCount"`
	Severity            string  `yaml:"severity"`
	Summary             string  `yaml:"summary"`
	Description         string  `yaml:"description"`
	RunBookURL          string  `yaml:"runbookURL"`
	AlertTargetInstance string  `yaml:"alertTargetInstance"`

	// when the rule is in test mode, it will not send alerts to alert manager
	TestMode bool

	FiringAlerts map[string]*FiringAlertCache
	// contains filtered or unexported fields
}

func (*AlertRule) AddFiringAlertAndCheckResolved added in v1.37.0

func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]any) (*PostableAlert, bool, string)

func (*AlertRule) CheckAndRemoveFiringAlerts added in v1.37.0

func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert

func (*AlertRule) IsTestMode added in v1.37.0

func (r *AlertRule) IsTestMode() bool

func (*AlertRule) String added in v1.37.0

func (r *AlertRule) String() string

type AutoMigrationConfig added in v1.49.14

type AutoMigrationConfig struct {
	Enable bool                `yaml:"enable"`
	Scope  *AutoMigrationScope `yaml:"scope"`
}

type AutoMigrationRules added in v1.49.14

type AutoMigrationRules struct {
	NamespaceNames    []string              `yaml:"namespaceNames"`
	NamespaceSelector *metav1.LabelSelector `yaml:"namespaceSelector"`
	PodSelector       *metav1.LabelSelector `yaml:"podSelector"`
}

type AutoMigrationScope added in v1.49.14

type AutoMigrationScope struct {
	Includes *AutoMigrationRules `yaml:"includes"`
	Excludes *AutoMigrationRules `yaml:"excludes"`
}

type FiringAlertCache added in v1.37.0

type FiringAlertCache struct {
	Alert PostableAlert
	Count int
}

type GPUFitConfig added in v1.35.0

type GPUFitConfig struct {
	MaxWorkerPerNode int `json:"maxWorkerPerNode"`

	VramWeight   float64 `json:"vramWeight"`
	TflopsWeight float64 `json:"tflopsWeight"`
}

type GPUNetworkTopologyAwareConfig added in v1.35.0

type GPUNetworkTopologyAwareConfig struct {
	TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}

type GlobalConfig added in v1.34.0

type GlobalConfig struct {
	MetricsTTL            string            `yaml:"metricsTTL"`
	MetricsFormat         string            `yaml:"metricsFormat"`
	MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`

	AlertRules    []AlertRule          `yaml:"alertRules"`
	AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`

	AutoScalingInterval  string `yaml:"autoScalingInterval"`
	GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
}

func GetGlobalConfig added in v1.41.0

func GetGlobalConfig() *GlobalConfig

func MockGlobalConfig added in v1.34.0

func MockGlobalConfig() *GlobalConfig

type GpuInfo

type GpuInfo struct {
	Model         string            `json:"model"`
	Vendor        string            `json:"vendor"`
	CostPerHour   float64           `json:"costPerHour"`
	Fp16TFlops    resource.Quantity `json:"fp16TFlops"`
	FullModelName string            `json:"fullModelName"`

	// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
	// Only applicable for GPUs that support hardware partitioning
	PartitionTemplates []PartitionTemplateInfo `json:"partitionTemplates,omitempty"`

	// MaxPartitions is the maximum number of partitions this GPU can support (e.g., 7 for MIG)
	MaxPartitions uint32 `json:"maxPartitions,omitempty"`

	// MaxPlacementSlots is the maximum number of placement slots this GPU can support (e.g., 8 for NVIDIA MIG)
	MaxPlacementSlots uint32 `json:"maxPlacementSlots,omitempty"`

	// MaxIsolationGroups is the maximum number of isolation groups (e.g., 4 for Ascend vGroups)
	// If not set, defaults to MaxPlacementSlots for backward compatibility
	MaxIsolationGroups uint32 `json:"maxIsolationGroups,omitempty"`

	// TotalExtendedResources defines the total capacity of extended resources for this GPU
	// For Ascend NPU: {"AICORE": 8, "AICPU": 7, "VPC": 12, "VENC": 3, "VDEC": 12, "JPEGD": 16, "JPEGE": 8}
	TotalExtendedResources map[string]uint32 `json:"totalExtendedResources,omitempty"`
}

type LabelSet added in v1.37.0

type LabelSet map[string]string

type PartitionTemplateInfo added in v1.55.0

type PartitionTemplateInfo struct {
	// TemplateID is the unique identifier for this partition template Profile `19` for 1g.10gb in A100
	TemplateID string `json:"templateId"`

	// Name is the unique identifier (e.g., "1g.24gb", "4g.94gb", "vir01", "vir04")
	Name string `json:"name"`

	// MemoryGigabytes is the memory allocated to this partition in gigabytes
	MemoryGigabytes uint64 `json:"memoryGigabytes"`

	// ComputePercent is the percent of sliced GPU (0-100)
	ComputePercent float64 `json:"computePercent"`

	// Description provides additional information about this template
	Description string `json:"description,omitempty"`

	// MaxPartition for this single template, eg. 1g.10gb+me can only be allocate once
	MaxPartition uint32 `json:"maxPartition"`

	// The placement limit for this template, use a bitmask to represent the placement limit
	// e.g. sudo nvidia-smi mig -i 0 -lgipp
	// GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
	// GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
	// GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
	// GPU  0 Profile ID 14 Placements: {0,2,4}:2
	// GPU  0 Profile ID  9 Placements: {0,4}:4
	// GPU  0 Profile ID  5 Placement : {0}:4
	// GPU  0 Profile ID  0 Placement : {0}:8
	PlacementLimit  []uint32 `json:"placementLimit"`
	PlacementOffSet uint32   `json:"placementOffSet"`

	// ExtendedResources contains additional resource dimensions beyond compute and memory
	// For Ascend NPU: {"AICORE": 1, "AICPU": 1, "VPC": 1, "VENC": 0, "VDEC": 1, "JPEGD": 2, "JPEGE": 1, "PNGD": 0}
	// This enables multi-dimensional resource tracking for different accelerator types
	ExtendedResources map[string]uint32 `json:"extendedResources,omitempty"`

	// IsolationGroupSharing defines how isolation groups (vGroups for Ascend) are handled
	// "exclusive" - each partition requires its own isolation group (default for most templates)
	// "shared" - multiple partitions of this template can share an isolation group (time-sharing)
	// For Ascend vir01: shared (can have 2 vNPUs per vGroup with time-sharing)
	// For Ascend vir02/vir04: exclusive (each requires its own vGroup)
	IsolationGroupSharing string `json:"isolationGroupSharing,omitempty"`

	// MaxPartitionsPerIsolationGroup limits how many partitions of this template can share one group
	// Only applicable when IsolationGroupSharing is "shared"
	// For Ascend vir01: 2 (two vir01 can share one vGroup)
	MaxPartitionsPerIsolationGroup uint32 `json:"maxPartitionsPerIsolationGroup,omitempty"`

	// IsolationGroupSlots defines the minimum AI Core slots required by this template's isolation group
	// For Ascend: vir01 uses 1 AICORE but vGroup requires min 2 AICORE
	// vir02/vir04 use 2/4 AICORE respectively
	IsolationGroupSlots uint32 `json:"isolationGroupSlots,omitempty"`
}

PartitionTemplateInfo contains detailed resource information for a partition template

type PostableAlert added in v1.37.0

type PostableAlert struct {
	Alert
	StartsAt    time.Time `json:"startsAt,omitempty"`
	EndsAt      time.Time `json:"endsAt,omitempty"`
	Annotations LabelSet  `json:"annotations,omitempty"`
}

func CreateAlertData added in v1.37.0

func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, startsAt time.Time) PostableAlert

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL