config

package

v1.56.2 Latest Latest Go to latest Published: Jan 23, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NexusGPU/tensor-fusion

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func GetGPUOperatorNamespace() string
func MockGpuInfo() *[]GpuInfo
func SetGlobalConfig(config *GlobalConfig)
type Alert
type AlertRule
- func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]any) (*PostableAlert, bool, string)
- func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert
- func (r *AlertRule) IsTestMode() bool
- func (r *AlertRule) String() string
type AutoMigrationConfig
type AutoMigrationRules
type AutoMigrationScope
type FiringAlertCache
type GPUFitConfig
type GPUNetworkTopologyAwareConfig
type GlobalConfig
- func GetGlobalConfig() *GlobalConfig
- func MockGlobalConfig() *GlobalConfig
type GpuInfo
type LabelSet
type PartitionTemplateInfo
type PostableAlert
- func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, ...) PostableAlert

Constants ¶

View Source

const (
	// Default format for fast greptimedb ingestion
	// See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/
	MetricsFormatInflux = "influx"

	// Json format with { "measure", "tag", "field", "ts"}
	MetricsFormatJson = "json"

	// Open telemetry format
	MetricsFormatOTel = "otel"

	// Default GPU operator namespace
	DefaultGPUOperatorNamespace = "gpu-operator"
)

View Source

const (
	IsolationGroupSharingExclusive = "exclusive"
	IsolationGroupSharingShared    = "shared"
)

IsolationGroupSharing constants

Variables ¶

View Source

var MockDeployment = &appsv1.Deployment{
	ObjectMeta: metav1.ObjectMeta{
		Name:      "pytorch-example",
		Namespace: "tensor-fusion",
		Labels: map[string]string{
			"app":                      "pytorch-example",
			"tensor-fusion.ai/enabled": "true",
		},
	},
	Spec: appsv1.DeploymentSpec{
		Replicas: ptr.To[int32](1),
		Selector: &metav1.LabelSelector{
			MatchLabels: map[string]string{
				"app": "pytorch-example",
			},
		},
		Template: corev1.PodTemplateSpec{
			ObjectMeta: metav1.ObjectMeta{
				Labels: map[string]string{
					"app":                      "pytorch-example",
					"tensor-fusion.ai/enabled": "true",
				},
				Annotations: map[string]string{
					"tensor-fusion.ai/generate-workload": "true",
					"tensor-fusion.ai/gpupool":           "mock",
					"tensor-fusion.ai/inject-container":  "python",
					"tensor-fusion.ai/replicas":          "1",
					"tensor-fusion.ai/tflops-limit":      "10",
					"tensor-fusion.ai/tflops-request":    "10",
					"tensor-fusion.ai/vram-limit":        "1Gi",
					"tensor-fusion.ai/vram-request":      "1Gi",
					"tensor-fusion.ai/workload":          "pytorch-example",
				},
			},
			Spec: corev1.PodSpec{
				Containers: []corev1.Container{
					{
						Name:    "python",
						Image:   "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime",
						Command: []string{"sh", "-c", "sleep", "1d"},
					},
				},
			},
		},
	},
}

View Source

var MockGPUPoolSpec = &tfv1.GPUPoolSpec{
	CapacityConfig: &tfv1.CapacityConfig{
		Oversubscription: &tfv1.Oversubscription{
			TFlopsOversellRatio: 2000,
		},
		MinResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("100"),
			VRAM:   resource.MustParse("10Gi"),
		},
		MaxResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("3000"),
			VRAM:   resource.MustParse("3000Gi"),
		},
		WarmResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("2200"),
			VRAM:   resource.MustParse("2020Gi"),
		},
	},
	NodeManagerConfig: &tfv1.NodeManagerConfig{
		DefaultVendor: constants.AcceleratorVendorNvidia,
		NodeSelector: &corev1.NodeSelector{
			NodeSelectorTerms: []corev1.NodeSelectorTerm{
				{
					MatchExpressions: []corev1.NodeSelectorRequirement{
						{
							Key:      "mock-label",
							Operator: "In",
							Values:   []string{"true"},
						},
					},
				},
			},
		},
		NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{
			BatchPercentage:   25,
			BatchInterval:     "10m",
			MaxDuration:       "10m",
			MaintenanceWindow: tfv1.MaintenanceWindow{},
		},
		ProvisioningMode: tfv1.ProvisioningModeAutoSelect,
	},
	ComponentConfig: &tfv1.ComponentConfig{
		Hypervisor: &tfv1.HypervisorConfig{
			Image:       "hypervisor",
			VectorImage: "vector",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								Volumes: []corev1.Volume{
									{
										Name: "mock-volume",
										VolumeSource: corev1.VolumeSource{
											EmptyDir: &corev1.EmptyDirVolumeSource{},
										},
									},
								},
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-hypervisor",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
										VolumeMounts: []corev1.VolumeMount{
											{
												Name:      "mock-volume",
												MountPath: "/mock",
											},
										},
									},
								},
							},
						},
					},
				)),
			},
		},
		Worker: &tfv1.WorkerConfig{
			Image: "worker",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-worker",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Client: &tfv1.ClientConfig{
			Image:            "client",
			OperatorEndpoint: "http://localhost:8080",
			PatchToPod: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"spec": map[string]any{
						"initContainers": []corev1.Container{
							{
								Name:  "inject-lib",
								Image: "busybox:stable-glibc",
							},
						},
					},
				})),
			},
			PatchToContainer: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"env": []corev1.EnvVar{
						{
							Name:  "LD_PRELOAD",
							Value: "tensorfusion.so",
						},
					},
				})),
			},
		},
	},
	QosConfig: &tfv1.QosConfig{
		Definitions: []tfv1.QosDefinition{
			{
				Name: constants.QoSLevelMedium,
			},
			{
				Name: constants.QoSLevelHigh,
			},
		},
		DefaultQoS: constants.QoSLevelMedium,
		Pricing: []tfv1.QosPricing{
			{
				Qos: constants.QoSLevelMedium,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.5",
			},
			{
				Qos: constants.QoSLevelHigh,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.8",
			},
		},
	},
}

This is for unit testing

Functions ¶

func GetGPUOperatorNamespace ¶ added in v1.54.1

func GetGPUOperatorNamespace() string

GetGPUOperatorNamespace returns the configured GPU operator namespace or default value

func MockGpuInfo ¶ added in v1.24.0

func MockGpuInfo() *[]GpuInfo

func SetGlobalConfig ¶ added in v1.41.0

func SetGlobalConfig(config *GlobalConfig)

Types ¶

type Alert ¶ added in v1.37.0

type Alert struct {
	Labels       LabelSet `json:"labels"`
	GeneratorURL string   `json:"generatorURL,omitempty"`
}

type AlertRule ¶ added in v1.37.0

type AlertRule struct {
	Name                string  `yaml:"name"`
	Query               string  `yaml:"query"`
	Threshold           float64 `yaml:"threshold"`
	EvaluationInterval  string  `yaml:"evaluationInterval"`
	ConsecutiveCount    int     `yaml:"consecutiveCount"`
	Severity            string  `yaml:"severity"`
	Summary             string  `yaml:"summary"`
	Description         string  `yaml:"description"`
	RunBookURL          string  `yaml:"runbookURL"`
	AlertTargetInstance string  `yaml:"alertTargetInstance"`

	// when the rule is in test mode, it will not send alerts to alert manager
	TestMode bool

	FiringAlerts map[string]*FiringAlertCache
	// contains filtered or unexported fields
}

func (*AlertRule) AddFiringAlertAndCheckResolved ¶ added in v1.37.0

func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]any) (*PostableAlert, bool, string)

func (*AlertRule) CheckAndRemoveFiringAlerts ¶ added in v1.37.0

func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert

func (*AlertRule) IsTestMode ¶ added in v1.37.0

func (r *AlertRule) IsTestMode() bool

func (*AlertRule) String ¶ added in v1.37.0

func (r *AlertRule) String() string

type AutoMigrationConfig ¶ added in v1.49.14

type AutoMigrationConfig struct {
	Enable bool                `yaml:"enable"`
	Scope  *AutoMigrationScope `yaml:"scope"`
}

type AutoMigrationRules ¶ added in v1.49.14

type AutoMigrationRules struct {
	NamespaceNames    []string              `yaml:"namespaceNames"`
	NamespaceSelector *metav1.LabelSelector `yaml:"namespaceSelector"`
	PodSelector       *metav1.LabelSelector `yaml:"podSelector"`
}

type AutoMigrationScope ¶ added in v1.49.14

type AutoMigrationScope struct {
	Includes *AutoMigrationRules `yaml:"includes"`
	Excludes *AutoMigrationRules `yaml:"excludes"`
}

type FiringAlertCache ¶ added in v1.37.0

type FiringAlertCache struct {
	Alert PostableAlert
	Count int
}

type GPUFitConfig ¶ added in v1.35.0

type GPUFitConfig struct {
	MaxWorkerPerNode int `json:"maxWorkerPerNode"`

	VramWeight   float64 `json:"vramWeight"`
	TflopsWeight float64 `json:"tflopsWeight"`
}

type GPUNetworkTopologyAwareConfig ¶ added in v1.35.0

type GPUNetworkTopologyAwareConfig struct {
	TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}

type GlobalConfig ¶ added in v1.34.0

type GlobalConfig struct {
	MetricsTTL            string            `yaml:"metricsTTL"`
	MetricsFormat         string            `yaml:"metricsFormat"`
	MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`

	AlertRules    []AlertRule          `yaml:"alertRules"`
	AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`

	AutoScalingInterval  string `yaml:"autoScalingInterval"`
	GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
}

func GetGlobalConfig ¶ added in v1.41.0

func GetGlobalConfig() *GlobalConfig

func MockGlobalConfig ¶ added in v1.34.0

func MockGlobalConfig() *GlobalConfig

type GpuInfo ¶

type GpuInfo struct {
	Model         string            `json:"model"`
	Vendor        string            `json:"vendor"`
	CostPerHour   float64           `json:"costPerHour"`
	Fp16TFlops    resource.Quantity `json:"fp16TFlops"`
	FullModelName string            `json:"fullModelName"`

	// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
	// Only applicable for GPUs that support hardware partitioning
	PartitionTemplates []PartitionTemplateInfo `json:"partitionTemplates,omitempty"`

	// MaxPartitions is the maximum number of partitions this GPU can support (e.g., 7 for MIG)
	MaxPartitions uint32 `json:"maxPartitions,omitempty"`

	// MaxPlacementSlots is the maximum number of placement slots this GPU can support (e.g., 8 for NVIDIA MIG)
	MaxPlacementSlots uint32 `json:"maxPlacementSlots,omitempty"`

	// MaxIsolationGroups is the maximum number of isolation groups (e.g., 4 for Ascend vGroups)
	// If not set, defaults to MaxPlacementSlots for backward compatibility
	MaxIsolationGroups uint32 `json:"maxIsolationGroups,omitempty"`

	// TotalExtendedResources defines the total capacity of extended resources for this GPU
	// For Ascend NPU: {"AICORE": 8, "AICPU": 7, "VPC": 12, "VENC": 3, "VDEC": 12, "JPEGD": 16, "JPEGE": 8}
	TotalExtendedResources map[string]uint32 `json:"totalExtendedResources,omitempty"`
}

type LabelSet ¶ added in v1.37.0

type LabelSet map[string]string

type PartitionTemplateInfo ¶ added in v1.55.0

type PartitionTemplateInfo struct {
	// TemplateID is the unique identifier for this partition template Profile `19` for 1g.10gb in A100
	TemplateID string `json:"templateId"`

	// Name is the unique identifier (e.g., "1g.24gb", "4g.94gb", "vir01", "vir04")
	Name string `json:"name"`

	// MemoryGigabytes is the memory allocated to this partition in gigabytes
	MemoryGigabytes uint64 `json:"memoryGigabytes"`

	// ComputePercent is the percent of sliced GPU (0-100)
	ComputePercent float64 `json:"computePercent"`

	// Description provides additional information about this template
	Description string `json:"description,omitempty"`

	// MaxPartition for this single template, eg. 1g.10gb+me can only be allocate once
	MaxPartition uint32 `json:"maxPartition"`

	// The placement limit for this template, use a bitmask to represent the placement limit
	// e.g. sudo nvidia-smi mig -i 0 -lgipp
	// GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
	// GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
	// GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
	// GPU  0 Profile ID 14 Placements: {0,2,4}:2
	// GPU  0 Profile ID  9 Placements: {0,4}:4
	// GPU  0 Profile ID  5 Placement : {0}:4
	// GPU  0 Profile ID  0 Placement : {0}:8
	PlacementLimit  []uint32 `json:"placementLimit"`
	PlacementOffSet uint32   `json:"placementOffSet"`

	// ExtendedResources contains additional resource dimensions beyond compute and memory
	// For Ascend NPU: {"AICORE": 1, "AICPU": 1, "VPC": 1, "VENC": 0, "VDEC": 1, "JPEGD": 2, "JPEGE": 1, "PNGD": 0}
	// This enables multi-dimensional resource tracking for different accelerator types
	ExtendedResources map[string]uint32 `json:"extendedResources,omitempty"`

	// IsolationGroupSharing defines how isolation groups (vGroups for Ascend) are handled
	// "exclusive" - each partition requires its own isolation group (default for most templates)
	// "shared" - multiple partitions of this template can share an isolation group (time-sharing)
	// For Ascend vir01: shared (can have 2 vNPUs per vGroup with time-sharing)
	// For Ascend vir02/vir04: exclusive (each requires its own vGroup)
	IsolationGroupSharing string `json:"isolationGroupSharing,omitempty"`

	// MaxPartitionsPerIsolationGroup limits how many partitions of this template can share one group
	// Only applicable when IsolationGroupSharing is "shared"
	// For Ascend vir01: 2 (two vir01 can share one vGroup)
	MaxPartitionsPerIsolationGroup uint32 `json:"maxPartitionsPerIsolationGroup,omitempty"`

	// IsolationGroupSlots defines the minimum AI Core slots required by this template's isolation group
	// For Ascend: vir01 uses 1 AICORE but vGroup requires min 2 AICORE
	// vir02/vir04 use 2/4 AICORE respectively
	IsolationGroupSlots uint32 `json:"isolationGroupSlots,omitempty"`
}

PartitionTemplateInfo contains detailed resource information for a partition template

type PostableAlert ¶ added in v1.37.0

type PostableAlert struct {
	Alert
	StartsAt    time.Time `json:"startsAt,omitempty"`
	EndsAt      time.Time `json:"endsAt,omitempty"`
	Annotations LabelSet  `json:"annotations,omitempty"`
}

func CreateAlertData ¶ added in v1.37.0

func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, startsAt time.Time) PostableAlert

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL