config

package

v1.57.2 Latest Latest Go to latest Published: Jan 29, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NexusGPU/tensor-fusion

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func GetGPUOperatorNamespace() string
func MockGpuInfo() *[]GpuInfo
func SetGlobalConfig(config *GlobalConfig)
type Alert
type AlertRule
- func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (*PostableAlert, bool, string)
- func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert
- func (r *AlertRule) IsTestMode() bool
- func (r *AlertRule) String() string
type AutoMigrationConfig
type AutoMigrationRules
type AutoMigrationScope
type FiringAlertCache
type GPUFitConfig
type GPUNetworkTopologyAwareConfig
type GlobalConfig
- func GetGlobalConfig() *GlobalConfig
- func MockGlobalConfig() *GlobalConfig
type GpuInfo
type LabelSet
type PostableAlert
- func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, ...) PostableAlert

Constants ¶

View Source

const (
	// Default format for fast greptimedb ingestion
	// See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/
	MetricsFormatInflux = "influx"

	// Json format with { "measure", "tag", "field", "ts"}
	MetricsFormatJson = "json"

	// Open telemetry format
	MetricsFormatOTel = "otel"

	// Default GPU operator namespace
	DefaultGPUOperatorNamespace = "gpu-operator"
)

Variables ¶

View Source

var MockDeployment = &appsv1.Deployment{
	ObjectMeta: metav1.ObjectMeta{
		Name:      "pytorch-example",
		Namespace: "tensor-fusion",
		Labels: map[string]string{
			"app":                      "pytorch-example",
			"tensor-fusion.ai/enabled": "true",
		},
	},
	Spec: appsv1.DeploymentSpec{
		Replicas: ptr.To[int32](1),
		Selector: &metav1.LabelSelector{
			MatchLabels: map[string]string{
				"app": "pytorch-example",
			},
		},
		Template: corev1.PodTemplateSpec{
			ObjectMeta: metav1.ObjectMeta{
				Labels: map[string]string{
					"app":                      "pytorch-example",
					"tensor-fusion.ai/enabled": "true",
				},
				Annotations: map[string]string{
					"tensor-fusion.ai/generate-workload": "true",
					"tensor-fusion.ai/gpupool":           "mock",
					"tensor-fusion.ai/inject-container":  "python",
					"tensor-fusion.ai/replicas":          "1",
					"tensor-fusion.ai/tflops-limit":      "10",
					"tensor-fusion.ai/tflops-request":    "10",
					"tensor-fusion.ai/vram-limit":        "1Gi",
					"tensor-fusion.ai/vram-request":      "1Gi",
					"tensor-fusion.ai/workload":          "pytorch-example",
				},
			},
			Spec: corev1.PodSpec{
				Containers: []corev1.Container{
					{
						Name:    "python",
						Image:   "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime",
						Command: []string{"sh", "-c", "sleep", "1d"},
					},
				},
			},
		},
	},
}

View Source

var MockGPUPoolSpec = &tfv1.GPUPoolSpec{
	CapacityConfig: &tfv1.CapacityConfig{
		Oversubscription: &tfv1.Oversubscription{
			TFlopsOversellRatio: 2000,
		},
		MinResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("100"),
			VRAM:   resource.MustParse("10Gi"),
		},
		MaxResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("3000"),
			VRAM:   resource.MustParse("3000Gi"),
		},
		WarmResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("2200"),
			VRAM:   resource.MustParse("2020Gi"),
		},
	},
	NodeManagerConfig: &tfv1.NodeManagerConfig{
		NodeSelector: &corev1.NodeSelector{
			NodeSelectorTerms: []corev1.NodeSelectorTerm{
				{
					MatchExpressions: []corev1.NodeSelectorRequirement{
						{
							Key:      "mock-label",
							Operator: "In",
							Values:   []string{"true"},
						},
					},
				},
			},
		},
		NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{
			BatchPercentage:   25,
			BatchInterval:     "10m",
			MaxDuration:       "10m",
			MaintenanceWindow: tfv1.MaintenanceWindow{},
		},
		ProvisioningMode: tfv1.ProvisioningModeAutoSelect,
	},
	ComponentConfig: &tfv1.ComponentConfig{
		Hypervisor: &tfv1.HypervisorConfig{
			Image:       "hypervisor",
			VectorImage: "vector",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								Volumes: []corev1.Volume{
									{
										Name: "mock-volume",
										VolumeSource: corev1.VolumeSource{
											EmptyDir: &corev1.EmptyDirVolumeSource{},
										},
									},
								},
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-hypervisor",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
										VolumeMounts: []corev1.VolumeMount{
											{
												Name:      "mock-volume",
												MountPath: "/mock",
											},
										},
									},
								},
							},
						},
					},
				)),
			},
		},
		NodeDiscovery: &tfv1.NodeDiscoveryConfig{
			Image: "node-discovery",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								RestartPolicy:                 corev1.RestartPolicyOnFailure,
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-node-discovery",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Worker: &tfv1.WorkerConfig{
			Image: "worker",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-worker",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Client: &tfv1.ClientConfig{
			RemoteModeImage:   "client",
			EmbeddedModeImage: "ngpu",
			OperatorEndpoint:  "http://localhost:8080",
			PatchToPod: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"spec": map[string]any{
						"initContainers": []corev1.Container{
							{
								Name:  "inject-lib",
								Image: "busybox:stable-glibc",
							},
						},
					},
				})),
			},
			PatchToContainer: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"env": []corev1.EnvVar{
						{
							Name:  "LD_PRELOAD",
							Value: "tensorfusion.so",
						},
					},
				})),
			},
		},
	},
	QosConfig: &tfv1.QosConfig{
		Definitions: []tfv1.QosDefinition{
			{
				Name: constants.QoSLevelMedium,
			},
			{
				Name: constants.QoSLevelHigh,
			},
		},
		DefaultQoS: constants.QoSLevelMedium,
		Pricing: []tfv1.QosPricing{
			{
				Qos: constants.QoSLevelMedium,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.5",
			},
			{
				Qos: constants.QoSLevelHigh,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.8",
			},
		},
	},
}

This is for unit testing

Functions ¶

func GetGPUOperatorNamespace ¶ added in v1.54.1

func GetGPUOperatorNamespace() string

GetGPUOperatorNamespace returns the configured GPU operator namespace or default value

func MockGpuInfo ¶ added in v1.24.0

func MockGpuInfo() *[]GpuInfo

func SetGlobalConfig ¶ added in v1.41.0

func SetGlobalConfig(config *GlobalConfig)

Types ¶

type Alert ¶ added in v1.37.0

type Alert struct {
	Labels       LabelSet `json:"labels"`
	GeneratorURL string   `json:"generatorURL,omitempty"`
}

type AlertRule ¶ added in v1.37.0

type AlertRule struct {
	Name                string  `yaml:"name"`
	Query               string  `yaml:"query"`
	Threshold           float64 `yaml:"threshold"`
	EvaluationInterval  string  `yaml:"evaluationInterval"`
	ConsecutiveCount    int     `yaml:"consecutiveCount"`
	Severity            string  `yaml:"severity"`
	Summary             string  `yaml:"summary"`
	Description         string  `yaml:"description"`
	RunBookURL          string  `yaml:"runbookURL"`
	AlertTargetInstance string  `yaml:"alertTargetInstance"`

	// when the rule is in test mode, it will not send alerts to alert manager
	TestMode bool

	FiringAlerts map[string]*FiringAlertCache
	// contains filtered or unexported fields
}

func (*AlertRule) AddFiringAlertAndCheckResolved ¶ added in v1.37.0

func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (*PostableAlert, bool, string)

func (*AlertRule) CheckAndRemoveFiringAlerts ¶ added in v1.37.0

func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert

func (*AlertRule) IsTestMode ¶ added in v1.37.0

func (r *AlertRule) IsTestMode() bool

func (*AlertRule) String ¶ added in v1.37.0

func (r *AlertRule) String() string

type AutoMigrationConfig ¶ added in v1.49.14

type AutoMigrationConfig struct {
	Enable bool                `yaml:"enable"`
	Scope  *AutoMigrationScope `yaml:"scope"`
}

type AutoMigrationRules ¶ added in v1.49.14

type AutoMigrationRules struct {
	NamespaceNames    []string              `yaml:"namespaceNames"`
	NamespaceSelector *metav1.LabelSelector `yaml:"namespaceSelector"`
	PodSelector       *metav1.LabelSelector `yaml:"podSelector"`
}

type AutoMigrationScope ¶ added in v1.49.14

type AutoMigrationScope struct {
	Includes *AutoMigrationRules `yaml:"includes"`
	Excludes *AutoMigrationRules `yaml:"excludes"`
}

type FiringAlertCache ¶ added in v1.37.0

type FiringAlertCache struct {
	Alert PostableAlert
	Count int
}

type GPUFitConfig ¶ added in v1.35.0

type GPUFitConfig struct {
	MaxWorkerPerNode int `json:"maxWorkerPerNode"`

	VramWeight   float64 `json:"vramWeight"`
	TflopsWeight float64 `json:"tflopsWeight"`
}

type GPUNetworkTopologyAwareConfig ¶ added in v1.35.0

type GPUNetworkTopologyAwareConfig struct {
	TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}

type GlobalConfig ¶ added in v1.34.0

type GlobalConfig struct {
	MetricsTTL            string            `yaml:"metricsTTL"`
	MetricsFormat         string            `yaml:"metricsFormat"`
	MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`

	AlertRules    []AlertRule          `yaml:"alertRules"`
	AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`

	AutoScalingInterval  string `yaml:"autoScalingInterval"`
	GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
}

func GetGlobalConfig ¶ added in v1.41.0

func GetGlobalConfig() *GlobalConfig

func MockGlobalConfig ¶ added in v1.34.0

func MockGlobalConfig() *GlobalConfig

type GpuInfo ¶

type GpuInfo struct {
	Model         string            `json:"model"`
	Vendor        string            `json:"vendor"`
	CostPerHour   float64           `json:"costPerHour"`
	Fp16TFlops    resource.Quantity `json:"fp16TFlops"`
	FullModelName string            `json:"fullModelName"`
}

type LabelSet ¶ added in v1.37.0

type LabelSet map[string]string

type PostableAlert ¶ added in v1.37.0

type PostableAlert struct {
	Alert
	StartsAt    time.Time `json:"startsAt,omitempty"`
	EndsAt      time.Time `json:"endsAt,omitempty"`
	Annotations LabelSet  `json:"annotations,omitempty"`
}

func CreateAlertData ¶ added in v1.37.0

func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, startsAt time.Time) PostableAlert

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL