config

package
v1.57.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 29, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Default format for fast greptimedb ingestion
	// See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/
	MetricsFormatInflux = "influx"

	// Json format with { "measure", "tag", "field", "ts"}
	MetricsFormatJson = "json"

	// Open telemetry format
	MetricsFormatOTel = "otel"

	// Default GPU operator namespace
	DefaultGPUOperatorNamespace = "gpu-operator"
)

Variables

View Source
var MockDeployment = &appsv1.Deployment{
	ObjectMeta: metav1.ObjectMeta{
		Name:      "pytorch-example",
		Namespace: "tensor-fusion",
		Labels: map[string]string{
			"app":                      "pytorch-example",
			"tensor-fusion.ai/enabled": "true",
		},
	},
	Spec: appsv1.DeploymentSpec{
		Replicas: ptr.To[int32](1),
		Selector: &metav1.LabelSelector{
			MatchLabels: map[string]string{
				"app": "pytorch-example",
			},
		},
		Template: corev1.PodTemplateSpec{
			ObjectMeta: metav1.ObjectMeta{
				Labels: map[string]string{
					"app":                      "pytorch-example",
					"tensor-fusion.ai/enabled": "true",
				},
				Annotations: map[string]string{
					"tensor-fusion.ai/generate-workload": "true",
					"tensor-fusion.ai/gpupool":           "mock",
					"tensor-fusion.ai/inject-container":  "python",
					"tensor-fusion.ai/replicas":          "1",
					"tensor-fusion.ai/tflops-limit":      "10",
					"tensor-fusion.ai/tflops-request":    "10",
					"tensor-fusion.ai/vram-limit":        "1Gi",
					"tensor-fusion.ai/vram-request":      "1Gi",
					"tensor-fusion.ai/workload":          "pytorch-example",
				},
			},
			Spec: corev1.PodSpec{
				Containers: []corev1.Container{
					{
						Name:    "python",
						Image:   "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime",
						Command: []string{"sh", "-c", "sleep", "1d"},
					},
				},
			},
		},
	},
}
View Source
var MockGPUPoolSpec = &tfv1.GPUPoolSpec{
	CapacityConfig: &tfv1.CapacityConfig{
		Oversubscription: &tfv1.Oversubscription{
			TFlopsOversellRatio: 2000,
		},
		MinResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("100"),
			VRAM:   resource.MustParse("10Gi"),
		},
		MaxResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("3000"),
			VRAM:   resource.MustParse("3000Gi"),
		},
		WarmResources: &tfv1.GPUOrCPUResourceUnit{
			TFlops: resource.MustParse("2200"),
			VRAM:   resource.MustParse("2020Gi"),
		},
	},
	NodeManagerConfig: &tfv1.NodeManagerConfig{
		NodeSelector: &corev1.NodeSelector{
			NodeSelectorTerms: []corev1.NodeSelectorTerm{
				{
					MatchExpressions: []corev1.NodeSelectorRequirement{
						{
							Key:      "mock-label",
							Operator: "In",
							Values:   []string{"true"},
						},
					},
				},
			},
		},
		NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{
			BatchPercentage:   25,
			BatchInterval:     "10m",
			MaxDuration:       "10m",
			MaintenanceWindow: tfv1.MaintenanceWindow{},
		},
		ProvisioningMode: tfv1.ProvisioningModeAutoSelect,
	},
	ComponentConfig: &tfv1.ComponentConfig{
		Hypervisor: &tfv1.HypervisorConfig{
			Image:       "hypervisor",
			VectorImage: "vector",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								Volumes: []corev1.Volume{
									{
										Name: "mock-volume",
										VolumeSource: corev1.VolumeSource{
											EmptyDir: &corev1.EmptyDirVolumeSource{},
										},
									},
								},
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-hypervisor",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
										VolumeMounts: []corev1.VolumeMount{
											{
												Name:      "mock-volume",
												MountPath: "/mock",
											},
										},
									},
								},
							},
						},
					},
				)),
			},
		},
		NodeDiscovery: &tfv1.NodeDiscoveryConfig{
			Image: "node-discovery",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								RestartPolicy:                 corev1.RestartPolicyOnFailure,
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-node-discovery",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Worker: &tfv1.WorkerConfig{
			Image: "worker",
			PodTemplate: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(
					corev1.PodTemplate{
						Template: corev1.PodTemplateSpec{
							Spec: corev1.PodSpec{
								TerminationGracePeriodSeconds: ptr.To[int64](0),
								Containers: []corev1.Container{
									{
										Name:    "tensorfusion-worker",
										Image:   "busybox:stable-glibc",
										Command: []string{"sleep", "infinity"},
									},
								},
							},
						},
					},
				)),
			},
		},
		Client: &tfv1.ClientConfig{
			RemoteModeImage:   "client",
			EmbeddedModeImage: "ngpu",
			OperatorEndpoint:  "http://localhost:8080",
			PatchToPod: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"spec": map[string]any{
						"initContainers": []corev1.Container{
							{
								Name:  "inject-lib",
								Image: "busybox:stable-glibc",
							},
						},
					},
				})),
			},
			PatchToContainer: &runtime.RawExtension{
				Raw: lo.Must(json.Marshal(map[string]any{
					"env": []corev1.EnvVar{
						{
							Name:  "LD_PRELOAD",
							Value: "tensorfusion.so",
						},
					},
				})),
			},
		},
	},
	QosConfig: &tfv1.QosConfig{
		Definitions: []tfv1.QosDefinition{
			{
				Name: constants.QoSLevelMedium,
			},
			{
				Name: constants.QoSLevelHigh,
			},
		},
		DefaultQoS: constants.QoSLevelMedium,
		Pricing: []tfv1.QosPricing{
			{
				Qos: constants.QoSLevelMedium,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.5",
			},
			{
				Qos: constants.QoSLevelHigh,
				Requests: tfv1.GPUResourcePricingUnit{
					PerFP16TFlopsPerHour: "2",
					PerGBOfVRAMPerHour:   "1",
				},
				LimitsOverRequestsChargingRatio: "0.8",
			},
		},
	},
}

This is for unit testing

Functions

func GetGPUOperatorNamespace added in v1.54.1

func GetGPUOperatorNamespace() string

GetGPUOperatorNamespace returns the configured GPU operator namespace or default value

func MockGpuInfo added in v1.24.0

func MockGpuInfo() *[]GpuInfo

func SetGlobalConfig added in v1.41.0

func SetGlobalConfig(config *GlobalConfig)

Types

type Alert added in v1.37.0

type Alert struct {
	Labels       LabelSet `json:"labels"`
	GeneratorURL string   `json:"generatorURL,omitempty"`
}

type AlertRule added in v1.37.0

type AlertRule struct {
	Name                string  `yaml:"name"`
	Query               string  `yaml:"query"`
	Threshold           float64 `yaml:"threshold"`
	EvaluationInterval  string  `yaml:"evaluationInterval"`
	ConsecutiveCount    int     `yaml:"consecutiveCount"`
	Severity            string  `yaml:"severity"`
	Summary             string  `yaml:"summary"`
	Description         string  `yaml:"description"`
	RunBookURL          string  `yaml:"runbookURL"`
	AlertTargetInstance string  `yaml:"alertTargetInstance"`

	// when the rule is in test mode, it will not send alerts to alert manager
	TestMode bool

	FiringAlerts map[string]*FiringAlertCache
	// contains filtered or unexported fields
}

func (*AlertRule) AddFiringAlertAndCheckResolved added in v1.37.0

func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (*PostableAlert, bool, string)

func (*AlertRule) CheckAndRemoveFiringAlerts added in v1.37.0

func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert

func (*AlertRule) IsTestMode added in v1.37.0

func (r *AlertRule) IsTestMode() bool

func (*AlertRule) String added in v1.37.0

func (r *AlertRule) String() string

type AutoMigrationConfig added in v1.49.14

type AutoMigrationConfig struct {
	Enable bool                `yaml:"enable"`
	Scope  *AutoMigrationScope `yaml:"scope"`
}

type AutoMigrationRules added in v1.49.14

type AutoMigrationRules struct {
	NamespaceNames    []string              `yaml:"namespaceNames"`
	NamespaceSelector *metav1.LabelSelector `yaml:"namespaceSelector"`
	PodSelector       *metav1.LabelSelector `yaml:"podSelector"`
}

type AutoMigrationScope added in v1.49.14

type AutoMigrationScope struct {
	Includes *AutoMigrationRules `yaml:"includes"`
	Excludes *AutoMigrationRules `yaml:"excludes"`
}

type FiringAlertCache added in v1.37.0

type FiringAlertCache struct {
	Alert PostableAlert
	Count int
}

type GPUFitConfig added in v1.35.0

type GPUFitConfig struct {
	MaxWorkerPerNode int `json:"maxWorkerPerNode"`

	VramWeight   float64 `json:"vramWeight"`
	TflopsWeight float64 `json:"tflopsWeight"`
}

type GPUNetworkTopologyAwareConfig added in v1.35.0

type GPUNetworkTopologyAwareConfig struct {
	TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}

type GlobalConfig added in v1.34.0

type GlobalConfig struct {
	MetricsTTL            string            `yaml:"metricsTTL"`
	MetricsFormat         string            `yaml:"metricsFormat"`
	MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`

	AlertRules    []AlertRule          `yaml:"alertRules"`
	AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`

	AutoScalingInterval  string `yaml:"autoScalingInterval"`
	GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
}

func GetGlobalConfig added in v1.41.0

func GetGlobalConfig() *GlobalConfig

func MockGlobalConfig added in v1.34.0

func MockGlobalConfig() *GlobalConfig

type GpuInfo

type GpuInfo struct {
	Model         string            `json:"model"`
	Vendor        string            `json:"vendor"`
	CostPerHour   float64           `json:"costPerHour"`
	Fp16TFlops    resource.Quantity `json:"fp16TFlops"`
	FullModelName string            `json:"fullModelName"`
}

type LabelSet added in v1.37.0

type LabelSet map[string]string

type PostableAlert added in v1.37.0

type PostableAlert struct {
	Alert
	StartsAt    time.Time `json:"startsAt,omitempty"`
	EndsAt      time.Time `json:"endsAt,omitempty"`
	Annotations LabelSet  `json:"annotations,omitempty"`
}

func CreateAlertData added in v1.37.0

func CreateAlertData(name, summary, description string, labels LabelSet, annotations LabelSet, startsAt time.Time) PostableAlert

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL