Documentation
¶
Index ¶
Constants ¶
View Source
const ( // Default format for fast greptimedb ingestion // See https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/ MetricsFormatInflux = "influx" // Json format with { "measure", "tag", "field", "ts"} MetricsFormatJson = "json" // Open telemetry format MetricsFormatOTel = "otel" )
Variables ¶
View Source
var MockDeployment = &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "pytorch-example", Namespace: "tensor-fusion", Labels: map[string]string{ "app": "pytorch-example", "tensor-fusion.ai/enabled": "true", }, }, Spec: appsv1.DeploymentSpec{ Replicas: ptr.To[int32](1), Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "app": "pytorch-example", }, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "app": "pytorch-example", "tensor-fusion.ai/enabled": "true", }, Annotations: map[string]string{ "tensor-fusion.ai/generate-workload": "true", "tensor-fusion.ai/gpupool": "mock", "tensor-fusion.ai/inject-container": "python", "tensor-fusion.ai/replicas": "1", "tensor-fusion.ai/tflops-limit": "10", "tensor-fusion.ai/tflops-request": "10", "tensor-fusion.ai/vram-limit": "1Gi", "tensor-fusion.ai/vram-request": "1Gi", "tensor-fusion.ai/workload": "pytorch-example", }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: "python", Image: "pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime", Command: []string{"sh", "-c", "sleep", "1d"}, }, }, }, }, }, }
View Source
var MockGPUPoolSpec = &tfv1.GPUPoolSpec{ CapacityConfig: &tfv1.CapacityConfig{ Oversubscription: &tfv1.Oversubscription{ TFlopsOversellRatio: 2000, }, MinResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("100"), VRAM: resource.MustParse("10Gi"), }, MaxResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("3000"), VRAM: resource.MustParse("3000Gi"), }, WarmResources: &tfv1.GPUOrCPUResourceUnit{ TFlops: resource.MustParse("2200"), VRAM: resource.MustParse("2020Gi"), }, }, NodeManagerConfig: &tfv1.NodeManagerConfig{ NodeSelector: &corev1.NodeSelector{ NodeSelectorTerms: []corev1.NodeSelectorTerm{ { MatchExpressions: []corev1.NodeSelectorRequirement{ { Key: "mock-label", Operator: "In", Values: []string{"true"}, }, }, }, }, }, NodePoolRollingUpdatePolicy: &tfv1.NodeRollingUpdatePolicy{ BatchPercentage: 25, BatchInterval: "10m", MaxDuration: "10m", MaintenanceWindow: tfv1.MaintenanceWindow{}, }, ProvisioningMode: tfv1.ProvisioningModeAutoSelect, }, ComponentConfig: &tfv1.ComponentConfig{ Hypervisor: &tfv1.HypervisorConfig{ Image: "hypervisor", VectorImage: "vector", PodTemplate: &runtime.RawExtension{ Raw: lo.Must(json.Marshal( corev1.PodTemplate{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Volumes: []corev1.Volume{ { Name: "mock-volume", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, }, }, Containers: []corev1.Container{ { Name: "tensorfusion-hypervisor", Image: "busybox:stable-glibc", Command: []string{"sleep", "infinity"}, VolumeMounts: []corev1.VolumeMount{ { Name: "mock-volume", MountPath: "/mock", }, }, }, }, }, }, }, )), }, }, NodeDiscovery: &tfv1.NodeDiscoveryConfig{ Image: "node-discovery", PodTemplate: &runtime.RawExtension{ Raw: lo.Must(json.Marshal( corev1.PodTemplate{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyOnFailure, TerminationGracePeriodSeconds: ptr.To[int64](0), Containers: []corev1.Container{ { Name: "tensorfusion-node-discovery", Image: "busybox:stable-glibc", Command: []string{"sleep", "infinity"}, }, }, }, }, }, )), }, }, Worker: &tfv1.WorkerConfig{ Image: "worker", PodTemplate: &runtime.RawExtension{ Raw: lo.Must(json.Marshal( corev1.PodTemplate{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ TerminationGracePeriodSeconds: ptr.To[int64](0), Containers: []corev1.Container{ { Name: "tensorfusion-worker", Image: "busybox:stable-glibc", Command: []string{"sleep", "infinity"}, }, }, }, }, }, )), }, }, Client: &tfv1.ClientConfig{ RemoteModeImage: "client", EmbeddedModeImage: "ngpu", OperatorEndpoint: "http://localhost:8080", PatchToPod: &runtime.RawExtension{ Raw: lo.Must(json.Marshal(map[string]any{ "spec": map[string]any{ "initContainers": []corev1.Container{ { Name: "inject-lib", Image: "busybox:stable-glibc", }, }, }, })), }, PatchToContainer: &runtime.RawExtension{ Raw: lo.Must(json.Marshal(map[string]any{ "env": []corev1.EnvVar{ { Name: "LD_PRELOAD", Value: "tensorfusion.so", }, }, })), }, }, }, QosConfig: &tfv1.QosConfig{ Definitions: []tfv1.QosDefinition{ { Name: constants.QoSLevelMedium, }, { Name: constants.QoSLevelHigh, }, }, DefaultQoS: constants.QoSLevelMedium, Pricing: []tfv1.QosPricing{ { Qos: constants.QoSLevelMedium, Requests: tfv1.GPUResourcePricingUnit{ PerFP16TFlopsPerHour: "2", PerGBOfVRAMPerHour: "1", }, LimitsOverRequestsChargingRatio: "0.5", }, { Qos: constants.QoSLevelHigh, Requests: tfv1.GPUResourcePricingUnit{ PerFP16TFlopsPerHour: "2", PerGBOfVRAMPerHour: "1", }, LimitsOverRequestsChargingRatio: "0.8", }, }, }, }
This is for unit testing
Functions ¶
func MockGpuInfo ¶ added in v1.24.0
func MockGpuInfo() *[]GpuInfo
func SetGlobalConfig ¶ added in v1.41.0
func SetGlobalConfig(config *GlobalConfig)
Types ¶
type AlertRule ¶ added in v1.37.0
type AlertRule struct { Name string `yaml:"name"` Query string `yaml:"query"` Threshold float64 `yaml:"threshold"` EvaluationInterval string `yaml:"evaluationInterval"` ConsecutiveCount int `yaml:"consecutiveCount"` Severity string `yaml:"severity"` Summary string `yaml:"summary"` Description string `yaml:"description"` RunBookURL string `yaml:"runbookURL"` AlertTargetInstance string `yaml:"alertTargetInstance"` // when the rule is in test mode, it will not send alerts to alert manager TestMode bool FiringAlerts map[string]*FiringAlertCache // contains filtered or unexported fields }
func (*AlertRule) AddFiringAlertAndCheckResolved ¶ added in v1.37.0
func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (*PostableAlert, bool, string)
func (*AlertRule) CheckAndRemoveFiringAlerts ¶ added in v1.37.0
func (r *AlertRule) CheckAndRemoveFiringAlerts(firingAlertSet map[string]struct{}) []PostableAlert
func (*AlertRule) IsTestMode ¶ added in v1.37.0
type FiringAlertCache ¶ added in v1.37.0
type FiringAlertCache struct { Alert PostableAlert Count int }
type GPUFitConfig ¶ added in v1.35.0
type GPUNetworkTopologyAwareConfig ¶ added in v1.35.0
type GPUNetworkTopologyAwareConfig struct {
TotalIntranetBandWidthGBps int64 `json:"totalIntranetBandWidthGBps"`
}
type GlobalConfig ¶ added in v1.34.0
type GlobalConfig struct { MetricsTTL string `yaml:"metricsTTL"` MetricsFormat string `yaml:"metricsFormat"` MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"` AlertRules []AlertRule `yaml:"alertRules"` }
func GetGlobalConfig ¶ added in v1.41.0
func GetGlobalConfig() *GlobalConfig
func MockGlobalConfig ¶ added in v1.34.0
func MockGlobalConfig() *GlobalConfig
type PostableAlert ¶ added in v1.37.0
type PostableAlert struct { Alert StartsAt time.Time `json:"startsAt,omitempty"` EndsAt time.Time `json:"endsAt,omitempty"` Annotations LabelSet `json:"annotations,omitempty"` }
func CreateAlertData ¶ added in v1.37.0
Click to show internal directories.
Click to hide internal directories.