Documentation
¶
Index ¶
- Variables
- func Node(options NodeOptions) *corev1.Node
- func SetTestModeCompactionPeriod()
- func UpdateK8SNodeSelectorHash(ctx context.Context, k8sClient client.Client, node *corev1.Node, hash string) error
- type FakeNodeClaimReconciler
- type GPUNodeClaimReconciler
- type GPUNodeClassReconciler
- type GPUNodeReconciler
- type GPUPoolCompactionReconciler
- type GPUPoolReconciler
- type GPUReconciler
- type GPUResourceQuotaReconciler
- type NodeOptions
- type NodeReconciler
- type PodReconciler
- type SchedulingConfigTemplateReconciler
- type TensorFusionClusterReconciler
- type TensorFusionConnectionReconciler
- type TensorFusionWorkloadReconciler
- type WorkloadProfileReconciler
Constants ¶
This section is empty.
Variables ¶
var ( // Killer switch to avoid creating too much cloud vendor nodes // Controlled by /api/provision?enable=true/false ProvisioningToggle = true // creating nodes, next round capacity check should consider the assumed resources // map key is pool name, second level is GPUClaim name PendingGPUNodeClaim map[string]map[string]tfv1.Resource // deleting nodes, must be serialized, delete one round by one round // map key is pool name, value is GPUNode name list PendingDeletionGPUNodes map[string][]string )
Functions ¶
func SetTestModeCompactionPeriod ¶ added in v1.41.0
func SetTestModeCompactionPeriod()
Types ¶
type FakeNodeClaimReconciler ¶ added in v1.40.0
type FakeNodeClaimReconciler struct {
Scheme *runtime.Scheme
// contains filtered or unexported fields
}
func (*FakeNodeClaimReconciler) SetupWithManager ¶ added in v1.40.0
func (r *FakeNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error
type GPUNodeClaimReconciler ¶ added in v1.41.0
type GPUNodeClaimReconciler struct {
client.Client
Expander *expander.NodeExpander
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
GPUNodeClaimReconciler reconciles a GPUNodeClaim object
func (*GPUNodeClaimReconciler) Reconcile ¶ added in v1.41.0
func (r *GPUNodeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
GPUNodeClaim is responsible for creating cloud vendor GPU nodes
func (*GPUNodeClaimReconciler) SetupWithManager ¶ added in v1.41.0
func (r *GPUNodeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUNodeClassReconciler ¶
GPUNodeClassReconciler reconciles a GPUNodeClass object
func (*GPUNodeClassReconciler) Reconcile ¶
func (r *GPUNodeClassReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile GPU node classes
func (*GPUNodeClassReconciler) SetupWithManager ¶
func (r *GPUNodeClassReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUNodeReconciler ¶
type GPUNodeReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Allocator *gpuallocator.GpuAllocator
Expander *expander.NodeExpander
}
GPUNodeReconciler reconciles a GPUNode object
func (*GPUNodeReconciler) SetupWithManager ¶
func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUPoolCompactionReconciler ¶
type GPUPoolCompactionReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Allocator *gpuallocator.GpuAllocator
// contains filtered or unexported fields
}
GPUPoolReconciler reconciles a GPUPool object
func (*GPUPoolCompactionReconciler) SetupWithManager ¶
func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type GPUPoolReconciler ¶
type GPUPoolReconciler struct {
client.Client
LastProcessedItems sync.Map
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
GPUPoolReconciler reconciles a GPUPool object
func (*GPUPoolReconciler) SetupWithManager ¶
func (r *GPUPoolReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error
SetupWithManager sets up the controller with the Manager.
type GPUReconciler ¶
GPUReconciler reconciles a GPU object
func (*GPUReconciler) Reconcile ¶
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.
func (*GPUReconciler) SetupWithManager ¶
SetupWithManager sets up the controller with the Manager.
type GPUResourceQuotaReconciler ¶ added in v1.34.6
type GPUResourceQuotaReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
QuotaStore *quota.QuotaStore
}
GPUResourceQuotaReconciler reconciles a GPUResourceQuota object
func (*GPUResourceQuotaReconciler) Reconcile ¶ added in v1.34.6
func (r *GPUResourceQuotaReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.
func (*GPUResourceQuotaReconciler) SetupWithManager ¶ added in v1.34.6
func (r *GPUResourceQuotaReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type NodeOptions ¶ added in v1.40.0
type NodeOptions struct {
metav1.ObjectMeta
ReadyStatus corev1.ConditionStatus
ReadyReason string
Conditions []corev1.NodeCondition
Unschedulable bool
ProviderID string
Taints []corev1.Taint
Allocatable corev1.ResourceList
Capacity corev1.ResourceList
OwnerReference []metav1.OwnerReference
}
type NodeReconciler ¶
PodReconciler reconciles a Pod object
func (*NodeReconciler) SetupWithManager ¶
func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type PodReconciler ¶
type PodReconciler struct {
client.Client
Scheme *runtime.Scheme
Allocator *gpuallocator.GpuAllocator
PortAllocator *portallocator.PortAllocator
Expander *expander.NodeExpander
}
PodReconciler reconciles a Pod object
func (*PodReconciler) Reconcile ¶
Add GPU connection for Pods using GPU Have to create TensorFusion connection here because pod UID not available in MutatingWebhook
func (*PodReconciler) SetupWithManager ¶
func (r *PodReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type SchedulingConfigTemplateReconciler ¶
SchedulingConfigTemplateReconciler reconciles a SchedulingConfigTemplate object
func (*SchedulingConfigTemplateReconciler) Reconcile ¶
func (r *SchedulingConfigTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
When deleted, need check if any GPU pool is using this template, if so, add warning event and requeue When updated, trigger the re-scheduling
func (*SchedulingConfigTemplateReconciler) SetupWithManager ¶
func (r *SchedulingConfigTemplateReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionClusterReconciler ¶
type TensorFusionClusterReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
MetricsRecorder *metrics.MetricsRecorder
LastProcessedItems sync.Map
}
TensorFusionClusterReconciler reconciles a TensorFusionCluster object
func (*TensorFusionClusterReconciler) Reconcile ¶
func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile a TensorFusionCluster object, create and monitor GPU Pool, managing cluster level component versions
func (*TensorFusionClusterReconciler) SetupWithManager ¶
func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager, addLimiter bool) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionConnectionReconciler ¶
type TensorFusionConnectionReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
TensorFusionConnectionReconciler reconciles a TensorFusionConnection object
func (*TensorFusionConnectionReconciler) Reconcile ¶
func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Add and monitor GPU worker Pod for a TensorFusionConnection
func (*TensorFusionConnectionReconciler) SetupWithManager ¶
func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type TensorFusionWorkloadReconciler ¶
type TensorFusionWorkloadReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
PortAllocator *portallocator.PortAllocator
}
TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
func (*TensorFusionWorkloadReconciler) Reconcile ¶
func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
TensorFusionWorkload Reconciler
func (*TensorFusionWorkloadReconciler) SetupWithManager ¶
func (r *TensorFusionWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
type WorkloadProfileReconciler ¶ added in v1.23.7
WorkloadProfileReconciler reconciles a WorkloadProfile object
func (*WorkloadProfileReconciler) Reconcile ¶ added in v1.23.7
func (r *WorkloadProfileReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile
func (*WorkloadProfileReconciler) SetupWithManager ¶ added in v1.23.7
func (r *WorkloadProfileReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager sets up the controller with the Manager.
Source Files
¶
- fake_node_claim_contoller.go
- gpu_controller.go
- gpunode_controller.go
- gpunodeclaim_controller.go
- gpunodeclass_controller.go
- gpupool_compaction_controller.go
- gpupool_controller.go
- gpupool_node_provision.go
- gpuresourcequota_controller.go
- node_controller.go
- pod_controller.go
- schedulingconfigtemplate_controller.go
- tensorfusioncluster_controller.go
- tensorfusionconnection_controller.go
- tensorfusionworkload_controller.go
- workloadprofile_controller.go