Documentation
¶
Index ¶
- Constants
- Variables
- func ClearArmRestartTracker(dpu *provisioningv1.DPU)
- func CompleteRebooting(ctx context.Context, dpu *provisioningv1.DPU, state *provisioningv1.DPUStatus, ...) provisioningv1.DPUStatus
- func InitializeDPURebootStatus(ctx context.Context, dpu *provisioningv1.DPU, state *provisioningv1.DPUStatus, ...) error
- func MkIso(isoname, volumelabel string, files []IsoRootFile) (string, error)
- func SaveArmRestartTracker(dpu *provisioningv1.DPU, tracker *ArmRestartTracker) error
- func StartRebooting(ctx context.Context, dpu *provisioningv1.DPU, ctrlCtx *ControllerContext) (*provisioningv1.DPUStatus, *provisioningv1.DPUNode, bool, error)
- func UpdateRebootStatus(state *provisioningv1.DPUStatus, phase provisioningv1.RebootStatusPhase, ...)
- type ArmRestartTracker
- type BF4Artifact
- type BlueFieldSpecs
- type CanProceedError
- type CapacityResult
- type ControllerContext
- type DPUArtifactGenerator
- type DPUArtifactRequest
- type DPUID
- type DPUInProvisioningMap
- type DPUOptions
- type IsoRootFile
- type KubeadmBootstrapTokenGenerator
- type NodeJoinCommandGenerator
- type TaskWithRetry
Constants ¶
const (
ErrReasonMaxDPUParallelInstallationsLimitReached = "MaxDPUParallelInstallationsLimitReached"
)
Error reasons for CanProceed
const (
MaxRetryCount = 10
)
const ( // StaleTrackerTimeout defines when a tracker is considered stale (controller restart recovery). // Must be longer than (MaxSafetyLimit * MinRestartInterval = 10 * 90s = 15min) to allow // the full restart cycle to complete before declaring the tracker stale. StaleTrackerTimeout = 20 * time.Minute )
Variables ¶
var ( PartNumbers = map[string]BlueFieldSpecs{ "06CMW1": {8, "16"}, "0HFWRM": {16, "32"}, "0KK4NR": {8, "16"}, "0NDP41": {16, "32"}, "0WJ9T5": {16, "32"}, "0WN6RF": {16, "32"}, "0X5DXX": {16, "32"}, "0XNWR4": {16, "32"}, "8217991": {16, "32"}, "8225672": {16, "32"}, "900-9D3B4-00CC-EA0": {8, "16"}, "900-9D3B4-00CV-EA0": {8, "16"}, "900-9D3B4-00EN-EA0": {8, "16"}, "900-9D3B4-00PN-EA0": {8, "16"}, "900-9D3B4-00SC-EA0": {8, "16"}, "900-9D3B4-00SV-EA0": {8, "16"}, "900-9D3B6-00CC-EA0": {16, "32"}, "900-9D3B6-00CN-AB0": {16, "32"}, "900-9D3B6-00CN-PA0": {16, "32"}, "900-9D3B6-00CN-PN0": {16, "32"}, "900-9D3B6-00CV-AA0": {16, "32"}, "900-9D3B6-00SC-EA0": {16, "32"}, "900-9D3B6-00SN-AB0": {16, "32"}, "900-9D3B6-00SV-AA0": {16, "32"}, "900-9D3B6-F2SC-EA0": {16, "32"}, "900-9D3B6-F2SV-PA0": {16, "32"}, "900-9D3B6-H1CN-AB0": {16, "32"}, "900-9D3B6-H1CN-AB1": {16, "32"}, "900-9D3C6-00CV-DA0": {16, "48"}, "900-9D3C6-00SV-DA0": {16, "48"}, "900-9D3C6-B9SV-DA0": {16, "48"}, "900-9D3D4-00EN-HA0": {8, "16"}, "900-9D3D4-00NN-HA0": {8, "16"}, "900-9D3D4-00NN-HAS": {8, "16"}, "900-9D3D4-00NN-LA0": {8, "16"}, "900-9D3L6-00CN-AA0": {16, "32"}, "P66102-001": {16, "32"}, "P66584-001": {8, "16"}, "S3K99-63001": {8, "16"}, "SN37B36732": {16, "32"}, "SN37B82788": {8, "16"}, } Models = map[string]BlueFieldSpecs{ "B3240": {16, "32"}, "B3220": {16, "32"}, "B3210E": {16, "32"}, "B3210": {16, "32"}, "B3210L": {8, "16"}, "B3220L": {8, "16"}, "B3140L": {8, "16"}, "B3140H": {8, "16"}, "B3220SH": {16, "48"}, } )
var BmcFwUpdateTaskMap sync.Map
var HostNetworkTaskMap sync.Map
var RebootTaskMap sync.Map
Functions ¶
func ClearArmRestartTracker ¶
func ClearArmRestartTracker(dpu *provisioningv1.DPU)
ClearArmRestartTracker removes the tracker annotation from DPU
func CompleteRebooting ¶
func CompleteRebooting(ctx context.Context, dpu *provisioningv1.DPU, state *provisioningv1.DPUStatus, zeroTrustMode bool) provisioningv1.DPUStatus
CompleteRebooting runs when the DPU is in DPURebooting and reboot is complete.
func InitializeDPURebootStatus ¶
func InitializeDPURebootStatus(ctx context.Context, dpu *provisioningv1.DPU, state *provisioningv1.DPUStatus, ctrlCtx *ControllerContext, sourcePhase provisioningv1.DPUPhase) error
InitializeDPURebootStatus initializes status.rebootStatus when entering DPURebooting, always refreshing it to avoid carrying stale state between reboot cycles. The DPUConfig branch consumes the agent-reported method as-is; the host-power-cycle-required annotation is a Trusted Host execution-time escalation (see internal/provisioning/hostagent/phase/reboot/sync.go) and intentionally does not propagate into RebootStatus.Method.
func MkIso ¶
func MkIso(isoname, volumelabel string, files []IsoRootFile) (string, error)
MkIso writes an ISO 9660 image equivalent to OpenBSD mkhybrid mkisofs with:
mkisofs -output <isoname>.iso -volid <volumelabel> -joliet -rock <files[0].Name> <files[1].Name> ...
That is -J (Joliet), -R (Rock Ridge, long option -rock; not rationalized -r), and -V (volume ID), per gnu/usr.sbin/mkhybrid/src/mkisofs.c. The caller must pass files in the same order as mkisofs source arguments; extent / directory order follows that order.
func SaveArmRestartTracker ¶
func SaveArmRestartTracker(dpu *provisioningv1.DPU, tracker *ArmRestartTracker) error
SaveArmRestartTracker serializes tracker to DPU annotation
func StartRebooting ¶
func StartRebooting(ctx context.Context, dpu *provisioningv1.DPU, ctrlCtx *ControllerContext) (*provisioningv1.DPUStatus, *provisioningv1.DPUNode, bool, error)
StartRebooting performs checks common to all DPURebooting implementations. If done is true, the returned status is complete and callers should return it immediately.
func UpdateRebootStatus ¶
func UpdateRebootStatus(state *provisioningv1.DPUStatus, phase provisioningv1.RebootStatusPhase, reason, message string)
UpdateRebootStatus updates the in-memory DPU reboot status and transition time.
Types ¶
type ArmRestartTracker ¶
type ArmRestartTracker struct {
// Attempt is the current attempt count (1-based, 0 = not started)
Attempt int `json:"attempt"`
// MaxAttempts is set by caller (e.g., 2 for Secure Boot)
MaxAttempts int `json:"maxAttempts"`
// LastRestartTime is used for timeout/interval checks
LastRestartTime time.Time `json:"lastRestartTime"`
// InitialGeneration detects spec changes during flow
InitialGeneration int64 `json:"initialGeneration"`
}
ArmRestartTracker tracks ARM restart operations across reconcile loops. Stored as JSON annotation on the DPU object.
func LoadArmRestartTracker ¶
func LoadArmRestartTracker(dpu *provisioningv1.DPU) (*ArmRestartTracker, error)
LoadArmRestartTracker deserializes tracker from DPU annotation. Returns nil if annotation doesn't exist (not an error). Validates loaded data to guard against corrupted or tampered annotations.
func (*ArmRestartTracker) AllRestartsDone ¶
func (t *ArmRestartTracker) AllRestartsDone() bool
AllRestartsDone returns true if all required restarts have been triggered
func (*ArmRestartTracker) IncrementAttempt ¶
func (t *ArmRestartTracker) IncrementAttempt()
IncrementAttempt increments attempt counter and updates timestamp
func (*ArmRestartTracker) IsStale ¶
func (t *ArmRestartTracker) IsStale() bool
IsStale returns true if the tracker is older than StaleTrackerTimeout
type BF4Artifact ¶
type BlueFieldSpecs ¶
type BlueFieldSpecs struct {
// CPU is the number of cores
CPU int
// Mem is the memory size in GB.
// Note: GB should be converted to "Gi" or "G" before comparison.
Mem string
}
func LookUpModel ¶
func LookUpModel(desc string) *BlueFieldSpecs
LookUpModel tries to find the model name in the product description and returns the corresponding BlueFieldSpecs.
func LookUpPartNumber ¶
func LookUpPartNumber(partNumber string) *BlueFieldSpecs
LookUpPartNumber returns the BlueFieldSpecs for the given part number.
func LookUpResource ¶
func LookUpResource(description string) *BlueFieldSpecs
LookUpResource tries to grep the number of cores and memory size from the product description.
func ParseDescription ¶
func ParseDescription(description string) *BlueFieldSpecs
ParseDescription parses the product description and returns the corresponding BlueFieldSpecs.
func (*BlueFieldSpecs) CanSatisfy ¶
func (spec *BlueFieldSpecs) CanSatisfy(req corev1.ResourceList) CapacityResult
type CanProceedError ¶
CanProceedError represents errors returned by CanProceed
func (*CanProceedError) Error ¶
func (e *CanProceedError) Error() string
type CapacityResult ¶
type CapacityResult int
const ( CapacityUnknown CapacityResult = iota CapacitySatisfied CapacityInsufficient )
type ControllerContext ¶
type ControllerContext struct {
client.Client
Scheme *runtime.Scheme
Options DPUOptions
Recorder record.EventRecorder
ClusterAllocator allocator.Allocator
JoinCommandGenerator NodeJoinCommandGenerator
DPUArtifactGenerator DPUArtifactGenerator
HostUptimeChecker reboot.HostUptimeChecker
DPUInProvisioningMap *DPUInProvisioningMap
}
type DPUArtifactGenerator ¶
type DPUArtifactGenerator interface {
GenerateBF3(ctx context.Context, req DPUArtifactRequest) ([]byte, error)
GenerateBF4(ctx context.Context, req DPUArtifactRequest) (BF4Artifact, error)
}
type DPUArtifactRequest ¶
type DPUArtifactRequest struct {
ControllerContext *ControllerContext
DPU *provisioningv1.DPU
Flavor *provisioningv1.DPUFlavor
BootstrapToken string
}
type DPUID ¶
type DPUID string
DPUID is a type alias for string that represents a DPU's unique identifier
type DPUInProvisioningMap ¶
type DPUInProvisioningMap struct {
// contains filtered or unexported fields
}
DPUInProvisioningMap tracks the number of DPUs in provisioning states This map is used to limit the number of DPUs that can be in provisioning at once.
func NewDPUInProvisioningMap ¶
func NewDPUInProvisioningMap(max int32) *DPUInProvisioningMap
NewDPUInProvisioningMap creates a new map with the specified maximum value
func (*DPUInProvisioningMap) CanProceed ¶
func (c *DPUInProvisioningMap) CanProceed(dpuUID DPUID) error
CanProceed checks if a new DPU can enter provisioning and inserts if possible. Returns nil if can proceed, or a CanProceedError with the specific reason.
func (*DPUInProvisioningMap) GetMax ¶
func (c *DPUInProvisioningMap) GetMax() int32
GetMax returns max allowed
func (*DPUInProvisioningMap) Initialize ¶
Initialize counts current DPUs in provisioning state. reader can be mgr.GetAPIReader() to avoid waiting for cache sync.
func (*DPUInProvisioningMap) Remove ¶
func (c *DPUInProvisioningMap) Remove(dpuUID DPUID)
Remove removes a DPU from the map
type DPUOptions ¶
type DPUOptions struct {
PrarprouterdImageWithTag string
ImagePullSecrets []corev1.LocalObjectReference
DPUInstallInterface string
DeploymentMode string
BFCFGTemplateFile string
BFBRegistry string
BFBPVC string
BFBRegistryLoadBalancer string
CustomCASecretName string
MaxDPUParallelInstallations int32
// OSInstallTimeout is the maximum time allowed for OS installation in zero-trust mode.
// Default: 45 minutes
OSInstallTimeout time.Duration
// FirmwareUpdateTimeout is the maximum time allowed for firmware update in zero-trust mode.
// Default: 45 minutes
FirmwareUpdateTimeout time.Duration
// NodeEffectRemovalTimeout is the maximum time allowed for the Node Effect Removal phase.
// Default: 30 minutes
NodeEffectRemovalTimeout time.Duration
}
func (DPUOptions) ZeroTrustProvisioningFlow ¶
func (o DPUOptions) ZeroTrustProvisioningFlow() bool
ZeroTrustProvisioningFlow reports whether the cluster policy is zero-trust for provisioning phases that branch on ZT vs host-trusted (e.g. reboot completion, RebootMethodNoAction). When DeploymentMode is unset (legacy), Redfish install interface implies zero-trust flow.
type IsoRootFile ¶
IsoRootFile is one root-level file in the image, in the same sense as each positional pathname argument to mkisofs (all names are written under the ISO root directory).
type KubeadmBootstrapTokenGenerator ¶
KubeadmBootstrapTokenGenerator is a NodeJoinCommandGenerator that generates join commands following the kubeadm bootstrap token authentication method. It creates a bootstrap token secret and returns the join command. This join process is based on the kubeadm implementation. More details can be found in the kubeadm documentation: https://kubernetes.io/docs/reference/setup-tools/kubeadm/kubeadm-init/#bootstrap-token-authentication
func (*KubeadmBootstrapTokenGenerator) GenerateJoinCommand ¶
func (s *KubeadmBootstrapTokenGenerator) GenerateJoinCommand(ctx context.Context, dc *provisioningv1.DPUCluster) (string, error)
GenerateJoinCommand generates a join command for a DPU cluster node.
type NodeJoinCommandGenerator ¶
type NodeJoinCommandGenerator interface {
GenerateJoinCommand(ctx context.Context, dc *provisioningv1.DPUCluster) (string, error)
}
NodeJoinCommandGenerator is an interface for generating join commands for DPU cluster nodes.