Documentation
¶
Overview ¶
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.
Index ¶
- Constants
- func GetTaskRestartReason(reasonList []FaultReasonList) string
- type AllocNodeRankOccurrence
- type DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() []FaultNode
- func (reCache *DealReSchedulerCache) SetFaultJobsFromCM(jobType string) error
- func (reCache *DealReSchedulerCache) SetFaultNodesFromCM() error
- func (reCache *DealReSchedulerCache) SetNodeHeartbeatFromCM() error
- func (reCache *DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM() error
- func (reCache *DealReSchedulerCache) SetRetryTimesFromCM() error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
- type DealReSchedulerConfigmap
- type FaultCard
- type FaultDeviceList
- type FaultJob
- func (fJob *FaultJob) CheckJobExistsInKubernetes(ssn *framework.Session) bool
- func (fJob *FaultJob) ForceDeleteJob(ssn *framework.Session, schedulerJob *plugin.SchedulerJob, ...) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultNPUTaskNum() int
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobHasPreSeparateNPUKey() bool
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- type FaultNode
- type FaultNodeInfoToCm
- type FaultRankIdsJobCMData
- type FaultReasonList
- type FaultTask
- type NodeHeartbeat
- type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode, npuName string) error
- func (reScheduler *ReScheduler) GenerateNodeRankIndexTaskMap()
- func (reScheduler ReScheduler) GetFaultJobOfGivenTaskInfoFromCache(task *api.TaskInfo) *FaultJob
- func (reScheduler *ReScheduler) GetGraceDeleteTime(Conf []config.Configuration) (int64, error)
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) (map[api.JobID]*api.JobInfo, error)
- func (reScheduler *ReScheduler) InitFaultNodeMap()
- func (reScheduler *ReScheduler) New910ReScheduler()
- func (reScheduler *ReScheduler) NewCommonReScheduler(jobType string)
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64) error
- func (reScheduler *ReScheduler) SynCacheFaultJobWithSession(ssn *framework.Session)
- func (reScheduler *ReScheduler) SynCacheFaultNodeWithSession()
- func (reScheduler *ReScheduler) SynCacheNodeRankOccMapWithSession(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
- func (reScheduler *ReScheduler) ValidJobByReschedule(curSchedulerJob util.SchedulerJobAttr) *api.ValidateResult
- type RemainRetryTimes
- type SimpleFNodeInfo
Constants ¶
const ( // RePropertyName name specifying re-scheduler cm RePropertyName = "re-scheduling" // CmName Name of ReSchedulerConfigmap CmName = "vcjob-fault-npu-cm" // CmNameSpace Namespace of ReSchedulerConfigmap CmNameSpace = "volcano-system" // JobRescheduleLabelKey key word of re-scheduling configuration JobRescheduleLabelKey = "fault-scheduling" // JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration JobGraceRescheduleLabelValue = "grace" // JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration JobForceRescheduleLabelValue = "force" // JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration JobOffRescheduleLabelValue = "off" // GraceOverTimeKey for GraceOverTime config by user GraceOverTimeKey = "grace-over-time" // ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling ElasticSchedulingKey = "elastic-scheduling" // JobOnElasticScheduling job enabled with elastic scheduling JobOnElasticScheduling = "on" // JobOffElasticScheduling job not enabled with elastic scheduling JobOffElasticScheduling = "off" // CmFaultNodeKind key in configmap which saves the FaultNode cache CmFaultNodeKind = "fault-node" // CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache CmFaultJob910bx2Kind = "fault-job-910bx2" // CmFaultJob910bx2InferKind key in configmap which saves the 910bx2-infer FaultJob cache CmFaultJob910bx2InferKind = "fault-job-910bx2-infer" // CmFaultJob910bx8Kind key in configmap which saves the 910bx8 FaultJob cache CmFaultJob910bx8Kind = "fault-job-910bx8" // CmFaultJob910bx16Kind key in configmap which saves the 910bx16 FaultJob cache CmFaultJob910bx16Kind = "fault-job-910bx16" // CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x8Kind = "fault-job-910x8" // CmFaultJob910x4Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x4Kind = "fault-job-910x4" // CmFaultJob910x2Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x2Kind = "fault-job-910x2" // CmFaultJob310x4Kind key in configmap which saves the 310x4 FaultJob cache CmFaultJob310x4Kind = "fault-job-310x4" // CmFaultJob310PKind key in configmap which saves the 310P FaultJob cache CmFaultJob310PKind = "fault-job-310P" // CmNodeHeartbeatKind judging node fault needs heartbeat info from former session, so should be recorded CmNodeHeartbeatKind = "node-heartbeat" // CmJobRemainRetryTimes judging node fault needs heartbeat info from former session, so should be recorded CmJobRemainRetryTimes = "remain-retry-times" // CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence CmNodeRankTimeMapKind = "node-rankIndex-Occurrence" // CmCheckCode Check code key CmCheckCode = "checkCode" // CmFaultJob key in configmap which saves the FaultJob cache CmFaultJob = "fault-job" // DefaultGraceOverTime time interval for grace delete DefaultGraceOverTime = 900 // CardHealthy represents a healthy card CardHealthy = "Healthy" // CardUnhealthy represents an unhealthy card CardUnhealthy = "Unhealthy" // CardNetworkUnhealthy represents a network unhealthy card CardNetworkUnhealthy = "NetworkUnhealthy" // NodeHealthy represents node is available for scheduling NodeHealthy = "Healthy" // NodeUnhealthy represents node is unhealthy by judging heartbeat NodeUnhealthy = "NodeUnhealthy" // NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy NodeCardUnhealthy = "CardUnhealthy" // NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy NodeCardNetworkUnhealthy = "CardNetworkUnhealthy" // NoFaultJobsErr none fault jobs NoFaultJobsErr = "none fault jobs to be restarted in cache" // JobFaultRankIDCMPre the job cm name prefix, for retraining JobFaultRankIDCMPre = "fault-config-" // JobFaultRankIDCMDataKey the job cm value key. JobFaultRankIDCMDataKey = "fault-npus" // JobRecovery Name of cm for recovery JobRecovery = "job-recovery" // DeviceFaultCmKey the key of DeviceFault info DeviceFaultCmKey = "huawei.com/Ascend910-Fault" // PodFailed the state of failed pod PodFailed = "pod-failed" // PodHealthy the state of healthy pod PodHealthy = "pod-healthy" // FaultRetryTimesKey key of fault-retry-times label FaultRetryTimesKey = "fault-retry-times" )
const ( // PreSeparateNPU fault type waiting user check PreSeparateNPU = "PreSeparateNPU" // NotHandleFault fault type not handle NotHandleFault = "NotHandleFault" // NodeFaultCode fault type nodeUnhealthy NodeFaultCode = "heartbeatTimeOut" // AcJobTag the tag of AcJob AcJobTag = "group-name" // AcJobVersion the api version of AcJob AcJobVersion = "mindxdl.gitee.com" // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const (
// SuperPodAnnoKey annotation key of super pod
SuperPodAnnoKey = "sp-block"
)
Variables ¶
This section is empty.
Functions ¶
func GetTaskRestartReason ¶
func GetTaskRestartReason(reasonList []FaultReasonList) string
GetTaskRestartReason convert to json str
Types ¶
type AllocNodeRankOccurrence ¶
AllocNodeRankOccurrence object recording node rankIndex and whether index re-allocated to new node
type DealReSchedulerCache ¶
type DealReSchedulerCache struct { *DealReSchedulerConfigmap FaultNodes []FaultNode RealFaultNodes []FaultNode `json:"-"` FaultNodeMaps map[string]SimpleFNodeInfo `json:"-"` FaultJobs []FaultJob RealFaultJobs []FaultJob `json:"-"` NodeHeartbeats []NodeHeartbeat AllocNodeRankOccurrenceMap map[api.JobID][]*AllocNodeRankOccurrence JobRemainRetryTimes map[api.JobID]*RemainRetryTimes }
DealReSchedulerCache object with method for re-scheduler cache
func (DealReSchedulerCache) GetRealFaultNodes ¶
func (reCache DealReSchedulerCache) GetRealFaultNodes() []FaultNode
GetRealFaultNodes get the nodes whose isFaultNode property takes true value
func (*DealReSchedulerCache) SetFaultJobsFromCM ¶
func (reCache *DealReSchedulerCache) SetFaultJobsFromCM(jobType string) error
SetFaultJobsFromCM unmarshal FaultJobs from string into struct and set the value
func (*DealReSchedulerCache) SetFaultNodesFromCM ¶
func (reCache *DealReSchedulerCache) SetFaultNodesFromCM() error
SetFaultNodesFromCM unmarshal FaultNodes from string into struct and set the value
func (*DealReSchedulerCache) SetNodeHeartbeatFromCM ¶
func (reCache *DealReSchedulerCache) SetNodeHeartbeatFromCM() error
SetNodeHeartbeatFromCM unmarshal NodeHeartbeat from string into struct and set the value
func (*DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM ¶
func (reCache *DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM() error
SetNodeRankOccurrenceMapFromCM unmarshal NodeRankOccurrenceMap from string into struct and set the value
func (*DealReSchedulerCache) SetRetryTimesFromCM ¶
func (reCache *DealReSchedulerCache) SetRetryTimesFromCM() error
SetRetryTimesFromCM unmarshal NodeHeartbeat from string into struct and set the value
func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶
func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap
type DealReSchedulerConfigmap ¶
DealReSchedulerConfigmap object with method for re-scheduler configmap
type FaultCard ¶
FaultCard card object for re-scheduling
type FaultDeviceList ¶
type FaultDeviceList struct { FaultType string `json:"fault_type"` NPUName string `json:"npu_name"` FaultLevel string `json:"fault_level"` FaultHandling string `json:"fault_handling"` LargeModelFaultLevel string `json:"large_model_fault_level"` FaultCode string `json:"fault_code"` }
FaultDeviceList is the fault reason of card
func GetNodeDeviceFaultFromDeviceInfo ¶
func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)
GetNodeDeviceFaultFromDeviceInfo get device fault from device info
type FaultJob ¶
type FaultJob struct { ReScheduleKey string // values taken off/grace/force SubHealthyStrategy string IsSubHealthFault bool PendingSessionNum int IsFaultJob bool IsInSession bool JobName string JobUID api.JobID JobNamespace string JobRankIds []string // useCardIndex + 8*NodeRankIndex NodeNames []string SuperPods map[string][]plugin.SuperNode NodeNameMaps map[string]struct{} FaultTasks []FaultTask UpdateTime int64 JobRankIdCreateTime int64 // stop updating when job becomes a real fault one FaultTypes []string DeleteExecutedFlag bool ElasticScheduling string ReferenceName string FaultRetryTimes int UUID types.UID // contains filtered or unexported fields }
FaultJob job object for re-scheduling
func (*FaultJob) CheckJobExistsInKubernetes ¶
CheckJobExistsInKubernetes check whether job recorded in cache can be traced in kubernetes
func (*FaultJob) ForceDeleteJob ¶
func (fJob *FaultJob) ForceDeleteJob(ssn *framework.Session, schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones
func (*FaultJob) GetJobElasticSchedulingLabel ¶
func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
GetJobElasticSchedulingLabel get job's elastic scheduling label
func (*FaultJob) GetJobFaultNPUTaskNum ¶
GetJobFaultNPUTaskNum get fob fault NPU task num
func (*FaultJob) GetJobFaultRescheduleLabel ¶
func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
GetJobFaultRescheduleLabel Get job's fault reschedule label.
func (*FaultJob) GraceDeleteJob ¶
func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
GraceDeleteJob grace delete jobs labelled to be deleted gracefully
func (*FaultJob) IsJobHasPreSeparateNPUKey ¶
IsJobHasPreSeparateNPUKey is Job has the key of PreSeparateNPU
func (*FaultJob) IsJobSingleRescheduling ¶
func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
IsJobSingleRescheduling valid job.
type FaultNode ¶
type FaultNode struct { SuperPodID int32 NodeName string NPUName string FaultDeviceList []FaultDeviceList UpdateTime int64 UnhealthyNPU []string NetworkUnhealthyNPU []string IsFaultNode bool NodeDEnable bool NodeHealthState string AllCards []string FaultCards []FaultCard HeartbeatInterval int OldHeartbeatTime int64 NewHeartbeatTime int64 UpdateHeartbeatTime int64 HasSwitchSubHealthFault bool HasCardSubHealthFault bool }
FaultNode node object for re-scheduling
type FaultNodeInfoToCm ¶
type FaultNodeInfoToCm struct { FaultDeviceList []FaultDeviceList NodeName string UnhealthyNPU []string NetworkUnhealthyNPU []string NodeDEnable bool NodeHealthState string UpdateTime int64 OldHeartbeatTime int64 NewHeartbeatTime int64 UpdateHeartbeatTime int64 }
FaultNodeInfoToCm fault node info to cm
type FaultRankIdsJobCMData ¶
FaultRankIdsJobCMData used by RestoreManager for every job.
type FaultReasonList ¶
type FaultReasonList struct { NodeName string `json:"node_name"` FaultDeviceList }
FaultReasonList node Fault Device List
type FaultTask ¶
type FaultTask struct { Reason []FaultReasonList RelationFault string IsFaultTask bool IsFaultRetryEnable bool HasSubHealthFault bool TaskUID api.TaskID TaskName string TaskNamespace string NodeName string JobName string NodeRankIndex string UseCardName []string PodCreateTime int64 PodUID types.UID // contains filtered or unexported fields }
FaultTask object dealing with node for rescheduling
type NodeHeartbeat ¶
NodeHeartbeat object recording nodes and their heartbeats
type ReScheduler ¶
type ReScheduler struct { *DealReSchedulerCache GraceDeleteTime int64 Level string Jobs map[api.JobID]plugin.SchedulerJob Nodes map[string]plugin.NPUNode DeviceInfoNotInSession map[string]plugin.NodeDeviceInfoWithTime `json:"-"` IsFirstSession *bool // contains filtered or unexported fields }
ReScheduler object for re-scheduling
func New ¶
func New(env *plugin.ScheduleEnv, jobType string) *ReScheduler
New Initialisation of ReScheduler
func (*ReScheduler) AddFaultJobWithSession ¶
func (reScheduler *ReScheduler) AddFaultJobWithSession( jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects
func (*ReScheduler) AddFaultNodeWithSession ¶
func (reScheduler *ReScheduler) AddFaultNodeWithSession()
AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache
func (*ReScheduler) CheckNodeNPUByTask ¶
func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode, npuName string) error
CheckNodeNPUByTask used in the predicate process of task and node
func (*ReScheduler) GenerateNodeRankIndexTaskMap ¶
func (reScheduler *ReScheduler) GenerateNodeRankIndexTaskMap()
GenerateNodeRankIndexTaskMap get the nodeName, rankIndex, and Occurrence of nodes in a job
func (ReScheduler) GetFaultJobOfGivenTaskInfoFromCache ¶
func (reScheduler ReScheduler) GetFaultJobOfGivenTaskInfoFromCache(task *api.TaskInfo) *FaultJob
GetFaultJobOfGivenTaskInfoFromCache get fault job from task info
func (*ReScheduler) GetGraceDeleteTime ¶
func (reScheduler *ReScheduler) GetGraceDeleteTime(Conf []config.Configuration) (int64, error)
GetGraceDeleteTime Get the graceful delete time from configuration
func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶
func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs( schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully
func (*ReScheduler) GetRunningJobs ¶
func (reScheduler *ReScheduler) GetRunningJobs( ssn *framework.Session) (map[api.JobID]*api.JobInfo, error)
GetRunningJobs get all the running jobs of <UseCardName> type
func (*ReScheduler) InitFaultNodeMap ¶
func (reScheduler *ReScheduler) InitFaultNodeMap()
InitFaultNodeMap init the node map of fault node
func (*ReScheduler) New910ReScheduler ¶
func (reScheduler *ReScheduler) New910ReScheduler()
New910ReScheduler initialise ReScheduler.FaultJobs for 910x8
func (*ReScheduler) NewCommonReScheduler ¶
func (reScheduler *ReScheduler) NewCommonReScheduler(jobType string)
NewCommonReScheduler initialise ReScheduler.FaultJobs for non 910x8
func (*ReScheduler) RestartFaultJobs ¶
func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off
func (*ReScheduler) RestartNeedForceDeleteJobs ¶
func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartNeedForceDeleteJobs Restart jobs that need to be force deleted
func (*ReScheduler) ScoreBestNPUNodes ¶
func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64) error
ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks
func (*ReScheduler) SynCacheFaultJobWithSession ¶
func (reScheduler *ReScheduler) SynCacheFaultJobWithSession(ssn *framework.Session)
SynCacheFaultJobWithSession Synchronise FaultJobs in cache by updating the information using current session
func (*ReScheduler) SynCacheFaultNodeWithSession ¶
func (reScheduler *ReScheduler) SynCacheFaultNodeWithSession()
SynCacheFaultNodeWithSession Synchronise FaultNodes in cache by updating the information using current session
func (*ReScheduler) SynCacheNodeRankOccMapWithSession ¶
func (reScheduler *ReScheduler) SynCacheNodeRankOccMapWithSession(ssn *framework.Session)
SynCacheNodeRankOccMapWithSession Synchronise FaultJobs in cache by updating the information using current session
func (*ReScheduler) SyncJobRemainRetryTimes ¶
func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session
func (*ReScheduler) ValidJobByReschedule ¶
func (reScheduler *ReScheduler) ValidJobByReschedule(curSchedulerJob util.SchedulerJobAttr) *api.ValidateResult
ValidJobByReschedule valid job by reschedule
type RemainRetryTimes ¶
RemainRetryTimes remained retry times
Source Files
¶
- cache.go
- configmap.go
- job.go
- node.go
- reschedule.go
- task.go
- type.go