Documentation
¶
Overview ¶
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.
Index ¶
- Constants
- func GetTaskRestartReason(reasonList []FaultReasonList) string
- func NewHandler() plugin.FaultHandler
- type DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
- func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
- type FaultCard
- type FaultDeviceList
- type FaultJob
- func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- func (fJob *FaultJob) IsNormalJobNeedRestart() bool
- func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
- type FaultNode
- type FaultNodeInfoToCm
- type FaultReasonList
- type FaultTask
- type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode *plugin.NPUNode) error
- func (reScheduler *ReScheduler) Execute(env *plugin.ScheduleEnv, ssn *framework.Session) error
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo
- func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
- func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
- func (reScheduler *ReScheduler) UseAnnotation(task *api.TaskInfo)
- type RemainRetryTimes
- type RescheduleReason
- type RescheduleRecord
- type RescheduleTaskReason
- type SimpleFNodeInfo
Constants ¶
const ( // RePropertyName name specifying re-scheduler cm RePropertyName = "re-scheduling" // ReschedulingReasonKey is used to record the reason of rescheduling ReschedulingReasonKey = "rescheduling-reason" // CmName Name of ReSchedulerConfigmap CmName = "vcjob-fault-npu-cm" // CmNameSpace Namespace of ReSchedulerConfigmap CmNameSpace = "volcano-system" // RescheduleReasonCmName Name of RescheduleReasonConfigmap RescheduleReasonCmName = "job-reschedule-reason" // RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap RescheduleReasonCmNamespace = "mindx-dl" // JobRescheduleLabelKey key word of re-scheduling configuration JobRescheduleLabelKey = "fault-scheduling" // JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration JobGraceRescheduleLabelValue = "grace" // JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration JobForceRescheduleLabelValue = "force" // JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration JobOffRescheduleLabelValue = "off" // GraceOverTimeKey for GraceOverTime config by user GraceOverTimeKey = "grace-over-time" // ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling ElasticSchedulingKey = "elastic-scheduling" // JobOnElasticScheduling job enabled with elastic scheduling JobOnElasticScheduling = "on" // JobOffElasticScheduling job not enabled with elastic scheduling JobOffElasticScheduling = "off" // CmFaultNodeKind key in configmap which saves the FaultNode cache CmFaultNodeKind = "fault-node" // CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache CmFaultJob910bx2Kind = "fault-job-910bx2" // CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x8Kind = "fault-job-910x8" // CmJobRemainRetryTimes key in configmap which saves remain retry times of job CmJobRemainRetryTimes = "remain-retry-times" // MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted // if record more than MaxRescheduleRecordsNum records MaxRescheduleRecordsNum = 10 // MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records MaxKbOfRescheduleRecords = 950 * 1024 // CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling CmJobRescheduleReasonsKey = "recent-reschedule-records" // CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence CmNodeRankTimeMapKind = "node-rankIndex-Occurrence" // CmCheckCode Check code key CmCheckCode = "checkCode" // CmFaultJob key in configmap which saves the FaultJob cache CmFaultJob = "fault-job" // DefaultGraceOverTime time interval for grace delete DefaultGraceOverTime = 900 // PublicFaultType represents a PublicFault fault type PublicFaultType = "PublicFault" // CardHealthy represents a healthy card CardHealthy = "Healthy" // CardUnhealthy represents an unhealthy card CardUnhealthy = "Unhealthy" // CardNetworkUnhealthy represents a network unhealthy card CardNetworkUnhealthy = "NetworkUnhealthy" // NodeHealthy represents node is available for scheduling NodeHealthy = "Healthy" // NodeUnhealthy represents node is unhealthy NodeUnhealthy = "NodeUnhealthy" // NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy NodeCardUnhealthy = "CardUnhealthy" // NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy NodeCardNetworkUnhealthy = "CardNetworkUnhealthy" // NoFaultJobsErr none fault jobs NoFaultJobsErr = "none fault jobs to be restarted in cache" // JobRecovery Name of cm for recovery JobRecovery = "job-recovery" // DeviceFaultCmKeySuffix the key of DeviceFault info DeviceFaultCmKeySuffix = "-Fault" // PodFailed the state of failed pod PodFailed = "pod-failed" // PodHealthy the state of healthy pod PodHealthy = "pod-healthy" // FaultRetryTimesKey key of fault-retry-times label FaultRetryTimesKey = "fault-retry-times" )
const ( // PreSeparateNPU fault type waiting user check PreSeparateNPU = "PreSeparateNPU" // NotHandleFault fault type not handle NotHandleFault = "NotHandleFault" // NodeFaultCode fault type nodeUnhealthy NodeFaultCode = "heartbeatTimeOut" // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const (
// SuperPodAnnoKey annotation key of super pod
SuperPodAnnoKey = "sp-block"
)
Variables ¶
This section is empty.
Functions ¶
func GetTaskRestartReason ¶
func GetTaskRestartReason(reasonList []FaultReasonList) string
GetTaskRestartReason convert to json str
Types ¶
type DealReSchedulerCache ¶
type DealReSchedulerCache struct { FaultNodes map[string]*FaultNode FaultJobs map[api.JobID]*FaultJob JobRemainRetryTimes map[api.JobID]*RemainRetryTimes JobRecentRescheduleRecords map[api.JobID]*RescheduleReason }
DealReSchedulerCache object with method for re-scheduler cache
func GetReSchedulerCache ¶
func GetReSchedulerCache() *DealReSchedulerCache
GetReSchedulerCache return reschedule cache
func (DealReSchedulerCache) GetRealFaultNodes ¶
func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
GetRealFaultNodes get the nodes whose isFaultNode property takes true value
func (*DealReSchedulerCache) SetJobRecentRescheduleRecords ¶
func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it
func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶
func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap
type FaultCard ¶
FaultCard card object for re-scheduling
type FaultDeviceList ¶
type FaultDeviceList struct { FaultType string `json:"fault_type"` NPUName string `json:"npu_name"` FaultLevel string `json:"fault_level"` FaultHandling string `json:"fault_handling"` LargeModelFaultLevel string `json:"large_model_fault_level"` FaultCode string `json:"fault_code"` }
FaultDeviceList is the fault reason of card
type FaultJob ¶
type FaultJob struct { ReScheduleKey string // values taken off/grace/force RescheduleTime int64 SubHealthyStrategy string IsSubHealthFault bool PendingSessionNum int IsFaultJob bool JobName string JobUID api.JobID JobNamespace string SuperPods map[string][]plugin.SuperNode FaultTasks []FaultTask UpdateTime int64 FaultTypes []string DeleteExecutedFlag bool ElasticScheduling string ReferenceName string FaultRetryTimes int UUID types.UID // contains filtered or unexported fields }
FaultJob job object for re-scheduling
func (*FaultJob) ForceDeleteJob ¶
func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones
func (*FaultJob) GetJobElasticSchedulingLabel ¶
func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
GetJobElasticSchedulingLabel get job's elastic scheduling label
func (*FaultJob) GetJobFaultRescheduleLabel ¶
func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
GetJobFaultRescheduleLabel Get job's fault reschedule label.
func (*FaultJob) GraceDeleteJob ¶
func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
GraceDeleteJob grace delete jobs labelled to be deleted gracefully
func (*FaultJob) IsJobSingleRescheduling ¶
func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
IsJobSingleRescheduling valid job.
func (*FaultJob) IsNormalJobNeedRestart ¶
IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault
func (*FaultJob) IsProcessReschedulingJob ¶
func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
IsProcessReschedulingJob valid job.
type FaultNode ¶
type FaultNode struct { SuperPodID int32 NodeName string NPUName string FaultDeviceList []FaultDeviceList UpdateTime int64 UnhealthyNPU []string NetworkUnhealthyNPU []string IsFaultNode bool NodeDEnable bool NodeHealthState string FaultCards []FaultCard HasSwitchSubHealthFault bool HasCardSubHealthFault bool LinkDownTime int64 IsNpuNode bool }
FaultNode node object for re-scheduling
type FaultNodeInfoToCm ¶
type FaultNodeInfoToCm struct { FaultDeviceList []FaultDeviceList NodeName string UnhealthyNPU []string NetworkUnhealthyNPU []string NodeDEnable bool NodeHealthState string UpdateTime int64 }
FaultNodeInfoToCm fault node info to cm
type FaultReasonList ¶
type FaultReasonList struct { NodeName string `json:"node_name"` TaskName string `json:"task_name"` FaultRankList []string FaultDeviceList }
FaultReasonList node Fault Device List
type FaultTask ¶
type FaultTask struct { Reason []FaultReasonList FaultTime int64 RelationFault string IsFaultTask bool IsFaultRetryEnable bool HasSubHealthFault bool IsSoftwareFault bool TaskUID api.TaskID TaskName string TaskNamespace string NodeName string NodeRankIndex string UseCardName []string PodCreateTime int64 IsNpuTask bool Annotations map[string]string // contains filtered or unexported fields }
FaultTask object dealing with node for rescheduling
type ReScheduler ¶
type ReScheduler struct { *DealReSchedulerCache GraceDeleteTime int64 Jobs map[api.JobID]plugin.SchedulerJob Nodes map[string]plugin.NPUNode // contains filtered or unexported fields }
ReScheduler object for re-scheduling
func (*ReScheduler) AddFaultJobWithSession ¶
func (reScheduler *ReScheduler) AddFaultJobWithSession( jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects
func (*ReScheduler) AddFaultNodeWithSession ¶
func (reScheduler *ReScheduler) AddFaultNodeWithSession()
AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache
func (*ReScheduler) CheckNodeNPUByTask ¶
func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode *plugin.NPUNode) error
CheckNodeNPUByTask used in the predicate process of task and node
func (*ReScheduler) Execute ¶
func (reScheduler *ReScheduler) Execute(env *plugin.ScheduleEnv, ssn *framework.Session) error
Execute pre-processing actions for rescheduler handler
func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶
func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs( schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully
func (*ReScheduler) GetRunningJobs ¶
GetRunningJobs get all the running jobs of <UseCardName> type
func (*ReScheduler) PreStopAction ¶
func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
PreStopAction post-processing actions for re-scheduling
func (*ReScheduler) RestartFaultJobs ¶
func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off
func (*ReScheduler) RestartNeedForceDeleteJobs ¶
func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartNeedForceDeleteJobs Restart jobs that need to be force deleted
func (*ReScheduler) ScoreBestNPUNodes ¶
func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks
func (*ReScheduler) SyncJobRecentRescheduleReason ¶
func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync
func (*ReScheduler) SyncJobRemainRetryTimes ¶
func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session
func (*ReScheduler) UseAnnotation ¶
func (reScheduler *ReScheduler) UseAnnotation(task *api.TaskInfo)
UseAnnotation add task annotation is reschedule in place
type RemainRetryTimes ¶
RemainRetryTimes remained retry times
type RescheduleReason ¶
type RescheduleReason struct { // JobID the job id of this record JobID api.JobID // TotalRescheduleTimes to show how many times reschedule has happened since job created TotalRescheduleTimes int // RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling RescheduleRecords []RescheduleRecord // AdditionalInfo is used to provide additional information, such as for length concern reduce some records AdditionalInfo string `json:",omitempty"` }
RescheduleReason shows the reason of this job rescheduling
type RescheduleRecord ¶
type RescheduleRecord struct { // LogFileFormatTime is the formated time, to make it convenient to read and locate log LogFileFormatTime string // RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened RescheduleTimeStamp int64 // ReasonOfTask record the reason of this rescheduling of task ReasonOfTask []RescheduleTaskReason }
RescheduleRecord will records job rescheduling records
type RescheduleTaskReason ¶
type RescheduleTaskReason struct { // RescheduleReason the fault type of this rescheduling RescheduleReason string // PodName the fault task caused this rescheduling PodName string // NodeName the fault node caused this rescheduling NodeName string // NodeRankIndex the rank index of the fault task NodeRankIndex string }
RescheduleTaskReason record the reason of this rescheduling of task
Source Files
¶
- cache.go
- frame.go
- job.go
- node.go
- reschedule.go
- task.go
- type.go