rescheduling

package
v6.0.0-RC2.0...-a5e2871 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 30, 2025 License: Apache-2.0, Apache-2.0 Imports: 19 Imported by: 0

Documentation

Overview

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.

Index

Constants

View Source
const (
	// RePropertyName name specifying re-scheduler cm
	RePropertyName = "re-scheduling"
	// ReschedulingReasonKey is used to record the reason of rescheduling
	ReschedulingReasonKey = "rescheduling-reason"
	// CmName Name of ReSchedulerConfigmap
	CmName = "vcjob-fault-npu-cm"
	// CmNameSpace Namespace of ReSchedulerConfigmap
	CmNameSpace = "volcano-system"
	// RescheduleReasonCmName Name of RescheduleReasonConfigmap
	RescheduleReasonCmName = "job-reschedule-reason"
	// RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap
	RescheduleReasonCmNamespace = "mindx-dl"

	// JobRescheduleLabelKey key word of re-scheduling configuration
	JobRescheduleLabelKey = "fault-scheduling"
	// JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration
	JobGraceRescheduleLabelValue = "grace"
	// JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration
	JobForceRescheduleLabelValue = "force"
	// JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration
	JobOffRescheduleLabelValue = "off"
	// GraceOverTimeKey for GraceOverTime config by user
	GraceOverTimeKey = "grace-over-time"
	// ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling
	ElasticSchedulingKey = "elastic-scheduling"
	// JobOnElasticScheduling job enabled with elastic scheduling
	JobOnElasticScheduling = "on"
	// JobOffElasticScheduling job not enabled with elastic scheduling
	JobOffElasticScheduling = "off"

	// CmFaultNodeKind key in configmap which saves the FaultNode cache
	CmFaultNodeKind = "fault-node"
	// CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache
	CmFaultJob910bx2Kind = "fault-job-910bx2"
	// CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache
	CmFaultJob910x8Kind = "fault-job-910x8"
	// CmJobRemainRetryTimes key in configmap which saves remain retry times of job
	CmJobRemainRetryTimes = "remain-retry-times"
	// MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted
	// if record more than MaxRescheduleRecordsNum records
	MaxRescheduleRecordsNum = 10
	// MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records
	MaxKbOfRescheduleRecords = 950 * 1024
	// CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling
	CmJobRescheduleReasonsKey = "recent-reschedule-records"
	// CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence
	CmNodeRankTimeMapKind = "node-rankIndex-Occurrence"
	// CmCheckCode Check code key
	CmCheckCode = "checkCode"

	// CmFaultJob key in configmap which saves the FaultJob cache
	CmFaultJob = "fault-job"

	// DefaultGraceOverTime time interval for grace delete
	DefaultGraceOverTime = 900

	// PublicFaultType represents a PublicFault fault type
	PublicFaultType = "PublicFault"
	// CardHealthy represents a healthy card
	CardHealthy = "Healthy"
	// CardUnhealthy represents an unhealthy card
	CardUnhealthy = "Unhealthy"
	// CardNetworkUnhealthy represents a network unhealthy card
	CardNetworkUnhealthy = "NetworkUnhealthy"
	// NodeHealthy represents node is available for scheduling
	NodeHealthy = "Healthy"
	// NodeUnhealthy represents node is unhealthy
	NodeUnhealthy = "NodeUnhealthy"
	// NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy
	NodeCardUnhealthy = "CardUnhealthy"
	// NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy
	NodeCardNetworkUnhealthy = "CardNetworkUnhealthy"
	// NoFaultJobsErr none fault jobs
	NoFaultJobsErr = "none fault jobs to be restarted in cache"
	// JobRecovery Name of cm for recovery
	JobRecovery = "job-recovery"
	// DeviceFaultCmKeySuffix the key of DeviceFault info
	DeviceFaultCmKeySuffix = "-Fault"
	// PodFailed the state of failed pod
	PodFailed = "pod-failed"
	// PodHealthy the state of healthy pod
	PodHealthy = "pod-healthy"

	// FaultRetryTimesKey key of fault-retry-times label
	FaultRetryTimesKey = "fault-retry-times"
)
View Source
const (
	// PreSeparateNPU fault type waiting user check
	PreSeparateNPU = "PreSeparateNPU"
	// NotHandleFault fault type not handle
	NotHandleFault = "NotHandleFault"
	// NodeFaultCode fault type nodeUnhealthy
	NodeFaultCode = "heartbeatTimeOut"
	// SubHealthFault subHealth code
	SubHealthFault = "SubHealthFault"
)
View Source
const (

	// SuperPodAnnoKey annotation key of super pod
	SuperPodAnnoKey = "sp-block"
)

Variables

This section is empty.

Functions

func GetTaskRestartReason

func GetTaskRestartReason(reasonList []FaultReasonList) string

GetTaskRestartReason convert to json str

func NewHandler

func NewHandler() plugin.FaultHandler

NewHandler new fault policy handler

Types

type DealReSchedulerCache

type DealReSchedulerCache struct {
	FaultNodes                 map[string]*FaultNode
	FaultJobs                  map[api.JobID]*FaultJob
	JobRemainRetryTimes        map[api.JobID]*RemainRetryTimes
	JobRecentRescheduleRecords map[api.JobID]*RescheduleReason
}

DealReSchedulerCache object with method for re-scheduler cache

func GetReSchedulerCache

func GetReSchedulerCache() *DealReSchedulerCache

GetReSchedulerCache return reschedule cache

func (DealReSchedulerCache) GetRealFaultNodes

func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode

GetRealFaultNodes get the nodes whose isFaultNode property takes true value

func (*DealReSchedulerCache) SetJobRecentRescheduleRecords

func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool,
	client kubernetes.Interface) error

SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it

func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache

func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error

WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap

type FaultCard

type FaultCard struct {
	IsFaultCard bool
	NPUName     string
	FaultType   string
}

FaultCard card object for re-scheduling

type FaultDeviceList

type FaultDeviceList struct {
	FaultType            string `json:"fault_type"`
	NPUName              string `json:"npu_name"`
	FaultLevel           string `json:"fault_level"`
	FaultHandling        string `json:"fault_handling"`
	LargeModelFaultLevel string `json:"large_model_fault_level"`
	FaultCode            string `json:"fault_code"`
}

FaultDeviceList is the fault reason of card

type FaultJob

type FaultJob struct {
	ReScheduleKey      string // values taken off/grace/force
	RescheduleTime     int64
	SubHealthyStrategy string
	IsSubHealthFault   bool
	PendingSessionNum  int
	IsFaultJob         bool
	JobName            string
	JobUID             api.JobID
	JobNamespace       string
	SuperPods          map[string][]plugin.SuperNode
	FaultTasks         []FaultTask
	UpdateTime         int64
	FaultTypes         []string
	DeleteExecutedFlag bool
	ElasticScheduling  string
	ReferenceName      string
	FaultRetryTimes    int

	UUID types.UID
	// contains filtered or unexported fields
}

FaultJob job object for re-scheduling

func (*FaultJob) ForceDeleteJob

func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones

func (*FaultJob) GetJobElasticSchedulingLabel

func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string

GetJobElasticSchedulingLabel get job's elastic scheduling label

func (*FaultJob) GetJobFaultRescheduleLabel

func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string

GetJobFaultRescheduleLabel Get job's fault reschedule label.

func (*FaultJob) GraceDeleteJob

func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

GraceDeleteJob grace delete jobs labelled to be deleted gracefully

func (*FaultJob) IsJobSingleRescheduling

func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool

IsJobSingleRescheduling valid job.

func (*FaultJob) IsNormalJobNeedRestart

func (fJob *FaultJob) IsNormalJobNeedRestart() bool

IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault

func (*FaultJob) IsProcessReschedulingJob

func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool

IsProcessReschedulingJob valid job.

type FaultNode

type FaultNode struct {
	SuperPodID              int32
	NodeName                string
	NPUName                 string
	FaultDeviceList         []FaultDeviceList
	UpdateTime              int64
	UnhealthyNPU            []string
	NetworkUnhealthyNPU     []string
	IsFaultNode             bool
	NodeDEnable             bool
	NodeHealthState         string
	FaultCards              []FaultCard
	HasSwitchSubHealthFault bool
	HasCardSubHealthFault   bool
	LinkDownTime            int64
	IsNpuNode               bool
}

FaultNode node object for re-scheduling

type FaultNodeInfoToCm

type FaultNodeInfoToCm struct {
	FaultDeviceList     []FaultDeviceList
	NodeName            string
	UnhealthyNPU        []string
	NetworkUnhealthyNPU []string
	NodeDEnable         bool
	NodeHealthState     string
	UpdateTime          int64
}

FaultNodeInfoToCm fault node info to cm

type FaultReasonList

type FaultReasonList struct {
	NodeName      string `json:"node_name"`
	TaskName      string `json:"task_name"`
	FaultRankList []string
	FaultDeviceList
}

FaultReasonList node Fault Device List

type FaultTask

type FaultTask struct {
	Reason             []FaultReasonList
	FaultTime          int64
	RelationFault      string
	IsFaultTask        bool
	IsFaultRetryEnable bool
	HasSubHealthFault  bool
	IsSoftwareFault    bool
	TaskUID            api.TaskID
	TaskName           string
	TaskNamespace      string
	NodeName           string
	NodeRankIndex      string
	UseCardName        []string
	PodCreateTime      int64

	IsNpuTask   bool
	Annotations map[string]string
	// contains filtered or unexported fields
}

FaultTask object dealing with node for rescheduling

func (*FaultTask) DeleteRealPodByTask

func (fTask *FaultTask) DeleteRealPodByTask(kubeClient kubernetes.Interface, waitTime int64) error

DeleteRealPodByTask delete pod from kubernetes of tasks

type ReScheduler

type ReScheduler struct {
	*DealReSchedulerCache
	GraceDeleteTime int64
	Jobs            map[api.JobID]plugin.SchedulerJob
	Nodes           map[string]plugin.NPUNode
	// contains filtered or unexported fields
}

ReScheduler object for re-scheduling

func (*ReScheduler) AddFaultJobWithSession

func (reScheduler *ReScheduler) AddFaultJobWithSession(
	jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error

AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects

func (*ReScheduler) AddFaultNodeWithSession

func (reScheduler *ReScheduler) AddFaultNodeWithSession()

AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache

func (*ReScheduler) CheckNodeNPUByTask

func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode *plugin.NPUNode) error

CheckNodeNPUByTask used in the predicate process of task and node

func (*ReScheduler) Execute

func (reScheduler *ReScheduler) Execute(env *plugin.ScheduleEnv, ssn *framework.Session) error

Execute pre-processing actions for rescheduler handler

func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs

func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(
	schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)

GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully

func (*ReScheduler) GetRunningJobs

func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo

GetRunningJobs get all the running jobs of <UseCardName> type

func (*ReScheduler) PreStopAction

func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error

PreStopAction post-processing actions for re-scheduling

func (*ReScheduler) RestartFaultJobs

func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off

func (*ReScheduler) RestartNeedForceDeleteJobs

func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartNeedForceDeleteJobs Restart jobs that need to be force deleted

func (*ReScheduler) ScoreBestNPUNodes

func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)

ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks

func (*ReScheduler) SyncJobRecentRescheduleReason

func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)

SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync

func (*ReScheduler) SyncJobRemainRetryTimes

func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)

SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session

func (*ReScheduler) UseAnnotation

func (reScheduler *ReScheduler) UseAnnotation(task *api.TaskInfo)

UseAnnotation add task annotation is reschedule in place

type RemainRetryTimes

type RemainRetryTimes struct {
	UUID  types.UID
	Times int
}

RemainRetryTimes remained retry times

type RescheduleReason

type RescheduleReason struct {
	// JobID the job id of this record
	JobID api.JobID
	// TotalRescheduleTimes to show how many times reschedule has happened since job created
	TotalRescheduleTimes int
	// RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling
	RescheduleRecords []RescheduleRecord
	// AdditionalInfo is used to provide additional information, such as for length concern reduce some records
	AdditionalInfo string `json:",omitempty"`
}

RescheduleReason shows the reason of this job rescheduling

type RescheduleRecord

type RescheduleRecord struct {
	// LogFileFormatTime is the formated time, to make it convenient to read and locate log
	LogFileFormatTime string
	// RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened
	RescheduleTimeStamp int64
	// ReasonOfTask record the reason of this rescheduling of task
	ReasonOfTask []RescheduleTaskReason
}

RescheduleRecord will records job rescheduling records

type RescheduleTaskReason

type RescheduleTaskReason struct {
	// RescheduleReason the fault type of this rescheduling
	RescheduleReason string
	// PodName the fault task caused this rescheduling
	PodName string
	// NodeName the fault node caused this rescheduling
	NodeName string
	// NodeRankIndex the rank index of the fault task
	NodeRankIndex string
}

RescheduleTaskReason record the reason of this rescheduling of task

type SimpleFNodeInfo

type SimpleFNodeInfo struct {
	NodeName                string
	IsFaultNode             bool
	HasCardSubHealthFault   bool
	HasSwitchSubHealthFault bool
	NodeHealthState         string
}

SimpleFNodeInfo simple fault node info

Source Files

  • cache.go
  • frame.go
  • job.go
  • node.go
  • reschedule.go
  • task.go
  • type.go

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL