Documentation
¶
Index ¶
- Constants
- Variables
- func BuildKubeConfig(sConfig *Config) *rest.Config
- type AffinityGroup
- type AffinityGroupList
- type AffinityGroupMemberBindInfo
- type AffinityGroupMemberSpec
- type AffinityGroupSpec
- type AffinityGroupState
- type AffinityGroupStatus
- type CellAddress
- type CellHealthiness
- type CellState
- type CellStatus
- type CellType
- type CellTypeSpec
- type ClusterStatus
- type Config
- type LazyPreemptionStatus
- type ObjectMeta
- type PhysicalCellSpec
- type PhysicalCellStatus
- type PhysicalClusterSpec
- type PhysicalClusterStatus
- type PodBindInfo
- type PodPlacementInfo
- type PodSchedulingSpec
- type ReservationId
- type ReservedCellSpec
- type VirtualCellSpec
- type VirtualCellStatus
- type VirtualClusterName
- type VirtualClusterSpec
- type VirtualClusterStatus
- type WebServerError
- type WebServerPaths
Constants ¶
const ( ComponentName = "hivedscheduler" GroupName = "hivedscheduler.microsoft.com" DefaultConfigFilePath = "./hivedscheduler.yaml" UnlimitedValue = -1 // To leverage this scheduler, at least one container in the Pod should contain // below resource limit with any positive int16 value. ResourceNamePodSchedulingEnable = GroupName + "/pod-scheduling-enable" // To leverage this scheduler, the Pod should contain below annotation in // PodSchedulingSpec YAML format. AnnotationKeyPodSchedulingSpec = GroupName + "/pod-scheduling-spec" // To leverage this scheduler, if one container in the Pod want to use the // allocated GPUs for the whole Pod, it should contain below env. // env: // - name: NVIDIA_VISIBLE_DEVICES // valueFrom: // fieldRef: // fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation'] // The annotation referred by the env will be populated by scheduler when bind the pod. // // Notes: // 1. The scheduler directly delivers GPU isolation decision to // nvidia-container-runtime through Pod Env: NVIDIA_VISIBLE_DEVICES. // 2. If multiple containers in the Pod contain the env, the allocated GPUs are // all visible to them, so it is these containers' freedom to control how // to share these GPUs. EnvNameNvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" AnnotationKeyPodGpuIsolation = GroupName + "/pod-gpu-isolation" // Populated by this scheduler, used to track and recover allocated placement. // It is in PodBindInfo YAML format. AnnotationKeyPodBindInfo = GroupName + "/pod-bind-info" // Priority Range of Guaranteed Pod. MaxGuaranteedPriority = int32(1000) MinGuaranteedPriority = int32(0) // Priority of Opportunistic Pod. OpportunisticPriority = int32(-1) )
///////////////////////////////////////////////////////////////////////////////////// General Constants /////////////////////////////////////////////////////////////////////////////////////
const ( RootPath = "/" VersionPath = RootPath + "v1" // Scheduler Extender API: API with K8S Default Scheduler ExtenderPath = VersionPath + "/extender" FilterPath = ExtenderPath + "/filter" BindPath = ExtenderPath + "/bind" PreemptPath = ExtenderPath + "/preempt" // Scheduler Inspect API: API to inspect current scheduling status // Notes: // 1. Both Binding and Bound AffinityGroups/Pods are considered as Allocated. InspectPath = VersionPath + "/inspect" // Inspect current allocated AffinityGroup(s) AffinityGroupsPath = InspectPath + "/affinitygroups/" // Inspect current cluster status ClusterStatusPath = InspectPath + "/clusterstatus" // Inspect current physical cluster status PhysicalClusterPath = ClusterStatusPath + "/physicalcluster" // Inspect current virtual cluster(s)' status VirtualClustersPath = ClusterStatusPath + "/virtualclusters/" )
///////////////////////////////////////////////////////////////////////////////////// WebServer Constants /////////////////////////////////////////////////////////////////////////////////////
Variables ¶
var DefaultKubeConfigFilePath = os.Getenv("HOME") + "/.kube/config"
var EnvValueKubeApiServerAddress = os.Getenv("KUBE_APISERVER_ADDRESS")
var EnvValueKubeConfigFilePath = os.Getenv("KUBECONFIG")
Functions ¶
func BuildKubeConfig ¶
Types ¶
type AffinityGroup ¶
type AffinityGroup struct {
ObjectMeta `json:"metadata"`
Status AffinityGroupStatus `json:"status"`
}
type AffinityGroupList ¶ added in v0.2.4
type AffinityGroupList struct {
Items []AffinityGroup `json:"items"`
}
type AffinityGroupMemberBindInfo ¶ added in v0.2.0
type AffinityGroupMemberBindInfo struct {
PodPlacements []PodPlacementInfo `yaml:"podPlacements"`
}
type AffinityGroupMemberSpec ¶ added in v0.2.0
type AffinityGroupSpec ¶ added in v0.2.0
type AffinityGroupSpec struct {
Name string `yaml:"name"`
Members []AffinityGroupMemberSpec `yaml:"members"`
}
type AffinityGroupState ¶ added in v0.3.1
type AffinityGroupState string
type AffinityGroupStatus ¶ added in v0.2.4
type AffinityGroupStatus struct {
VC VirtualClusterName `json:"vc"`
Priority int32 `json:"priority"`
State AffinityGroupState `json:"state"`
// node -> GPU indices
PhysicalPlacement map[string][]int32 `json:"physicalPlacement,omitempty"`
// preassigned cell -> leaf cells
VirtualPlacement map[CellAddress][]CellAddress `json:"virtualPlacement,omitempty"`
AllocatedPods []types.UID `json:"allocatedPods,omitempty"`
PreemptingPods []types.UID `json:"preemptingPods,omitempty"`
LazyPreemptionStatus *LazyPreemptionStatus `json:"lazyPreemptionStatus,omitempty"`
}
type CellAddress ¶
type CellAddress string
///////////////////////////////////////////////////////////////////////////////////// General Types /////////////////////////////////////////////////////////////////////////////////////
type CellHealthiness ¶ added in v0.3.0
type CellHealthiness string
const ( CellHealthy CellHealthiness = "Healthy" CellBad CellHealthiness = "Bad" )
type CellStatus ¶ added in v0.3.0
type CellStatus struct {
GpuType string `json:"gpuType,omitempty"`
CellType CellType `json:"cellType"`
IsNodeLevel bool `json:"isNodeLevel,omitempty"`
// Address of a physical cell consists of its address (or index) in each level
// (e.g., node0/0/0/0 may represent node0, CPU socket 0, PCIe switch 0, GPU 0.
// Address of a virtual cell consists of its VC name, index of the preassigned cell,
// and the relative index in each level inside the preassigned cell
// (e.g., VC1/0/0 may represent VC1, preassigned cell 0, index 0 among its children)
CellAddress CellAddress `json:"cellAddress"`
// CellState and CellHealthiness are two orthogonal fields.
// That means, there are four possible combinations of them: a cell may be in
// (1) used and healthy, (2) used and bad, (3) free and healthy, and (4) free and bad.
CellState CellState `json:"cellState"`
CellHealthiness CellHealthiness `json:"cellHealthiness"`
CellPriority int32 `json:"cellPriority"`
}
type CellType ¶
type CellType string
///////////////////////////////////////////////////////////////////////////////////// General Types /////////////////////////////////////////////////////////////////////////////////////
type CellTypeSpec ¶
type ClusterStatus ¶ added in v0.3.0
type ClusterStatus struct {
// Status of cells in the physical cluster
PhysicalCluster PhysicalClusterStatus `json:"physicalCluster"`
// Status of cells in each VC
VirtualClusters map[VirtualClusterName]VirtualClusterStatus `json:"virtualClusters"`
}
type Config ¶
type Config struct {
// KubeApiServerAddress is default to ${KUBE_APISERVER_ADDRESS}.
// KubeConfigFilePath is default to ${KUBECONFIG} then falls back to ${HOME}/.kube/config.
//
// If both KubeApiServerAddress and KubeConfigFilePath after defaulting are still empty, falls back to the
// [k8s inClusterConfig](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod).
//
// If both KubeApiServerAddress and KubeConfigFilePath after defaulting are not empty,
// KubeApiServerAddress overrides the server address specified in the file referred by KubeConfigFilePath.
//
// If only KubeApiServerAddress after defaulting is not empty, it should be an insecure ApiServer address (can be got from
// [Insecure ApiServer](https://kubernetes.io/docs/reference/access-authn-authz/controlling-access/#api-server-ports-and-ips) or
// [kubectl proxy](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#using-kubectl-proxy))
// which does not enforce authentication.
//
// If only KubeConfigFilePath after defaulting is not empty, it should be an valid
// [KubeConfig File](https://kubernetes.io/docs/tasks/access-application-cluster/configure-access-multiple-clusters/#explore-the-home-kube-directory)
// which inlines or refers the valid
// [ApiServer Credential Files](https://kubernetes.io/docs/reference/access-authn-authz/controlling-access/#transport-security).
//
// Address should be in format http[s]://host:port
KubeApiServerAddress *string `yaml:"kubeApiServerAddress"`
KubeConfigFilePath *string `yaml:"kubeConfigFilePath"`
// WebServer
// Default to :9096
WebServerAddress *string `yaml:"webServerAddress"`
// Specify a threshold for PodBindAttempts, that after it is exceeded, an extra
// Pod binding will be executed forcefully.
ForcePodBindThreshold *int32 `yaml:"forcePodBindThreshold"`
// If a Pod is decided to be PodWaiting, it will block the whole scheduling by
// WaitingPodSchedulingBlockMilliSec.
// Large value can be used to achieve stronger FIFO scheduling by sacrificing
// the scheduling throughput.
// This is a workaround until PodMaxBackoffSeconds can be configured for
// K8S Default Scheduler.
WaitingPodSchedulingBlockMilliSec *int64 `yaml:"waitingPodSchedulingBlockMilliSec"`
// Specify the whole physical cluster
// TODO: Automatically construct it based on node info from GPU and Network Device Plugins
PhysicalCluster *PhysicalClusterSpec `yaml:"physicalCluster"`
// Specify all the virtual clusters belongs to the physical cluster
VirtualClusters *map[VirtualClusterName]VirtualClusterSpec `yaml:"virtualClusters"`
}
func InitRawConfig ¶ added in v0.2.4
type LazyPreemptionStatus ¶ added in v0.2.4
type ObjectMeta ¶ added in v0.2.4
type ObjectMeta struct {
Name string `json:"name"`
}
WebServer Exposed Objects: Align with K8S Objects
type PhysicalCellSpec ¶
type PhysicalCellSpec struct {
CellType CellType `yaml:"cellType"`
CellAddress CellAddress `yaml:"cellAddress"`
ReservationId ReservationId `yaml:"reservationId"`
CellChildren []PhysicalCellSpec `yaml:"cellChildren,omitempty"`
}
Specify physical Cell instances.
type PhysicalCellStatus ¶ added in v0.3.0
type PhysicalCellStatus struct {
CellStatus
CellChildren []*PhysicalCellStatus `json:"cellChildren,omitempty"`
VC VirtualClusterName `json:"vc,omitempty"`
VirtualCell *VirtualCellStatus `json:"virtualCell,omitempty"`
}
type PhysicalClusterSpec ¶
type PhysicalClusterSpec struct {
CellTypes map[CellType]CellTypeSpec `yaml:"cellTypes"`
PhysicalCells []PhysicalCellSpec `yaml:"physicalCells"`
}
Physical cluster definition
type PhysicalClusterStatus ¶ added in v0.3.0
type PhysicalClusterStatus []*PhysicalCellStatus
func (PhysicalClusterStatus) DeepCopy ¶ added in v0.3.0
func (pcs PhysicalClusterStatus) DeepCopy() PhysicalClusterStatus
type PodBindInfo ¶
type PodBindInfo struct {
Node string `yaml:"node"` // node to bind
GpuIsolation []int32 `yaml:"gpuIsolation"` // GPUs to bind
CellChain string `yaml:"cellChain"` // cell chain selected
AffinityGroupBindInfo []AffinityGroupMemberBindInfo `yaml:"affinityGroupBindInfo"`
}
Used to recover scheduler allocated resource
type PodPlacementInfo ¶ added in v0.2.0
type PodPlacementInfo struct {
PhysicalNode string `yaml:"physicalNode"`
PhysicalGpuIndices []int32 `yaml:"physicalGpuIndices"`
// preassigned cell types used by the pods. used to locate the virtual cells
// when adding an allocated pod
PreassignedCellTypes []CellType `yaml:"preassignedCellTypes"`
}
type PodSchedulingSpec ¶
type PodSchedulingSpec struct {
VirtualCluster VirtualClusterName `yaml:"virtualCluster"`
Priority int32 `yaml:"priority"`
ReservationId ReservationId `yaml:"reservationId"`
GpuType string `yaml:"gpuType"`
GpuNumber int32 `yaml:"gpuNumber"`
GangReleaseEnable bool `yaml:"gangReleaseEnable"`
LazyPreemptionEnable bool `yaml:"lazyPreemptionEnable"`
AffinityGroup *AffinityGroupSpec `yaml:"affinityGroup"`
}
type ReservationId ¶
type ReservationId string
///////////////////////////////////////////////////////////////////////////////////// General Types /////////////////////////////////////////////////////////////////////////////////////
type ReservedCellSpec ¶
type ReservedCellSpec struct {
ReservationId ReservationId `yaml:"reservationId"`
}
type VirtualCellSpec ¶
type VirtualCellStatus ¶ added in v0.3.0
type VirtualCellStatus struct {
CellStatus
CellChildren []*VirtualCellStatus `json:"cellChildren,omitempty"`
PhysicalCell *PhysicalCellStatus `json:"physicalCell,omitempty"`
}
type VirtualClusterSpec ¶
type VirtualClusterSpec struct {
VirtualCells []VirtualCellSpec `yaml:"virtualCells"`
ReservedCells []ReservedCellSpec `yaml:"reservedCells,omitempty"`
}
type VirtualClusterStatus ¶ added in v0.3.0
type VirtualClusterStatus []*VirtualCellStatus
func (VirtualClusterStatus) DeepCopy ¶ added in v0.3.0
func (vcs VirtualClusterStatus) DeepCopy() VirtualClusterStatus
type WebServerError ¶
func NewWebServerError ¶
func NewWebServerError(code int, message string) *WebServerError
func (*WebServerError) Error ¶
func (err *WebServerError) Error() string
type WebServerPaths ¶
type WebServerPaths struct {
Paths []string `json:"paths"`
}