types

package

v0.8.2 Latest Latest Go to latest Published: Apr 27, 2021 License: Apache-2.0, Apache-2.0 Imports: 6 Imported by: 40

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/kubeflow/arena

Links

Open Source Insights

Documentation ¶

Overview ¶

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Copyright 2018 The Kubeflow Authors ¶

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Index ¶

Constants
Variables
type AdvancedGpuMetric
type AllNodeInfo
type ArenaClientArgs
type CommonCronArgs
type CommonGPUNodeInfo
type CommonNodeInfo
type CommonServingArgs
type CommonSubmitArgs
type ConcurrencyPolicy
type ConfigFileInfo
type CronHistoryInfo
type CronInfo
type CronTFJobArgs
type CronType
type CustomServingArgs
type DataDirVolume
type Destination
type DestinationRuleCRD
type DestinationWeight
type Driver
type Endpoint
type Executor
type FormatStyle
type GPUDeviceInfo
type GPUExclusiveNodeInfo
type GPUExclusivePodInfo
type GPUShareNodeDevice
type GPUShareNodeInfo
type GPUSharePodInfo
type GPUTopology
type GPUTopologyNodeDevice
type GPUTopologyNodeInfo
type GPUTopologyPodInfo
type GpuMetric
type GpuMetricInfo
type HTTPMatchRequest
type HTTPRoute
type JobConditionType
type JobGpuMetric
type KFServingArgs
type LimitedPodSecurityContext
type LogArgs
type LogLevel
type NodeGpuMetric
type NodeType
type NodeTypeInfo
type NormalNodeInfo
type PodGpuMetric
type PortSelector
type PreprocesObject
type PrometheusMetric
type PrometheusMetricData
type PrometheusMetricResult
type PrometheusMetricValue
type PrometheusServer
type Runtime
type ScaleETJobArgs
type ScaleInETJobArgs
type ScaleOutETJobArgs
type SeldonServingArgs
type ServingInstance
type ServingJobInfo
type ServingJobType
type ServingTypeInfo
type ServingVersionWeight
type StringMatchPrefix
type SubmitETJobArgs
type SubmitHorovodJobArgs
type SubmitMPIJobArgs
type SubmitPyTorchJobArgs
type SubmitSparkJobArgs
type SubmitSyncCodeArgs
type SubmitTFJobArgs
type SubmitTensorboardArgs
type SubmitVolcanoJobArgs
type TFRuntime
type TensorFlowServingArgs
type TensorRTServingArgs
type TrafficRouterSplitArgs
type TrainingJobInfo
type TrainingJobInstance
type TrainingJobStatus
type TrainingJobType
type TrainingJobTypeInfo
type VirtualService
type VirtualServiceCRD

Constants ¶

View Source

const (
	GPUShareResourceName    = "aliyun.com/gpu-mem"
	GPUShareCountName       = "aliyun.com/gpu-count"
	GPUShareEnvGPUID        = "ALIYUN_COM_GPU_MEM_IDX"
	GPUShareAllocationLabel = "scheduler.framework.gpushare.allocation"
	GPUShareNodeLabels      = "gpushare=true,cgpu=true,ack.node.gpu.schedule=share,ack.node.gpu.schedule=cgpu"
)

View Source

const (
	AliyunGPUResourceName      = "aliyun.com/gpu"
	GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group"
	GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible"
	GPUTopologyNodeLabels      = "ack.node.gpu.schedule=topology"
)

View Source

const KUBEFLOW_NAMESPACE = "kubeflow"

View Source

const KUBE_SYSTEM_NAMESPACE = "kube-system"

View Source

const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`

View Source

const (
	// defines the nvidia resource name
	NvidiaGPUResourceName = "nvidia.com/gpu"
)

View Source

const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`

View Source

const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"

View Source

const PROMETHEUS_SCHEME = "http"

View Source

const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"

View Source

const (
	RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)

Variables ¶

View Source

var ErrTrainingJobNotFound = errors.New("training job not found,please use 'arena list' to make sure job is existed.")

View Source

var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}

View Source

var NodeTypeSlice = []NodeTypeInfo{
	{
		Name:      NormalNode,
		Alias:     "none",
		Shorthand: "n",
	},
	{
		Name:      GPUExclusiveNode,
		Alias:     "exclusive",
		Shorthand: "e",
	},
	{
		Name:      GPUTopologyNode,
		Alias:     "topology",
		Shorthand: "t",
	},
	{
		Name:      GPUShareNode,
		Alias:     "share",
		Shorthand: "s",
	},
}

View Source

var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{

	{
		Name:          "arms-prometheus-admin",
		ServiceLabels: "kubernetes.io/service-name=prometheus-admin",
		Protocol:      "http",
		Port:          "9335",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
	{
		Name:          "default",
		ServiceLabels: "kubernetes.io/service-name=prometheus-server",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},

	{
		Name:          "default-old",
		ServiceLabels: "kubernetes.io/name=Prometheus",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
}

View Source

var ServingTypeMap = map[ServingJobType]ServingTypeInfo{
	CustomServingJob: {
		Name:      CustomServingJob,
		Alias:     "Custom",
		Shorthand: "custom",
	},
	KFServingJob: {
		Name:      KFServingJob,
		Alias:     "KFServing",
		Shorthand: "kf",
	},
	TFServingJob: {
		Name:      TFServingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	TRTServingJob: {
		Name:      TRTServingJob,
		Alias:     "Tensorrt",
		Shorthand: "trt",
	},
	SeldonServingJob: {
		Name:      SeldonServingJob,
		Alias:     "Seldon",
		Shorthand: "seldon",
	},
}

ServingTypeMap collects serving job type and their alias

View Source

var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{
	TFTrainingJob: {
		Name:      TFTrainingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	MPITrainingJob: {
		Name:      MPITrainingJob,
		Alias:     "MPI",
		Shorthand: "mpi",
	},
	PytorchTrainingJob: {
		Name:      PytorchTrainingJob,
		Alias:     "Pytorch",
		Shorthand: "py",
	},
	HorovodTrainingJob: {
		Name:      HorovodTrainingJob,
		Alias:     "Horovod",
		Shorthand: "horovod",
	},
	VolcanoTrainingJob: {
		Name:      VolcanoTrainingJob,
		Alias:     "Volcano",
		Shorthand: "volcano",
	},
	ETTrainingJob: {
		Name:      ETTrainingJob,
		Alias:     "ElasticTraining",
		Shorthand: "et",
	},
	SparkTrainingJob: {
		Name:      SparkTrainingJob,
		Alias:     "Spark",
		Shorthand: "spark",
	},
}

ServingTypeMap collects serving job type and their alias

Functions ¶

This section is empty.

Types ¶

type AdvancedGpuMetric ¶

type AdvancedGpuMetric struct {
	Id             string  `json:"id" yaml:"id"`
	UUID           string  `json:"uuid" yaml:"uuid"`
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	// PodName is combined with namespace and  pod name,like 'namespace/pod_name'
	PodNames []string `json:"podNames" yaml:"podNames"`
}

type AllNodeInfo ¶

type AllNodeInfo map[string][]interface{}

type ArenaClientArgs ¶

type ArenaClientArgs struct {
	Kubeconfig     string
	Namespace      string
	ArenaNamespace string
	IsDaemonMode   bool
	LogLevel       string
}

type CommonCronArgs ¶ added in v0.8.2

type CommonCronArgs struct {
	// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
	Schedule string `yaml:"schedule"` // --schedule

	// Specifies how to treat concurrent executions of a Job.
	// Valid values are:
	// - "Allow" (default): allows CronJobs to run concurrently;
	// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
	// - "Replace": cancels currently running job and replaces it with a new one
	// +optional
	ConcurrencyPolicy string `yaml:"concurrencyPolicy"` // --concurrency-policy

	// This flag tells the controller to suspend subsequent executions, it does
	// not apply to already started executions.  Defaults to false.
	// +optional
	Suspend bool `yaml:"suspend"` // --suspend

	// Deadline is the timestamp that a cron job can keep scheduling util then.
	Deadline string `yaml:"deadline"` // --deadline

	// The number of finished job history to retain.
	// This is a pointer to distinguish between explicit zero and not specified.
	// +optional
	HistoryLimit int `yaml:"historyLimit"` // --history-limit
}

type CommonGPUNodeInfo ¶

type CommonGPUNodeInfo struct {
	TotalGPUs     int                  `json:"totalGPUs" yaml:"totalGPUs"`
	AllocatedGPUs int                  `json:"allocatedGPUs" yaml:"allocatedGPUs"`
	UnhealthyGPUs int                  `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
	GPUMetrics    []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

type CommonNodeInfo ¶

type CommonNodeInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Description string   `json:"description" yaml:"description"`
	IP          string   `json:"ip" yaml:"ip"`
	Status      string   `json:"status" yaml:"status"`
	Role        string   `json:"role" yaml:"role"`
	Type        NodeType `json:"type" yaml:"type"`
}

type CommonServingArgs ¶

type CommonServingArgs struct {
	Name            string            `yaml:"servingName"`
	Version         string            `yaml:"servingVersion"`
	Namespace       string            `yaml:"-"`
	Type            ServingJobType    `yaml:"-"`
	Image           string            `yaml:"image"`
	ImagePullPolicy string            `yaml:"imagePullPolicy"` // --imagePullPolicy
	GPUCount        int               `yaml:"gpuCount"`        // --gpus
	GPUMemory       int               `yaml:"gpuMemory"`       // --gpumemory
	Cpu             string            `yaml:"cpu"`             // --cpu
	Memory          string            `yaml:"memory"`          // --memory
	Envs            map[string]string `yaml:"envs"`            // --envs
	Command         string            `yaml:"command"`         // --command
	Replicas        int               `yaml:"replicas"`        // --replicas
	EnableIstio     bool              `yaml:"enableIstio"`     // --enableIstio
	ExposeService   bool              `yaml:"exposeService"`   // --exposeService
	ModelDirs       map[string]string `yaml:"modelDirs"`
	HostVolumes     []DataDirVolume   `yaml:"hostVolumes"`   // --data-dir
	NodeSelectors   map[string]string `yaml:"nodeSelectors"` // --selector
	Tolerations     []string          `yaml:"tolerations"`   // --toleration
	Annotations     map[string]string `yaml:"annotations"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}

type CommonSubmitArgs ¶

type CommonSubmitArgs struct {

	// Name stores the job name,match option --name
	Name string `yaml:"-"`

	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`

	// TrainingType stores the trainingType
	TrainingType TrainingJobType `yaml:"-"`

	// NodeSelectors defines the node selectors,match option --selector
	NodeSelectors map[string]string `yaml:"nodeSelectors"`

	// ConfigFiles stores the config file which is existed in client host node
	// and map it to container,match option --config-file
	ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"`

	// Tolerations defines the tolerations which tolerates node taints
	// match option --toleration
	Tolerations []string `yaml:"tolerations"`

	// Image stores the docker image of job,match option --image
	Image string `yaml:"image"`

	// GPUCount stores the gpu count of the job needs,match option --gpus
	GPUCount int `yaml:"gpuCount"`

	// Envs stores the envs of container in job, match option --env
	Envs map[string]string `yaml:"envs"`

	// WorkingDir stores the working directory of container in job,match option --working-dir
	WorkingDir string `yaml:"workingDir"`

	// Command stores the command of job
	Command string `yaml:"command"`

	// Mode is used for horovod,match option --sync-mode
	Mode string `yaml:"mode"`

	// WorkerCount stores the count of job worker,match option --workers
	WorkerCount int `yaml:"workers"`

	// Retry defines the retry times
	Retry int `yaml:"retry"`

	// DataSet stores the kubernetes pvc names
	DataSet map[string]string `yaml:"dataset"`

	// DataDirs stores the files(or directories) in k8s node which will map to containers
	// match option --data-dir
	DataDirs []DataDirVolume `yaml:"dataDirs"`

	// EnableRDMA enable rdma or not,match option --rdma
	EnableRDMA bool `yaml:"enableRDMA"`

	// UseENI defines using eni or not
	UseENI bool `yaml:"useENI"`

	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`

	// IsNonRoot is root user or not
	IsNonRoot bool `yaml:"isNonRoot"`

	// PodSecurityContext defines the pod security context
	PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"`

	// PriorityClassName defines the priority class
	PriorityClassName string `yaml:"priorityClassName"`

	// Conscheduling defines using Conscheduling
	Conscheduling bool

	// PodGroupName stores pod group name
	PodGroupName string `yaml:"podGroupName"`

	// PodGroupMinAvailable stores pod group min available
	PodGroupMinAvailable string `yaml:"podGroupMinAvailable"`

	// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
	ImagePullSecrets []string `yaml:"imagePullSecrets"`

	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`
}

CommonSubmitArgs defines the common parts of the submitAthd

type ConcurrencyPolicy ¶ added in v0.8.2

type ConcurrencyPolicy string

ConcurrencyPolicy describes how the job will be handled. Only one of the following concurrent policies may be specified. If none of the following policies is specified, the default one is AllowConcurrent.

const (
	ConcurrencyAllow   ConcurrencyPolicy = "Allow"
	ConcurrencyForbid  ConcurrencyPolicy = "Forbid"
	ConcurrencyReplace ConcurrencyPolicy = "Replace"
)

type ConfigFileInfo ¶

type ConfigFileInfo struct {
	ContainerFileName string `yaml:"containerFileName"`
	HostFile          string `yaml:"hostFile"`
	Key               string `yaml:"key"`
	ContainerFilePath string `yaml:"containerFilePath"`
}

ConfigFileInfo defines the config files which will be mounted to containers

type CronHistoryInfo ¶ added in v0.8.2

type CronHistoryInfo struct {
	Name       string `json:"name" yaml:"name"`
	Namespace  string `json:"namespace" yaml:"namespace"`
	Group      string `json:"group" yaml:"group"`
	Kind       string `json:"kind" yaml:"kind"`
	Status     string `json:"status" yaml:"status"`
	CreateTime string `json:"createTime" yaml:"createTime"`
	FinishTime string `json:"finishTime" yaml:"finishTime"`
}

type CronInfo ¶ added in v0.8.2

type CronInfo struct {
	Name string `json:"name" yaml:"name"`

	Namespace string `json:"namespace" yaml:"namespace"`

	// Type is the job type, like TFjob、PyTorchJob
	Type string `json:"type" yaml:"type"`

	// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
	Schedule string `json:"schedule" yaml:"schedule"`

	// Specifies how to treat concurrent executions of a Job.
	// Valid values are:
	// - "Allow" (default): allows CronJobs to run concurrently;
	// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
	// - "Replace": cancels currently running job and replaces it with a new one
	// +optional
	ConcurrencyPolicy string `json:"concurrencyPolicy" yaml:"concurrencyPolicy"` // --concurrency-policy

	// This flag tells the controller to suspend subsequent executions, it does
	// not apply to already started executions.  Defaults to false.
	// +optional
	Suspend bool `json:"suspend" yaml:"suspend"` // --suspend

	// Deadline is the timestamp that a cron job can keep scheduling util then.
	Deadline string `json:"deadline" yaml:"deadline"` // --deadline

	// The number of finished job history to retain.
	// This is a pointer to distinguish between explicit zero and not specified.
	// +optional
	HistoryLimit int64 `json:"historyLimit" yaml:"historyLimit"` // --history-limit

	// Information when was the last time the job was successfully scheduled.
	// +optional
	LastScheduleTime string `json:"lastScheduleTime" yaml:"lastScheduleTime"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"`

	History []CronHistoryInfo `json:"cronHistory" yaml:"cronHistory"`
}

type CronTFJobArgs ¶ added in v0.8.2

type CronTFJobArgs struct {
	CommonCronArgs  `yaml:"cron"`
	SubmitTFJobArgs `yaml:"tfjob"`
}

type CronType ¶ added in v0.8.2

type CronType string

CronType defines the supporting job type

const (
	// CronTFTrainingJob defines the cron tfjob
	CronTFTrainingJob CronType = "tfjob"
)

type CustomServingArgs ¶

type CustomServingArgs struct {
	Port              int `yaml:"port"`        // --port
	RestfulPort       int `yaml:"restApiPort"` // --restfulPort
	CommonServingArgs `yaml:",inline"`
}

type DataDirVolume ¶

type DataDirVolume struct {
	// HostPath defines the host path
	HostPath string `yaml:"hostPath"`
	// ContainerPath defines container path
	ContainerPath string `yaml:"containerPath"`
	// Name defines the volume name
	Name string `yaml:"name"`
}

DataDirVolume defines the volume of kubernetes

type Destination ¶

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD ¶

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight ¶

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type Driver ¶

type Driver struct {
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type Endpoint ¶

type Endpoint struct {
	// Endpoint Name
	Name string `json:"name" yaml:"name"`
	// Port specifies endpoint port
	Port int `json:"port" yaml:"port"`
	// NodePort specifies the node port
	NodePort int `json:"nodePort" yaml:"nodePort"`
}

type Executor ¶

type Executor struct {
	Replicas      int    `yaml:"Replicas"`
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type FormatStyle ¶

type FormatStyle string

PrintFormatStyle defines the format of output it only used in cmd

const (
	// Wide defines the wide format
	WideFormat FormatStyle = "wide"
	// Json defines the json format
	JsonFormat FormatStyle = "json"
	// Yaml defines the yaml format
	YamlFormat FormatStyle = "yaml"
	// Unknwon defines the unknown format
	UnknownFormat FormatStyle = "unknown"
)

type GPUDeviceInfo ¶

type GPUDeviceInfo struct {
	ID                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	UsedGPUMemory      float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	DutyCycle          float64 `json:"dutyCycle" yaml:"dutyCycle"`
}

type GPUExclusiveNodeInfo ¶

type GPUExclusiveNodeInfo struct {
	PodInfos          []GPUExclusivePodInfo `json:"instances" yaml:"instances"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
}

type GPUExclusivePodInfo ¶

type GPUExclusivePodInfo struct {
	Name       string `json:"name" yaml:"name"`
	Namespace  string `json:"namespace" yaml:"namespace"`
	Status     string `json:"status" yaml:"status"`
	RequestGPU int    `json:"requestGPUs" yaml:"requestGPUs"`
}

type GPUShareNodeDevice ¶

type GPUShareNodeDevice struct {
	Id                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
}

type GPUShareNodeInfo ¶

type GPUShareNodeInfo struct {
	PodInfos           []GPUSharePodInfo    `json:"instances" yaml:"instances"`
	TotalGPUMemory     float64              `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64              `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	Devices            []GPUShareNodeDevice `json:"devices" yaml:"devices"`
	CommonGPUNodeInfo  `yaml:",inline" json:",inline"`
	CommonNodeInfo     `yaml:",inline" json:",inline"`
}

type GPUSharePodInfo ¶

type GPUSharePodInfo struct {
	Name          string         `json:"name" yaml:"name"`
	Namespace     string         `json:"namespace" yaml:"namespace"`
	Status        string         `json:"status" yaml:"status"`
	RequestMemory int            `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	Allocation    map[string]int `json:"allocation" yaml:"allocation"`
}

type GPUTopology ¶

type GPUTopology struct {
	LinkMatrix      [][]string  `json:"linkMatrix" yaml:"linkMatrix"`
	BandwidthMatrix [][]float32 `json:"bandwidthMatrix" yaml:"bandwidthMatrix"`
}

type GPUTopologyNodeDevice ¶

type GPUTopologyNodeDevice struct {
	Id      string `json:"id" yaml:"id"`
	Healthy bool   `json:"healthy" yaml:"healthy"`
	Status  string `json:"status" yaml:"status"`
}

type GPUTopologyNodeInfo ¶

type GPUTopologyNodeInfo struct {
	PodInfos          []GPUTopologyPodInfo `json:"instances" yaml:"instances"`
	GPUTopology       GPUTopology          `json:"gpuTopology" yaml:"gpuTopology"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	Devices           []GPUTopologyNodeDevice `yaml:"devices" yaml:"devices"`
}

type GPUTopologyPodInfo ¶

type GPUTopologyPodInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Namespace   string   `json:"namespace" yaml:"namespace"`
	Status      string   `json:"status" yaml:"status"`
	RequestGPU  int      `json:"requestGPUs" yaml:"requestGPUs"`
	Allocation  []string `json:"allocation" yaml:"allocation"`
	VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"`
}

type GpuMetric ¶

type GpuMetric struct {
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
}

type GpuMetricInfo ¶

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

type HTTPMatchRequest ¶

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute ¶

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type JobConditionType ¶ added in v0.8.2

type JobConditionType string

JobConditionType defines all kinds of types of JobStatus.

const (
	// JobCreated means the job has been accepted by the system,
	// but one or more of the pods/services has not been started.
	// This includes time before pods being scheduled and launched.
	JobCreated JobConditionType = "Created"

	// JobRunning means all sub-resources (e.g. services/pods) of this job
	// have been successfully scheduled and launched.
	// The training is running without error.
	JobRunning JobConditionType = "Running"

	// JobRestarting means one or more sub-resources (e.g. services/pods) of this job
	// reached phase failed but maybe restarted according to it's restart policy
	// which specified by user in v1.PodTemplateSpec.
	// The training is freezing/pending.
	JobRestarting JobConditionType = "Restarting"

	// JobSucceeded means all sub-resources (e.g. services/pods) of this job
	// reached phase have terminated in success.
	// The training is complete without error.
	JobSucceeded JobConditionType = "Succeeded"

	// JobFailed means one or more sub-resources (e.g. services/pods) of this job
	// reached phase failed with no restarting.
	// The training has failed its execution.
	JobFailed JobConditionType = "Failed"
)

type JobGpuMetric ¶

type JobGpuMetric map[string]PodGpuMetric

type KFServingArgs ¶

type KFServingArgs struct {
	Port              int    `yaml:"port"`          // --port
	ModelType         string `yaml:"modelType"`     // --modelType
	CanaryPercent     int    `yaml:"canaryPercent"` // --canaryTrafficPercent
	StorageUri        string `yaml:"storageUri"`    // --storageUri
	CommonServingArgs `yaml:",inline"`
}

type LimitedPodSecurityContext ¶

type LimitedPodSecurityContext struct {
	RunAsUser          int64   `yaml:"runAsUser"`
	RunAsNonRoot       bool    `yaml:"runAsNonRoot"`
	RunAsGroup         int64   `yaml:"runAsGroup"`
	SupplementalGroups []int64 `yaml:"supplementalGroups"`
}

LimitedPodSecurityContext defines the kuberntes pod security context

type LogArgs ¶

type LogArgs struct {
	Namespace     string
	JobName       string
	InstanceName  string
	ContainerName string
	Follow        bool
	SinceSeconds  *int64
	SinceTime     *metav1.Time
	Tail          *int64
	Timestamps    bool
	RetryCnt      int
	RetryTimeout  time.Duration
	WriterCloser  io.WriteCloser
}

type LogLevel ¶

type LogLevel string

const (
	LogDebug   LogLevel = "debug"
	LogInfo    LogLevel = "info"
	LogWarning LogLevel = "warn"
	LogError   LogLevel = "error"
	LogUnknown LogLevel = "unknown"
)

type NodeGpuMetric ¶

type NodeGpuMetric map[string]*AdvancedGpuMetric

key of map is device id

type NodeType ¶

type NodeType string

const (
	GPUShareNode     NodeType = "GPUShare"
	GPUExclusiveNode NodeType = "GPUExclusive"
	GPUTopologyNode  NodeType = "GPUTopology"
	NormalNode       NodeType = "Normal"
	UnknownNode      NodeType = "unknown"
	AllKnownNode     NodeType = ""
)

type NodeTypeInfo ¶

type NodeTypeInfo struct {
	Name      NodeType
	Alias     string
	Shorthand string
}

type NormalNodeInfo ¶

type NormalNodeInfo struct {
	CommonNodeInfo `yaml:",inline" json:",inline"`
}

type PodGpuMetric ¶

type PodGpuMetric map[string]*GpuMetric

type PortSelector ¶

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject ¶

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrometheusMetric ¶

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData ¶

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult ¶

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue ¶

type PrometheusMetricValue interface{}

type PrometheusServer ¶

type PrometheusServer struct {
	Name          string
	ServiceLabels string
	Protocol      string
	Port          string
	Path          string
	MetricList    []string
	Service       *v1.Service
}

PrometheusServer is used to define prometheus server

type Runtime ¶

type Runtime interface {
	// get the chart
	GetChartName() string
	// defines the runtime is default or not
	IsDefault() bool
}

type ScaleETJobArgs ¶

type ScaleETJobArgs struct {
	//--name string     required, et job name
	Name string `yaml:"etName"`
	// TrainingType stores the trainingType
	JobType TrainingJobType `yaml:"-"`
	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`
	//--timeout int     timeout of callback scaler script.
	Timeout int `yaml:"timeout"`
	//--retry int       retry times.
	Retry int `yaml:"retry"`
	//--count int       the nums of you want to add or delete worker.
	Count int `yaml:"count"`
	//--script string        script of scaling.
	Script string `yaml:"script"`
	//-e, --env stringArray      the environment variables
	Envs map[string]string `yaml:"envs"`
}

type ScaleInETJobArgs ¶

type ScaleInETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type ScaleOutETJobArgs ¶

type ScaleOutETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type SeldonServingArgs ¶ added in v0.8.0

type SeldonServingArgs struct {
	Implementation    string `yaml:"implementation"` // --implementation
	ModelUri          string `yaml:"modelUri"`       // --modelUri
	CommonServingArgs `yaml:",inline"`
}

type ServingInstance ¶

type ServingInstance struct {
	// Name gives the instance name
	Name string `json:"name" yaml:"name"`
	// Status gives the instance status
	Status string `json:"status" yaml:"status"`
	// Age gives the instance ge
	Age string `json:"age" yaml:"age"`
	// ReadyContainer represents the count of ready containers
	ReadyContainer int `json:"readyContainers" yaml:"readyContainers"`
	// TotalContainer represents the count of  total containers
	TotalContainer int `json:"totalContainers" yaml:"totalContainers"`
	// RestartCount represents the count of instance restarts
	RestartCount int `json:"restartCount" yaml:"restartCount"`
	// HostIP specifies host ip of instance
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// NodeName returns the node name
	NodeName string `json:"nodeName" yaml:"nodeName"`
	// IP returns the instance ip
	IP string `json:"ip" yaml:"ip"`
	// RequestGPU returns the request gpus
	RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory returns the request gpu memory
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
}

type ServingJobInfo ¶

type ServingJobInfo struct {
	// Name specifies serving job name
	Name string `json:"name" yaml:"name"`
	// Namespace specifies serving job namespace
	Namespace string `json:"namespace" yaml:"namespace"`
	// Type specifies serving job type
	Type string `json:"type" yaml:"type"`
	// Version specifies serving job version
	Version string `json:"version" yaml:"version"`
	// Age specifies the serving job age
	Age string `json:"age" yaml:"age"`
	// Desired specifies the desired instances
	Desired int `json:"desiredInstances" yaml:"desiredInstances"`
	// Available specifies the available instances
	Available int `json:"availableInstances" yaml:"availableInstances"`
	// Endpoints specifies the endpoints
	Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"`
	// IPAddress specifies the ip address
	IPAddress string `json:"ip" yaml:"ip"`
	// Instances gives the instance informations
	Instances []ServingInstance `json:"instances" yaml:"instances"`
	// RequestGPU specifies the request gpus
	RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory specifies the request gpu memory,only for gpushare
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

ServingJobInfo display serving job information

type ServingJobType ¶

type ServingJobType string

ServingJobType defines the serving job type name must like shorthand + "-serving"

const (
	// TFServingJob defines the tensorflow serving job
	TFServingJob ServingJobType = "tf-serving"
	// TRTServingJob defines the tensorrt serving job
	TRTServingJob ServingJobType = "trt-serving"
	// KFServingJob defines the kfserving job
	KFServingJob ServingJobType = "kf-serving"
	// SeldonServingJob defines the seldon core job
	SeldonServingJob ServingJobType = "seldon-serving"
	// CustomServingJob defines the custom serving job
	CustomServingJob ServingJobType = "custom-serving"
	// AllServingJob represents all serving job type
	AllServingJob ServingJobType = ""
	// UnknownServingJob defines the unknown serving job
	UnknownServingJob ServingJobType = "unknown"
)

type ServingTypeInfo ¶

type ServingTypeInfo struct {
	Name      ServingJobType
	Alias     string
	Shorthand string
}

type ServingVersionWeight ¶

type ServingVersionWeight struct {
	Version string
	Weight  int
}

type StringMatchPrefix ¶

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type SubmitETJobArgs ¶

type SubmitETJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`
	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`
	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`
	MaxWorkers         int `yaml:"maxWorkers"`
	MinWorkers         int `yaml:"minWorkers"`
}

type SubmitHorovodJobArgs ¶

type SubmitHorovodJobArgs struct {
	SSHPort int    `yaml:"sshPort"`
	Cpu     string `yaml:"cpu"`    // --cpu
	Memory  string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`
}

type SubmitMPIJobArgs ¶

type SubmitMPIJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// enable gpu topology scheduling
	GPUTopology        bool   `yaml:"gputopology"`
	GPUTopologyReplica string `yaml:"gputopologyreplica"`
}

type SubmitPyTorchJobArgs ¶

type SubmitPyTorchJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
}

type SubmitSparkJobArgs ¶

type SubmitSparkJobArgs struct {
	Name         string          `yaml:"-"`
	Namespace    string          `yaml:"-"`
	TrainingType TrainingJobType `yaml:"-"`
	Image        string          `yaml:"Image"`
	MainClass    string          `yaml:"MainClass"`
	Jar          string          `yaml:"Jar"`
	Executor     *Executor       `yaml:"Executor"`
	Driver       *Driver         `yaml:"Driver"`
}

type SubmitSyncCodeArgs ¶

type SubmitSyncCodeArgs struct {
	SyncMode   string `yaml:"syncMode"`            // --syncMode: rsync, hdfs, git
	SyncSource string `yaml:"syncSource"`          // --syncSource
	SyncImage  string `yaml:"syncImage,omitempty"` // --syncImage
	// syncGitProjectName
	SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage
}

type SubmitTFJobArgs ¶

type SubmitTFJobArgs struct {
	// TFNodeSelectors assigns tfjob node selectors
	TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"`
	// Port defines the defaut port if workerPort and PSPort are not set
	Port int
	// WorkerImage assigns worker image,match option --worker-image
	WorkerImage string `yaml:"workerImage"`
	// WorkerPort stores worker port,match option --work-port
	WorkerPort int `yaml:"workerPort"`
	// PSPort stores the ps port,match option --ps-port
	PSPort int `yaml:"psPort"`
	// PSCount stores the ps count,--ps-count
	PSCount int `yaml:"ps"`
	// PSImage stores the ps image,--ps-image
	PSImage string `yaml:"psImage"`
	// WorkerCpu stores the cpu of job worker,match option --worker-cpu
	WorkerCpu string `yaml:"workerCPU"`
	//WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector
	// WorkerMemory stores woker memory,match option --worker-memory
	WorkerMemory string `yaml:"workerMemory"`
	// PSCpu stores ps cpu,match option --ps-cpu
	PSCpu string `yaml:"psCPU"`
	// PSGpu stores ps gpu,match option --ps-gpus
	PSGpu int `yaml:"psGPU"` // --ps-gpus
	// PSMemory stores the ps memory,match option --ps-memory
	PSMemory string `yaml:"psMemory"`
	// CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
	// UseChief stores the using chief or not,match option --chief
	UseChief bool `yaml:",omitempty"` // --chief
	// ChiefCount stores the chief count of job,match option --chief-count
	ChiefCount int `yaml:"chief"`
	// UseEvaluator is used to enable evaluator or not,match option --evaluator
	UseEvaluator bool `yaml:",omitempty"`
	// ChiefPort stores the chief port,match option --chief-port
	ChiefPort int `yaml:"chiefPort"`
	//ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector
	// ChiefCpu stores the chief pod cpu,match option --chief-cpu
	ChiefCpu string `yaml:"chiefCPU"`
	// ChiefMemory stores the chief pod memory,match option --chief-memory
	ChiefMemory string `yaml:"chiefMemory"`
	// EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu
	EvaluatorCpu string `yaml:"evaluatorCPU"`
	//EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector
	// EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory
	EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory
	// EvaluatorCount stores the evaluator pod count,match option --evaluator-count
	EvaluatorCount int `yaml:"evaluator"`
	// HasGangScheduler determines if it has gang scheduler
	HasGangScheduler bool `yaml:"hasGangScheduler"`
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`

	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`

	// TFRuntime stores the runtime
	TFRuntime `yaml:"-"`
}

type SubmitTensorboardArgs ¶

type SubmitTensorboardArgs struct {
	UseTensorboard   bool   `yaml:"useTensorboard"`   // --tensorboard
	TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage
	TrainingLogdir   string `yaml:"trainingLogdir"`   // --logdir
	HostLogPath      string `yaml:"hostLogPath"`
	IsLocalLogging   bool   `yaml:"isLocalLogging"`
}

SubmitTensorboardArgs is used to store tensorborad information

type SubmitVolcanoJobArgs ¶

type SubmitVolcanoJobArgs struct {
	// Name stores the job name
	Name string
	// Namespace stores the namespace of job
	Namespace string
	// TrainingType is used to accept job type
	TrainingType TrainingJobType
	// Command defines the job command
	Command string
	// The MinAvailable available pods to run for this Job
	MinAvailable int `yaml:"minAvailable"`
	// Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty.
	Queue string `yaml:"queue"`
	// SchedulerName is the default value of `tasks.template.spec.schedulerName`.
	SchedulerName string `yaml:"schedulerName"`
	// TaskName specifies the name of task
	TaskName string `yaml:"taskName"`
	// TaskImages specifies the task image
	TaskImages []string `yaml:"taskImages"`
	// TaskReplicas specifies the replicas of this Task in Job
	TaskReplicas int `yaml:"taskReplicas"`
	// TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m
	TaskCPU string `yaml:"taskCPU"`
	// TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi
	TaskMemory string `yaml:"taskMemory"`
	// TaskPort specifies the task port
	TaskPort int `yaml:"taskPort"`
}

type TFRuntime ¶

type TFRuntime interface {
	// check the tfjob args
	Check(tf *SubmitTFJobArgs) (err error)
	// transform the tfjob
	Transform(tf *SubmitTFJobArgs) (err error)
	Runtime
}

Customized runtime for tf training training

type TensorFlowServingArgs ¶

type TensorFlowServingArgs struct {
	VersionPolicy          string `yaml:"versionPolicy"`   // --versionPolicy
	ModelConfigFile        string `yaml:"modelConfigFile"` // --modelConfigFile
	ModelConfigFileContent string `yaml:"modelConfigFileContent"`
	ModelName              string `yaml:"modelName"`   // --modelName
	ModelPath              string `yaml:"modelPath"`   // --modelPath
	Port                   int    `yaml:"port"`        // --port
	RestfulPort            int    `yaml:"restApiPort"` // --restfulPort
	CommonServingArgs      `yaml:",inline"`
}

type TensorRTServingArgs ¶

type TensorRTServingArgs struct {
	ModelStore        string `yaml:"modelStore"`   // --modelStore
	MetricsPort       int    `yaml:"metricsPort"`  // --metricsPort
	HttpPort          int    `yaml:"httpPort"`     // --httpPort
	GrpcPort          int    `yaml:"grpcPort"`     // --grpcPort
	AllowMetrics      bool   `yaml:"allowMetrics"` // --allowMetrics
	CommonServingArgs `yaml:",inline"`
}

type TrafficRouterSplitArgs ¶

type TrafficRouterSplitArgs struct {
	ServingName    string `yaml:"servingName,omitempty"` //--name
	Namespace      string `yaml:"namespace,omitempty"`   //--namespace
	Versions       string `yaml:"versions,omitempty"`    //--versions
	Weights        string `yaml:"weights,omitempty"`     //--weights
	VersionWeights []ServingVersionWeight
}

type TrainingJobInfo ¶

type TrainingJobInfo struct {
	// The name of the training job
	Name string `json:"name" yaml:"name"`
	// The namespace of the training job
	Namespace string `json:"namespace" yaml:"namespace"`
	// The time of the training job
	Duration string `json:"duration" yaml:"duration"`
	// The status of the training Job
	Status TrainingJobStatus `json:"status" yaml:"status"`

	// The training type of the training job
	Trainer TrainingJobType `json:"trainer" yaml:"trainer"`
	// The tensorboard of the training job
	Tensorboard string `json:"tensorboard" yaml:"tensorboard"`

	// The name of the chief Instance
	ChiefName string `json:"chiefName" yaml:"chiefName"`

	// The instances under the training job
	Instances []TrainingJobInstance `json:"instances" yaml:"instances"`

	// The priority of the training job
	Priority string `json:"priority" yaml:"priority"`

	// RequestGPU stores the request gpus
	RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"`

	// AllocatedGPU stores the allocated gpus
	AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

TrainingJobInfo stores training job information

type TrainingJobInstance ¶

type TrainingJobInstance struct {
	// IP defines the instance ip
	IP string `json:"ip" yaml:"ip"`
	// the status of of instance
	Status string `json:"status"`
	// the name of instance
	Name string `json:"name"`
	// the age of instance
	Age string `json:"age"`
	// the node instance runs on
	Node string `json:"node"`
	// NodeIP is store the node ip
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// the instance is chief or not
	IsChief bool `json:"chief" yaml:"chief"`
	// RequestGPUs is used to store request gpu count
	RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"`
	// GpuDutyCycle stores the gpu metrics
	GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

TrainingJobInstance defines the instance of training job

type TrainingJobStatus ¶

type TrainingJobStatus string

TrainingJobStatus defines all the kinds of JobStatus

const (
	// TrainingJobPending means the job is pending
	TrainingJobPending TrainingJobStatus = "PENDING"
	// TrainingJobRunning means the job is running
	TrainingJobRunning TrainingJobStatus = "RUNNING"
	// TrainingJobSucceeded means the job is Succeeded
	TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED"
	// TrainingJobFailed means the job is failed
	TrainingJobFailed TrainingJobStatus = "FAILED"
)

type TrainingJobType ¶

type TrainingJobType string

TrainingJobType defines the supporting training job type

const (
	// TFTrainingJob defines the tfjob
	TFTrainingJob TrainingJobType = "tfjob"
	// MPITrainingJob defines the mpijob
	MPITrainingJob TrainingJobType = "mpijob"
	// PytorchTrainingJob defines the pytorchjob
	PytorchTrainingJob TrainingJobType = "pytorchjob"
	// HorovodTrainingJob defines the horovod job
	HorovodTrainingJob TrainingJobType = "horovodjob"
	// VolcanoTrainingJob defines the volcano job
	VolcanoTrainingJob TrainingJobType = "volcanojob"
	// ETTrainingJob defines the etjob
	ETTrainingJob TrainingJobType = "etjob"
	// SparkTrainingJob defines the spark job
	SparkTrainingJob TrainingJobType = "sparkjob"
	// AllTrainingJob represents all job types
	AllTrainingJob TrainingJobType = ""
	// UnknownTrainingJob defines the unknown training
	UnknownTrainingJob TrainingJobType = "unknown"
)

type TrainingJobTypeInfo ¶

type TrainingJobTypeInfo struct {
	Name      TrainingJobType
	Alias     string
	Shorthand string
}

type VirtualService ¶

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD ¶

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL