Documentation
¶
Overview ¶
Copyright 2018 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Copyright 2018 The Kubeflow Authors ¶
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Index ¶
- Constants
- Variables
- type AdvancedGpuMetric
- type AllNodeInfo
- type ArenaClientArgs
- type CommonCronArgs
- type CommonGPUNodeInfo
- type CommonNodeInfo
- type CommonServingArgs
- type CommonSubmitArgs
- type ConcurrencyPolicy
- type ConfigFileInfo
- type CronHistoryInfo
- type CronInfo
- type CronTFJobArgs
- type CronType
- type CustomServingArgs
- type DataDirVolume
- type Destination
- type DestinationRuleCRD
- type DestinationWeight
- type Driver
- type Endpoint
- type Executor
- type FormatStyle
- type GPUDeviceInfo
- type GPUExclusiveNodeInfo
- type GPUExclusivePodInfo
- type GPUShareNodeDevice
- type GPUShareNodeInfo
- type GPUSharePodInfo
- type GPUTopology
- type GPUTopologyNodeDevice
- type GPUTopologyNodeInfo
- type GPUTopologyPodInfo
- type GpuMetric
- type GpuMetricInfo
- type HTTPMatchRequest
- type HTTPRoute
- type JobConditionType
- type JobGpuMetric
- type KFServingArgs
- type LimitedPodSecurityContext
- type LogArgs
- type LogLevel
- type NodeGpuMetric
- type NodeType
- type NodeTypeInfo
- type NormalNodeInfo
- type PodGpuMetric
- type PortSelector
- type PreprocesObject
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PrometheusServer
- type Runtime
- type ScaleETJobArgs
- type ScaleInETJobArgs
- type ScaleOutETJobArgs
- type SeldonServingArgs
- type ServingInstance
- type ServingJobInfo
- type ServingJobType
- type ServingTypeInfo
- type ServingVersionWeight
- type StringMatchPrefix
- type SubmitETJobArgs
- type SubmitHorovodJobArgs
- type SubmitMPIJobArgs
- type SubmitPyTorchJobArgs
- type SubmitSparkJobArgs
- type SubmitSyncCodeArgs
- type SubmitTFJobArgs
- type SubmitTensorboardArgs
- type SubmitVolcanoJobArgs
- type TFRuntime
- type TensorFlowServingArgs
- type TensorRTServingArgs
- type TrafficRouterSplitArgs
- type TrainingJobInfo
- type TrainingJobInstance
- type TrainingJobStatus
- type TrainingJobType
- type TrainingJobTypeInfo
- type VirtualService
- type VirtualServiceCRD
Constants ¶
const ( )
const ( AliyunGPUResourceName = "aliyun.com/gpu" GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group" GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible" GPUTopologyNodeLabels = "ack.node.gpu.schedule=topology" )
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`
const (
// defines the nvidia resource name
NvidiaGPUResourceName = "nvidia.com/gpu"
)
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
const (
RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)
Variables ¶
var ErrTrainingJobNotFound = errors.New("training job not found,please use 'arena list' to make sure job is existed.")
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
var NodeTypeSlice = []NodeTypeInfo{ { Name: NormalNode, Alias: "none", Shorthand: "n", }, { Name: GPUExclusiveNode, Alias: "exclusive", Shorthand: "e", }, { Name: GPUTopologyNode, Alias: "topology", Shorthand: "t", }, { Name: GPUShareNode, Alias: "share", Shorthand: "s", }, }
var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{ { Name: "arms-prometheus-admin", ServiceLabels: "kubernetes.io/service-name=prometheus-admin", Protocol: "http", Port: "9335", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default", ServiceLabels: "kubernetes.io/service-name=prometheus-server", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default-old", ServiceLabels: "kubernetes.io/name=Prometheus", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, }
var ServingTypeMap = map[ServingJobType]ServingTypeInfo{ CustomServingJob: { Name: CustomServingJob, Alias: "Custom", Shorthand: "custom", }, KFServingJob: { Name: KFServingJob, Alias: "KFServing", Shorthand: "kf", }, TFServingJob: { Name: TFServingJob, Alias: "Tensorflow", Shorthand: "tf", }, TRTServingJob: { Name: TRTServingJob, Alias: "Tensorrt", Shorthand: "trt", }, SeldonServingJob: { Name: SeldonServingJob, Alias: "Seldon", Shorthand: "seldon", }, }
ServingTypeMap collects serving job type and their alias
var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{ TFTrainingJob: { Name: TFTrainingJob, Alias: "Tensorflow", Shorthand: "tf", }, MPITrainingJob: { Name: MPITrainingJob, Alias: "MPI", Shorthand: "mpi", }, PytorchTrainingJob: { Name: PytorchTrainingJob, Alias: "Pytorch", Shorthand: "py", }, HorovodTrainingJob: { Name: HorovodTrainingJob, Alias: "Horovod", Shorthand: "horovod", }, VolcanoTrainingJob: { Name: VolcanoTrainingJob, Alias: "Volcano", Shorthand: "volcano", }, ETTrainingJob: { Name: ETTrainingJob, Alias: "ElasticTraining", Shorthand: "et", }, SparkTrainingJob: { Name: SparkTrainingJob, Alias: "Spark", Shorthand: "spark", }, }
ServingTypeMap collects serving job type and their alias
Functions ¶
This section is empty.
Types ¶
type AdvancedGpuMetric ¶
type AdvancedGpuMetric struct {
Id string `json:"id" yaml:"id"`
UUID string `json:"uuid" yaml:"uuid"`
GpuDutyCycle float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
GpuMemoryUsed float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
// PodName is combined with namespace and pod name,like 'namespace/pod_name'
PodNames []string `json:"podNames" yaml:"podNames"`
}
type AllNodeInfo ¶
type AllNodeInfo map[string][]interface{}
type ArenaClientArgs ¶
type CommonCronArgs ¶ added in v0.8.2
type CommonCronArgs struct {
// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
Schedule string `yaml:"schedule"` // --schedule
// Specifies how to treat concurrent executions of a Job.
// Valid values are:
// - "Allow" (default): allows CronJobs to run concurrently;
// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
// - "Replace": cancels currently running job and replaces it with a new one
// +optional
ConcurrencyPolicy string `yaml:"concurrencyPolicy"` // --concurrency-policy
// This flag tells the controller to suspend subsequent executions, it does
// not apply to already started executions. Defaults to false.
// +optional
Suspend bool `yaml:"suspend"` // --suspend
// Deadline is the timestamp that a cron job can keep scheduling util then.
Deadline string `yaml:"deadline"` // --deadline
// The number of finished job history to retain.
// This is a pointer to distinguish between explicit zero and not specified.
// +optional
HistoryLimit int `yaml:"historyLimit"` // --history-limit
}
type CommonGPUNodeInfo ¶
type CommonGPUNodeInfo struct {
TotalGPUs int `json:"totalGPUs" yaml:"totalGPUs"`
AllocatedGPUs int `json:"allocatedGPUs" yaml:"allocatedGPUs"`
UnhealthyGPUs int `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
GPUMetrics []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}
type CommonNodeInfo ¶
type CommonServingArgs ¶
type CommonServingArgs struct {
Name string `yaml:"servingName"`
Version string `yaml:"servingVersion"`
Namespace string `yaml:"-"`
Type ServingJobType `yaml:"-"`
Image string `yaml:"image"`
ImagePullPolicy string `yaml:"imagePullPolicy"` // --imagePullPolicy
GPUCount int `yaml:"gpuCount"` // --gpus
GPUMemory int `yaml:"gpuMemory"` // --gpumemory
Cpu string `yaml:"cpu"` // --cpu
Memory string `yaml:"memory"` // --memory
Envs map[string]string `yaml:"envs"` // --envs
Command string `yaml:"command"` // --command
Replicas int `yaml:"replicas"` // --replicas
EnableIstio bool `yaml:"enableIstio"` // --enableIstio
ExposeService bool `yaml:"exposeService"` // --exposeService
ModelDirs map[string]string `yaml:"modelDirs"`
HostVolumes []DataDirVolume `yaml:"hostVolumes"` // --data-dir
NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector
Tolerations []string `yaml:"tolerations"` // --toleration
Annotations map[string]string `yaml:"annotations"`
ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}
type CommonSubmitArgs ¶
type CommonSubmitArgs struct {
// Name stores the job name,match option --name
Name string `yaml:"-"`
// Namespace stores the namespace of job,match option --namespace
Namespace string `yaml:"-"`
// TrainingType stores the trainingType
TrainingType TrainingJobType `yaml:"-"`
// NodeSelectors defines the node selectors,match option --selector
NodeSelectors map[string]string `yaml:"nodeSelectors"`
// ConfigFiles stores the config file which is existed in client host node
// and map it to container,match option --config-file
ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"`
// Tolerations defines the tolerations which tolerates node taints
// match option --toleration
Tolerations []string `yaml:"tolerations"`
// Image stores the docker image of job,match option --image
Image string `yaml:"image"`
// GPUCount stores the gpu count of the job needs,match option --gpus
GPUCount int `yaml:"gpuCount"`
// Envs stores the envs of container in job, match option --env
Envs map[string]string `yaml:"envs"`
// WorkingDir stores the working directory of container in job,match option --working-dir
WorkingDir string `yaml:"workingDir"`
// Command stores the command of job
Command string `yaml:"command"`
// Mode is used for horovod,match option --sync-mode
Mode string `yaml:"mode"`
// WorkerCount stores the count of job worker,match option --workers
WorkerCount int `yaml:"workers"`
// Retry defines the retry times
Retry int `yaml:"retry"`
// DataSet stores the kubernetes pvc names
DataSet map[string]string `yaml:"dataset"`
// DataDirs stores the files(or directories) in k8s node which will map to containers
// match option --data-dir
DataDirs []DataDirVolume `yaml:"dataDirs"`
// EnableRDMA enable rdma or not,match option --rdma
EnableRDMA bool `yaml:"enableRDMA"`
// UseENI defines using eni or not
UseENI bool `yaml:"useENI"`
// Annotations defines pod annotations of job,match option --annotation
Annotations map[string]string `yaml:"annotations"`
// IsNonRoot is root user or not
IsNonRoot bool `yaml:"isNonRoot"`
// PodSecurityContext defines the pod security context
PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"`
// PriorityClassName defines the priority class
PriorityClassName string `yaml:"priorityClassName"`
// Conscheduling defines using Conscheduling
Conscheduling bool
// PodGroupName stores pod group name
PodGroupName string `yaml:"podGroupName"`
// PodGroupMinAvailable stores pod group min available
PodGroupMinAvailable string `yaml:"podGroupMinAvailable"`
// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
ImagePullSecrets []string `yaml:"imagePullSecrets"`
// HelmOptions stores the helm options
HelmOptions []string `yaml:"-"`
}
CommonSubmitArgs defines the common parts of the submitAthd
type ConcurrencyPolicy ¶ added in v0.8.2
type ConcurrencyPolicy string
ConcurrencyPolicy describes how the job will be handled. Only one of the following concurrent policies may be specified. If none of the following policies is specified, the default one is AllowConcurrent.
const ( ConcurrencyAllow ConcurrencyPolicy = "Allow" ConcurrencyForbid ConcurrencyPolicy = "Forbid" ConcurrencyReplace ConcurrencyPolicy = "Replace" )
type ConfigFileInfo ¶
type ConfigFileInfo struct {
ContainerFileName string `yaml:"containerFileName"`
HostFile string `yaml:"hostFile"`
Key string `yaml:"key"`
ContainerFilePath string `yaml:"containerFilePath"`
}
ConfigFileInfo defines the config files which will be mounted to containers
type CronHistoryInfo ¶ added in v0.8.2
type CronHistoryInfo struct {
Name string `json:"name" yaml:"name"`
Namespace string `json:"namespace" yaml:"namespace"`
Group string `json:"group" yaml:"group"`
Kind string `json:"kind" yaml:"kind"`
Status string `json:"status" yaml:"status"`
CreateTime string `json:"createTime" yaml:"createTime"`
FinishTime string `json:"finishTime" yaml:"finishTime"`
}
type CronInfo ¶ added in v0.8.2
type CronInfo struct {
Name string `json:"name" yaml:"name"`
Namespace string `json:"namespace" yaml:"namespace"`
// Type is the job type, like TFjob、PyTorchJob
Type string `json:"type" yaml:"type"`
// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
Schedule string `json:"schedule" yaml:"schedule"`
// Specifies how to treat concurrent executions of a Job.
// Valid values are:
// - "Allow" (default): allows CronJobs to run concurrently;
// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
// - "Replace": cancels currently running job and replaces it with a new one
// +optional
ConcurrencyPolicy string `json:"concurrencyPolicy" yaml:"concurrencyPolicy"` // --concurrency-policy
// This flag tells the controller to suspend subsequent executions, it does
// not apply to already started executions. Defaults to false.
// +optional
Suspend bool `json:"suspend" yaml:"suspend"` // --suspend
// Deadline is the timestamp that a cron job can keep scheduling util then.
Deadline string `json:"deadline" yaml:"deadline"` // --deadline
// The number of finished job history to retain.
// This is a pointer to distinguish between explicit zero and not specified.
// +optional
HistoryLimit int64 `json:"historyLimit" yaml:"historyLimit"` // --history-limit
// Information when was the last time the job was successfully scheduled.
// +optional
LastScheduleTime string `json:"lastScheduleTime" yaml:"lastScheduleTime"`
// CreationTimestamp stores the creation timestamp of job
CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"`
History []CronHistoryInfo `json:"cronHistory" yaml:"cronHistory"`
}
type CronTFJobArgs ¶ added in v0.8.2
type CronTFJobArgs struct {
CommonCronArgs `yaml:"cron"`
SubmitTFJobArgs `yaml:"tfjob"`
}
type CronType ¶ added in v0.8.2
type CronType string
CronType defines the supporting job type
const ( // CronTFTrainingJob defines the cron tfjob CronTFTrainingJob CronType = "tfjob" )
type CustomServingArgs ¶
type CustomServingArgs struct {
Port int `yaml:"port"` // --port
RestfulPort int `yaml:"restApiPort"` // --restfulPort
CommonServingArgs `yaml:",inline"`
}
type DataDirVolume ¶
type DataDirVolume struct {
// HostPath defines the host path
HostPath string `yaml:"hostPath"`
// ContainerPath defines container path
ContainerPath string `yaml:"containerPath"`
// Name defines the volume name
Name string `yaml:"name"`
}
DataDirVolume defines the volume of kubernetes
type Destination ¶
type Destination struct {
*istiov1alpha3.Destination
Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}
type DestinationRuleCRD ¶
type DestinationRuleCRD struct {
// Kind is a string value representing the REST resource this object represents.
// Servers may infer this from the endpoint the client submits requests to.
// Cannot be updated.
// In CamelCase.
// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
// +optional
Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`
// APIVersion defines the versioned schema of this representation of an object.
// Servers should convert recognized schemas to the latest internal value, and
// may reject unrecognized values.
// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
// +optional
APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
Spec istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}
type DestinationWeight ¶
type DestinationWeight struct {
Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
Weight int32 `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}
type FormatStyle ¶
type FormatStyle string
PrintFormatStyle defines the format of output it only used in cmd
const ( // Wide defines the wide format WideFormat FormatStyle = "wide" // Json defines the json format JsonFormat FormatStyle = "json" // Yaml defines the yaml format YamlFormat FormatStyle = "yaml" // Unknwon defines the unknown format UnknownFormat FormatStyle = "unknown" )
type GPUDeviceInfo ¶
type GPUDeviceInfo struct {
ID string `json:"id" yaml:"id"`
TotalGPUMemory float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
UsedGPUMemory float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
DutyCycle float64 `json:"dutyCycle" yaml:"dutyCycle"`
}
type GPUExclusiveNodeInfo ¶
type GPUExclusiveNodeInfo struct {
PodInfos []GPUExclusivePodInfo `json:"instances" yaml:"instances"`
CommonNodeInfo `yaml:",inline" json:",inline"`
CommonGPUNodeInfo `yaml:",inline" json:",inline"`
}
type GPUExclusivePodInfo ¶
type GPUShareNodeDevice ¶
type GPUShareNodeDevice struct {
}
type GPUShareNodeInfo ¶
type GPUShareNodeInfo struct {
}
type GPUSharePodInfo ¶
type GPUSharePodInfo struct {
}
type GPUTopology ¶
type GPUTopologyNodeDevice ¶
type GPUTopologyNodeInfo ¶
type GPUTopologyNodeInfo struct {
PodInfos []GPUTopologyPodInfo `json:"instances" yaml:"instances"`
GPUTopology GPUTopology `json:"gpuTopology" yaml:"gpuTopology"`
CommonGPUNodeInfo `yaml:",inline" json:",inline"`
CommonNodeInfo `yaml:",inline" json:",inline"`
Devices []GPUTopologyNodeDevice `yaml:"devices" yaml:"devices"`
}
type GPUTopologyPodInfo ¶
type GPUTopologyPodInfo struct {
Name string `json:"name" yaml:"name"`
Namespace string `json:"namespace" yaml:"namespace"`
Status string `json:"status" yaml:"status"`
RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
Allocation []string `json:"allocation" yaml:"allocation"`
VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"`
}
type GpuMetricInfo ¶
type HTTPMatchRequest ¶
type HTTPMatchRequest struct {
*istiov1alpha3.HTTPMatchRequest
Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}
type HTTPRoute ¶
type HTTPRoute struct {
*istiov1alpha3.HTTPRoute
Match []*HTTPMatchRequest `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}
type JobConditionType ¶ added in v0.8.2
type JobConditionType string
JobConditionType defines all kinds of types of JobStatus.
const ( // JobCreated means the job has been accepted by the system, // but one or more of the pods/services has not been started. // This includes time before pods being scheduled and launched. JobCreated JobConditionType = "Created" // JobRunning means all sub-resources (e.g. services/pods) of this job // have been successfully scheduled and launched. // The training is running without error. JobRunning JobConditionType = "Running" // JobRestarting means one or more sub-resources (e.g. services/pods) of this job // reached phase failed but maybe restarted according to it's restart policy // which specified by user in v1.PodTemplateSpec. // The training is freezing/pending. JobRestarting JobConditionType = "Restarting" // JobSucceeded means all sub-resources (e.g. services/pods) of this job // reached phase have terminated in success. // The training is complete without error. JobSucceeded JobConditionType = "Succeeded" // JobFailed means one or more sub-resources (e.g. services/pods) of this job // reached phase failed with no restarting. // The training has failed its execution. JobFailed JobConditionType = "Failed" )
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
type KFServingArgs ¶
type KFServingArgs struct {
Port int `yaml:"port"` // --port
ModelType string `yaml:"modelType"` // --modelType
CanaryPercent int `yaml:"canaryPercent"` // --canaryTrafficPercent
StorageUri string `yaml:"storageUri"` // --storageUri
CommonServingArgs `yaml:",inline"`
}
type LimitedPodSecurityContext ¶
type LimitedPodSecurityContext struct {
RunAsUser int64 `yaml:"runAsUser"`
RunAsNonRoot bool `yaml:"runAsNonRoot"`
RunAsGroup int64 `yaml:"runAsGroup"`
SupplementalGroups []int64 `yaml:"supplementalGroups"`
}
LimitedPodSecurityContext defines the kuberntes pod security context
type NodeTypeInfo ¶
type NormalNodeInfo ¶
type NormalNodeInfo struct {
CommonNodeInfo `yaml:",inline" json:",inline"`
}
type PodGpuMetric ¶
type PortSelector ¶
type PortSelector struct {
*istiov1alpha3.PortSelector
Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}
type PreprocesObject ¶
type PreprocesObject struct {
ServiceName string
Namespace string
DestinationRule DestinationRuleCRD
VirtualService VirtualServiceCRD
}
type PrometheusMetric ¶
type PrometheusMetric struct {
Status string `json:"status,inline"`
Data PrometheusMetricData `json:"data,omitempty"`
}
type PrometheusMetricData ¶
type PrometheusMetricData struct {
Result []PrometheusMetricResult `json:"result"`
ResultType string `json:"resultType"`
}
type PrometheusMetricResult ¶
type PrometheusMetricResult struct {
Metric map[string]string `json:"metric"`
Value []PrometheusMetricValue `json:"value"`
}
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type PrometheusServer ¶
type PrometheusServer struct {
Name string
ServiceLabels string
Protocol string
Port string
Path string
MetricList []string
Service *v1.Service
}
PrometheusServer is used to define prometheus server
type ScaleETJobArgs ¶
type ScaleETJobArgs struct {
//--name string required, et job name
Name string `yaml:"etName"`
// TrainingType stores the trainingType
JobType TrainingJobType `yaml:"-"`
// Namespace stores the namespace of job,match option --namespace
Namespace string `yaml:"-"`
//--timeout int timeout of callback scaler script.
Timeout int `yaml:"timeout"`
//--retry int retry times.
Retry int `yaml:"retry"`
//--count int the nums of you want to add or delete worker.
Count int `yaml:"count"`
//--script string script of scaling.
Script string `yaml:"script"`
//-e, --env stringArray the environment variables
Envs map[string]string `yaml:"envs"`
}
type ScaleInETJobArgs ¶
type ScaleInETJobArgs struct {
// common args
ScaleETJobArgs `yaml:",inline"`
}
type ScaleOutETJobArgs ¶
type ScaleOutETJobArgs struct {
// common args
ScaleETJobArgs `yaml:",inline"`
}
type SeldonServingArgs ¶ added in v0.8.0
type SeldonServingArgs struct {
Implementation string `yaml:"implementation"` // --implementation
ModelUri string `yaml:"modelUri"` // --modelUri
CommonServingArgs `yaml:",inline"`
}
type ServingInstance ¶
type ServingInstance struct {
// Name gives the instance name
Name string `json:"name" yaml:"name"`
// Status gives the instance status
Status string `json:"status" yaml:"status"`
// Age gives the instance ge
Age string `json:"age" yaml:"age"`
// ReadyContainer represents the count of ready containers
ReadyContainer int `json:"readyContainers" yaml:"readyContainers"`
// TotalContainer represents the count of total containers
TotalContainer int `json:"totalContainers" yaml:"totalContainers"`
// RestartCount represents the count of instance restarts
RestartCount int `json:"restartCount" yaml:"restartCount"`
// HostIP specifies host ip of instance
NodeIP string `json:"nodeIP" yaml:"nodeIP"`
// NodeName returns the node name
NodeName string `json:"nodeName" yaml:"nodeName"`
// IP returns the instance ip
IP string `json:"ip" yaml:"ip"`
// RequestGPU returns the request gpus
RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
// RequestGPUMemory returns the request gpu memory
RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
}
type ServingJobInfo ¶
type ServingJobInfo struct {
// Name specifies serving job name
Name string `json:"name" yaml:"name"`
// Namespace specifies serving job namespace
Namespace string `json:"namespace" yaml:"namespace"`
// Type specifies serving job type
Type string `json:"type" yaml:"type"`
// Version specifies serving job version
Version string `json:"version" yaml:"version"`
// Age specifies the serving job age
Age string `json:"age" yaml:"age"`
// Desired specifies the desired instances
Desired int `json:"desiredInstances" yaml:"desiredInstances"`
// Available specifies the available instances
Available int `json:"availableInstances" yaml:"availableInstances"`
// Endpoints specifies the endpoints
Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"`
// IPAddress specifies the ip address
IPAddress string `json:"ip" yaml:"ip"`
// Instances gives the instance informations
Instances []ServingInstance `json:"instances" yaml:"instances"`
// RequestGPU specifies the request gpus
RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"`
// RequestGPUMemory specifies the request gpu memory,only for gpushare
RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
// CreationTimestamp stores the creation timestamp of job
CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}
ServingJobInfo display serving job information
type ServingJobType ¶
type ServingJobType string
ServingJobType defines the serving job type name must like shorthand + "-serving"
const ( // TFServingJob defines the tensorflow serving job TFServingJob ServingJobType = "tf-serving" // TRTServingJob defines the tensorrt serving job TRTServingJob ServingJobType = "trt-serving" // KFServingJob defines the kfserving job KFServingJob ServingJobType = "kf-serving" // SeldonServingJob defines the seldon core job SeldonServingJob ServingJobType = "seldon-serving" // CustomServingJob defines the custom serving job CustomServingJob ServingJobType = "custom-serving" // AllServingJob represents all serving job type AllServingJob ServingJobType = "" // UnknownServingJob defines the unknown serving job UnknownServingJob ServingJobType = "unknown" )
type ServingTypeInfo ¶
type ServingTypeInfo struct {
Name ServingJobType
Alias string
Shorthand string
}
type ServingVersionWeight ¶
type StringMatchPrefix ¶
type StringMatchPrefix struct {
Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}
type SubmitETJobArgs ¶
type SubmitETJobArgs struct {
Cpu string `yaml:"cpu"` // --cpu
Memory string `yaml:"memory"` // --memory
// for common args
CommonSubmitArgs `yaml:",inline"`
// SubmitTensorboardArgs stores tensorboard information
SubmitTensorboardArgs `yaml:",inline"`
// SubmitSyncCodeArgs stores syncing code information
SubmitSyncCodeArgs `yaml:",inline"`
MaxWorkers int `yaml:"maxWorkers"`
MinWorkers int `yaml:"minWorkers"`
}
type SubmitHorovodJobArgs ¶
type SubmitHorovodJobArgs struct {
SSHPort int `yaml:"sshPort"`
Cpu string `yaml:"cpu"` // --cpu
Memory string `yaml:"memory"` // --memory
// for common args
CommonSubmitArgs `yaml:",inline"`
// for tensorboard
SubmitTensorboardArgs `yaml:",inline"`
// for sync up source code
SubmitSyncCodeArgs `yaml:",inline"`
}
type SubmitMPIJobArgs ¶
type SubmitMPIJobArgs struct {
Cpu string `yaml:"cpu"` // --cpu
Memory string `yaml:"memory"` // --memory
// for common args
CommonSubmitArgs `yaml:",inline"`
// for tensorboard
SubmitTensorboardArgs `yaml:",inline"`
// for sync up source code
SubmitSyncCodeArgs `yaml:",inline"`
// enable gpu topology scheduling
GPUTopology bool `yaml:"gputopology"`
GPUTopologyReplica string `yaml:"gputopologyreplica"`
}
type SubmitPyTorchJobArgs ¶
type SubmitPyTorchJobArgs struct {
Cpu string `yaml:"cpu"` // --cpu
Memory string `yaml:"memory"` // --memory
// for common args
CommonSubmitArgs `yaml:",inline"`
// for tensorboard
SubmitTensorboardArgs `yaml:",inline"`
// for sync up source code
SubmitSyncCodeArgs `yaml:",inline"`
// clean-task-policy
CleanPodPolicy string `yaml:"cleanPodPolicy"`
}
type SubmitSparkJobArgs ¶
type SubmitSyncCodeArgs ¶
type SubmitSyncCodeArgs struct {
SyncMode string `yaml:"syncMode"` // --syncMode: rsync, hdfs, git
SyncSource string `yaml:"syncSource"` // --syncSource
SyncImage string `yaml:"syncImage,omitempty"` // --syncImage
// syncGitProjectName
SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage
}
type SubmitTFJobArgs ¶
type SubmitTFJobArgs struct {
// TFNodeSelectors assigns tfjob node selectors
TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"`
// Port defines the defaut port if workerPort and PSPort are not set
Port int
// WorkerImage assigns worker image,match option --worker-image
WorkerImage string `yaml:"workerImage"`
// WorkerPort stores worker port,match option --work-port
WorkerPort int `yaml:"workerPort"`
// PSPort stores the ps port,match option --ps-port
PSPort int `yaml:"psPort"`
// PSCount stores the ps count,--ps-count
PSCount int `yaml:"ps"`
// PSImage stores the ps image,--ps-image
PSImage string `yaml:"psImage"`
// WorkerCpu stores the cpu of job worker,match option --worker-cpu
WorkerCpu string `yaml:"workerCPU"`
//WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector
// WorkerMemory stores woker memory,match option --worker-memory
WorkerMemory string `yaml:"workerMemory"`
// PSCpu stores ps cpu,match option --ps-cpu
PSCpu string `yaml:"psCPU"`
// PSGpu stores ps gpu,match option --ps-gpus
PSGpu int `yaml:"psGPU"` // --ps-gpus
// PSMemory stores the ps memory,match option --ps-memory
PSMemory string `yaml:"psMemory"`
// CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy
CleanPodPolicy string `yaml:"cleanPodPolicy"`
// UseChief stores the using chief or not,match option --chief
UseChief bool `yaml:",omitempty"` // --chief
// ChiefCount stores the chief count of job,match option --chief-count
ChiefCount int `yaml:"chief"`
// UseEvaluator is used to enable evaluator or not,match option --evaluator
UseEvaluator bool `yaml:",omitempty"`
// ChiefPort stores the chief port,match option --chief-port
ChiefPort int `yaml:"chiefPort"`
//ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector
// ChiefCpu stores the chief pod cpu,match option --chief-cpu
ChiefCpu string `yaml:"chiefCPU"`
// ChiefMemory stores the chief pod memory,match option --chief-memory
ChiefMemory string `yaml:"chiefMemory"`
// EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu
EvaluatorCpu string `yaml:"evaluatorCPU"`
//EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector
// EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory
EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory
// EvaluatorCount stores the evaluator pod count,match option --evaluator-count
EvaluatorCount int `yaml:"evaluator"`
// HasGangScheduler determines if it has gang scheduler
HasGangScheduler bool `yaml:"hasGangScheduler"`
// for common args
CommonSubmitArgs `yaml:",inline"`
// SubmitTensorboardArgs stores tensorboard information
SubmitTensorboardArgs `yaml:",inline"`
// SubmitSyncCodeArgs stores syncing code information
SubmitSyncCodeArgs `yaml:",inline"`
// TFRuntime stores the runtime
TFRuntime `yaml:"-"`
}
type SubmitTensorboardArgs ¶
type SubmitTensorboardArgs struct {
UseTensorboard bool `yaml:"useTensorboard"` // --tensorboard
TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage
TrainingLogdir string `yaml:"trainingLogdir"` // --logdir
HostLogPath string `yaml:"hostLogPath"`
IsLocalLogging bool `yaml:"isLocalLogging"`
}
SubmitTensorboardArgs is used to store tensorborad information
type SubmitVolcanoJobArgs ¶
type SubmitVolcanoJobArgs struct {
// Name stores the job name
Name string
// Namespace stores the namespace of job
Namespace string
// TrainingType is used to accept job type
TrainingType TrainingJobType
// Command defines the job command
Command string
// The MinAvailable available pods to run for this Job
MinAvailable int `yaml:"minAvailable"`
// Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty.
Queue string `yaml:"queue"`
// SchedulerName is the default value of `tasks.template.spec.schedulerName`.
SchedulerName string `yaml:"schedulerName"`
// TaskName specifies the name of task
TaskName string `yaml:"taskName"`
// TaskImages specifies the task image
TaskImages []string `yaml:"taskImages"`
// TaskReplicas specifies the replicas of this Task in Job
TaskReplicas int `yaml:"taskReplicas"`
// TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m
TaskCPU string `yaml:"taskCPU"`
// TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi
TaskMemory string `yaml:"taskMemory"`
// TaskPort specifies the task port
TaskPort int `yaml:"taskPort"`
}
type TFRuntime ¶
type TFRuntime interface {
// check the tfjob args
Check(tf *SubmitTFJobArgs) (err error)
// transform the tfjob
Transform(tf *SubmitTFJobArgs) (err error)
Runtime
}
Customized runtime for tf training training
type TensorFlowServingArgs ¶
type TensorFlowServingArgs struct {
VersionPolicy string `yaml:"versionPolicy"` // --versionPolicy
ModelConfigFile string `yaml:"modelConfigFile"` // --modelConfigFile
ModelConfigFileContent string `yaml:"modelConfigFileContent"`
ModelName string `yaml:"modelName"` // --modelName
ModelPath string `yaml:"modelPath"` // --modelPath
Port int `yaml:"port"` // --port
RestfulPort int `yaml:"restApiPort"` // --restfulPort
CommonServingArgs `yaml:",inline"`
}
type TensorRTServingArgs ¶
type TensorRTServingArgs struct {
ModelStore string `yaml:"modelStore"` // --modelStore
MetricsPort int `yaml:"metricsPort"` // --metricsPort
HttpPort int `yaml:"httpPort"` // --httpPort
GrpcPort int `yaml:"grpcPort"` // --grpcPort
AllowMetrics bool `yaml:"allowMetrics"` // --allowMetrics
CommonServingArgs `yaml:",inline"`
}
type TrafficRouterSplitArgs ¶
type TrafficRouterSplitArgs struct {
ServingName string `yaml:"servingName,omitempty"` //--name
Namespace string `yaml:"namespace,omitempty"` //--namespace
Versions string `yaml:"versions,omitempty"` //--versions
Weights string `yaml:"weights,omitempty"` //--weights
VersionWeights []ServingVersionWeight
}
type TrainingJobInfo ¶
type TrainingJobInfo struct {
// The name of the training job
Name string `json:"name" yaml:"name"`
// The namespace of the training job
Namespace string `json:"namespace" yaml:"namespace"`
// The time of the training job
Duration string `json:"duration" yaml:"duration"`
// The status of the training Job
Status TrainingJobStatus `json:"status" yaml:"status"`
// The training type of the training job
Trainer TrainingJobType `json:"trainer" yaml:"trainer"`
// The tensorboard of the training job
Tensorboard string `json:"tensorboard" yaml:"tensorboard"`
// The name of the chief Instance
ChiefName string `json:"chiefName" yaml:"chiefName"`
// The instances under the training job
Instances []TrainingJobInstance `json:"instances" yaml:"instances"`
// The priority of the training job
Priority string `json:"priority" yaml:"priority"`
// RequestGPU stores the request gpus
RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"`
// AllocatedGPU stores the allocated gpus
AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"`
// CreationTimestamp stores the creation timestamp of job
CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}
TrainingJobInfo stores training job information
type TrainingJobInstance ¶
type TrainingJobInstance struct {
// IP defines the instance ip
IP string `json:"ip" yaml:"ip"`
// the status of of instance
Status string `json:"status"`
// the name of instance
Name string `json:"name"`
// the age of instance
Age string `json:"age"`
// the node instance runs on
Node string `json:"node"`
// NodeIP is store the node ip
NodeIP string `json:"nodeIP" yaml:"nodeIP"`
// the instance is chief or not
IsChief bool `json:"chief" yaml:"chief"`
// RequestGPUs is used to store request gpu count
RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"`
// GpuDutyCycle stores the gpu metrics
GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}
TrainingJobInstance defines the instance of training job
type TrainingJobStatus ¶
type TrainingJobStatus string
TrainingJobStatus defines all the kinds of JobStatus
const ( // TrainingJobPending means the job is pending TrainingJobPending TrainingJobStatus = "PENDING" // TrainingJobRunning means the job is running TrainingJobRunning TrainingJobStatus = "RUNNING" // TrainingJobSucceeded means the job is Succeeded TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED" // TrainingJobFailed means the job is failed TrainingJobFailed TrainingJobStatus = "FAILED" )
type TrainingJobType ¶
type TrainingJobType string
TrainingJobType defines the supporting training job type
const ( // TFTrainingJob defines the tfjob TFTrainingJob TrainingJobType = "tfjob" // MPITrainingJob defines the mpijob MPITrainingJob TrainingJobType = "mpijob" // PytorchTrainingJob defines the pytorchjob PytorchTrainingJob TrainingJobType = "pytorchjob" // HorovodTrainingJob defines the horovod job HorovodTrainingJob TrainingJobType = "horovodjob" // VolcanoTrainingJob defines the volcano job VolcanoTrainingJob TrainingJobType = "volcanojob" // ETTrainingJob defines the etjob ETTrainingJob TrainingJobType = "etjob" // SparkTrainingJob defines the spark job SparkTrainingJob TrainingJobType = "sparkjob" // AllTrainingJob represents all job types AllTrainingJob TrainingJobType = "" // UnknownTrainingJob defines the unknown training UnknownTrainingJob TrainingJobType = "unknown" )
type TrainingJobTypeInfo ¶
type TrainingJobTypeInfo struct {
Name TrainingJobType
Alias string
Shorthand string
}
type VirtualService ¶
type VirtualService struct {
*istiov1alpha3.VirtualService
Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}
type VirtualServiceCRD ¶
type VirtualServiceCRD struct {
// Kind is a string value representing the REST resource this object represents.
// Servers may infer this from the endpoint the client submits requests to.
// Cannot be updated.
// In CamelCase.
// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
// +optional
Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`
// APIVersion defines the versioned schema of this representation of an object.
// Servers should convert recognized schemas to the latest internal value, and
// may reject unrecognized values.
// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
// +optional
APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
Spec VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}