Documentation
¶
Index ¶
- Constants
- Variables
- func CalculateAvailableReplicas(pods corev1.PodList) int32
- func CalculateDesiredReplicas(ctx context.Context, cluster *rayv1.RayCluster) int32
- func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList
- func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32
- func CalculateMinReplicas(cluster *rayv1.RayCluster) int32
- func CalculateMinResources(cluster *rayv1.RayCluster) corev1.ResourceList
- func CalculatePodResource(podSpec corev1.PodSpec) corev1.ResourceList
- func CalculateReadyReplicas(pods corev1.PodList) int32
- func CheckAllPodsRunning(ctx context.Context, runningPods corev1.PodList) bool
- func CheckLabel(s string) string
- func CheckName(s string) string
- func CheckRouteName(ctx context.Context, s string, n string) string
- func CompareJsonStruct(objA interface{}, objB interface{}) bool
- func Contains(elems []string, searchTerm string) bool
- func ConvertResourceListToMapString(resourceList corev1.ResourceList) map[string]resource.Quantity
- func EnvVarByName(envName string, envVars []corev1.EnvVar) (corev1.EnvVar, bool)
- func EnvVarExists(envName string, envVars []corev1.EnvVar) bool
- func ExtractRayIPFromFQDN(fqdnRayIP string) string
- func FetchHeadServiceURL(ctx context.Context, cli client.Client, rayCluster *rayv1.RayCluster, ...) (string, error)
- func FindContainerPort(container *corev1.Container, portName string, defaultPort int32) int32
- func FindHeadPodReadyCondition(headPod *corev1.Pod) metav1.Condition
- func FindRayClusterSuspendStatus(instance *rayv1.RayCluster) rayv1.RayClusterConditionType
- func FormatInt32(n int32) string
- func GenerateFQDNServiceName(ctx context.Context, cluster rayv1.RayCluster, namespace string) string
- func GenerateHeadServiceName(crdType CRDType, clusterSpec rayv1.RayClusterSpec, ownerName string) (string, error)
- func GenerateIdentifier(clusterName string, nodeType rayv1.RayNodeType) string
- func GenerateIngressName(clusterName string) string
- func GenerateJsonHash(obj interface{}) (string, error)
- func GenerateRayClusterName(serviceName string) string
- func GenerateRayJobId(rayjob string) string
- func GenerateRayWorkerReplicaGroupName(workerGroupName string) string
- func GenerateRouteName(clusterName string) string
- func GenerateServeServiceLabel(serviceName string) string
- func GenerateServeServiceName(serviceName string) string
- func GetClusterDomainName() string
- func GetClusterType() bool
- func GetContainerCommand(additionalOptions []string) []string
- func GetHeadGroupServiceAccountName(cluster *rayv1.RayCluster) string
- func GetNamespace(metaData metav1.ObjectMeta) string
- func GetRayClusterNameFromService(svc *corev1.Service) string
- func GetRayDashboardClientFunc(mgr manager.Manager, useKubernetesProxy bool) ...
- func GetRayHttpProxyClientFunc(mgr manager.Manager, useKubernetesProxy bool) ...
- func GetRayServiceClusterUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.ClusterUpgradeOptions
- func GetWeightsFromHTTPRoute(httpRoute *gwv1.HTTPRoute, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32)
- func GetWorkerGroupDesiredReplicas(ctx context.Context, workerGroupSpec rayv1.WorkerGroupSpec) int32
- func HasSubmitter(rayJobInstance *rayv1.RayJob) bool
- func InconsistentRayClusterStatus(oldStatus rayv1.RayClusterStatus, newStatus rayv1.RayClusterStatus) bool
- func InconsistentRayServiceStatuses(oldStatus rayv1.RayServiceStatuses, newStatus rayv1.RayServiceStatuses) bool
- func IsAuthEnabled(spec *rayv1.RayClusterSpec) bool
- func IsAutoscalingEnabled(spec *rayv1.RayClusterSpec) bool
- func IsAutoscalingV2Enabled(spec *rayv1.RayClusterSpec) bool
- func IsCreated(pod *corev1.Pod) bool
- func IsDeterministicHeadPodNameEnabled() bool
- func IsGCSFaultToleranceEnabled(spec *rayv1.RayClusterSpec, annotations map[string]string) bool
- func IsGPUResourceKey(key string) bool
- func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool
- func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool
- func IsIncrementalUpgradeComplete(rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) bool
- func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool
- func IsJobFinished(j *batchv1.Job) (batchv1.JobConditionType, bool)
- func IsRunningAndReady(pod *corev1.Pod) bool
- func ManagedByExternalController(controllerName *string) *string
- func PodName(prefix string, nodeType rayv1.RayNodeType, isGenerateName bool) string
- func RayClusterReplicaFailureReason(err error) string
- func RayOriginatedFromCRDLabelValue(crdType CRDType) string
- func SafeInt64ToInt32(n int64) int32
- func SafeUint64ToInt64(n uint64) int64
- func SumResourceList(list []corev1.ResourceList) corev1.ResourceList
- func TrimJobName(jobName string) string
- func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error
- func ValidateRayClusterMetadata(metadata metav1.ObjectMeta) error
- func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]string) error
- func ValidateRayClusterStatus(instance *rayv1.RayCluster) error
- func ValidateRayJobMetadata(metadata metav1.ObjectMeta) error
- func ValidateRayJobSpec(rayJob *rayv1.RayJob) error
- func ValidateRayJobStatus(rayJob *rayv1.RayJob) error
- func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error
- func ValidateRayServiceSpec(rayService *rayv1.RayService) error
- type CRDType
- type ClientProvider
- type FakeRayDashboardClient
- func (r *FakeRayDashboardClient) DeleteJob(_ context.Context, _ string) error
- func (r *FakeRayDashboardClient) GetJobInfo(ctx context.Context, jobId string) (*utiltypes.RayJobInfo, error)
- func (r *FakeRayDashboardClient) GetJobLog(_ context.Context, _ string) (*string, error)
- func (r *FakeRayDashboardClient) GetMultiApplicationStatus(_ context.Context) (map[string]*utiltypes.ServeApplicationStatus, error)
- func (r *FakeRayDashboardClient) GetServeDetails(_ context.Context) (*utiltypes.ServeDetails, error)
- func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string, _ string)
- func (r *FakeRayDashboardClient) ListJobs(ctx context.Context) (*[]utiltypes.RayJobInfo, error)
- func (r *FakeRayDashboardClient) SetMultiApplicationStatuses(statuses map[string]*utiltypes.ServeApplicationStatus)
- func (r *FakeRayDashboardClient) StopJob(_ context.Context, _ string) (err error)
- func (r *FakeRayDashboardClient) SubmitJob(_ context.Context, _ *rayv1.RayJob) (jobId string, err error)
- func (r *FakeRayDashboardClient) SubmitJobReq(_ context.Context, _ *utiltypes.RayJobRequest) (string, error)
- func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error
- type FakeRayHttpProxyClient
- type K8sEventType
- type RayHttpProxyClient
- type RayHttpProxyClientInterface
- type ServiceType
Constants ¶
const ( // Default application name DefaultServeAppName = "default" // RayOriginatedFromCRNameLabelKey and RayOriginatedFromCRDLabelKey are the labels used to associate the root KubeRay Custom Resource. // [Example 1] If we create a RayJob named `myjob`, then (1) the RayCluster and (2) the submitter K8s Job will have a // `ray.io/originated-from-cr-name=myjob` and a `ray.io/originated-from-crd=RayJob` label. // // [Example 2] If we create a RayService named `mysvc`, then (1) the RayCluster and (2) the Kubernetes services managed by the RayService // will have a `ray.io/originated-from-cr-name=mysvc` and a `ray.io/originated-from-crd=RayService` label. RayOriginatedFromCRNameLabelKey = "ray.io/originated-from-cr-name" RayOriginatedFromCRDLabelKey = "ray.io/originated-from-crd" RayClusterLabelKey = "ray.io/cluster" RayNodeTypeLabelKey = "ray.io/node-type" RayNodeGroupLabelKey = "ray.io/group" RayNodeLabelKey = "ray.io/is-ray-node" RayIDLabelKey = "ray.io/identifier" RayClusterServingServiceLabelKey = "ray.io/serve" RayClusterHeadlessServiceLabelKey = "ray.io/headless-worker-svc" HashWithoutReplicasAndWorkersToDeleteKey = "ray.io/hash-without-replicas-and-workers-to-delete" NumWorkerGroupsKey = "ray.io/num-worker-groups" KubeRayVersion = "ray.io/kuberay-version" // Labels for feature RayMultihostIndexing // // RayWorkerReplicaNameKey label is the unique name for the replica in a specific worker group. It is made up // of the worker group name and a unique identifier (e.g. multi-host-worker-group-xh3hf). This label is unique // across RayClusters. RayWorkerReplicaNameKey = "ray.io/worker-group-replica-name" // RayWorkerReplicaIndexKey label is the integer index for the replica in it's worker group (0 to replicas-1). // The value for this label is unique within its worker group, but not across worker groups or RayClusters. RayWorkerReplicaIndexKey = "ray.io/worker-group-replica-index" // RayHostIndexKey label represents the index of the host within the replica group. RayHostIndexKey = "ray.io/replica-host-index" // In KubeRay, the Ray container must be the first application container in a head or worker Pod. RayContainerIndex = 0 // Batch scheduling labels // TODO(tgaddair): consider making these part of the CRD RaySchedulerName = "ray.io/scheduler-name" RayPriorityClassName = "ray.io/priority-class-name" RayGangSchedulingEnabled = "ray.io/gang-scheduling-enabled" // Ray GCS FT related annotations RayFTEnabledAnnotationKey = "ray.io/ft-enabled" RayExternalStorageNSAnnotationKey = "ray.io/external-storage-namespace" // If this annotation is set to "true", the KubeRay operator will not modify the container's command. // However, the generated `ray start` command will still be stored in the container's environment variable // `KUBERAY_GEN_RAY_START_CMD`. RayOverwriteContainerCmdAnnotationKey = "ray.io/overwrite-container-cmd" // RayServiceInitializingTimeoutAnnotation specifies the timeout for RayService initialization. // Accepts Go duration format (e.g., "30m", "1h") or integer seconds. // // Behavior when timeout is exceeded: // - RayServiceReady condition is set to False with reason InitializingTimeout // - RayService enters a terminal failure state (cannot be recovered by spec updates) // - Cluster names are cleared, triggering cleanup of RayCluster resources // - A Warning event is emitted with timeout details // // Recovery after timeout: // The RayService must be deleted and recreated. Updating the spec will NOT retry initialization. RayServiceInitializingTimeoutAnnotation = "ray.io/initializing-timeout" // RayJob default cluster selector key RayJobClusterSelectorKey = "ray.io/cluster" // Finalizers for GCS fault tolerance GCSFaultToleranceRedisCleanupFinalizer = "ray.io/gcs-ft-redis-cleanup-finalizer" // EnableServeServiceKey is exclusively utilized to indicate if a RayCluster is directly used for serving. // See https://github.com/ray-project/kuberay/pull/1672 for more details. EnableServeServiceKey = "ray.io/enable-serve-service" EnableServeServiceTrue = "true" EnableRayClusterServingServiceTrue = "true" EnableRayClusterServingServiceFalse = "false" KubernetesApplicationNameLabelKey = "app.kubernetes.io/name" KubernetesCreatedByLabelKey = "app.kubernetes.io/created-by" // Use as separator for pod name, for example, raycluster-small-size-worker-0 DashSymbol = "-" // Use as default port DefaultClientPort = 10001 DefaultGcsServerPort = 6379 DefaultDashboardPort = 8265 DefaultMetricsPort = 8080 DefaultDashboardAgentListenPort = 52365 DefaultServingPort = 8000 ClientPortName = "client" GcsServerPortName = "gcs-server" DashboardPortName = "dashboard" MetricsPortName = "metrics" ServingPortName = "serve" // Gateway defaults for HTTP protocol GatewayListenerPortName = "http" DefaultGatewayListenerPort = 80 // The default AppProtocol for Kubernetes service DefaultServiceAppProtocol = "tcp" // The default application name ApplicationName = "kuberay" // The default name for kuberay operator ComponentName = "kuberay-operator" // The default suffix for Headless Service for multi-host worker groups. // The full name will be of the form "${RayCluster_Name}-headless". HeadlessServiceSuffix = "headless" // Use as container env variable RAY_CLUSTER_NAME = "RAY_CLUSTER_NAME" RAY_IP = "RAY_IP" FQ_RAY_IP = "FQ_RAY_IP" RAY_PORT = "RAY_PORT" RAY_ADDRESS = "RAY_ADDRESS" RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" REDIS_PASSWORD = "REDIS_PASSWORD" REDIS_USERNAME = "REDIS_USERNAME" RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE = "RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE" RAY_EXTERNAL_STORAGE_NS = "RAY_external_storage_namespace" RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S = "RAY_gcs_rpc_server_reconnect_timeout_s" RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO = "RAY_timeout_ms_task_wait_for_death_info" RAY_GCS_SERVER_REQUEST_TIMEOUT_SECONDS = "RAY_gcs_server_request_timeout_seconds" RAY_SERVE_KV_TIMEOUT_S = "RAY_SERVE_KV_TIMEOUT_S" RAY_USAGE_STATS_KUBERAY_IN_USE = "RAY_USAGE_STATS_KUBERAY_IN_USE" RAY_USAGE_STATS_EXTRA_TAGS = "RAY_USAGE_STATS_EXTRA_TAGS" RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV = "RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV" RAYCLUSTER_DEFAULT_REQUEUE_SECONDS = 300 KUBERAY_GEN_RAY_START_CMD = "KUBERAY_GEN_RAY_START_CMD" // Environment variables for RayJob submitter Kubernetes Job. // Example: ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID ... RAY_DASHBOARD_ADDRESS = "RAY_DASHBOARD_ADDRESS" RAY_JOB_SUBMISSION_ID = "RAY_JOB_SUBMISSION_ID" // Environment variables for Ray Autoscaler V2. // The value of RAY_CLOUD_INSTANCE_ID is the Pod name for Autoscaler V2 alpha. This may change in the future. RAY_CLOUD_INSTANCE_ID = "RAY_CLOUD_INSTANCE_ID" // The value of RAY_NODE_TYPE_NAME is the name of the node group (i.e., the value of the "ray.io/group" label). RAY_NODE_TYPE_NAME = "RAY_NODE_TYPE_NAME" RAY_ENABLE_AUTOSCALER_V2 = "RAY_enable_autoscaler_v2" // RAY_AUTH_MODE_ENV_VAR is the Ray environment variable for configuring the authentication mode RAY_AUTH_MODE_ENV_VAR = "RAY_AUTH_MODE" // RAY_AUTH_TOKEN_ENV_VAR is the Ray environment variable containing the authentication token. RAY_AUTH_TOKEN_ENV_VAR = "RAY_AUTH_TOKEN" // #nosec G101 // RAY_AUTH_TOKEN_SECRET_KEY is the key used in the Secret containing Ray auth token RAY_AUTH_TOKEN_SECRET_KEY = "auth_token" // This KubeRay operator environment variable is used to determine if random Pod // deletion should be enabled. Note that this only takes effect when autoscaling // is enabled for the RayCluster. This is a feature flag for v0.6.0, and will be // removed if the default behavior is stable enoguh. ENABLE_RANDOM_POD_DELETE = "ENABLE_RANDOM_POD_DELETE" // This KubeRay operator environment variable is used to determine if the Redis // cleanup Job should be enabled. This is a feature flag for v1.0.0. ENABLE_GCS_FT_REDIS_CLEANUP = "ENABLE_GCS_FT_REDIS_CLEANUP" // This environment variable for the KubeRay operator is used to determine whether to enable // the injection of readiness and liveness probes into Ray head and worker containers. // Enabling this feature contributes to the robustness of Ray clusters. It is currently a feature // flag for v1.1.0 and will be removed if the behavior proves to be stable enough. ENABLE_PROBES_INJECTION = "ENABLE_PROBES_INJECTION" // This KubeRay operator environment variable is used to determine // if operator should treat OpenShift cluster as Vanilla Kubernetes. USE_INGRESS_ON_OPENSHIFT = "USE_INGRESS_ON_OPENSHIFT" // If set to true, kuberay creates a normal ClusterIP service for a Ray Head instead of a Headless service. ENABLE_RAY_HEAD_CLUSTER_IP_SERVICE = "ENABLE_RAY_HEAD_CLUSTER_IP_SERVICE" // If set to true, the RayJob CR itself will be deleted if shutdownAfterJobFinishes is set to true. Note that all resources created by the RayJob CR will be deleted, including the K8s Job. DELETE_RAYJOB_CR_AFTER_JOB_FINISHES = "DELETE_RAYJOB_CR_AFTER_JOB_FINISHES" // If this occurs, it is likely due to a system-level issue (e.g., a Ray bug) that prevents the // `ray job submit` process in the Kubernetes Job submitter from exiting. RAYJOB_DEPLOYMENT_STATUS_TRANSITION_GRACE_PERIOD_SECONDS = "RAYJOB_DEPLOYMENT_STATUS_TRANSITION_GRACE_PERIOD_SECONDS" DEFAULT_RAYJOB_DEPLOYMENT_STATUS_TRANSITION_GRACE_PERIOD_SECONDS = 300 // This environment variable for the KubeRay operator determines whether to enable // a login shell by passing the -l option to the container command /bin/bash. // The -l flag was added by default before KubeRay v1.4.0, but it is no longer added // by default starting with v1.4.0. ENABLE_LOGIN_SHELL = "ENABLE_LOGIN_SHELL" // If set to true, we will use deterministic name for head pod. Otherwise, the non-deterministic name is used. ENABLE_DETERMINISTIC_HEAD_POD_NAME = "ENABLE_DETERMINISTIC_HEAD_POD_NAME" // Ray core default configurations DefaultWorkerRayGcsReconnectTimeoutS = "600" LOCAL_HOST = "127.0.0.1" // Ray FT default readiness probe values DefaultReadinessProbeInitialDelaySeconds = 10 DefaultReadinessProbeTimeoutSeconds = 2 // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) DefaultHeadReadinessProbeTimeoutSeconds = 5 DefaultReadinessProbePeriodSeconds = 5 DefaultReadinessProbeSuccessThreshold = 1 DefaultReadinessProbeFailureThreshold = 10 ServeReadinessProbeFailureThreshold = 1 // Ray FT default liveness probe values DefaultLivenessProbeInitialDelaySeconds = 30 DefaultLivenessProbeTimeoutSeconds = 2 // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) DefaultHeadLivenessProbeTimeoutSeconds = 5 DefaultLivenessProbePeriodSeconds = 5 DefaultLivenessProbeSuccessThreshold = 1 DefaultLivenessProbeFailureThreshold = 120 // Ray health check related configurations // Note: Since the Raylet process and the dashboard agent process are fate-sharing, // only one of them needs to be checked. So, RayAgentRayletHealthPath accesses the dashboard agent's API endpoint // to check the health of the Raylet process. // TODO (kevin85421): Should we take the dashboard process into account? RayAgentRayletHealthPath = "api/local_raylet_healthz" RayDashboardGCSHealthPath = "api/gcs_healthz" RayServeProxyHealthPath = "-/healthz" BaseWgetHealthCommand = "wget --tries 1 -T %d -q -O- http://localhost:%d/%s | grep success" // Finalizers for RayJob RayJobStopJobFinalizer = "ray.io/rayjob-finalizer" // RayNodeHeadGroupLabelValue is the value for the RayNodeGroupLabelKey label on a head node RayNodeHeadGroupLabelValue = "headgroup" RayNodeSubmitterGroupLabelValue = "submittergroup" // SubmitterContainerName is the default name of the job submit container injected into the head Pod in SidecarMode. SubmitterContainerName = "ray-job-submitter" // KUBERAY_VERSION is the build version of KubeRay. // The version is included in the RAY_USAGE_STATS_EXTRA_TAGS environment variable // as well as the user-agent. This constant is updated before release. // TODO: Update KUBERAY_VERSION to be a build-time variable. KUBERAY_VERSION = "v1.5.1" // KubeRayController represents the value of the default job controller KubeRayController = "ray.io/kuberay-operator" ServeConfigLRUSize = 1000 // MaxRayClusterNameLength is the maximum RayCluster name to make sure we don't truncate // their k8s service names. Currently, "-serve-svc" is the longest service suffix: // 63 - len("-serve-svc") == 53, so the name should not be longer than 53 characters. MaxRayClusterNameLength = 53 // MaxRayServiceNameLength is the maximum RayService name to make sure it pass the RayCluster validation. // Minus 6 since we append 6 characters to the RayService name to create the cluster (GenerateRayClusterName). MaxRayServiceNameLength = MaxRayClusterNameLength - 6 // MaxRayJobNameLength is the maximum RayJob name to make sure it pass the RayCluster validation // Minus 6 since we append 6 characters to the RayJob name to create the cluster (GenerateRayClusterName). MaxRayJobNameLength = MaxRayClusterNameLength - 6 )
const ( ServeName = "serve" ClusterDomainEnvKey = "CLUSTER_DOMAIN" DefaultDomainName = "cluster.local" ContainersNotReady = "ContainersNotReady" )
Variables ¶
var ( ErrFailedDeleteAllPods = &errRayClusterReplicaFailure{reason: "FailedDeleteAllPods"} ErrFailedDeleteHeadPod = &errRayClusterReplicaFailure{reason: "FailedDeleteHeadPod"} ErrFailedCreateHeadPod = &errRayClusterReplicaFailure{reason: "FailedCreateHeadPod"} ErrFailedDeleteWorkerPod = &errRayClusterReplicaFailure{reason: "FailedDeleteWorkerPod"} ErrFailedCreateWorkerPod = &errRayClusterReplicaFailure{reason: "FailedCreateWorkerPod"} )
These are markers used by the calculateStatus() for setting the RayClusterReplicaFailure condition.
Functions ¶
func CalculateAvailableReplicas ¶
CalculateAvailableReplicas calculates available worker replicas at the cluster level A worker is available if its Pod is running
func CalculateDesiredReplicas ¶
func CalculateDesiredReplicas(ctx context.Context, cluster *rayv1.RayCluster) int32
CalculateDesiredReplicas calculate desired worker replicas at the cluster level
func CalculateDesiredResources ¶
func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList
func CalculateMaxReplicas ¶
func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32
CalculateMaxReplicas calculates max worker replicas at the cluster level
func CalculateMinReplicas ¶
func CalculateMinReplicas(cluster *rayv1.RayCluster) int32
CalculateMinReplicas calculates min worker replicas at the cluster level
func CalculateMinResources ¶
func CalculateMinResources(cluster *rayv1.RayCluster) corev1.ResourceList
func CalculatePodResource ¶ added in v1.2.2
func CalculatePodResource(podSpec corev1.PodSpec) corev1.ResourceList
CalculatePodResource returns the total resources of a pod. Request values take precedence over limit values.
func CalculateReadyReplicas ¶ added in v1.2.0
CalculateReadyReplicas calculates ready worker replicas at the cluster level A worker is ready if its Pod has a PodCondition with type == Ready and status == True
func CheckAllPodsRunning ¶ added in v0.6.0
CheckAllPodsRunning returns true if all the RayCluster's Pods are running, false otherwise
func CheckLabel ¶
CheckLabel makes sure the label value does not start with a punctuation and the total length is < 63 char
func CheckName ¶
CheckName makes sure the name does not start with a numeric value and the total length is < 63 char
func CheckRouteName ¶ added in v1.1.0
func CompareJsonStruct ¶
func CompareJsonStruct(objA interface{}, objB interface{}) bool
CompareJsonStruct This is a way to better compare if two objects are the same when they are json/yaml structs. reflect.DeepEqual will fail in some cases.
func ConvertResourceListToMapString ¶ added in v1.2.2
func ConvertResourceListToMapString(resourceList corev1.ResourceList) map[string]resource.Quantity
func EnvVarByName ¶ added in v1.1.0
EnvVarByName returns an entry in []corev1.EnvVar that matches a name. Also returns a bool for whether the env var exists.
func ExtractRayIPFromFQDN ¶
ExtractRayIPFromFQDN extracts the head service name (i.e., RAY_IP, deprecated) from a fully qualified domain name (FQDN). This function is provided for backward compatibility purposes only.
func FetchHeadServiceURL ¶ added in v0.6.0
func FetchHeadServiceURL(ctx context.Context, cli client.Client, rayCluster *rayv1.RayCluster, defaultPortName string) (string, error)
FetchHeadServiceURL fetches the URL that consists of the FQDN for the RayCluster's head service and the port with the given port name (defaultPortName).
func FindContainerPort ¶ added in v0.6.0
FindContainerPort searches for a specific port $portName in the container. If the port is found in the container, the corresponding port is returned. If the port is not found, the $defaultPort is returned instead.
func FindHeadPodReadyCondition ¶ added in v1.2.0
func FindRayClusterSuspendStatus ¶ added in v1.3.0
func FindRayClusterSuspendStatus(instance *rayv1.RayCluster) rayv1.RayClusterConditionType
FindRayClusterSuspendStatus returns the current suspend status from two conditions:
- rayv1.RayClusterSuspending
- rayv1.RayClusterSuspended
The two conditions should not be both True at the same time. The transition logic should be the following:
rayv1.RayClusterSuspending: False by default False -> True: when `spec.Suspend` is true. True -> False: when all Pods are deleted, set rayv1.RayClusterSuspended from False to True. rayv1.RayClusterSuspended False by default False -> True: when suspending transitions from True to False True -> False: when `spec.Suspend` is false.
If both rayv1.RayClusterSuspending and rayv1.RayClusterSuspended are False, FindRayClusterSuspendStatus returns "".
func FormatInt32 ¶
FormatInt returns the string representation of i in the given base, for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z' for digit values >= 10.
func GenerateFQDNServiceName ¶
func GenerateFQDNServiceName(ctx context.Context, cluster rayv1.RayCluster, namespace string) string
GenerateFQDNServiceName generates a Fully Qualified Domain Name.
func GenerateHeadServiceName ¶ added in v1.0.0
func GenerateHeadServiceName(crdType CRDType, clusterSpec rayv1.RayClusterSpec, ownerName string) (string, error)
GenerateHeadServiceName generates a Ray head service name. Note that there are two types of head services:
(1) For RayCluster: If `HeadService.Name` in the cluster spec is not empty, it will be used as the head service name. Otherwise, the name is generated based on the RayCluster CR's name. (2) For RayService: It's important to note that the RayService CR not only possesses a head service owned by its RayCluster CR but also maintains a separate head service for itself to facilitate zero-downtime upgrades. The name of the head service owned by the RayService CR is generated based on the RayService CR's name.
@param crdType: The type of the CRD that owns the head service. @param clusterSpec: `RayClusterSpec` @param ownerName: The name of the CR that owns the head service.
func GenerateIdentifier ¶
func GenerateIdentifier(clusterName string, nodeType rayv1.RayNodeType) string
GenerateIdentifier generates identifier of same group pods
func GenerateIngressName ¶
GenerateIngressName generates an ingress name from cluster name
func GenerateJsonHash ¶
Json-serializes obj and returns its hash string
func GenerateRayClusterName ¶
GenerateRayClusterName generates a ray cluster name from ray service name
func GenerateRayJobId ¶
GenerateRayJobId generates a ray job id for submission
func GenerateRayWorkerReplicaGroupName ¶ added in v1.5.0
GenerateRayWorkerReplicaGroupName generates a name for the replica group currently used for RayMultiHostIndexing
func GenerateRouteName ¶ added in v1.0.0
GenerateRouteName generates an ingress name from cluster name
func GenerateServeServiceLabel ¶
GenerateServeServiceLabel generates label value for serve service selector.
func GenerateServeServiceName ¶
GenerateServeServiceName generates name for serve service.
func GetClusterDomainName ¶
func GetClusterDomainName() string
GetClusterDomainName returns cluster's domain name
func GetClusterType ¶ added in v1.4.0
func GetClusterType() bool
Check where we are running. We are trying to distinguish here whether this is vanilla kubernetes cluster or Openshift
func GetContainerCommand ¶ added in v1.4.0
func GetHeadGroupServiceAccountName ¶
func GetHeadGroupServiceAccountName(cluster *rayv1.RayCluster) string
GetHeadGroupServiceAccountName returns the head group service account if it exists. Otherwise, it returns the name of the cluster itself.
func GetNamespace ¶
func GetNamespace(metaData metav1.ObjectMeta) string
GetNamespace return namespace
func GetRayClusterNameFromService ¶ added in v1.3.0
GetRayClusterNameFromService returns the name of the RayCluster that the service points to
func GetRayDashboardClientFunc ¶
func GetRayDashboardClientFunc(mgr manager.Manager, useKubernetesProxy bool) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
func GetRayServiceClusterUpgradeOptions ¶ added in v1.5.0
func GetRayServiceClusterUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.ClusterUpgradeOptions
func GetWeightsFromHTTPRoute ¶ added in v1.5.0
func GetWeightsFromHTTPRoute(httpRoute *gwv1.HTTPRoute, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32)
GetWeightsFromHTTPRoute parses a given HTTPRoute object and extracts the traffic weights for the active and pending clusters (if present) of a RayService.
func GetWorkerGroupDesiredReplicas ¶ added in v1.0.0
func GetWorkerGroupDesiredReplicas(ctx context.Context, workerGroupSpec rayv1.WorkerGroupSpec) int32
func HasSubmitter ¶ added in v1.5.0
func InconsistentRayClusterStatus ¶ added in v1.5.0
func InconsistentRayClusterStatus(oldStatus rayv1.RayClusterStatus, newStatus rayv1.RayClusterStatus) bool
Checks whether the old and new RayClusterStatus are inconsistent by comparing different fields. If the only differences between the old and new status are the `LastUpdateTime` and `ObservedGeneration` fields, the status update will not be triggered.
TODO (kevin85421): The field `ObservedGeneration` is not being well-maintained at the moment. In the future, this field should be used to determine whether to update this CR or not.
func InconsistentRayServiceStatuses ¶ added in v1.5.0
func InconsistentRayServiceStatuses(oldStatus rayv1.RayServiceStatuses, newStatus rayv1.RayServiceStatuses) bool
Determine whether to update the status of the RayService instance.
func IsAuthEnabled ¶ added in v1.5.1
func IsAuthEnabled(spec *rayv1.RayClusterSpec) bool
IsAuthEnabled returns whether Ray auth is enabled.
func IsAutoscalingEnabled ¶ added in v1.3.0
func IsAutoscalingEnabled(spec *rayv1.RayClusterSpec) bool
func IsAutoscalingV2Enabled ¶ added in v1.4.0
func IsAutoscalingV2Enabled(spec *rayv1.RayClusterSpec) bool
func IsDeterministicHeadPodNameEnabled ¶ added in v1.5.0
func IsDeterministicHeadPodNameEnabled() bool
GetEnableDeterministicHeadName returns true if deterministic head pod name is enabled.
func IsGCSFaultToleranceEnabled ¶ added in v1.3.0
func IsGCSFaultToleranceEnabled(spec *rayv1.RayClusterSpec, annotations map[string]string) bool
Check if the RayCluster has GCS fault tolerance enabled.
func IsGPUResourceKey ¶ added in v1.5.0
func IsGatewayReady ¶ added in v1.5.0
IsGatewayReady checks if a Gateway is considered "ready".
A Gateway is "ready" only if both the `Accepted` and `Programmed` conditions are set to 'True'.
'Accepted': Signifies that the Gateway controller understands and accepts the Gateway resource. If 'False', it often indicates a conflict or an invalid specification.
'Programmed': Signifies that the underlying network infrastructure for the Gateway (e.g. load balancer) has been successfully provisioned and configured.
func IsHTTPRouteReady ¶ added in v1.5.0
IsHTTPRouteReady checks if an HTTPRoute is considered ready for a given Gateway.
It returns true only if the route's parent status entry matching the Gateway has both the 'Accepted' and 'ResolvedRefs' conditions set to 'True'.
'Accepted': Signifies that the Gateway controller has validated the HTTPRoute's configuration (e.g. syntax, filters, matching rules). An 'Accepted' status of 'False' means the route's specification is invalid.
'ResolvedRefs': Signifies that all references within the route are valid, exist, and are resolvable by the Gateway.
func IsIncrementalUpgradeComplete ¶ added in v1.5.0
func IsIncrementalUpgradeComplete(rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) bool
IsIncrementalUpgradeComplete checks if the conditions for completing an incremental upgrade are met.
func IsIncrementalUpgradeEnabled ¶ added in v1.5.0
func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool
func IsJobFinished ¶ added in v1.1.0
func IsJobFinished(j *batchv1.Job) (batchv1.JobConditionType, bool)
IsJobFinished checks whether the given Job has finished execution. It does not discriminate between successful and failed terminations. src: https://github.com/kubernetes/kubernetes/blob/a8a1abc25cad87333840cd7d54be2efaf31a3177/pkg/controller/job/utils.go#L26
func IsRunningAndReady ¶
IsRunningAndReady returns true if pod is in the PodRunning Phase, if it has a condition of PodReady.
func ManagedByExternalController ¶ added in v1.3.0
func PodName ¶ added in v1.4.0
func PodName(prefix string, nodeType rayv1.RayNodeType, isGenerateName bool) string
PodName returns the value that should be used for a Pod's Name or GenerateName based on the RayCluster name and node type (head or worker).
func RayClusterReplicaFailureReason ¶ added in v1.2.0
func RayOriginatedFromCRDLabelValue ¶ added in v1.1.0
RayOriginatedFromCRDLabelValue generates a value for the label RayOriginatedFromCRDLabelKey This is also the only function to construct label filter of resources originated from a given CRDType.
func SafeInt64ToInt32 ¶ added in v1.5.1
SafeInt64ToInt32 converts int64 to int32, preventing overflow/underflow by bounding the value between [math.MinInt32, math.MaxInt32]
func SafeUint64ToInt64 ¶ added in v1.4.0
SafeUint64ToInt64 safely converts a uint64 to int64. If the uint64 value exceeds the maximum int64 value, the function will panic.
func SumResourceList ¶ added in v1.5.0
func SumResourceList(list []corev1.ResourceList) corev1.ResourceList
func TrimJobName ¶ added in v1.3.0
TrimJobName uses CheckLabel to trim Kubernetes job to constrains
func ValidateClusterUpgradeOptions ¶ added in v1.5.0
func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error
func ValidateRayClusterMetadata ¶ added in v1.4.0
func ValidateRayClusterMetadata(metadata metav1.ObjectMeta) error
func ValidateRayClusterSpec ¶ added in v1.3.0
func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]string) error
Validation for invalid Ray Cluster configurations.
func ValidateRayClusterStatus ¶ added in v1.3.0
func ValidateRayClusterStatus(instance *rayv1.RayCluster) error
func ValidateRayJobMetadata ¶ added in v1.4.0
func ValidateRayJobMetadata(metadata metav1.ObjectMeta) error
func ValidateRayJobSpec ¶ added in v1.3.0
func ValidateRayJobStatus ¶ added in v1.3.0
func ValidateRayServiceMetadata ¶ added in v1.4.0
func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error
func ValidateRayServiceSpec ¶ added in v1.3.0
func ValidateRayServiceSpec(rayService *rayv1.RayService) error
Types ¶
type CRDType ¶ added in v1.0.0
type CRDType string
TODO (kevin85421): Define CRDType here rather than constant.go to avoid circular dependency.
func GetCRDType ¶ added in v1.1.0
type ClientProvider ¶ added in v1.2.0
type ClientProvider interface {
GetDashboardClient(mgr manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
GetHttpProxyClient(mgr manager.Manager) func(hostIp, podNamespace, podName string, port int) RayHttpProxyClientInterface
}
type FakeRayDashboardClient ¶
type FakeRayDashboardClient struct {
GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)]
LastUpdatedConfig []byte
// contains filtered or unexported fields
}
func (*FakeRayDashboardClient) DeleteJob ¶ added in v1.1.0
func (r *FakeRayDashboardClient) DeleteJob(_ context.Context, _ string) error
func (*FakeRayDashboardClient) GetJobInfo ¶
func (r *FakeRayDashboardClient) GetJobInfo(ctx context.Context, jobId string) (*utiltypes.RayJobInfo, error)
func (*FakeRayDashboardClient) GetMultiApplicationStatus ¶ added in v0.6.0
func (r *FakeRayDashboardClient) GetMultiApplicationStatus(_ context.Context) (map[string]*utiltypes.ServeApplicationStatus, error)
func (*FakeRayDashboardClient) GetServeDetails ¶ added in v0.6.0
func (r *FakeRayDashboardClient) GetServeDetails(_ context.Context) (*utiltypes.ServeDetails, error)
func (*FakeRayDashboardClient) InitClient ¶
func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string, _ string)
func (*FakeRayDashboardClient) ListJobs ¶ added in v1.1.0
func (r *FakeRayDashboardClient) ListJobs(ctx context.Context) (*[]utiltypes.RayJobInfo, error)
func (*FakeRayDashboardClient) SetMultiApplicationStatuses ¶ added in v0.6.0
func (r *FakeRayDashboardClient) SetMultiApplicationStatuses(statuses map[string]*utiltypes.ServeApplicationStatus)
func (*FakeRayDashboardClient) StopJob ¶
func (r *FakeRayDashboardClient) StopJob(_ context.Context, _ string) (err error)
func (*FakeRayDashboardClient) SubmitJobReq ¶ added in v1.1.0
func (r *FakeRayDashboardClient) SubmitJobReq(_ context.Context, _ *utiltypes.RayJobRequest) (string, error)
func (*FakeRayDashboardClient) UpdateDeployments ¶
func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error
type FakeRayHttpProxyClient ¶
type FakeRayHttpProxyClient struct {
IsHealthy bool
}
func (*FakeRayHttpProxyClient) CheckProxyActorHealth ¶ added in v1.2.0
func (fc *FakeRayHttpProxyClient) CheckProxyActorHealth(_ context.Context) error
type K8sEventType ¶ added in v1.2.2
type K8sEventType string
Currently, KubeRay fires events when failures occur during the creation or deletion of resources.
const ( // RayCluster event list InvalidRayClusterStatus K8sEventType = "InvalidRayClusterStatus" InvalidRayClusterSpec K8sEventType = "InvalidRayClusterSpec" InvalidRayClusterMetadata K8sEventType = "InvalidRayClusterMetadata" // Head Pod event list CreatedHeadPod K8sEventType = "CreatedHeadPod" FailedToCreateHeadPod K8sEventType = "FailedToCreateHeadPod" DeletedHeadPod K8sEventType = "DeletedHeadPod" FailedToDeleteHeadPod K8sEventType = "FailedToDeleteHeadPod" // Worker Pod event list CreatedWorkerPod K8sEventType = "CreatedWorkerPod" FailedToCreateWorkerPod K8sEventType = "FailedToCreateWorkerPod" DeletedWorkerPod K8sEventType = "DeletedWorkerPod" FailedToDeleteWorkerPod K8sEventType = "FailedToDeleteWorkerPod" FailedToDeleteWorkerPodCollection K8sEventType = "FailedToDeleteWorkerPodCollection" // Redis Cleanup Job event list CreatedRedisCleanupJob K8sEventType = "CreatedRedisCleanupJob" FailedToCreateRedisCleanupJob K8sEventType = "FailedToCreateRedisCleanupJob" // RayJob event list InvalidRayJobSpec K8sEventType = "InvalidRayJobSpec" InvalidRayJobMetadata K8sEventType = "InvalidRayJobMetadata" InvalidRayJobStatus K8sEventType = "InvalidRayJobStatus" CreatedRayJobSubmitter K8sEventType = "CreatedRayJobSubmitter" DeletedRayJobSubmitter K8sEventType = "DeletedRayJobSubmitter" FailedToCreateRayJobSubmitter K8sEventType = "FailedToCreateRayJobSubmitter" FailedToDeleteRayJobSubmitter K8sEventType = "FailedToDeleteRayJobSubmitter" CreatedRayCluster K8sEventType = "CreatedRayCluster" UpdatedRayCluster K8sEventType = "UpdatedRayCluster" DeletedRayCluster K8sEventType = "DeletedRayCluster" FailedToCreateRayCluster K8sEventType = "FailedToCreateRayCluster" FailedToDeleteRayCluster K8sEventType = "FailedToDeleteRayCluster" FailedToUpdateRayCluster K8sEventType = "FailedToUpdateRayCluster" RayClusterNotFound K8sEventType = "RayClusterNotFound" // RayService event list CreatedGateway K8sEventType = "CreatedGateway" CreatedHTTPRoute K8sEventType = "CreatedHTTPRoute" InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec" InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata" RayServiceInitializingTimeout K8sEventType = "RayServiceInitializingTimeout" UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel" UpdatedGateway K8sEventType = "UpdatedGateway" UpdatedHTTPRoute K8sEventType = "UpdatedHTTPRoute" UpdatedServeApplications K8sEventType = "UpdatedServeApplications" UpdatedServeTargetCapacity K8sEventType = "UpdatedServeTargetCapacity" FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel" FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications" FailedToUpdateTargetCapacity K8sEventType = "FailedToUpdateTargetCapacity" FailedToCreateGateway K8sEventType = "FailedToCreateGateway" FailedToUpdateGateway K8sEventType = "FailedToUpdateGateway" FailedToCreateHTTPRoute K8sEventType = "FailedToCreateHTTPRoute" FailedToUpdateHTTPRoute K8sEventType = "FailedToUpdateHTTPRoute" // Generic Pod event list DeletedPod K8sEventType = "DeletedPod" FailedToDeletePod K8sEventType = "FailedToDeletePod" FailedToDeletePodCollection K8sEventType = "FailedToDeletePodCollection" // Ingress event list CreatedIngress K8sEventType = "CreatedIngress" FailedToCreateIngress K8sEventType = "FailedToCreateIngress" // Route event list CreatedRoute K8sEventType = "CreatedRoute" FailedToCreateRoute K8sEventType = "FailedToCreateRoute" // Service event list CreatedService K8sEventType = "CreatedService" UpdatedService K8sEventType = "UpdatedService" FailedToCreateService K8sEventType = "FailedToCreateService" FailedToUpdateService K8sEventType = "FailedToUpdateService" // ServiceAccount event list CreatedServiceAccount K8sEventType = "CreatedServiceAccount" FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount" AutoscalerServiceAccountNotFound K8sEventType = "AutoscalerServiceAccountNotFound" // Role event list CreatedRole K8sEventType = "CreatedRole" FailedToCreateRole K8sEventType = "FailedToCreateRole" // RoleBinding list CreatedRoleBinding K8sEventType = "CreatedRoleBinding" FailedToCreateRoleBinding K8sEventType = "FailedToCreateRoleBinding" )
type RayHttpProxyClient ¶
type RayHttpProxyClient struct {
// contains filtered or unexported fields
}
func (*RayHttpProxyClient) CheckProxyActorHealth ¶ added in v1.2.0
func (r *RayHttpProxyClient) CheckProxyActorHealth(ctx context.Context) error
CheckProxyActorHealth checks the health status of the Ray Serve proxy actor.
func (*RayHttpProxyClient) InitClient ¶
func (r *RayHttpProxyClient) InitClient()
type ServiceType ¶ added in v1.1.0
type ServiceType string
const ( HeadService ServiceType = "headService" ServingService ServiceType = "serveService" )