Documentation
¶
Index ¶
Constants ¶
View Source
const ( // GangIdAnnotation Jobs with equal value for this annotation make up a gang. // All jobs in a gang are guaranteed to be scheduled onto the same cluster at the same time. GangIdAnnotation = "armadaproject.io/gangId" // GangCardinalityAnnotation All jobs in a gang must specify the total number of jobs in the gang via this annotation. // The cardinality should be expressed as a positive integer, e.g., "3". GangCardinalityAnnotation = "armadaproject.io/gangCardinality" // GangMinimumCardinalityAnnotation All jobs in a gang must specify the minimum size for the gang to be schedulable via this annotation. // The cardinality should be expressed as a positive integer, e.g., "3". GangMinimumCardinalityAnnotation = "armadaproject.io/gangMinimumCardinality" // The jobs that make up a gang may be constrained to be scheduled across a set of uniform nodes. // Specifically, if provided, all gang jobs are scheduled onto nodes for which the value of the provided label is equal. // Used to ensure, e.g., that all gang jobs are scheduled onto the same cluster or rack. GangNodeUniformityLabelAnnotation = "armadaproject.io/gangNodeUniformityLabel" // Armada normally tries to re-schedule jobs for which a pod fails to start. // Pods for which this annotation has value "true" are not retried. // Instead, the job the pod is part of fails immediately. FailFastAnnotation = "armadaproject.io/failFast" )
View Source
const ( DuplicateWellKnownNodeTypeErrorMessage = "duplicate well-known node type name" AwayNodeTypesWithoutPreemptionErrorMessage = "priority class has away node types but is not preemptible" UnknownWellKnownNodeTypeErrorMessage = "priority class refers to unknown well-known node type" )
View Source
const (
RuntimeGangCardinality = "runtime_gang_cardinality"
)
Variables ¶
View Source
var ReturnLeaseRequestTrackedAnnotations = map[string]struct{}{ FailFastAnnotation: {}, }
Functions ¶
func SchedulingConfigValidation ¶ added in v0.4.8
func SchedulingConfigValidation(sl validator.StructLevel)
Types ¶
type ArmadaConfig ¶
type ArmadaConfig struct {
Auth authconfig.AuthConfig
GrpcPort uint16
HttpPort uint16
MetricsPort uint16
// If non-nil, net/http/pprof endpoints are exposed on localhost on this port.
PprofPort *uint16
CorsAllowedOrigins []string
GrpcGatewayPath string
Grpc grpcconfig.GrpcConfig
SchedulerApiConnection client.ApiConnectionDetails
CancelJobsBatchSize int
Redis redis.UniversalOptions
EventsApiRedis redis.UniversalOptions
Scheduling SchedulingConfig
Pulsar PulsarConfig
Postgres PostgresConfig // Used for Pulsar submit API deduplication
QueryApi QueryApiConfig
}
type FailureEstimatorConfig ¶ added in v0.4.20
type FailureEstimatorConfig struct {
Disabled bool
NodeSuccessProbabilityCordonThreshold float64
QueueSuccessProbabilityCordonThreshold float64
NodeCordonTimeout time.Duration
QueueCordonTimeout time.Duration
NodeEquilibriumFailureRate float64
QueueEquilibriumFailureRate float64
}
FailureEstimatorConfig contains config controlling node and queue success probability estimation. See the internal/scheduler/failureestimator package for details.
type IndexedResource ¶ added in v0.3.71
type PostgresConfig ¶
TODO: we can probably just typedef this to map[string]string
type PreemptionConfig ¶
type PreemptionConfig struct {
// If using PreemptToFairShare,
// the probability of evicting jobs on a node to balance resource usage.
NodeEvictionProbability float64
// If using PreemptToFairShare,
// the probability of evicting jobs on oversubscribed nodes, i.e.,
// nodes on which the total resource requests are greater than the available resources.
NodeOversubscriptionEvictionProbability float64
ProtectedFractionOfFairShare float64
// If true, the Armada scheduler will add to scheduled pods a node selector
// NodeIdLabel: <value of label on node selected by scheduler>.
// If true, NodeIdLabel must be non-empty.
SetNodeIdSelector bool
// Label used with SetNodeIdSelector. Must be non-empty if SetNodeIdSelector is true.
NodeIdLabel string `validate:"required"`
// If true, the Armada scheduler will set the node name of the selected node directly on scheduled pods,
// thus bypassing kube-scheduler entirely.
SetNodeName bool
// Map from priority class names to priority classes.
// Must be consistent with Kubernetes priority classes.
// I.e., priority classes defined here must be defined in all executor clusters and should map to the same priority.
PriorityClasses map[string]types.PriorityClass `validate:"dive"`
// Priority class assigned to pods that do not specify one.
// Must be an entry in PriorityClasses above.
DefaultPriorityClass string
// If set, override the priority class name of pods with this value when sending to an executor.
PriorityClassNameOverride *string
}
TODO: Remove. Move PriorityClasses and DefaultPriorityClass into SchedulingConfig.
type PulsarConfig ¶
type PulsarConfig struct {
// Pulsar URL
URL string `validate:"required"`
// Path to the trusted TLS certificate file (must exist)
TLSTrustCertsFilePath string
// Whether Pulsar client accept untrusted TLS certificate from broker
TLSAllowInsecureConnection bool
// Whether the Pulsar client will validate the hostname in the broker's TLS Cert matches the actual hostname.
TLSValidateHostname bool
// Max number of connections to a single broker that will be kept in the pool. (Default: 1 connection)
MaxConnectionsPerBroker int
// Whether Pulsar authentication is enabled
AuthenticationEnabled bool
// Authentication type. For now only "JWT" auth is valid
AuthenticationType string
// Path to the JWT token (must exist). This must be set if AuthenticationType is "JWT"
JwtTokenPath string
JobsetEventsTopic string
RedisFromPulsarSubscription string
// Compression to use. Valid values are "None", "LZ4", "Zlib", "Zstd". Default is "None"
CompressionType pulsar.CompressionType
// Compression Level to use. Valid values are "Default", "Better", "Faster". Default is "Default"
CompressionLevel pulsar.CompressionLevel
// Settings for deduplication, which relies on a postgres server.
DedupTable string
// Log all pulsar events
EventsPrinterSubscription string
EventsPrinter bool
// Maximum allowed message size in bytes
MaxAllowedMessageSize uint
// Timeout when polling pulsar for messages
ReceiveTimeout time.Duration
// Backoff from polling when Pulsar returns an error
BackoffTime time.Duration
// Number of pulsar messages that will be queued by the pulsar consumer.
ReceiverQueueSize int
}
type QueryApiConfig ¶ added in v0.4.20
type QueryApiConfig struct {
Enabled bool
Postgres PostgresConfig
}
type SchedulingConfig ¶
type SchedulingConfig struct {
// Set to true to disable scheduling
DisableScheduling bool
// Set to true to enable scheduler assertions. This results in some performance loss.
EnableAssertions bool
Preemption PreemptionConfig
// Number of jobs to load from the database at a time.
MaxQueueLookback uint
// In each invocation of the scheduler, no more jobs are scheduled once this limit has been exceeded.
// Note that the total scheduled resources may be greater than this limit.
MaximumResourceFractionToSchedule map[string]float64
// Overrides MaximalClusterFractionToSchedule if set for the current pool.
MaximumResourceFractionToScheduleByPool map[string]map[string]float64
// The rate at which Armada schedules jobs is rate-limited using a token bucket approach.
// Specifically, there is a token bucket that persists between scheduling rounds.
// The bucket fills up at a rate of MaximumSchedulingRate tokens per second and has capacity MaximumSchedulingBurst.
// A token is removed from the bucket when a scheduling a job and scheduling stops while the bucket is empty.
//
// Hence, MaximumSchedulingRate controls the maximum number of jobs scheduled per second in steady-state,
// i.e., once the burst capacity has been exhausted.
//
// Rate-limiting is based on the number of tokens available at the start of each scheduling round,
// i.e., tokens accumulated while scheduling become available at the start of the next scheduling round.
//
// For more information about the rate-limiter, see:
// https://pkg.go.dev/golang.org/x/time/rate#Limiter
MaximumSchedulingRate float64 `validate:"gt=0"`
// MaximumSchedulingBurst controls the burst capacity of the rate-limiter.
//
// There are two important implications:
// - Armada will never schedule more than MaximumSchedulingBurst jobs per scheduling round.
// - Gang jobs with cardinality greater than MaximumSchedulingBurst can never be scheduled.
MaximumSchedulingBurst int `validate:"gt=0"`
// In addition to the global rate-limiter, there is a separate rate-limiter for each queue.
// These work the same as the global rate-limiter, except they apply only to jobs scheduled from a specific queue.
//
// Per-queue version of MaximumSchedulingRate.
MaximumPerQueueSchedulingRate float64 `validate:"gt=0"`
// Per-queue version of MaximumSchedulingBurst.
MaximumPerQueueSchedulingBurst int `validate:"gt=0"`
// Armada stores contexts associated with recent job scheduling attempts.
// This setting limits the number of such contexts to store.
// Contexts associated with the most recent scheduling attempt for each queue and cluster are always stored.
MaxJobSchedulingContextsPerExecutor uint
DefaultJobLimits armadaresource.ComputeResources
// Set of tolerations added to all submitted pods.
DefaultJobTolerations []v1.Toleration
// Set of tolerations added to all submitted pods of a given priority class.
DefaultJobTolerationsByPriorityClass map[string][]v1.Toleration
// Set of tolerations added to all submitted pods with a given resource request.
DefaultJobTolerationsByResourceRequest map[string][]v1.Toleration
// Maximum number of times a job is retried before considered failed.
MaxRetries uint
// List of resource names, e.g., []string{"cpu", "memory"}, to consider when computing DominantResourceFairness.
DominantResourceFairnessResourcesToConsider []string
MaxPodSpecSizeBytes uint
MinJobResources v1.ResourceList
// Once a node has been found on which a pod can be scheduled,
// the scheduler will consider up to the next maxExtraNodesToConsider nodes.
// The scheduler selects the node with the best score out of the considered nodes.
// In particular, the score expresses whether preemption is necessary to schedule a pod.
// Hence, a larger MaxExtraNodesToConsider would reduce the expected number of preemptions.
MaxExtraNodesToConsider uint
// Resources, e.g., "cpu", "memory", and "nvidia.com/gpu",
// for which the scheduler creates indexes for efficient lookup.
// Applies only to the new scheduler.
IndexedResources []IndexedResource
// Node labels that the scheduler creates indexes for efficient lookup of.
// Should include node labels frequently used for scheduling.
// Since the scheduler can efficiently sort out nodes for which these labels
// are not set correctly when looking for a node a pod can be scheduled on.
//
// If not set, no labels are indexed.
//
// Applies only to the new scheduler.
IndexedNodeLabels []string
// Taint keys that the scheduler creates indexes for efficient lookup of.
// Should include taints frequently used for scheduling.
// Since the scheduler can efficiently sort out nodes for which these taints
// are not set correctly when looking for a node a pod can be scheduled on.
//
// If not set, all taints are indexed.
//
// Applies only to the new scheduler.
IndexedTaints []string
// WellKnownNodeTypes defines a set of well-known node types; these are used
// to define "home" and "away" nodes for a given priority class.
WellKnownNodeTypes []WellKnownNodeType `validate:"dive"`
// Default value of GangNodeUniformityLabelAnnotation if none is provided.
DefaultGangNodeUniformityLabel string
// Kubernetes pods may specify a termination grace period.
// When Pods are cancelled/preempted etc., they are first sent a SIGTERM.
// If a pod has not exited within its termination grace period,
// it is killed forcefully by Kubernetes sending it a SIGKILL.
//
// This is the minimum allowed termination grace period.
// It should normally be set to a positive value, e.g., 1 second.
// Since a zero grace period causes Kubernetes to force delete pods,
// which may causes issues where resources associated with the pod, e.g.,
// containers, are not cleaned up correctly.
//
// The grace period of pods that either
// - do not set a grace period, or
// - explicitly set a grace period of 0 seconds,
// is automatically set to MinTerminationGracePeriod.
MinTerminationGracePeriod time.Duration
// Max allowed grace period.
// Should normally not be set greater than single-digit minutes,
// since cancellation and preemption may need to wait for this amount of time.
MaxTerminationGracePeriod time.Duration
// If an executor hasn't heartbeated in this time period, it will be considered stale
ExecutorTimeout time.Duration
// Default activeDeadline for all pods that don't explicitly set activeDeadlineSeconds.
// Is trumped by DefaultActiveDeadlineByResourceRequest.
DefaultActiveDeadline time.Duration
// Default activeDeadline for pods with at least one container requesting a given resource.
// For example, if
// DefaultActiveDeadlineByResourceRequest: map[string]time.Duration{"gpu": time.Second},
// then all pods requesting a non-zero amount of gpu and don't explicitly set activeDeadlineSeconds
// will have activeDeadlineSeconds set to 1. Trumps DefaultActiveDeadline.
DefaultActiveDeadlineByResourceRequest map[string]time.Duration
// Maximum number of jobs that can be assigned to a executor but not yet acknowledged, before
// the scheduler is excluded from consideration by the scheduler.
MaxUnacknowledgedJobsPerExecutor uint
// If true, do not during scheduling skip jobs with requirements known to be impossible to meet.
AlwaysAttemptScheduling bool
// The frequency at which the scheduler updates the cluster state.
ExecutorUpdateFrequency time.Duration
// Controls node and queue success probability estimation.
FailureEstimatorConfig FailureEstimatorConfig
}
type WellKnownNodeType ¶ added in v0.4.8
type WellKnownNodeType struct {
// Name is the unique identifier for this node type.
Name string `validate:"required"`
// Taints is the set of taints that characterizes this node type; a node is
// part of this node type if and only if it has all of these taints.
Taints []v1.Taint
}
A WellKnownNodeType defines a set of nodes; see AwayNodeType.
Click to show internal directories.
Click to hide internal directories.