Documentation
¶
Index ¶
- Constants
- Variables
- func AllResCompressible(res []string) bool
- func GetDeviceNameFromMetric(metric string) (dev, devMetric, originalMetric string)
- func InitHealthCheckConfigFunc(nodeMetrics *MetricsNodeConfig, predictReserved *Resource) func(string) (*HealthCheckConfig, error)
- func InitPredictConfig(config *PredictConfig)
- func OfflineOnYarn(config *TaskTypeConfig) bool
- type ActionConfig
- type AggregationsConfig
- type AlarmChannel
- type AlarmConfig
- type CPIManagerConfig
- type CaelusConfig
- type CheckPointConfig
- type ComponentConfig
- type CpuIsolateConfig
- type CpuQuotaConfig
- type CpuSetConfig
- type CustomMetric
- type DetectActionConfig
- type DetectConfig
- type Devices
- type DiskQuotaConfig
- type DiskQuotaSize
- type EWMAArgs
- type ExpressionArgs
- type HealthCheckConfig
- type K8sConfig
- type LocalAlarm
- type LocalPredictConfig
- type MemoryNotifyConfig
- type MemoryPressureNotifyConfig
- type MemoryUsageNotifyConfig
- type MetricKind
- type MetricsCollectConfig
- type MetricsContainerConfig
- type MetricsNodeConfig
- type MetricsPerfConfig
- type MetricsPrometheus
- type MetricsRdtConfig
- type MetricsSource
- type NodeResourceConfig
- type NotifyConfig
- type OfflineJobs
- type OnlineConfig
- type OnlineJobConfig
- type OnlineMetrics
- type OverCommit
- type PathInfo
- type PidToCgroup
- type PredictConfig
- type PrometheusData
- type RangeResource
- type RangeState
- type RemoteAlarm
- type Resource
- type ResourceIsolateConfig
- type ResourceUpdateEvent
- type RoundOffResource
- type RuleCheck
- type RuleCheckConfig
- type SharedInfo
- type SilenceConfig
- type TaskTypeConfig
- type TimeRangeOverCommit
- type VolumeType
- type YarnDisksConfig
- type YarnNodeResourceConfig
Constants ¶
const ( ExpresstionAutoDetect = "auto" DetectionExpression = "expression" DetectionEWMA = "ewma" DetectionUnion = "union" )
const ( // LocalPredictorType is the local predictor LocalPredictorType = "local" // VPAPredictorType is the remote VPA predictor VPAPredictorType = "vpa" NodeResourceTypeOnlinePredict = "online_predict" )
const ( // TaskType OnlineTypeOnK8s = "k8s" OnlineTypeOnLocal = "local" OfflineTypeOnk8s = "k8s" OfflineTypeYarnOnk8s = "yarn_on_k8s" AlarmTypeLocal = "local" AlarmTypeRemote = "remote" // CpuManagePolicyBT is just for tencent OS CpuManagePolicyBT = "bt" CpuManagePolicySet = "cpuset" CpuManagePolicyQuota = "quota" CpuManagePolicyAdaptive = "adaptive" // MemUnit translates Mb to byte MemUnit = int64(1024 * 1024) MemGbUnit = int64(1024 * 1024 * 1024) // CpuUnit translates milli core CpuUnit = int64(1000) // DiskUnit translates Gi to btye DiskUnit = int64(1024 * 1020 * 1024) // pod annotation fixed annotation PodAnnotationPrefix = "mixer.kubernetes.io/" // RootFS is the root directory in container. RootFS = "/rootfs" CgroupKubePods = "/kubepods" CgroupOffline = "/kubepods/offline" CgroupOfflineSystem = CgroupOffline + "/system" SystemComponentOomScoreAdj = "500" CgroupYarn = "hadoop-yarn" // CgroupNonK8sOnline is the cgroup for online jobs, which are not running on k8s, we need to create the cgroup // and children cgroup manually. CgroupNonK8sOnline = "/onlinejobs" )
const (
// container runtime
ContainerRuntimeDocker = "docker"
)
Variables ¶
var ( AvailablePredictType = sets.NewString(LocalPredictorType, VPAPredictorType) AvailableLocalPredictType = sets.NewString(LocalPredictorType) )
var ( // AvailableOnlineTaskType describe available online tasks, which may be pod or local process AvailableOnlineTaskType = sets.NewString(OnlineTypeOnK8s, OnlineTypeOnLocal) // AvailableOfflineTaskType describe available offline tasks, which may be pod or yarn job AvailableOfflineTaskType = sets.NewString(OfflineTypeOnk8s, OfflineTypeYarnOnk8s) // AvailableAlarmType shows available alarm type AvailableAlarmType = sets.NewString(AlarmTypeLocal, AlarmTypeRemote) // AvailableCpuManagePolicy shows available cpu manage policy AvailableCpuManagePolicy = sets.NewString(CpuManagePolicyBT, CpuManagePolicySet, CpuManagePolicyQuota, CpuManagePolicyAdaptive) CompressibleRes = sets.NewString(string(v1.ResourceCPU)) )
Functions ¶
func AllResCompressible ¶
AllResCompressible check if the resources are compressible
func GetDeviceNameFromMetric ¶
GetDeviceNameFromMetric parse the metric name, and output the dev and devMetric name
func InitHealthCheckConfigFunc ¶
func InitHealthCheckConfigFunc(nodeMetrics *MetricsNodeConfig, predictReserved *Resource) func(string) (*HealthCheckConfig, error)
InitHealthCheckConfigFunc return function to get health check config
func InitPredictConfig ¶
func InitPredictConfig(config *PredictConfig)
InitPredictConfig validate and format predict config
func OfflineOnYarn ¶
func OfflineOnYarn(config *TaskTypeConfig) bool
OfflineOnYarn check if offline job is running on YARN
Types ¶
type ActionConfig ¶
type ActionConfig struct {
Name string `json:"name"`
ArgsStr json.RawMessage `json:"args"`
Args interface{} `json:"-"`
}
ActionConfig define action config
type AggregationsConfig ¶
type AggregationsConfig struct {
// MemoryAggregationInterval is the length of a single interval, for
// which the peak memory usage is computed.
// Memory usage peaks are aggregated in multiples of this interval. In other words
// there is one memory usage sample per interval (the maximum usage over that
// interval).
MemoryAggregationInterval times.Duration `json:"memory_aggregation_interval"`
// MemoryAggregationWindowIntervalCount is the number of consecutive MemoryAggregationIntervals
// which make up the MemoryAggregationWindowLength which in turn is the period for memory
// usage aggregation by VPA.
MemoryAggregationIntervalCount int64 `json:"memory_aggregation_interval_count"`
// MemoryHistogramDecayHalfLife is the amount of time it takes a historical
// memory usage sample to lose half of its weight. In other words, a fresh
// usage sample is twice as 'important' as one with age equal to the half
// life period.
MemoryHistogramDecayHalfLife times.Duration `json:"memory_histogram_decay_half_life"`
// CPUHistogramDecayHalfLife is the amount of time it takes a historical
// CPU usage sample to lose half of its weight.
CPUHistogramDecayHalfLife times.Duration `json:"cpu_histogram_decay_half_life"`
}
AggregationsConfig is used to configure aggregation behaviour.
type AlarmChannel ¶
type AlarmChannel struct {
LocalAlarm *LocalAlarm `json:"local"`
RemoteAlarm *RemoteAlarm `json:"remote"`
}
AlarmChannel struct is used to show alarm channel
type AlarmConfig ¶
type AlarmConfig struct {
Enable bool `json:"enable"`
Cluster string `json:"cluster"`
MessageBatch int `json:"message_batch"`
MessageDelay times.Duration `json:"message_delay"`
ChannelName string `json:"channel_name"`
IgnoreAlarmWhenSilence bool `json:"ignore_alarm_when_silence"`
AlarmChannel `json:"alarm_channel"`
}
AlarmConfig group options to send alarm message
type CPIManagerConfig ¶
type CPIManagerConfig struct {
// I want this feature disabled by default
Enable bool `json:"enable"`
WindowDuration times.Duration `json:"window_duration"`
PrometheusAddrStr string `json:"prometheus_addr"`
PrometheusAddr url.URL `json:"-"`
MaxJobSpecRange times.Duration `json:"max_job_spec_range"`
}
CPIManagerConfig show the configuration for cpi detecting
type CaelusConfig ¶
type CaelusConfig struct {
K8sConfig K8sConfig `json:"k8s_config"`
CheckPoint CheckPointConfig `json:"check_point"`
TaskType TaskTypeConfig `json:"task_type"`
NodeResource NodeResourceConfig `json:"node_resource"`
// If multiple predicts, the first one is used for real prediction. The left are experiment predicts, caelus will
// only feeds samples to them and expose predict metrics for them.
Predicts []PredictConfig `json:"predicts"`
Metrics MetricsCollectConfig `json:"metrics"`
ResourceIsolate ResourceIsolateConfig `json:"resource_isolate"`
CpiManager CPIManagerConfig `json:"cpi_manager"`
Alarm AlarmConfig `json:"alarm"`
Online OnlineConfig `json:"online"`
DiskQuota DiskQuotaConfig `json:"disk_quota"`
}
CaelusConfig is the configuration for Caelus
func ParseJsonConfig ¶
func ParseJsonConfig(configFile string) (*CaelusConfig, error)
ParseJsonConfig parse json config
type CheckPointConfig ¶
type CheckPointConfig struct {
CheckPointDir string `json:"check_point_dir"`
NodeResourceKey string `json:"node_resource_key"`
}
CheckPointConfig group info related to check point, which saving state to local file
type ComponentConfig ¶
ComponentConfig is the config to specific a non-containerized component
type CpuIsolateConfig ¶
type CpuIsolateConfig struct {
// AutoDetect will enable bt feature if supported, and quota as the second choice.
AutoDetect bool `json:"auto_detect"`
// ManagePolicy assigns cpu manage policy
ManagePolicy string `json:"manage_policy"`
CpuSetConfig CpuSetConfig `json:"cpuset_config"`
CpuQuotaConfig CpuQuotaConfig `json:"cpu_quota_config"`
// KubeletStatic check if cpu manager policy for kubelet is static
KubeletStatic bool `json:"-"`
}
CpuIsolateConfig is the configuration for cpu isolation
type CpuQuotaConfig ¶
type CpuQuotaConfig struct {
OfflineShare *uint64 `json:"offline_share"`
}
CpuQuotaConfig describe configs for cpu quota isolation policy
type CpuSetConfig ¶
type CpuSetConfig struct {
// isolate online jobs with offline jobs
EnableOnlineIsolate bool `json:"enable_online_isolate"`
// cpu list, which offline job will not be assigned
ReservedCpus string `json:"reserved_cpus"`
}
CpuSetConfig describe configs for cpuset isolation policy
type CustomMetric ¶
type CustomMetric struct {
MetricServerAddr string `json:"metric_server_addr"`
CollectInterval times.Duration `json:"collect_interval"`
}
CustomMetric define custom metric config
type DetectActionConfig ¶
type DetectActionConfig struct {
Detects []*DetectConfig `json:"detects"`
Actions []*ActionConfig `json:"actions"`
}
DetectActionConfig define detectors and actions
type DetectConfig ¶
type DetectConfig struct {
Name string `json:"name"`
ArgsStr json.RawMessage `json:"args"`
Args interface{} `json:"-"`
}
DetectConfig define detector config
type Devices ¶
type Devices struct {
// Ifaces are the network interfaces, e.g. eth0, those not exist or down will be filter out
// these ifaces will be assigned to metrics.node.ifaces
IfacesWithProperty []string `json:"ifaces_xxx"`
Ifaces []string `json:"-"`
// DiskNames are the disk names, e.g. sda, vda, those not exist will be filter out
// these ifaces will be assigned to metrics.node.deviceNames
DiskNames []string `json:"disk_names"`
}
Devices group network and disk devices
type DiskQuotaConfig ¶
type DiskQuotaConfig struct {
Enabled bool `json:"enabled"`
CheckPeriod times.Duration `json:"check_period"`
// such as docker or containerd
ContainerRuntime string `json:"container_runtime"`
// quota size just for offline job, online jobs need to announce in annotations
VolumeSizes map[VolumeType]*DiskQuotaSize `json:"volume_sizes"`
}
DiskQuotaConfig group disk quota configurations
type DiskQuotaSize ¶
type DiskQuotaSize struct {
Quota uint64 `json:"quota"`
Inodes uint64 `json:"inodes"`
QuotaUsed uint64 `json:"-"`
InodesUsed uint64 `json:"-"`
}
shall we support soft feature ?
type ExpressionArgs ¶
type ExpressionArgs struct {
Expression string `json:"expression"`
WarningCount int `json:"warning_count"`
WarningDuration times.Duration `json:"warning_duration"`
}
ExpressionArgs group args used for expression detection
type HealthCheckConfig ¶
type HealthCheckConfig struct {
Disable bool `json:"disable"`
RuleNodes []string `json:"rule_nodes"`
RuleCheck RuleCheck `json:"rule_check"`
CgroupNotify NotifyConfig `json:"cgroup_notify"`
// assign the value when initialize
PredictReserved *Resource `json:"-"`
}
HealthCheckConfig is the config for checking health, such as node load or online job interference
type K8sConfig ¶
type K8sConfig struct {
KubeletRootDir string `json:"kubelet_root_dir"`
}
K8sConfig show kubernetes config
type LocalAlarm ¶
type LocalAlarm struct {
Executor string `json:"executor"`
}
LocalAlarm struct is used to describe local alarm body
type LocalPredictConfig ¶
type LocalPredictConfig struct {
// Minimum CPU recommendation for a pod
PodMinCPUMillicores float64 `json:"pod_min_cpu_millicores"`
// Minimum memory recommendation for a pod
PodMinMemoryMb float64 `json:"pod_min_memory_mb"`
// Fraction of usage added as the safety margin to the recommended request
SafetyMarginFraction float64 `json:"safety_margin_fraction"`
// cpu usage percentile to recommend cpu resource
CPUPercentile float64 `json:"cpu_percentile"`
// memory usage percentile to recommend cpu resource
MemoryPeaksPercentile float64 `json:"memory_peaks_percentile"`
// AggregationsConfig is used to configure aggregation behaviour.
AggregationsConfig `json:",inline"`
// Enable tune cpu weight if cpu usage is anomaly
EnableTuneCPUWeight bool `json:"enable_tune_cpu_weight"`
// AnomalyDetectorMovingWindow defines how long the moving window of anomaly detector should keep
AnomalyDetectorMovingWindow times.Duration `json:"anomaly_detector_moving_window"`
// If detect cpu usage increasing anomaly, the weight of the anomaly sample
// Base weight is 100
IncreasingAnomalyWeightFactor int64 `json:"increasing_anomaly_weight_factor"`
// If detect cpu usage decreasing anomaly, the weight of the anomaly sample
// Base weight is 100
DecreasingAnomalyWeightFactor int64 `json:"decreasing_anomaly_weight_factor"`
}
LocalPredictConfig group options for local predictor
type MemoryNotifyConfig ¶
type MemoryNotifyConfig struct {
Pressures []MemoryPressureNotifyConfig `json:"pressures"`
Usages []MemoryUsageNotifyConfig `json:"usages"`
}
MemoryNotifyConfig describe memory cgroup notify
type MemoryPressureNotifyConfig ¶
type MemoryPressureNotifyConfig struct {
Cgroups []string `json:"cgroups"`
PressureLevel string `json:"pressure_level"`
// assign time duration the pressure has kept
Duration times.Duration `json:"duration"`
// assign event number in the duration time
Count int `json:"count"`
}
MemoryPressureNotifyConfig describe memory.pressure_level notify data
type MemoryUsageNotifyConfig ¶
type MemoryUsageNotifyConfig struct {
Cgroups []string `json:"cgroups"`
// the distance between limit and threshold
MarginMb int `json:"margin_mb"`
// when to handle event after receiving event
Duration times.Duration `json:"duration"`
}
MemoryUsageNotifyConfig describe memory.usage_in_bytes notify data
type MetricKind ¶
type MetricKind string
MetricKind represent the kind of metrics that cAdvisor exposes.
type MetricsCollectConfig ¶
type MetricsCollectConfig struct {
Node MetricsNodeConfig `json:"node"`
Container MetricsContainerConfig `json:"container"`
Perf MetricsPerfConfig `json:"perf"`
Rdt MetricsRdtConfig `json:"rdt"`
Prometheus MetricsPrometheus `json:"prometheus"`
}
MetricsCollectConfig is the configuration for metrics collection
type MetricsContainerConfig ¶
type MetricsContainerConfig struct {
Resources []string `json:"resources"`
Cgroups []string `json:"cgroups"`
CollectInterval times.Duration `json:"collect_interval"`
MaxHousekeepingInterval times.Duration `json:"max_housekeeping_interval"`
}
MetricsContainerConfig is the configuration for container metrics collection
type MetricsNodeConfig ¶
type MetricsNodeConfig struct {
CollectInterval times.Duration `json:"collect_interval"`
SystemProcesses []string `json:"system_processes"`
OfflineType string `json:"-"`
Devices `json:",inline"`
}
MetricsNodeConfig is the configuration for node metrics collection
type MetricsPerfConfig ¶
type MetricsPerfConfig struct {
Disable bool `json:"disable"`
CollectInterval times.Duration `json:"collect_interval"`
CollectDuration times.Duration `json:"collect_duration"`
IgnoredCgroups []string `json:"ignored_cgroups"`
}
MetricsPerfConfig is the configuration for perf metrics collection
type MetricsPrometheus ¶
type MetricsPrometheus struct {
CollectInterval times.Duration `json:"collect_interval"`
// if need to show these metrics with the prefix "caelus_"
DisableShow bool `json:"disable_show"`
Items []*PrometheusData `json:"items"`
}
MetricsPrometheus describe how to collect prometheus metrics
type MetricsRdtConfig ¶
type MetricsRdtConfig struct {
Disable bool `json:"disable"`
RdtCommand string `json:"rdt_command"`
CollectInterval times.Duration `json:"collect_interval"`
CollectDuration times.Duration `json:"collect_duration"`
ExecuteInterval times.Duration `json:"execute_interval"`
}
MetricsRdtConfig is the configuration for RDT metrics collection
type MetricsSource ¶
type MetricsSource struct {
CheckInterval times.Duration `json:"check_interval"`
// MetricsCommand is a command to get job's current metrics value, it must return the format data, like:
// Its output is {"code":0,"msg":"success","data":[{"job_name":"","metric_name":"","key1":xx,"key2":xx,...}]}
MetricsCommand []string `json:"metrics_command"`
// if need to run chroot when executing metrics command
CmdNeedChroot *bool `json:"cmd_need_chroot"`
// MetricsURL is a url to get the job's metrics value, it must return the format data, like:
// Its output is <slo>,<metrics>.
MetricsURL string `json:"metrics_url"`
}
MetricsSource define metrics source of online services
type NodeResourceConfig ¶
type NodeResourceConfig struct {
Disable bool `json:"disable"`
UpdateInterval times.Duration `json:"update_interval"`
OfflineType string `json:"-"`
// DisableKillIfNormal does not kill pod when no resource in conflicting status
DisableKillIfNormal bool `json:"disable_kill_if_normal"`
OnlyKillIfIncompressibleRes bool `json:"only_kill_if_incompressible_res"`
YarnConfig YarnNodeResourceConfig `json:"yarn_config"`
Silence SilenceConfig `json:"silence"`
}
NodeResourceConfig group configuration for node
type NotifyConfig ¶
type NotifyConfig struct {
MemoryCgroup *MemoryNotifyConfig `json:"memory_cgroup"`
}
NotifyConfig monitor resource by kernel notify
type OfflineJobs ¶
type OfflineJobs struct {
Metadata interface{}
Request v1.ResourceList
Used v1.ResourceList
State string
}
OfflineJobs describe offline job features, such as resource and state
type OnlineConfig ¶
type OnlineConfig struct {
Enable bool `json:"enable"`
PidToCgroup PidToCgroup `json:"pid_to_cgroup"`
Jobs []OnlineJobConfig `json:"jobs"`
CustomMetric CustomMetric `json:"custom_metric"`
}
OnlineConfig show online job configuration
type OnlineJobConfig ¶
type OnlineJobConfig struct {
Name string `json:"name"`
// JobCommand is job's command expression
Command string `json:"command"`
Metrics []OnlineMetrics `json:"metrics"`
}
OnlineJobConfig is the configuration of a online job
type OnlineMetrics ¶
type OnlineMetrics struct {
Name string `json:"name"`
Source MetricsSource `json:"source"`
}
OnlineMetrics define metric config of online services
type OverCommit ¶
type OverCommit struct {
Enable bool `json:"enable"`
OverCommitPercent float64 `json:"over_commit_percent"`
Periods []TimeRangeOverCommit `json:"periods"`
}
OverCommit set overcommit percent for resource
type PathInfo ¶
type PathInfo struct {
Path string
Size *DiskQuotaSize
//if not, SharedInfo is nil
SharedInfo *SharedInfo
}
PathInfo group path and quota options
type PidToCgroup ¶
type PidToCgroup struct {
// PidCheckInterval could be zero
PidCheckInterval times.Duration `json:"pids_check_interval"`
CgroupCheckInterval times.Duration `json:"cgroup_check_interval"`
BatchNum int `json:"batch_num"`
}
PidToCgroup define online config of pid check
type PredictConfig ¶
type PredictConfig struct {
Disable bool `json:"disable"`
CheckInterval times.Duration `json:"check_interval"`
// PredictType must in [local, localv2, vpa]
PredictType string `json:"predict_type"`
PredictServerAddr string `json:"predict_server_addr"`
ReserveResource Resource `json:"reserve_resource"`
// PrintInterval is the the time interval to print predict detailed log for debug
PrintInterval times.Duration `json:"print_interval"`
// LocalPredictConfig is the configuration for local predictor
LocalPredictConfig `json:",inline"`
// The type value of online predict metrics caelus_node_resource{type=""}
// It's used by experiment predict
PredictMetricsType string `json:"predict_metrics_type"`
}
PredictConfig group options for predictor
type PrometheusData ¶
type PrometheusData struct {
Address string `json:"address"`
Collect []string `json:"collect"`
NoCollect []string `json:"no_collect"`
CollectMap sets.String `json:"-"`
NoCollectMap sets.String `json:"-"`
}
PrometheusData describe which metrics to collect or not collect
type RangeResource ¶
type RangeResource struct {
CPUMilli RangeState `json:"cpu_milli"`
MemMB RangeState `json:"mem_mb"`
}
RangeResource is used to check if the resource changed is available there is no need to update node resource when changed quantity is small.
type RangeState ¶
type RangeState struct {
// Minimum is the range quantity
Min float64 `json:"min"`
// Maximum is the maxisum range quantity
Max float64 `json:"max"`
// Ratio used to calculate change range quantity
Ratio float64 `json:"ratio"`
}
RangeState describe range resource to drop little changing
type RemoteAlarm ¶
type RemoteAlarm struct {
RemoteWebhook string `json:"remoteWebhook"`
WeWorkWebhook string `json:"weWorkWebhook"`
}
RemoteAlarm struct is used to describe remote alarm body
type Resource ¶
type Resource struct {
CpuMilli *float64 `json:"cpu_milli"`
MemMB *float64 `json:"mem_mb"`
CpuPercentStr string `json:"cpu_percent"`
CpuPercent *float64 `json:"-"`
MemPercentStr string `json:"mem_percent"`
MemPercent *float64 `json:"-"`
}
Resource is the cpu and memory configuration
type ResourceIsolateConfig ¶
type ResourceIsolateConfig struct {
Disable bool `json:"disable"`
ResourceDisable map[string]bool `json:"resource_disable"`
UpdatePeriod times.Duration `json:"update_period"`
// disks need to set io weight
DiskNames []string `json:"-"`
// eni iface for eni network pods
EniIface string `json:"-"`
// normal iface for host network and global route network pods
Iface string `json:"-"`
CpuConfig CpuIsolateConfig `json:"cpu_config"`
OnlineType string `json:"-"`
OfflineType string `json:"-"`
ExternalComponents []ComponentConfig `json:"external_components"`
}
ResourceIsolateConfig is the offline job quota limit configuration for resources
type ResourceUpdateEvent ¶
ResourceUpdateEvent define the event when need to update offline resources
type RoundOffResource ¶
RoundOffResource is used to format resource quantity, such as the origin memory is 1027Mi, we can get 1024Mi after rounding off, making memory 2 times of 512Mi
type RuleCheck ¶
type RuleCheck struct {
ContainerRules []*RuleCheckConfig `json:"container_rules"`
NodeRules []*RuleCheckConfig `json:"node_rules"`
AppRules []*RuleCheckConfig `json:"app_rules"`
}
RuleCheck group all rules
type RuleCheckConfig ¶
type RuleCheckConfig struct {
Name string `json:"name"`
Metrics []string `json:"metrics"`
// CheckInterval describes the interval to trigger detection
CheckInterval times.Duration `json:"check_interval"`
// HandleInterval describes the interval to handle conflicts after detecting abnormal result
HandleInterval times.Duration `json:"handle_interval"`
// RecoverInterval describes the interval to recover conflicts after detecting normal result
RecoverInterval times.Duration `json:"recover_interval"`
Rules []*DetectActionConfig `json:"rules"`
RecoverRules []*DetectActionConfig `json:"recover_rules"`
}
RuleCheckConfig define the rule config
type SilenceConfig ¶
type SilenceConfig struct {
// [0:00:00, 5:00:00]
Periods [][2]times.SecondsInDay `json:"periods"`
// disable schedule before silence
AheadOfUnSchedule times.Duration `json:"ahead_of_unSchedule"`
}
SilenceConfig describe the period time, do not allow running offline jobs
type TaskTypeConfig ¶
type TaskTypeConfig struct {
OnlineType string `json:"online_type"`
OfflineType string `json:"offline_type"`
}
TaskTypeConfig show the online and offline task type, such as offline is yarn on k8s.
type TimeRangeOverCommit ¶
type TimeRangeOverCommit struct {
Range [2]times.SecondsInDay `json:"range"`
OverCommitPercent float64 `json:"over_commit_percent"`
}
TimeRangeOverCommit set overcommit percent for resource in specific time range
type VolumeType ¶
type VolumeType string
var ( VolumeTypeRootFs VolumeType = "rootFs" VolumeTypeEmptyDir VolumeType = "emptyDir" VolumeTypeHostPath VolumeType = "hostPath" AvailableVolumeTypes = sets.NewString( VolumeTypeRootFs.String(), VolumeTypeEmptyDir.String(), VolumeTypeHostPath.String()) )
type YarnDisksConfig ¶
type YarnDisksConfig struct {
// RatioToCore translate disk space to core numbers
RatioToCore int64 `json:"ratio_to_core"`
MultiDiskDisable bool `json:"multi_disk_disable"`
// DiskMinCapacityGb drop disks with little disk space
DiskMinCapacityGb int64 `json:"disk_min_capacity_gb"`
SpaceCheckEnabled bool `json:"space_check_enabled"`
SpaceCheckPeriod times.Duration `json:"space_check_period"`
// SpaceCheckReservedGb is used for checking disk space, it will start cleaning space if free disk space is less
// than SpaceCheckReservedGb
SpaceCheckReservedGb int64 `json:"space_check_reserved_gb"`
SpaceCheckReservedPercent float64 `json:"space_check_reserved_percent"`
SpaceCleanDisable bool `json:"space_clean_disable"`
// SpaceCleanJustData is enabled, it will just restart nodemanager pod to release /data space, and
// do not care other disk partitions
SpaceCleanJustData bool `json:"space_clean_just_data"`
// OfflineExitedCleanDelay is used to clean nodemanager local or log path when offline pod exited for long time
OfflineExitedCleanDelay times.Duration `json:"offline_exited_clean_delay"`
}
YarnDisksConfig group disks config
type YarnNodeResourceConfig ¶
type YarnNodeResourceConfig struct {
// CapacityIncInterval is used to make nodemanager capacity increase not very frequently
CapacityIncInterval times.Duration `json:"capacity_inc_interval"`
NMServer string `json:"nm_server"`
NMReserve Resource `json:"nm_reserve"`
ResourceRoundOff RoundOffResource `json:"resource_roundoff"`
ResourceRange RangeResource `json:"resource_range"`
ScheduleServerPort string `json:"schedule_server_port"`
PortAutoDetect bool `json:"port_auto_detect"`
Properties map[string]string `json:"properties"`
Disks YarnDisksConfig `json:"disks"`
ShimServer string `json:"shim_server"`
CpuOverCommit OverCommit `json:"cpu_over_commit"`
}
YarnNodeResourceConfig is used to show yarn related configuration