metrics

package
v0.80.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 10, 2026 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Overview

Package metrics provides telemetry metrics for the logs agent

Package metrics provides telemetry metrics for the logs agent

Index

Constants

View Source
const (

	// ProcessorTlmName is the telemetry name for processor components
	ProcessorTlmName = "processor"
	// StrategyTlmName is the telemetry name for strategy components
	StrategyTlmName = "strategy"
	// SenderTlmName is the telemetry name for sender components
	SenderTlmName = "sender"
	// WorkerTlmName is the telemetry name for worker components
	WorkerTlmName = "worker"
	// SenderTlmInstanceID is the default instance ID for sender components
	SenderTlmInstanceID = "0"
)

Variables

View Source
var (
	// LogsExpvars contains metrics for the logs agent.
	LogsExpvars *expvar.Map
	// LogsDecoded is the total number of decoded logs
	LogsDecoded = expvar.Int{}
	// TlmLogsDecoded is the total number of decoded logs
	TlmLogsDecoded = telemetryimpl.GetCompatComponent().NewCounter("logs", "decoded",
		nil, "Total number of decoded logs")
	// LogsProcessed is the total number of processed logs.
	LogsProcessed = expvar.Int{}
	// TlmLogsProcessed is the total number of processed logs.
	TlmLogsProcessed = telemetryimpl.GetCompatComponent().NewCounter("logs", "processed",
		nil, "Total number of processed logs")

	// LogsSent is the total number of sent logs.
	LogsSent = expvar.Int{}
	// TlmLogsSent is the total number of sent logs.
	TlmLogsSent = telemetryimpl.GetCompatComponent().NewCounter("logs", "sent",
		nil, "Total number of sent logs")
	// DestinationErrors is the total number of network errors.
	DestinationErrors = expvar.Int{}
	// TlmDestinationErrors is the total number of network errors.
	TlmDestinationErrors = telemetryimpl.GetCompatComponent().NewCounter("logs", "network_errors",
		nil, "Total number of network errors")
	// DestinationLogsDropped is the total number of logs dropped per Destination
	DestinationLogsDropped = expvar.Map{}
	// TlmLogsDropped is the total number of logs dropped per Destination
	TlmLogsDropped = telemetryimpl.GetCompatComponent().NewCounter("logs", "dropped",
		[]string{"destination"}, "Total number of logs dropped per Destination")
	// BytesSent is the total number of sent bytes before encoding if any
	BytesSent = expvar.Int{}
	// TlmBytesSent is the total number of sent bytes before encoding if any
	// The remote_agent tag identifies which agent sent the logs. Use GetAgentIdentityTag()
	// to get the correct value for the current agent. This tag is used by COAT to partition
	// log bytes by agent type.
	TlmBytesSent = telemetryimpl.GetCompatComponent().NewCounter("logs", "bytes_sent",
		[]string{"remote_agent", "source"}, "Total number of bytes sent before encoding if any")
	// RetryCount is the total number of times we have retried payloads that failed to send
	RetryCount = expvar.Int{}
	// TlmRetryCount is the total number of times we have retried payloads that failed to send
	TlmRetryCount = telemetryimpl.GetCompatComponent().NewCounter("logs", "retry_count",
		nil, "Total number of retried payloads")
	// RetryTimeSpent is the total time spent retrying payloads that failed to send
	RetryTimeSpent = expvar.Int{}
	// EncodedBytesSent is the total number of sent bytes after encoding if any
	EncodedBytesSent = expvar.Int{}
	// TlmEncodedBytesSent is the total number of sent bytes after encoding if any
	// The remote_agent tag identifies which agent sent the logs. Use GetAgentIdentityTag()
	// to get the correct value for the current agent. This tag is used by COAT to partition
	// encoded log bytes by agent type.
	TlmEncodedBytesSent = telemetryimpl.GetCompatComponent().NewCounter("logs", "encoded_bytes_sent",
		[]string{"remote_agent", "source", "compression_kind"}, "Total number of sent bytes after encoding if any")
	// BytesMissed is the number of bytes lost before they could be consumed by the agent, such as after a log rotation
	BytesMissed = expvar.Int{}
	// TlmBytesMissed is the number of bytes lost before they could be consumed by the agent, such as after log rotation
	TlmBytesMissed = telemetryimpl.GetCompatComponent().NewCounter("logs", "bytes_missed",
		nil, "Total number of bytes lost before they could be consumed by the agent, such as after log rotation")
	// SenderLatency the last reported latency value from the http sender (ms)
	SenderLatency = expvar.Int{}
	// TlmSenderLatency a histogram of http sender latency (ms)
	TlmSenderLatency = telemetryimpl.GetCompatComponent().NewHistogram("logs", "sender_latency",
		nil, "Histogram of http sender latency in ms", []float64{10, 25, 50, 75, 100, 250, 500, 1000, 10000})
	// DestinationExpVars a map of sender utilization metrics for each http destination
	DestinationExpVars = expvar.Map{}
	// DestinationHTTPRespByStatusAndURL tracks HTTP responses by status code and destination URL
	DestinationHTTPRespByStatusAndURL = expvar.Map{}
	// TlmDestinationHTTPRespByStatusAndURL tracks HTTP responses by status code and destination URL
	TlmDestinationHTTPRespByStatusAndURL = telemetryimpl.GetCompatComponent().NewCounter("logs", "destination_http_resp", []string{"status_code", "url"}, "Count of http responses by status code and destination url")

	// TlmAutoMultilineAggregatorFlush Count of each line flushed from the auto multiline aggregator.
	TlmAutoMultilineAggregatorFlush = telemetryimpl.GetCompatComponent().NewCounter("logs", "auto_multi_line_aggregator_flush", []string{"truncated", "line_type"}, "Count of each line flushed from the auto multiline aggregator")

	// TlmAutoMultilineJSONAggregatorFlush Count of each line flushed from the auto multiline JSON aggregator.
	TlmAutoMultilineJSONAggregatorFlush = telemetryimpl.GetCompatComponent().NewCounter("logs", "auto_multi_line_json_aggregator_flush", []string{"is_valid"}, "Count of each line flushed from the auto multiline JSON aggregator")

	// TlmUtilizationRatio is the utilization ratio of a component.
	// Utilization ratio is calculated as the ratio of time spent in use to the total time.
	// This metric is internally sampled and exposed as an ewma in order to produce a useable value.
	TlmUtilizationRatio = telemetryimpl.GetCompatComponent().NewGauge("logs_component_utilization", "ratio", []string{"name", "instance"}, "Gauge of the utilization ratio of a component")
	// TlmUtilizationItems is the capacity of a component by number of elements
	// Both the number of items and the number of bytes are aggregated and exposed as a ewma.
	TlmUtilizationItems = telemetryimpl.GetCompatComponent().NewGauge("logs_component_utilization", "items", []string{"name", "instance"}, "Gauge of the number of items currently held in a component and its buffers")
	// TlmUtilizationBytes is the capacity of a component by number of bytes
	TlmUtilizationBytes = telemetryimpl.GetCompatComponent().NewGauge("logs_component_utilization", "bytes", []string{"name", "instance"}, "Gauge of the number of bytes currently held in a component and its buffers")
	// TlmDestNumWorkers is the number of destination workers in use.
	TlmDestNumWorkers = telemetryimpl.GetCompatComponent().NewGauge("logs_destination", "destination_workers", []string{"instance"}, "Gauge of the number of destination workers in use")
	// TlmDestVirtualLatency is a moving average of the destination's latency.
	TlmDestVirtualLatency = telemetryimpl.GetCompatComponent().NewGauge("logs_destination", "virtual_latency", []string{"instance"}, "Gauge of the destination's average latency")
	// TlmDestWorkerResets tracks the count of times the destination worker pool resets the worker count after encountering a retryable error.
	TlmDestWorkerResets = telemetryimpl.GetCompatComponent().NewCounter("logs_destination", "destination_worker_resets", []string{"instance"}, "Count of times the destination worker pool resets the worker count")
	// LogsTruncated is the number of logs truncated by the Agent
	LogsTruncated = expvar.Int{}
	// TlmTruncatedCount tracks the count of times a log is truncated
	TlmTruncatedCount = telemetryimpl.GetCompatComponent().NewCounter("logs", "truncated", []string{"service", "source"}, "Count the number of times a log is truncated")

	// TlmLogLineSizes is a distribution of post-framer log line sizes
	TlmLogLineSizes = telemetryimpl.GetCompatComponent().NewHistogram("logs", "log_line_sizes",
		nil, "Distribution of post-framer log line sizes before line parsers/handlers are applied", []float64{32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152})

	// TlmRotationsNix tracks file rotations detected on *nix platforms by rotation type (new_file vs truncated)
	TlmRotationsNix = telemetryimpl.GetCompatComponent().NewCounter("logs", "rotations_nix",
		[]string{"rotation_type"}, "Count of file rotations detected on *nix platforms, tagged by rotation_type (new_file or truncated)")

	// TlmRotationSizeMismatch counts disagreements between cache-growth and offset-unread rotation detectors.
	// The `detector` tag indicates which heuristic detected a potential rotation (not which claimed all was fine):
	// - detector:cache = cache observed growth but offset indicates all data was read (likely missed rotation)
	// - detector:offset = offset indicates unread data but cache saw no growth (likely false-positive rotation)
	TlmRotationSizeMismatch = telemetryimpl.GetCompatComponent().NewCounter("logs", "rotation_size_mismatch",
		[]string{"detector"}, "Count of disagreements between cache-growth and offset-unread rotation detectors")

	// TlmRotationSizeDifferences records the absolute file size difference whenever the file size changes between checks
	TlmRotationSizeDifferences = telemetryimpl.GetCompatComponent().NewHistogram("logs", "rotation_size_differences",
		nil, "Distribution of absolute file size differences observed between consecutive file rotation checks", []float64{256, 1024, 4096, 16384, 65536, 262144, 1048576, 10485760, 104857600})

	// TlmPayloadFlushed is the total number of payloads flushed by the batch strategy.
	// Tags: pipeline, flush_reason (max_count, max_bytes, timer, flush, shutdown)
	TlmPayloadFlushed = telemetryimpl.GetCompatComponent().NewCounter("logs", "batch_payload_flushed",
		[]string{"pipeline", "flush_reason"}, "Total number of payloads flushed, tagged by the reason the flush was triggered")

	// TlmHTTPConnectivityCheck tracks HTTP connectivity check results
	// Tags: status (success/failure)
	TlmHTTPConnectivityCheck = telemetryimpl.GetCompatComponent().NewCounter("logs", "http_connectivity_check",
		[]string{"status"}, "Count of HTTP connectivity checks with status")

	// TlmHTTPConnectivityRetryAttempt tracks HTTP connectivity retry attempts
	// Tags: status (success/failure)
	TlmHTTPConnectivityRetryAttempt = telemetryimpl.GetCompatComponent().NewCounter("logs", "http_connectivity_retry_attempt",
		[]string{"status"}, "Count of HTTP connectivity retry attempts with success/failure status")

	// TlmRestartAttempt tracks logs agent restart attempts
	// Tags: status (success/failure/timeout), transport (tcp/http)
	TlmRestartAttempt = telemetryimpl.GetCompatComponent().NewCounter("logs", "restart_attempt",
		[]string{"status", "transport"}, "Count of logs agent restart attempts with status and target transport")

	// TlmAutoMultilineTotalLines counts all lines processed by the detecting aggregator
	// for sources on the default path. Used as the denominator for both X% and Y% metrics.
	TlmAutoMultilineTotalLines = telemetryimpl.GetCompatComponent().NewCounter("logs", "auto_multi_line_default_total_lines",
		nil, "Total lines processed by the detecting aggregator for default-path sources")

	// TlmAutoMultilineWouldCombine counts lines that would be merged into a preceding
	// startGroup message if auto multiline were enabled by default.
	TlmAutoMultilineWouldCombine = telemetryimpl.GetCompatComponent().NewCounter("logs", "auto_multi_line_default_would_combine",
		nil, "Lines that would be combined if auto multiline were the default")

	// TlmAutoMultilineWouldTruncate counts raw input lines belonging to multiline
	// groups that would exceed maxContentSize due to combining. Single lines that are
	// individually oversized are excluded (they'd be truncated regardless).
	TlmAutoMultilineWouldTruncate = telemetryimpl.GetCompatComponent().NewCounter("logs", "auto_multi_line_default_would_truncate",
		nil, "Lines belonging to groups that would be truncated if auto multiline were the default")

	// TlmListenerIPDenied counts connections or datagrams rejected by IP allow/deny filters.
	// Tags: listener_type (tcp, udp)
	TlmListenerIPDenied = telemetryimpl.GetCompatComponent().NewCounter("logs", "listener_ip_denied",
		[]string{"listener_type"}, "Count of connections or datagrams rejected by IP allow/deny filters")
)

Functions

func GetAgentIdentityTag

func GetAgentIdentityTag() string

GetAgentIdentityTag returns the remote_agent tag value for the current agent process. The value is set at startup via SetAgentIdentity and defaults to "agent".

func SetAgentIdentity

func SetAgentIdentity(tag string)

SetAgentIdentity sets the remote_agent tag value for the current agent process. This must be called once during agent startup, before any logs are sent. Example values: "agent", "system-probe", "trace-agent", etc.

Types

type CapacityMonitor

type CapacityMonitor struct {
	sync.Mutex
	// contains filtered or unexported fields
}

CapacityMonitor samples the average capacity of a component over a given interval. Capacity is calculated as the difference between the ingress and egress of a payload. Because data moves very quickly through components, we need to sample and aggregate this value over time.

func NewCapacityMonitor

func NewCapacityMonitor(name, instance string) *CapacityMonitor

NewCapacityMonitor creates a new CapacityMonitor

func (*CapacityMonitor) AddEgress

func (i *CapacityMonitor) AddEgress(pl MeasurablePayload)

AddEgress records the egress of a payload

func (*CapacityMonitor) AddIngress

func (i *CapacityMonitor) AddIngress(pl MeasurablePayload)

AddIngress records the ingress of a payload

type MeasurablePayload

type MeasurablePayload interface {
	Size() int64
	Count() int64
}

MeasurablePayload represents a payload that can be measured in bytes and count

type NoopPipelineMonitor

type NoopPipelineMonitor struct {
	// contains filtered or unexported fields
}

NoopPipelineMonitor is a no-op implementation of PipelineMonitor. Some instances of logs components do not need to report capacity metrics and should use this implementation.

func NewNoopPipelineMonitor

func NewNoopPipelineMonitor(instanceID string) *NoopPipelineMonitor

NewNoopPipelineMonitor creates a new no-op pipeline monitor

func (*NoopPipelineMonitor) GetCapacityMonitor

func (n *NoopPipelineMonitor) GetCapacityMonitor(_ string, _ string) *CapacityMonitor

GetCapacityMonitor returns the capacity monitor for the given name and instance ID. Returns nil for NoopPipelineMonitor as it doesn't track capacity.

func (*NoopPipelineMonitor) MakeUtilizationMonitor

func (n *NoopPipelineMonitor) MakeUtilizationMonitor(_ string, _ string) UtilizationMonitor

MakeUtilizationMonitor returns a no-op utilization monitor.

func (*NoopPipelineMonitor) ReportComponentEgress

func (n *NoopPipelineMonitor) ReportComponentEgress(_ MeasurablePayload, _ string, _ string)

ReportComponentEgress does nothing.

func (*NoopPipelineMonitor) ReportComponentIngress

func (n *NoopPipelineMonitor) ReportComponentIngress(_ MeasurablePayload, _ string, _ string)

ReportComponentIngress does nothing.

type NoopUtilizationMonitor

type NoopUtilizationMonitor struct{}

NoopUtilizationMonitor is a no-op implementation of UtilizationMonitor.

func (*NoopUtilizationMonitor) Start

func (n *NoopUtilizationMonitor) Start()

Start does nothing.

func (*NoopUtilizationMonitor) Stop

func (n *NoopUtilizationMonitor) Stop()

Stop does nothing.

type PipelineMonitor

type PipelineMonitor interface {
	GetCapacityMonitor(name string, instanceID string) *CapacityMonitor
	ReportComponentIngress(size MeasurablePayload, name string, instanceID string)
	ReportComponentEgress(size MeasurablePayload, name string, instanceID string)
	MakeUtilizationMonitor(name string, instanceID string) UtilizationMonitor
}

PipelineMonitor is an interface for monitoring the capacity of a pipeline. Pipeline monitors are used to measure both capacity and utilization of components.

type TelemetryPipelineMonitor

type TelemetryPipelineMonitor struct {
	// contains filtered or unexported fields
}

TelemetryPipelineMonitor is a PipelineMonitor that reports capacity metrics to telemetry

func NewTelemetryPipelineMonitor

func NewTelemetryPipelineMonitor() *TelemetryPipelineMonitor

NewTelemetryPipelineMonitor creates a new pipeline monitort that reports capacity and utiilization metrics as telemetry

func (*TelemetryPipelineMonitor) GetCapacityMonitor

func (c *TelemetryPipelineMonitor) GetCapacityMonitor(name string, instanceID string) *CapacityMonitor

GetCapacityMonitor returns the capacity monitor for the given name and instance ID.

func (*TelemetryPipelineMonitor) MakeUtilizationMonitor

func (c *TelemetryPipelineMonitor) MakeUtilizationMonitor(name string, instanceID string) UtilizationMonitor

MakeUtilizationMonitor creates a new utilization monitor for a component.

func (*TelemetryPipelineMonitor) ReportComponentEgress

func (c *TelemetryPipelineMonitor) ReportComponentEgress(pl MeasurablePayload, name string, instanceID string)

ReportComponentEgress reports the egress of a payload from a component.

func (*TelemetryPipelineMonitor) ReportComponentIngress

func (c *TelemetryPipelineMonitor) ReportComponentIngress(pl MeasurablePayload, name string, instanceID string)

ReportComponentIngress reports the ingress of a payload to a component.

type TelemetryUtilizationMonitor

type TelemetryUtilizationMonitor struct {
	// contains filtered or unexported fields
}

TelemetryUtilizationMonitor is a UtilizationMonitor that reports utilization metrics as telemetry.

func NewTelemetryUtilizationMonitor

func NewTelemetryUtilizationMonitor(name, instance string) *TelemetryUtilizationMonitor

NewTelemetryUtilizationMonitor creates a new TelemetryUtilizationMonitor.

func (*TelemetryUtilizationMonitor) Start

func (u *TelemetryUtilizationMonitor) Start()

Start tracks a start event in the utilization tracker.

func (*TelemetryUtilizationMonitor) Stop

func (u *TelemetryUtilizationMonitor) Stop()

Stop tracks a finish event in the utilization tracker.

type UtilizationMonitor

type UtilizationMonitor interface {
	Start()
	Stop()
}

UtilizationMonitor is an interface for monitoring the utilization of a component.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL