Documentation
¶
Overview ¶
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
Package stats provides methods and functionality to register, track, log, and export metrics that, for the most part, include "counter" and "latency" kinds.
- Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
Index ¶
- Constants
- Variables
- func IsErrMetric(name string) bool
- func IsIOErrMetric(name string) bool
- func LatencyToCounter(latName string) string
- func SizeToThroughputCount(name, kind string) (string, string)
- type Cluster
- type ClusterRaw
- type Extra
- type Node
- type NodeStatus
- type Prunner
- func (r *Prunner) Add(name string, val int64)
- func (r *Prunner) AddWith(nvs ...cos.NamedVal64)
- func (r *Prunner) ClrFlag(name string, clr cos.NodeStateFlags)
- func (r *Prunner) Get(name string) (val int64)
- func (r *Prunner) GetMetricNames() cos.StrKVs
- func (r *Prunner) GetStats() *Node
- func (r *Prunner) Inc(name string)
- func (r *Prunner) IncBck(name string, bck *cmn.Bck)
- func (r *Prunner) IncWith(name string, vlabs map[string]string)
- func (r *Prunner) Init(p core.Node) *atomic.Bool
- func (r *Prunner) Name() string
- func (*Prunner) PromHandler() http.Handler
- func (r *Prunner) RegExtMetric(snode *meta.Snode, name, kind string, extra *Extra)
- func (r *Prunner) ResetStats(errorsOnly bool)
- func (r *Prunner) Run() error
- func (r *Prunner) SetClrFlag(name string, set, clr cos.NodeStateFlags)
- func (r *Prunner) SetFlag(name string, set cos.NodeStateFlags)
- func (r *Prunner) StartedUp() bool
- func (r *Prunner) Stop(err error)
- type Tracker
- type Trunner
- func (r *Trunner) Add(name string, val int64)
- func (r *Trunner) AddWith(nvs ...cos.NamedVal64)
- func (r *Trunner) ClrFlag(name string, clr cos.NodeStateFlags)
- func (r *Trunner) Get(name string) (val int64)
- func (r *Trunner) GetMetricNames() cos.StrKVs
- func (r *Trunner) GetStats() (ds *Node)
- func (r *Trunner) Inc(name string)
- func (r *Trunner) IncBck(name string, bck *cmn.Bck)
- func (r *Trunner) IncWith(name string, vlabs map[string]string)
- func (r *Trunner) Init() *atomic.Bool
- func (r *Trunner) InitCDF(config *cmn.Config) error
- func (r *Trunner) Name() string
- func (*Trunner) PromHandler() http.Handler
- func (r *Trunner) RegDiskMetrics(snode *meta.Snode, disk string)
- func (r *Trunner) RegExtMetric(snode *meta.Snode, name, kind string, extra *Extra)
- func (r *Trunner) RegMetrics(snode *meta.Snode)
- func (r *Trunner) ResetStats(errorsOnly bool)
- func (r *Trunner) Run() error
- func (r *Trunner) SetClrFlag(name string, set, clr cos.NodeStateFlags)
- func (r *Trunner) SetFlag(name string, set cos.NodeStateFlags)
- func (r *Trunner) Standby(v bool)
- func (r *Trunner) StartedUp() bool
- func (r *Trunner) Stop(err error)
Constants ¶
const ( KindCounter = "counter" KindTotal = "total" KindSize = "size" KindSpecial = "special" // uptime KindGauge = "gauge" // disk I/O KindComputedThroughput = "compbw" // disk read/write throughput KindLatency = "latency" // computed internally over 'periodic.stats_time' (milliseconds) KindThroughput = "bw" // ditto (MB/s) )
enum: `statsValue` kinds
const ( VlabBucket = "bucket" VlabXkind = "xkind" VlabMountpath = "mountpath" )
variable labels
const ( // NOTE semantics: // - counts all warm GETs // - counts all cold GETs (when remote GET is followed by storing new object (or, new object version) locally) // - does NOT count internal GetObjReader calls (e.g., by copy or transform jobs) // - see also: // - ais/backend/common // - rgetstats GetCount = "get.n" PutCount = "put.n" // ditto PUT(object) count = (all PUTs including remote) HeadCount = "head.n" AppendCount = "append.n" DeleteCount = "del.n" RenameCount = "ren.n" ListCount = "lst.n" // list-objects GetBlobCount = "getblob.n" // error counters // see also: `Inc`, `regCommon`, `ioErrNames` ErrGetCount = errPrefix + GetCount ErrPutCount = errPrefix + PutCount ErrHeadCount = errPrefix + HeadCount ErrAppendCount = errPrefix + AppendCount ErrDeleteCount = errPrefix + DeleteCount ErrRenameCount = errPrefix + RenameCount ErrListCount = errPrefix + ListCount ErrGetBlobCount = errPrefix + GetBlobCount ErrKaliveCount = errPrefix + "kalive.n" ErrHTTPWriteCount = errPrefix + "http.write.n" )
KindCounter: all basic counters are accompanied by the corresponding (errPrefix + kind) error count: e.g.: "get.n" => "err.get.n", "put.n" => "err.put.n", etc.
const ( ListLatency = "lst.ns" KeepAliveLatency = "kalive.ns" )
KindLatency (most latency metrics are target-only - see target_stats) latency stats have numSamples used to compute average latency
const ( // KindSpecial Uptime = "up.ns.time" // KindGauge, cos.NodeStateFlags enum NodeAlerts = cos.NodeAlerts // "state.flags" )
const ( // KindThroughput GetThroughput = "get.bps" // bytes per second PutThroughput = "put.bps" // ditto // same as above via `.cumulative` GetSize = "get.size" PutSize = "put.size" // common latencies AppendLatency = "append.ns" GetRedirLatency = "get.redir.ns" PutRedirLatency = "put.redir.ns" HeadLatencyTotal = "head.ns.total" // out-of-band VerChangeCount = "ver.change.n" VerChangeSize = "ver.change.size" // errors (note common prefix convention) ErrPutCksumCount = errPrefix + "put.cksum.n" ErrFSHCCount = errPrefix + "fshc.n" // IO errors (must have ioErrPrefix) IOErrGetCount = ioErrPrefix + "get.n" IOErrPutCount = ioErrPrefix + "put.n" IOErrDeleteCount = ioErrPrefix + "del.n" // KindLatency GetLatency = "get.ns" GetLatencyTotal = "get.ns.total" PutLatency = "put.ns" PutLatencyTotal = "put.ns.total" // "pure" remote PUT latency PutE2ELatencyTotal = "e2e.put.ns.total" // end to end (e2e) PUT latency // rate limit (409, 503) RatelimGetRetryCount = "ratelim.retry.get.n" RatelimGetRetryLatencyTotal = "ratelim.retry.get.ns.total" RatelimPutRetryCount = "ratelim.retry.put.n" RatelimPutRetryLatencyTotal = "ratelim.retry.put.ns.total" // compare w/ common `DeleteCount` RemoteDeletedDelCount = core.RemoteDeletedDelCount )
1. datapath (counters, sizes, latencies) and common errors
const ( LcacheCollisionCount = core.LcacheCollisionCount LcacheEvictedCount = core.LcacheEvictedCount LcacheErrCount = core.LcacheErrCount LcacheFlushColdCount = core.LcacheFlushColdCount )
2. object metadata in memory
const ( // blob downloader GetBlobSize = "getblob.size" // LRU eviction LruEvictCount = "lru.evict.n" LruEvictSize = "lru.evict.size" // space cleanup CleanupStoreCount = "cleanup.store.n" CleanupStoreSize = "cleanup.store.size" // distributed sort (ext/dsort) DsortCreationReqCount = "dsort.creation.req.n" DsortCreationRespCount = "dsort.creation.resp.n" DsortCreationRespLatency = "dsort.creation.resp.ns" DsortExtractShardDskCnt = "dsort.extract.shard.dsk.n" DsortExtractShardMemCnt = "dsort.extract.shard.mem.n" DsortExtractShardSize = "dsort.extract.shard.size" // uncompressed // ETL (ext/etl) ETLInlineCount = "etl.inline.n" ETLInlineLatencyTotal = "etl.inline.ns.total" ETLInlineSize = "etl.inline.size" ETLOfflineCount = "etl.offline.n" ETLOfflineLatencyTotal = "etl.offline.ns.total" ETLOfflineSize = "etl.offline.size" // downloader (ext/dload) // (not to confuse with blob downloader) DloadSize = "dl.size" DloadLatencyTotal = "dl.ns.total" ErrDloadCount = errPrefix + "dl.n" // get-batch (x-moss) GetBatchCount = "getbatch.n" GetBatchObjCount = "getbatch.obj.n" GetBatchFileCount = "getbatch.file.n" GetBatchObjSize = "getbatch.obj.size" GetBatchFileSize = "getbatch.file.size" GetBatchRxWaitTotal = "getbatch.rxwait.ns" GetBatchThrottleTotal = "getbatch.throttle.ns" ErrGetBatchCount = errPrefix + "getbatch.n" GetBatchSoftErrCount = errPrefix + "soft.getbatch.n" )
3. xactions (jobs)
const (
ConstlabNode = "node_id"
)
static labels
const (
NgrPrompt = "Number of goroutines"
)
Variables ¶
var ( BckVlabs = []string{VlabBucket} EmptyBckVlabs = map[string]string{VlabBucket: ""} BckXlabs = []string{VlabBucket, VlabXkind} EmptyBckXlabs = map[string]string{VlabBucket: "", VlabXkind: ""} )
Functions ¶
func IsErrMetric ¶ added in v1.3.16
func IsIOErrMetric ¶ added in v1.3.24
func LatencyToCounter ¶ added in v1.3.24
see also base.init() in ais/backend/common
func SizeToThroughputCount ¶ added in v1.3.26
Types ¶
type ClusterRaw ¶ added in v1.3.16
type ClusterRaw struct {
Proxy *Node `json:"proxy"`
Target cos.JSONRawMsgs `json:"target"`
}
type Node ¶ added in v1.3.16
type Node struct {
Snode *meta.Snode `json:"snode"`
Tracker copyTracker `json:"tracker"`
Tcdf fs.Tcdf `json:"capacity"`
}
REST API
type NodeStatus ¶ added in v1.3.16
type NodeStatus struct {
RebSnap *core.Snap `json:"rebalance_snap,omitempty"`
// assorted props
Status string `json:"status"`
DeploymentType string `json:"deployment"`
Version string `json:"ais_version"` // major.minor.build
BuildTime string `json:"build_time"` // YYYY-MM-DD HH:MM:SS-TZ
K8sPodName string `json:"k8s_pod_name"` // (via ais-k8s/operator `MY_POD` env var)
Reserved1 string `json:"reserved1,omitempty"`
Reserved2 string `json:"reserved2,omitempty"`
Node
Cluster cos.NodeStateInfo
MemCPUInfo apc.MemCPUInfo `json:"sys_info"`
SmapVersion int64 `json:"smap_version,string"`
Reserved3 int64 `json:"reserved3,omitempty"`
Reserved4 int64 `json:"reserved4,omitempty"`
}
(includes stats.Node and more; NOTE: direct API call w/ no proxying)
type Prunner ¶
type Prunner struct {
// contains filtered or unexported fields
}
func (*Prunner) AddWith ¶ added in v1.3.26
func (r *Prunner) AddWith(nvs ...cos.NamedVal64)
(prometheus with variable labels)
func (*Prunner) ClrFlag ¶ added in v1.3.24
func (r *Prunner) ClrFlag(name string, clr cos.NodeStateFlags)
func (*Prunner) GetMetricNames ¶ added in v1.3.16
func (*Prunner) Init ¶
All stats that proxy currently has are CoreStats which are registered at startup
func (*Prunner) PromHandler ¶ added in v1.3.26
func (*Prunner) RegExtMetric ¶ added in v1.3.24
func (*Prunner) ResetStats ¶ added in v1.3.16
func (r *Prunner) ResetStats(errorsOnly bool)
TODO: reset prometheus as well (assuming, there's an API)
func (*Prunner) SetClrFlag ¶ added in v1.3.24
func (r *Prunner) SetClrFlag(name string, set, clr cos.NodeStateFlags)
func (*Prunner) SetFlag ¶ added in v1.3.24
func (r *Prunner) SetFlag(name string, set cos.NodeStateFlags)
type Tracker ¶
type Tracker interface {
cos.StatsUpdater
StartedUp() bool
PromHandler() http.Handler
Inc(metric string)
IncWith(metric string, vlabs map[string]string)
IncBck(name string, bck *cmn.Bck)
GetStats() *Node
ResetStats(errorsOnly bool)
GetMetricNames() cos.StrKVs // (name, kind) pairs
// for aistore modules, to add their respective metrics
RegExtMetric(node *meta.Snode, name, kind string, extra *Extra)
}
type Trunner ¶
func NewTrunner ¶ added in v1.3.16
func (*Trunner) AddWith ¶ added in v1.3.26
func (r *Trunner) AddWith(nvs ...cos.NamedVal64)
(prometheus with variable labels)
func (*Trunner) ClrFlag ¶ added in v1.3.24
func (r *Trunner) ClrFlag(name string, clr cos.NodeStateFlags)
func (*Trunner) GetMetricNames ¶ added in v1.3.16
func (*Trunner) PromHandler ¶ added in v1.3.26
func (*Trunner) RegExtMetric ¶ added in v1.3.24
func (*Trunner) RegMetrics ¶
target-specific metrics, in addition to common and already added via regCommon()
func (*Trunner) ResetStats ¶ added in v1.3.16
func (r *Trunner) ResetStats(errorsOnly bool)
TODO: reset prometheus as well (assuming, there's an API)
func (*Trunner) SetClrFlag ¶ added in v1.3.24
func (r *Trunner) SetClrFlag(name string, set, clr cos.NodeStateFlags)
func (*Trunner) SetFlag ¶ added in v1.3.24
func (r *Trunner) SetFlag(name string, set cos.NodeStateFlags)