dbqcore

package module
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 28, 2025 License: Apache-2.0 Imports: 7 Imported by: 0

README

dbqcore

DataBridge Quality Core library is a part of dbqctl.

Documentation

Index

Constants

View Source
const (
	// CheckTypeRawQuery is a data quality check that uses a raw SQL query.
	CheckTypeRawQuery = "raw_query"
)

Variables

This section is empty.

Functions

This section is empty.

Types

type ChecksFileConfig added in v0.1.0

type ChecksFileConfig struct {
	Version     string           `yaml:"version"`
	Validations []ValidationRule `yaml:"validations"`
}

func LoadChecksFileConfig added in v0.1.0

func LoadChecksFileConfig(fileName string) (*ChecksFileConfig, error)

type ColumnInfo

type ColumnInfo struct {
	Name     string
	Type     string
	Comment  string
	Position uint
}

ColumnInfo represents the basic information of a column.

type ColumnMetrics

type ColumnMetrics struct {
	ColumnName          string   `json:"col_name"`
	ColumnComment       string   `json:"col_comment"`
	ColumnPosition      uint     `json:"col_position"`
	DataType            string   `json:"data_type"`
	NullCount           uint64   `json:"null_count"`
	BlankCount          *int64   `json:"blank_count,omitempty"`         // string only
	MinValue            *float64 `json:"min_value,omitempty"`           // numeric only
	MaxValue            *float64 `json:"max_value,omitempty"`           // numeric only
	AvgValue            *float64 `json:"avg_value,omitempty"`           // numeric only
	StddevValue         *float64 `json:"stddev_value,omitempty"`        // numeric only (Population StdDev)
	MostFrequentValue   *string  `json:"most_frequent_value,omitempty"` // pointer to handle NULL as most frequent
	ProfilingDurationMs int64    `json:"profiling_duration_ms"`
}

ColumnMetrics represents the metrics of a column.

type ConnectionConfig added in v0.1.0

type ConnectionConfig struct {
	Host     string `yaml:"host"`
	Port     int    `yaml:"port"`
	Username string `yaml:"username"`
	Password string `yaml:"password"`
	Database string `yaml:"database,omitempty"`
}

type DataQualityCheck added in v0.1.0

type DataQualityCheck struct {
	ID          string       `yaml:"id"`
	Description string       `yaml:"description,omitempty"` // optional
	OnFail      OnFailAction `yaml:"on_fail,omitempty"`     // optional (error, warn)
	Query       string       `yaml:"query,omitempty"`       // optional raw query
}

type DataQualityCheckType added in v0.1.0

type DataQualityCheckType string

DataQualityCheckType represents the type of data quality check.

type DataSource

type DataSource struct {
	ID            string           `yaml:"id"`
	Type          DataSourceType   `yaml:"type"`
	Configuration ConnectionConfig `yaml:"configuration"`
	Datasets      []string         `yaml:"datasets"`
}

type DataSourceType added in v0.0.8

type DataSourceType string
const (
	DataSourceTypeClickhouse DataSourceType = "clickhouse"
	DataSourceTypePostgresql DataSourceType = "postgresql"
	DataSourceTypeMysql      DataSourceType = "mysql"
)

type DbqConfig

type DbqConfig struct {
	Version     string       `yaml:"version"`
	DataSources []DataSource `yaml:"datasources"`
}

type DbqConnector

type DbqConnector interface {
	// Ping checks if the connection to the data source is alive.
	Ping(ctx context.Context) (string, error)

	// ImportDatasets imports datasets from the data source, with an optional filter.
	ImportDatasets(ctx context.Context, filter string) ([]string, error)
}

DbqConnector is the interface that wraps the basic connector methods.

type DbqDataProfiler added in v0.1.0

type DbqDataProfiler interface {
	// ProfileDataset is an entry point that runs profiling process by tying all specific profiling calls together
	// todo: consider extracting it into separate entity
	ProfileDataset(ctx context.Context, dataset string, sample bool, maxConcurrent int) (*TableMetrics, error)

	GetColumns(ctx context.Context, databaseName string, tableName string) ([]*ColumnInfo, error)
	GetTotalRows(ctx context.Context, dataset string) (uint64, error)
	GetNullCount(ctx context.Context, dataset string, column *ColumnInfo) (uint64, error)
	GetBlankCount(ctx context.Context, dataset string, column *ColumnInfo) (int64, error)
	GetNumericStats(ctx context.Context, dataset string, column *ColumnInfo) (*NumericStats, error)
	GetMostFrequentValue(ctx context.Context, dataset string, column *ColumnInfo) (*string, error)
	GetSampleData(ctx context.Context, dataset string) ([]map[string]interface{}, error)
	IsNumericType(dataType string) bool
	IsStringType(dataType string) bool
}

DbqDataProfiler is the interface that wraps the basic data profiling methods

type DbqDataValidator added in v0.1.0

type DbqDataValidator interface {
	// RunCheck runs a data quality check and returns the result.
	RunCheck(ctx context.Context, check *DataQualityCheck, dataset string, defaultWhere string) (bool, string, error)
}

DbqDataValidator is the interface that wraps the basic data validation methods.

type NumericStats added in v0.2.0

type NumericStats struct {
	MinValue    *float64
	MaxValue    *float64
	AvgValue    *float64
	StddevValue *float64
}

NumericStats represents the numeric statistics of a column.

type OnFailAction

type OnFailAction string
const (
	OnFailActionError   OnFailAction = "error"
	OnFailActionWarning OnFailAction = "warn"
)

type TableMetrics

type TableMetrics struct {
	ProfiledAt          int64                     `json:"profiled_at"`
	TableName           string                    `json:"table_name"`
	DatabaseName        string                    `json:"database_name"`
	TotalRows           uint64                    `json:"total_rows"`
	ColumnsMetrics      map[string]*ColumnMetrics `json:"columns_metrics"`
	RowsSample          []map[string]interface{}  `json:"rows_sample"`
	ProfilingDurationMs int64                     `json:"profiling_duration_ms"`
	DbqErrors           []error                   `json:"__dbq_errors"`
}

TableMetrics represents the metrics of a table.

type TaskPool added in v0.0.5

type TaskPool struct {
	// contains filtered or unexported fields
}

func NewTaskPool added in v0.0.5

func NewTaskPool(poolSize int, logger *slog.Logger) *TaskPool

func (*TaskPool) Enqueue added in v0.0.5

func (tp *TaskPool) Enqueue(id string, task func() error)

func (*TaskPool) Errors added in v0.0.6

func (tp *TaskPool) Errors() []error

func (*TaskPool) Join added in v0.0.5

func (tp *TaskPool) Join()

type ValidationResult added in v0.0.7

type ValidationResult struct {
	CheckID      string `json:"check_id"`
	Pass         bool   `json:"pass"`
	ActualResult string `json:"actual_result,omitempty"`
	Message      string `json:"message,omitempty"`
}

ValidationResult represents the result of a data quality check.

type ValidationRule added in v0.1.0

type ValidationRule struct {
	Dataset string             `yaml:"dataset"`
	Where   string             `yaml:"where,omitempty"` // optional, applies for all checks
	Checks  []DataQualityCheck `yaml:"checks"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL