Documentation
¶
Index ¶
- Constants
- type ChecksFileConfig
- type ColumnInfo
- type ColumnMetrics
- type ConnectionConfig
- type DataQualityCheck
- type DataQualityCheckType
- type DataSource
- type DataSourceType
- type DbqConfig
- type DbqConnector
- type DbqDataProfiler
- type DbqDataValidator
- type NumericStats
- type OnFailAction
- type TableMetrics
- type TaskPool
- type ValidationResult
- type ValidationRule
Constants ¶
View Source
const (
// CheckTypeRawQuery is a data quality check that uses a raw SQL query.
CheckTypeRawQuery = "raw_query"
)
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ChecksFileConfig ¶ added in v0.1.0
type ChecksFileConfig struct {
Version string `yaml:"version"`
Validations []ValidationRule `yaml:"validations"`
}
func LoadChecksFileConfig ¶ added in v0.1.0
func LoadChecksFileConfig(fileName string) (*ChecksFileConfig, error)
type ColumnInfo ¶
ColumnInfo represents the basic information of a column.
type ColumnMetrics ¶
type ColumnMetrics struct {
ColumnName string `json:"col_name"`
ColumnComment string `json:"col_comment"`
ColumnPosition uint `json:"col_position"`
DataType string `json:"data_type"`
NullCount uint64 `json:"null_count"`
BlankCount *int64 `json:"blank_count,omitempty"` // string only
MinValue *float64 `json:"min_value,omitempty"` // numeric only
MaxValue *float64 `json:"max_value,omitempty"` // numeric only
AvgValue *float64 `json:"avg_value,omitempty"` // numeric only
StddevValue *float64 `json:"stddev_value,omitempty"` // numeric only (Population StdDev)
MostFrequentValue *string `json:"most_frequent_value,omitempty"` // pointer to handle NULL as most frequent
ProfilingDurationMs int64 `json:"profiling_duration_ms"`
}
ColumnMetrics represents the metrics of a column.
type ConnectionConfig ¶ added in v0.1.0
type DataQualityCheck ¶ added in v0.1.0
type DataQualityCheck struct {
ID string `yaml:"id"`
Description string `yaml:"description,omitempty"` // optional
OnFail OnFailAction `yaml:"on_fail,omitempty"` // optional (error, warn)
Query string `yaml:"query,omitempty"` // optional raw query
}
type DataQualityCheckType ¶ added in v0.1.0
type DataQualityCheckType string
DataQualityCheckType represents the type of data quality check.
type DataSource ¶
type DataSource struct {
ID string `yaml:"id"`
Type DataSourceType `yaml:"type"`
Configuration ConnectionConfig `yaml:"configuration"`
Datasets []string `yaml:"datasets"`
}
type DataSourceType ¶ added in v0.0.8
type DataSourceType string
const ( DataSourceTypeClickhouse DataSourceType = "clickhouse" DataSourceTypePostgresql DataSourceType = "postgresql" DataSourceTypeMysql DataSourceType = "mysql" )
type DbqConfig ¶
type DbqConfig struct {
Version string `yaml:"version"`
DataSources []DataSource `yaml:"datasources"`
}
type DbqConnector ¶
type DbqConnector interface {
// Ping checks if the connection to the data source is alive.
Ping(ctx context.Context) (string, error)
// ImportDatasets imports datasets from the data source, with an optional filter.
ImportDatasets(ctx context.Context, filter string) ([]string, error)
}
DbqConnector is the interface that wraps the basic connector methods.
type DbqDataProfiler ¶ added in v0.1.0
type DbqDataProfiler interface {
// ProfileDataset is an entry point that runs profiling process by tying all specific profiling calls together
// todo: consider extracting it into separate entity
ProfileDataset(ctx context.Context, dataset string, sample bool, maxConcurrent int) (*TableMetrics, error)
GetColumns(ctx context.Context, databaseName string, tableName string) ([]*ColumnInfo, error)
GetTotalRows(ctx context.Context, dataset string) (uint64, error)
GetNullCount(ctx context.Context, dataset string, column *ColumnInfo) (uint64, error)
GetBlankCount(ctx context.Context, dataset string, column *ColumnInfo) (int64, error)
GetNumericStats(ctx context.Context, dataset string, column *ColumnInfo) (*NumericStats, error)
GetMostFrequentValue(ctx context.Context, dataset string, column *ColumnInfo) (*string, error)
GetSampleData(ctx context.Context, dataset string) ([]map[string]interface{}, error)
IsNumericType(dataType string) bool
IsStringType(dataType string) bool
}
DbqDataProfiler is the interface that wraps the basic data profiling methods
type DbqDataValidator ¶ added in v0.1.0
type DbqDataValidator interface {
// RunCheck runs a data quality check and returns the result.
RunCheck(ctx context.Context, check *DataQualityCheck, dataset string, defaultWhere string) (bool, string, error)
}
DbqDataValidator is the interface that wraps the basic data validation methods.
type NumericStats ¶ added in v0.2.0
type NumericStats struct {
MinValue *float64
MaxValue *float64
AvgValue *float64
StddevValue *float64
}
NumericStats represents the numeric statistics of a column.
type OnFailAction ¶
type OnFailAction string
const ( OnFailActionError OnFailAction = "error" OnFailActionWarning OnFailAction = "warn" )
type TableMetrics ¶
type TableMetrics struct {
ProfiledAt int64 `json:"profiled_at"`
TableName string `json:"table_name"`
DatabaseName string `json:"database_name"`
TotalRows uint64 `json:"total_rows"`
ColumnsMetrics map[string]*ColumnMetrics `json:"columns_metrics"`
RowsSample []map[string]interface{} `json:"rows_sample"`
ProfilingDurationMs int64 `json:"profiling_duration_ms"`
DbqErrors []error `json:"__dbq_errors"`
}
TableMetrics represents the metrics of a table.
type TaskPool ¶ added in v0.0.5
type TaskPool struct {
// contains filtered or unexported fields
}
type ValidationResult ¶ added in v0.0.7
type ValidationResult struct {
CheckID string `json:"check_id"`
Pass bool `json:"pass"`
ActualResult string `json:"actual_result,omitempty"`
Message string `json:"message,omitempty"`
}
ValidationResult represents the result of a data quality check.
type ValidationRule ¶ added in v0.1.0
type ValidationRule struct {
Dataset string `yaml:"dataset"`
Where string `yaml:"where,omitempty"` // optional, applies for all checks
Checks []DataQualityCheck `yaml:"checks"`
}
Source Files
¶
Click to show internal directories.
Click to hide internal directories.