semantic

package
v0.26.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 22, 2026 License: Apache-2.0 Imports: 9 Imported by: 0

Documentation

Overview

Package semantic provides semantic layer abstractions.

Package semantic provides semantic layer abstractions.

Package semantic provides abstractions for semantic metadata providers.

Index

Constants

View Source
const MaxStringLength = 2000

MaxStringLength is the maximum length for sanitized strings.

Variables

View Source
var DefaultInjectionLogger = &InjectionLogger{
	logFunc: log.Printf,
}

DefaultInjectionLogger is the default logger for injection attempts.

Functions

This section is empty.

Types

type CacheConfig

type CacheConfig struct {
	TTL time.Duration
}

CacheConfig configures the cache.

type CachedProvider

type CachedProvider struct {
	// contains filtered or unexported fields
}

CachedProvider wraps a Provider with caching.

func NewCachedProvider

func NewCachedProvider(provider Provider, cfg CacheConfig) *CachedProvider

NewCachedProvider creates a caching wrapper around a provider.

func (*CachedProvider) Close

func (c *CachedProvider) Close() error

Close closes the underlying provider.

func (*CachedProvider) GetColumnContext

func (c *CachedProvider) GetColumnContext(ctx context.Context, column ColumnIdentifier) (*ColumnContext, error)

GetColumnContext retrieves column context with caching.

func (*CachedProvider) GetColumnsContext

func (c *CachedProvider) GetColumnsContext(ctx context.Context, table TableIdentifier) (map[string]*ColumnContext, error)

GetColumnsContext retrieves columns context with caching.

func (*CachedProvider) GetCuratedQueryCount added in v0.25.0

func (c *CachedProvider) GetCuratedQueryCount(ctx context.Context, urn string) (int, error)

GetCuratedQueryCount retrieves curated query count with caching.

func (*CachedProvider) GetGlossaryTerm

func (c *CachedProvider) GetGlossaryTerm(ctx context.Context, urn string) (*GlossaryTerm, error)

GetGlossaryTerm retrieves a glossary term with caching.

func (*CachedProvider) GetLineage

func (c *CachedProvider) GetLineage(ctx context.Context, table TableIdentifier, direction LineageDirection, maxDepth int) (*LineageInfo, error)

GetLineage retrieves lineage with caching.

func (*CachedProvider) GetTableContext

func (c *CachedProvider) GetTableContext(ctx context.Context, table TableIdentifier) (*TableContext, error)

GetTableContext retrieves table context with caching.

func (*CachedProvider) Invalidate

func (c *CachedProvider) Invalidate()

Invalidate clears the cache.

func (*CachedProvider) Name

func (c *CachedProvider) Name() string

Name returns the underlying provider name.

func (*CachedProvider) SearchTables

func (c *CachedProvider) SearchTables(ctx context.Context, filter SearchFilter) ([]TableSearchResult, error)

SearchTables searches without caching (queries vary too much).

type ColumnContext

type ColumnContext struct {
	// Basic info
	Name        string `json:"name"`
	Description string `json:"description,omitempty"`

	// Classification
	Tags          []string       `json:"tags,omitempty"`
	GlossaryTerms []GlossaryTerm `json:"glossary_terms,omitempty"`

	// Sensitivity
	IsPII       bool `json:"is_pii,omitempty"`
	IsSensitive bool `json:"is_sensitive,omitempty"`

	// Business metadata
	BusinessName string `json:"business_name,omitempty"`

	// InheritedFrom is set when metadata was inherited from upstream lineage.
	InheritedFrom *InheritedMetadata `json:"inherited_from,omitempty"`
}

ColumnContext provides semantic context for a column.

func (*ColumnContext) HasContent added in v0.24.0

func (c *ColumnContext) HasContent() bool

HasContent reports whether the column has any meaningful metadata worth including in enrichment responses. Columns with no description, tags, glossary terms, sensitivity flags, business name, or inherited metadata are considered empty and can be omitted to save tokens.

type ColumnIdentifier

type ColumnIdentifier struct {
	TableIdentifier
	Column string `json:"column"`
}

ColumnIdentifier uniquely identifies a column.

func (ColumnIdentifier) String

func (c ColumnIdentifier) String() string

String returns a dot-separated representation including the column.

type Deprecation

type Deprecation struct {
	Deprecated bool       `json:"deprecated"`
	Note       string     `json:"note,omitempty"`
	Actor      string     `json:"actor,omitempty"`
	DecommDate *time.Time `json:"decommission_date,omitempty"`
}

Deprecation indicates if an entity is deprecated.

type Domain

type Domain struct {
	URN         string `json:"urn"`
	Name        string `json:"name"`
	Description string `json:"description,omitempty"`
}

Domain represents a data domain.

type GlossaryTerm

type GlossaryTerm struct {
	URN         string `json:"urn"`
	Name        string `json:"name"`
	Description string `json:"description,omitempty"`
}

GlossaryTerm represents a business glossary term.

type InheritedMetadata added in v0.8.0

type InheritedMetadata struct {
	// SourceURN is the DataHub URN of the upstream dataset.
	SourceURN string `json:"source_urn"`

	// SourceColumn is the column name in the upstream dataset.
	SourceColumn string `json:"source_column"`

	// Hops is the distance from the target dataset (1 = direct upstream).
	Hops int `json:"hops"`

	// MatchMethod indicates how the column was matched.
	// Values: "column_lineage", "name_exact", "name_transformed", "alias"
	MatchMethod string `json:"match_method"`
}

InheritedMetadata tracks the provenance of inherited column metadata.

type InjectionLogger added in v0.2.0

type InjectionLogger struct {
	// contains filtered or unexported fields
}

InjectionLogger logs detected prompt injection attempts.

func (*InjectionLogger) DetectAndLog added in v0.2.0

func (l *InjectionLogger) DetectAndLog(sanitizer *Sanitizer, source, field, input string) bool

DetectAndLog detects injection patterns in the input and logs if found. Returns true if injection was detected.

func (*InjectionLogger) Disable added in v0.2.0

func (l *InjectionLogger) Disable()

Disable disables injection logging.

func (*InjectionLogger) Enable added in v0.2.0

func (l *InjectionLogger) Enable()

Enable enables injection logging.

func (*InjectionLogger) LogInjectionAttempt added in v0.2.0

func (l *InjectionLogger) LogInjectionAttempt(source, field string, patterns []string)

LogInjectionAttempt logs a detected injection attempt.

func (*InjectionLogger) SetLogFunc added in v0.2.0

func (l *InjectionLogger) SetLogFunc(f func(format string, args ...any))

SetLogFunc sets the logging function for injection attempts.

type LineageDirection

type LineageDirection string

LineageDirection indicates the direction of lineage traversal.

const (
	LineageUpstream   LineageDirection = "upstream"
	LineageDownstream LineageDirection = "downstream"
)

Lineage direction constants.

type LineageEdge

type LineageEdge struct {
	URN            string `json:"urn"`
	Type           string `json:"type,omitempty"`
	TransformLogic string `json:"transform_logic,omitempty"`
}

LineageEdge represents an edge in the lineage graph.

type LineageEntity

type LineageEntity struct {
	URN      string        `json:"urn"`
	Type     string        `json:"type"`
	Name     string        `json:"name"`
	Platform string        `json:"platform,omitempty"`
	Depth    int           `json:"depth"`
	Parents  []LineageEdge `json:"parents,omitempty"`
	Children []LineageEdge `json:"children,omitempty"`
	Context  *TableContext `json:"context,omitempty"`
}

LineageEntity represents an entity in a lineage graph.

type LineageInfo

type LineageInfo struct {
	Direction LineageDirection `json:"direction"`
	Entities  []LineageEntity  `json:"entities"`
	MaxDepth  int              `json:"max_depth"`
}

LineageInfo contains lineage information for an entity.

type NoopProvider

type NoopProvider struct{}

NoopProvider is a no-op implementation for testing.

func NewNoopProvider

func NewNoopProvider() *NoopProvider

NewNoopProvider creates a new no-op provider.

func (*NoopProvider) Close

func (*NoopProvider) Close() error

Close does nothing.

func (*NoopProvider) GetColumnContext

func (*NoopProvider) GetColumnContext(_ context.Context, _ ColumnIdentifier) (*ColumnContext, error)

GetColumnContext returns empty context.

func (*NoopProvider) GetColumnsContext

func (*NoopProvider) GetColumnsContext(_ context.Context, _ TableIdentifier) (map[string]*ColumnContext, error)

GetColumnsContext returns empty map.

func (*NoopProvider) GetCuratedQueryCount added in v0.25.0

func (*NoopProvider) GetCuratedQueryCount(_ context.Context, _ string) (int, error)

GetCuratedQueryCount returns zero for the noop provider.

func (*NoopProvider) GetGlossaryTerm

func (*NoopProvider) GetGlossaryTerm(_ context.Context, _ string) (*GlossaryTerm, error)

GetGlossaryTerm returns an empty term.

func (*NoopProvider) GetLineage

func (*NoopProvider) GetLineage(_ context.Context, _ TableIdentifier, dir LineageDirection, maxDepth int) (*LineageInfo, error)

GetLineage returns empty lineage.

func (*NoopProvider) GetTableContext

func (*NoopProvider) GetTableContext(_ context.Context, _ TableIdentifier) (*TableContext, error)

GetTableContext returns empty context.

func (*NoopProvider) Name

func (*NoopProvider) Name() string

Name returns the provider name.

func (*NoopProvider) SearchTables

SearchTables returns empty results.

type Owner

type Owner struct {
	URN   string    `json:"urn"`
	Type  OwnerType `json:"type"`
	Name  string    `json:"name,omitempty"`
	Email string    `json:"email,omitempty"`
}

Owner represents a data owner.

type OwnerType

type OwnerType string

OwnerType indicates the type of owner.

const (
	OwnerTypeUser  OwnerType = "user"
	OwnerTypeGroup OwnerType = "group"
)

Owner type constants.

type Provider

type Provider interface {
	// Name returns the provider name.
	Name() string

	// GetTableContext retrieves semantic context for a table.
	GetTableContext(ctx context.Context, table TableIdentifier) (*TableContext, error)

	// GetColumnContext retrieves semantic context for a single column.
	GetColumnContext(ctx context.Context, column ColumnIdentifier) (*ColumnContext, error)

	// GetColumnsContext retrieves semantic context for all columns of a table.
	GetColumnsContext(ctx context.Context, table TableIdentifier) (map[string]*ColumnContext, error)

	// GetLineage retrieves lineage information for a table.
	GetLineage(ctx context.Context, table TableIdentifier, direction LineageDirection, maxDepth int) (*LineageInfo, error)

	// GetGlossaryTerm retrieves a glossary term by URN.
	GetGlossaryTerm(ctx context.Context, urn string) (*GlossaryTerm, error)

	// SearchTables searches for tables matching the filter.
	SearchTables(ctx context.Context, filter SearchFilter) ([]TableSearchResult, error)

	// GetCuratedQueryCount returns the number of curated/saved queries for a dataset.
	GetCuratedQueryCount(ctx context.Context, urn string) (int, error)

	// Close releases resources.
	Close() error
}

Provider retrieves semantic metadata from catalog systems. DataHub implements this. Future alternatives (Atlas, Unity Catalog) can too.

type SanitizeConfig added in v0.2.0

type SanitizeConfig struct {
	// MaxLength is the maximum length for strings (default: 2000).
	MaxLength int

	// StripInjectionPatterns removes detected injection patterns instead of flagging.
	StripInjectionPatterns bool

	// LogInjectionAttempts enables logging of detected injection attempts.
	LogInjectionAttempts bool
}

SanitizeConfig configures sanitization behavior.

func DefaultSanitizeConfig added in v0.2.0

func DefaultSanitizeConfig() SanitizeConfig

DefaultSanitizeConfig returns a safe default configuration.

type Sanitizer added in v0.2.0

type Sanitizer struct {
	// contains filtered or unexported fields
}

Sanitizer sanitizes metadata strings to prevent prompt injection and other attacks.

func NewSanitizer added in v0.2.0

func NewSanitizer(cfg SanitizeConfig) *Sanitizer

NewSanitizer creates a new sanitizer with the given configuration.

func (*Sanitizer) DetectInjection added in v0.2.0

func (*Sanitizer) DetectInjection(input string) (detected bool, patterns []string)

DetectInjection checks if the input contains potential prompt injection patterns. Returns true if injection is detected along with matched patterns.

func (*Sanitizer) SanitizeColumnContext added in v0.2.0

func (s *Sanitizer) SanitizeColumnContext(cc *ColumnContext) *ColumnContext

SanitizeColumnContext sanitizes all string fields in a ColumnContext.

func (*Sanitizer) SanitizeDescription added in v0.2.0

func (s *Sanitizer) SanitizeDescription(desc string) string

SanitizeDescription sanitizes a description field.

func (*Sanitizer) SanitizeString added in v0.2.0

func (s *Sanitizer) SanitizeString(input string) string

SanitizeString sanitizes a string by removing control characters, truncating to max length, and optionally stripping injection patterns.

func (*Sanitizer) SanitizeTableContext added in v0.2.0

func (s *Sanitizer) SanitizeTableContext(tc *TableContext) *TableContext

SanitizeTableContext sanitizes all string fields in a TableContext.

func (*Sanitizer) SanitizeTag added in v0.2.0

func (*Sanitizer) SanitizeTag(tag string) string

SanitizeTag validates and sanitizes a tag name. Returns empty string if the tag is invalid.

func (*Sanitizer) SanitizeTags added in v0.2.0

func (s *Sanitizer) SanitizeTags(tags []string) []string

SanitizeTags sanitizes a slice of tags, removing invalid ones.

type SearchFilter

type SearchFilter struct {
	Query    string   `json:"query"`
	Platform string   `json:"platform,omitempty"`
	Tags     []string `json:"tags,omitempty"`
	Domain   string   `json:"domain,omitempty"`
	Owner    string   `json:"owner,omitempty"`
	Limit    int      `json:"limit,omitempty"`
	Offset   int      `json:"offset,omitempty"`
}

SearchFilter defines criteria for searching tables.

type TableContext

type TableContext struct {
	// Basic info
	URN         string `json:"urn,omitempty"`
	Description string `json:"description,omitempty"`

	// Ownership
	Owners []Owner `json:"owners,omitempty"`

	// Classification
	Tags          []string       `json:"tags,omitempty"`
	GlossaryTerms []GlossaryTerm `json:"glossary_terms,omitempty"`
	Domain        *Domain        `json:"domain,omitempty"`

	// Status
	Deprecation *Deprecation `json:"deprecation,omitempty"`

	// Quality
	QualityScore *float64 `json:"quality_score,omitempty"`

	// Metadata
	CustomProperties map[string]string `json:"custom_properties,omitempty"`
	LastModified     *time.Time        `json:"last_modified,omitempty"`
}

TableContext provides semantic context for a table.

type TableIdentifier

type TableIdentifier struct {
	Catalog string `json:"catalog,omitempty"`
	Schema  string `json:"schema"`
	Table   string `json:"table"`
}

TableIdentifier uniquely identifies a table.

func (TableIdentifier) String

func (t TableIdentifier) String() string

String returns a dot-separated representation.

type TableSearchResult

type TableSearchResult struct {
	URN          string   `json:"urn"`
	Name         string   `json:"name"`
	Platform     string   `json:"platform,omitempty"`
	Description  string   `json:"description,omitempty"`
	Tags         []string `json:"tags,omitempty"`
	Domain       string   `json:"domain,omitempty"`
	MatchedField string   `json:"matched_field,omitempty"`
}

TableSearchResult represents a search result.

type URNResolver

type URNResolver interface {
	// ResolveURN converts a URN to a table identifier.
	ResolveURN(ctx context.Context, urn string) (*TableIdentifier, error)

	// BuildURN creates a URN from a table identifier.
	BuildURN(ctx context.Context, table TableIdentifier) (string, error)
}

URNResolver can resolve URNs to table identifiers.

Directories

Path Synopsis
Package datahub provides a DataHub implementation of the semantic provider.
Package datahub provides a DataHub implementation of the semantic provider.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL