corpus

package
v0.23.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 21, 2026 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

View Source
const (
	SplitTrain = "train"
	SplitTest  = "test"
)

Split constants for deterministic train/test partition labels.

Variables

This section is empty.

Functions

func ResolveSource

func ResolveSource(src SourceConfig, cacheDir string) (string, error)

ResolveSource ensures a source is available locally and returns the local root directory.

func Split

func Split(records []Record, testFraction float64) (train []Record, test []Record)

Split deterministically partitions records into train and test sets.

func WriteJSON

func WriteJSON(path string, value any) error

WriteJSON writes an indented JSON document.

func WriteManifest

func WriteManifest(path string, records []Record) error

WriteManifest writes manifest records as JSONL.

func WriteQASample

func WriteQASample(path string, records []QASampleRecord) error

WriteQASample writes qa-sample JSONL records.

Types

type BuildReport

type BuildReport struct {
	DatasetVersion string           `json:"dataset_version"`
	CollectedAt    string           `json:"collected_at"`
	FilesCollected int              `json:"files_collected"`
	FilesKept      int              `json:"files_kept"`
	FilesDeduped   int              `json:"files_deduped"`
	Taxonomy       map[Category]int `json:"taxonomy"`
	Split          SplitSummary     `json:"split"`
	Metrics        MetricSummary    `json:"metrics"`
}

BuildReport summarizes corpus build output.

func ReadBuildReport

func ReadBuildReport(path string) (BuildReport, error)

ReadBuildReport reads a build report JSON document.

type BuildResult

type BuildResult struct {
	Manifest []Record
	Report   BuildReport
	QASample []QASampleRecord
}

BuildResult contains build artifacts.

func Build

func Build(cfg *Config, cacheDir string) (*BuildResult, error)

Build runs collect -> dedup -> classify -> split and produces reports.

type Category

type Category string

Category is the taxonomy label for a corpus record.

const (
	CategoryReference Category = "reference"
	CategoryOther     Category = "other"
)

Category constants for corpus taxonomy labels.

type Config

type Config struct {
	DatasetVersion    string         `yaml:"dataset_version" json:"dataset_version"`
	CollectedAt       string         `yaml:"collected_at" json:"collected_at"`
	MinWords          int            `yaml:"min_words" json:"min_words"`
	MinChars          int            `yaml:"min_chars" json:"min_chars"`
	TestFraction      float64        `yaml:"test_fraction" json:"test_fraction"`
	LicenseAllowlist  []string       `yaml:"license_allowlist" json:"license_allowlist"`
	QASampleLimit     int            `yaml:"qa_sample_limit" json:"qa_sample_limit"`
	Sources           []SourceConfig `yaml:"sources" json:"sources"`
	ResolvedFromLocal bool           `yaml:"-" json:"-"`
	Progress          func(string)   `yaml:"-" json:"-"`
}

Config defines corpus collection settings.

func LoadConfig

func LoadConfig(path string) (*Config, error)

LoadConfig loads config from YAML, applies defaults, and merges config.local.yml overrides.

type DriftReport

type DriftReport struct {
	BaselineVersion  string           `json:"baseline_version"`
	CandidateVersion string           `json:"candidate_version"`
	FilesKeptDelta   int              `json:"files_kept_delta"`
	TaxonomyDeltas   map[Category]int `json:"taxonomy_deltas"`
	MetricDeltas     MetricSummary    `json:"metric_deltas"`
}

DriftReport summarizes changes between two build reports.

func CompareReports

func CompareReports(baseline BuildReport, candidate BuildReport) *DriftReport

CompareReports compares two build reports and returns a drift report.

type GitRunner

type GitRunner interface {
	Run(args []string) ([]byte, error)
}

GitRunner executes git commands.

type MeasureCategoryStats

type MeasureCategoryStats struct {
	Count    int     `json:"count"`
	AvgWords float64 `json:"avg_words"`
	AvgChars float64 `json:"avg_chars"`
}

MeasureCategoryStats holds measurement summary by category.

type MeasureReport

type MeasureReport struct {
	CorpusPath string                            `json:"corpus_path"`
	Total      int                               `json:"total"`
	Categories map[Category]MeasureCategoryStats `json:"categories"`
}

MeasureReport is written by the measure subcommand.

type MetricSummary

type MetricSummary struct {
	AvgWords float64 `json:"avg_words"`
	AvgChars float64 `json:"avg_chars"`
}

MetricSummary provides build-level aggregate metrics.

type QAAnnotation

type QAAnnotation struct {
	RecordID       string   `json:"record_id"`
	ActualCategory Category `json:"actual_category"`
}

QAAnnotation stores one manual annotation.

func ReadQAAnnotationsCSV

func ReadQAAnnotationsCSV(path string) ([]QAAnnotation, error)

ReadQAAnnotationsCSV reads CSV annotations in the form record_id,actual_category.

type QAAnnotationTemplateStats

type QAAnnotationTemplateStats struct {
	Total     int `json:"total"`
	Preserved int `json:"preserved"`
}

QAAnnotationTemplateStats reports template generation outcomes.

func WriteQAAnnotationTemplateCSV

func WriteQAAnnotationTemplateCSV(
	path string,
	sample []QASampleRecord,
	existing []QAAnnotation,
) (QAAnnotationTemplateStats, error)

WriteQAAnnotationTemplateCSV writes an annotation CSV aligned to a QA sample.

type QACategoryMetrics

type QACategoryMetrics struct {
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
	Support   int     `json:"support"`
}

QACategoryMetrics provides one-vs-rest metrics.

type QAReport

type QAReport struct {
	Total      int                            `json:"total"`
	Annotated  int                            `json:"annotated"`
	Coverage   float64                        `json:"coverage"`
	Accuracy   float64                        `json:"accuracy"`
	Kappa      *float64                       `json:"kappa,omitempty"`
	Categories map[Category]QACategoryMetrics `json:"categories"`
}

QAReport summarizes annotation quality.

func EvaluateQA

func EvaluateQA(sample []QASampleRecord, annotations []QAAnnotation) (*QAReport, error)

EvaluateQA compares predicted labels with manual annotations.

type QASampleRecord

type QASampleRecord struct {
	RecordID          string   `json:"record_id"`
	PredictedCategory Category `json:"predicted_category"`
	Source            string   `json:"source"`
	Path              string   `json:"path"`
}

QASampleRecord is a row in the annotation sample.

func ReadQASample

func ReadQASample(path string) ([]QASampleRecord, error)

ReadQASample reads qa-sample JSONL records.

type Record

type Record struct {
	RecordID       string   `json:"record_id"`
	Source         string   `json:"source"`
	Repository     string   `json:"repository,omitempty"`
	CommitSHA      string   `json:"commit_sha,omitempty"`
	License        string   `json:"license,omitempty"`
	Path           string   `json:"path"`
	Category       Category `json:"category"`
	Split          string   `json:"split,omitempty"`
	Words          int      `json:"words"`
	Chars          int      `json:"chars"`
	ContentSHA256  string   `json:"content_sha256,omitempty"`
	CollectedAt    string   `json:"collected_at,omitempty"`
	SourceResolved string   `json:"-"`
	RawContent     string   `json:"-"`
}

Record contains one collected markdown record.

func Classify

func Classify(records []Record) []Record

Classify assigns taxonomy categories to records.

func Collect

func Collect(cfg *Config, cacheDir string) ([]Record, error)

Collect gathers markdown records from configured sources.

func Dedup

func Dedup(records []Record) []Record

Dedup removes exact duplicates by content hash while keeping first occurrence. When ContentSHA256 is already set on a record it is used as-is; callers must provide a normalized hash if setting ContentSHA256 directly.

type SourceConfig

type SourceConfig struct {
	Name        string            `yaml:"name" json:"name"`
	Repository  string            `yaml:"repository" json:"repository"`
	Root        string            `yaml:"root" json:"root"`
	CommitSHA   string            `yaml:"commit_sha" json:"commit_sha"`
	License     string            `yaml:"license" json:"license"`
	Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"`
}

SourceConfig defines one configured source repository.

type SplitSummary

type SplitSummary struct {
	Train int `json:"train"`
	Test  int `json:"test"`
}

SplitSummary holds train/test counts.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL