corpus

package

v0.23.0 Latest Latest Go to latest Published: May 21, 2026 License: MIT Imports: 17 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/jeduden/mdsmith

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func ResolveSource(src SourceConfig, cacheDir string) (string, error)
func Split(records []Record, testFraction float64) (train []Record, test []Record)
func WriteJSON(path string, value any) error
func WriteManifest(path string, records []Record) error
func WriteQASample(path string, records []QASampleRecord) error
type BuildReport
- func ReadBuildReport(path string) (BuildReport, error)
type BuildResult
- func Build(cfg *Config, cacheDir string) (*BuildResult, error)
type Category
type Config
- func LoadConfig(path string) (*Config, error)
type DriftReport
- func CompareReports(baseline BuildReport, candidate BuildReport) *DriftReport
type GitRunner
type MeasureCategoryStats
type MeasureReport
type MetricSummary
type QAAnnotation
- func ReadQAAnnotationsCSV(path string) ([]QAAnnotation, error)
type QAAnnotationTemplateStats
- func WriteQAAnnotationTemplateCSV(path string, sample []QASampleRecord, existing []QAAnnotation) (QAAnnotationTemplateStats, error)
type QACategoryMetrics
type QAReport
- func EvaluateQA(sample []QASampleRecord, annotations []QAAnnotation) (*QAReport, error)
type QASampleRecord
- func ReadQASample(path string) ([]QASampleRecord, error)
type Record
- func Classify(records []Record) []Record
- func Collect(cfg *Config, cacheDir string) ([]Record, error)
- func Dedup(records []Record) []Record
type SourceConfig
type SplitSummary

Constants ¶

View Source

const (
	SplitTrain = "train"
	SplitTest  = "test"
)

Split constants for deterministic train/test partition labels.

Variables ¶

This section is empty.

Functions ¶

func ResolveSource ¶

func ResolveSource(src SourceConfig, cacheDir string) (string, error)

ResolveSource ensures a source is available locally and returns the local root directory.

func Split ¶

func Split(records []Record, testFraction float64) (train []Record, test []Record)

Split deterministically partitions records into train and test sets.

func WriteJSON ¶

func WriteJSON(path string, value any) error

WriteJSON writes an indented JSON document.

func WriteManifest ¶

func WriteManifest(path string, records []Record) error

WriteManifest writes manifest records as JSONL.

func WriteQASample ¶

func WriteQASample(path string, records []QASampleRecord) error

WriteQASample writes qa-sample JSONL records.

Types ¶

type BuildReport ¶

type BuildReport struct {
	DatasetVersion string           `json:"dataset_version"`
	CollectedAt    string           `json:"collected_at"`
	FilesCollected int              `json:"files_collected"`
	FilesKept      int              `json:"files_kept"`
	FilesDeduped   int              `json:"files_deduped"`
	Taxonomy       map[Category]int `json:"taxonomy"`
	Split          SplitSummary     `json:"split"`
	Metrics        MetricSummary    `json:"metrics"`
}

BuildReport summarizes corpus build output.

func ReadBuildReport ¶

func ReadBuildReport(path string) (BuildReport, error)

ReadBuildReport reads a build report JSON document.

func Build ¶

func Build(cfg *Config, cacheDir string) (*BuildResult, error)

Build runs collect -> dedup -> classify -> split and produces reports.

type Category ¶

type Category string

Category is the taxonomy label for a corpus record.

const (
	CategoryReference Category = "reference"
	CategoryOther     Category = "other"
)

Category constants for corpus taxonomy labels.

type Config ¶

type Config struct {
	DatasetVersion    string         `yaml:"dataset_version" json:"dataset_version"`
	CollectedAt       string         `yaml:"collected_at" json:"collected_at"`
	MinWords          int            `yaml:"min_words" json:"min_words"`
	MinChars          int            `yaml:"min_chars" json:"min_chars"`
	TestFraction      float64        `yaml:"test_fraction" json:"test_fraction"`
	LicenseAllowlist  []string       `yaml:"license_allowlist" json:"license_allowlist"`
	QASampleLimit     int            `yaml:"qa_sample_limit" json:"qa_sample_limit"`
	Sources           []SourceConfig `yaml:"sources" json:"sources"`
	ResolvedFromLocal bool           `yaml:"-" json:"-"`
	Progress          func(string)   `yaml:"-" json:"-"`
}

Config defines corpus collection settings.

func LoadConfig ¶

func LoadConfig(path string) (*Config, error)

LoadConfig loads config from YAML, applies defaults, and merges config.local.yml overrides.

type DriftReport ¶

type DriftReport struct {
	BaselineVersion  string           `json:"baseline_version"`
	CandidateVersion string           `json:"candidate_version"`
	FilesKeptDelta   int              `json:"files_kept_delta"`
	TaxonomyDeltas   map[Category]int `json:"taxonomy_deltas"`
	MetricDeltas     MetricSummary    `json:"metric_deltas"`
}

DriftReport summarizes changes between two build reports.

func CompareReports ¶

func CompareReports(baseline BuildReport, candidate BuildReport) *DriftReport

CompareReports compares two build reports and returns a drift report.

type MeasureCategoryStats ¶

type MeasureCategoryStats struct {
	Count    int     `json:"count"`
	AvgWords float64 `json:"avg_words"`
	AvgChars float64 `json:"avg_chars"`
}

MeasureCategoryStats holds measurement summary by category.

type MeasureReport ¶

type MeasureReport struct {
	CorpusPath string                            `json:"corpus_path"`
	Total      int                               `json:"total"`
	Categories map[Category]MeasureCategoryStats `json:"categories"`
}

MeasureReport is written by the measure subcommand.

type MetricSummary ¶

type MetricSummary struct {
	AvgWords float64 `json:"avg_words"`
	AvgChars float64 `json:"avg_chars"`
}

MetricSummary provides build-level aggregate metrics.

type QAAnnotation ¶

type QAAnnotation struct {
	RecordID       string   `json:"record_id"`
	ActualCategory Category `json:"actual_category"`
}

QAAnnotation stores one manual annotation.

func ReadQAAnnotationsCSV ¶

func ReadQAAnnotationsCSV(path string) ([]QAAnnotation, error)

ReadQAAnnotationsCSV reads CSV annotations in the form record_id,actual_category.

type QAAnnotationTemplateStats ¶

type QAAnnotationTemplateStats struct {
	Total     int `json:"total"`
	Preserved int `json:"preserved"`
}

QAAnnotationTemplateStats reports template generation outcomes.

func WriteQAAnnotationTemplateCSV ¶

func WriteQAAnnotationTemplateCSV(
	path string,
	sample []QASampleRecord,
	existing []QAAnnotation,
) (QAAnnotationTemplateStats, error)

WriteQAAnnotationTemplateCSV writes an annotation CSV aligned to a QA sample.

type QACategoryMetrics ¶

type QACategoryMetrics struct {
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
	Support   int     `json:"support"`
}

QACategoryMetrics provides one-vs-rest metrics.

func EvaluateQA ¶

func EvaluateQA(sample []QASampleRecord, annotations []QAAnnotation) (*QAReport, error)

EvaluateQA compares predicted labels with manual annotations.

type QASampleRecord ¶

type QASampleRecord struct {
	RecordID          string   `json:"record_id"`
	PredictedCategory Category `json:"predicted_category"`
	Source            string   `json:"source"`
	Path              string   `json:"path"`
}

QASampleRecord is a row in the annotation sample.

func ReadQASample ¶

func ReadQASample(path string) ([]QASampleRecord, error)

ReadQASample reads qa-sample JSONL records.

type Record ¶

type Record struct {
	RecordID       string   `json:"record_id"`
	Source         string   `json:"source"`
	Repository     string   `json:"repository,omitempty"`
	CommitSHA      string   `json:"commit_sha,omitempty"`
	License        string   `json:"license,omitempty"`
	Path           string   `json:"path"`
	Category       Category `json:"category"`
	Split          string   `json:"split,omitempty"`
	Words          int      `json:"words"`
	Chars          int      `json:"chars"`
	ContentSHA256  string   `json:"content_sha256,omitempty"`
	CollectedAt    string   `json:"collected_at,omitempty"`
	SourceResolved string   `json:"-"`
	RawContent     string   `json:"-"`
}

Record contains one collected markdown record.

func Classify ¶

func Classify(records []Record) []Record

Classify assigns taxonomy categories to records.

func Collect ¶

func Collect(cfg *Config, cacheDir string) ([]Record, error)

Collect gathers markdown records from configured sources.

func Dedup ¶

func Dedup(records []Record) []Record

Dedup removes exact duplicates by content hash while keeping first occurrence. When ContentSHA256 is already set on a record it is used as-is; callers must provide a normalized hash if setting ContentSHA256 directly.

type SourceConfig ¶

type SourceConfig struct {
	Name        string            `yaml:"name" json:"name"`
	Repository  string            `yaml:"repository" json:"repository"`
	Root        string            `yaml:"root" json:"root"`
	CommitSHA   string            `yaml:"commit_sha" json:"commit_sha"`
	License     string            `yaml:"license" json:"license"`
	Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"`
}

SourceConfig defines one configured source repository.

type SplitSummary ¶

type SplitSummary struct {
	Train int `json:"train"`
	Test  int `json:"test"`
}

SplitSummary holds train/test counts.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL