Documentation
¶
Index ¶
- Constants
- func ResolveSource(src SourceConfig, cacheDir string) (string, error)
- func Split(records []Record, testFraction float64) (train []Record, test []Record)
- func WriteJSON(path string, value any) error
- func WriteManifest(path string, records []Record) error
- func WriteQASample(path string, records []QASampleRecord) error
- type BuildReport
- type BuildResult
- type Category
- type Config
- type DriftReport
- type GitRunner
- type MeasureCategoryStats
- type MeasureReport
- type MetricSummary
- type QAAnnotation
- type QAAnnotationTemplateStats
- type QACategoryMetrics
- type QAReport
- type QASampleRecord
- type Record
- type SourceConfig
- type SplitSummary
Constants ¶
const ( SplitTrain = "train" SplitTest = "test" )
Split constants for deterministic train/test partition labels.
Variables ¶
This section is empty.
Functions ¶
func ResolveSource ¶
func ResolveSource(src SourceConfig, cacheDir string) (string, error)
ResolveSource ensures a source is available locally and returns the local root directory.
func WriteManifest ¶
WriteManifest writes manifest records as JSONL.
func WriteQASample ¶
func WriteQASample(path string, records []QASampleRecord) error
WriteQASample writes qa-sample JSONL records.
Types ¶
type BuildReport ¶
type BuildReport struct {
DatasetVersion string `json:"dataset_version"`
CollectedAt string `json:"collected_at"`
FilesCollected int `json:"files_collected"`
FilesKept int `json:"files_kept"`
FilesDeduped int `json:"files_deduped"`
Taxonomy map[Category]int `json:"taxonomy"`
Split SplitSummary `json:"split"`
Metrics MetricSummary `json:"metrics"`
}
BuildReport summarizes corpus build output.
func ReadBuildReport ¶
func ReadBuildReport(path string) (BuildReport, error)
ReadBuildReport reads a build report JSON document.
type BuildResult ¶
type BuildResult struct {
Manifest []Record
Report BuildReport
QASample []QASampleRecord
}
BuildResult contains build artifacts.
type Config ¶
type Config struct {
DatasetVersion string `yaml:"dataset_version" json:"dataset_version"`
CollectedAt string `yaml:"collected_at" json:"collected_at"`
MinWords int `yaml:"min_words" json:"min_words"`
MinChars int `yaml:"min_chars" json:"min_chars"`
TestFraction float64 `yaml:"test_fraction" json:"test_fraction"`
LicenseAllowlist []string `yaml:"license_allowlist" json:"license_allowlist"`
QASampleLimit int `yaml:"qa_sample_limit" json:"qa_sample_limit"`
Sources []SourceConfig `yaml:"sources" json:"sources"`
ResolvedFromLocal bool `yaml:"-" json:"-"`
Progress func(string) `yaml:"-" json:"-"`
}
Config defines corpus collection settings.
func LoadConfig ¶
LoadConfig loads config from YAML, applies defaults, and merges config.local.yml overrides.
type DriftReport ¶
type DriftReport struct {
BaselineVersion string `json:"baseline_version"`
CandidateVersion string `json:"candidate_version"`
FilesKeptDelta int `json:"files_kept_delta"`
TaxonomyDeltas map[Category]int `json:"taxonomy_deltas"`
MetricDeltas MetricSummary `json:"metric_deltas"`
}
DriftReport summarizes changes between two build reports.
func CompareReports ¶
func CompareReports(baseline BuildReport, candidate BuildReport) *DriftReport
CompareReports compares two build reports and returns a drift report.
type MeasureCategoryStats ¶
type MeasureCategoryStats struct {
Count int `json:"count"`
AvgWords float64 `json:"avg_words"`
AvgChars float64 `json:"avg_chars"`
}
MeasureCategoryStats holds measurement summary by category.
type MeasureReport ¶
type MeasureReport struct {
CorpusPath string `json:"corpus_path"`
Total int `json:"total"`
Categories map[Category]MeasureCategoryStats `json:"categories"`
}
MeasureReport is written by the measure subcommand.
type MetricSummary ¶
type MetricSummary struct {
AvgWords float64 `json:"avg_words"`
AvgChars float64 `json:"avg_chars"`
}
MetricSummary provides build-level aggregate metrics.
type QAAnnotation ¶
type QAAnnotation struct {
RecordID string `json:"record_id"`
ActualCategory Category `json:"actual_category"`
}
QAAnnotation stores one manual annotation.
func ReadQAAnnotationsCSV ¶
func ReadQAAnnotationsCSV(path string) ([]QAAnnotation, error)
ReadQAAnnotationsCSV reads CSV annotations in the form record_id,actual_category.
type QAAnnotationTemplateStats ¶
QAAnnotationTemplateStats reports template generation outcomes.
func WriteQAAnnotationTemplateCSV ¶
func WriteQAAnnotationTemplateCSV( path string, sample []QASampleRecord, existing []QAAnnotation, ) (QAAnnotationTemplateStats, error)
WriteQAAnnotationTemplateCSV writes an annotation CSV aligned to a QA sample.
type QACategoryMetrics ¶
type QACategoryMetrics struct {
Precision float64 `json:"precision"`
Recall float64 `json:"recall"`
F1 float64 `json:"f1"`
Support int `json:"support"`
}
QACategoryMetrics provides one-vs-rest metrics.
type QAReport ¶
type QAReport struct {
Total int `json:"total"`
Annotated int `json:"annotated"`
Coverage float64 `json:"coverage"`
Accuracy float64 `json:"accuracy"`
Kappa *float64 `json:"kappa,omitempty"`
Categories map[Category]QACategoryMetrics `json:"categories"`
}
QAReport summarizes annotation quality.
func EvaluateQA ¶
func EvaluateQA(sample []QASampleRecord, annotations []QAAnnotation) (*QAReport, error)
EvaluateQA compares predicted labels with manual annotations.
type QASampleRecord ¶
type QASampleRecord struct {
RecordID string `json:"record_id"`
PredictedCategory Category `json:"predicted_category"`
Source string `json:"source"`
Path string `json:"path"`
}
QASampleRecord is a row in the annotation sample.
func ReadQASample ¶
func ReadQASample(path string) ([]QASampleRecord, error)
ReadQASample reads qa-sample JSONL records.
type Record ¶
type Record struct {
RecordID string `json:"record_id"`
Source string `json:"source"`
Repository string `json:"repository,omitempty"`
CommitSHA string `json:"commit_sha,omitempty"`
License string `json:"license,omitempty"`
Path string `json:"path"`
Category Category `json:"category"`
Split string `json:"split,omitempty"`
Words int `json:"words"`
Chars int `json:"chars"`
ContentSHA256 string `json:"content_sha256,omitempty"`
CollectedAt string `json:"collected_at,omitempty"`
SourceResolved string `json:"-"`
RawContent string `json:"-"`
}
Record contains one collected markdown record.
type SourceConfig ¶
type SourceConfig struct {
Name string `yaml:"name" json:"name"`
Repository string `yaml:"repository" json:"repository"`
Root string `yaml:"root" json:"root"`
CommitSHA string `yaml:"commit_sha" json:"commit_sha"`
License string `yaml:"license" json:"license"`
Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"`
}
SourceConfig defines one configured source repository.
type SplitSummary ¶
SplitSummary holds train/test counts.