classifier

package
v0.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 6, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Overview

Package classifier implements form and field type classification.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ElemFeatures

func ElemFeatures(elem *goquery.Selection, form *goquery.Selection) map[string]any

ElemFeatures extracts per-field features for CRF classification.

func GetFormFeatures

func GetFormFeatures(form *goquery.Selection, formType string, fieldElems []*goquery.Selection) []map[string]any

GetFormFeatures extracts CRF feature sequences for a form.

Types

type ClassifyProbaResult

type ClassifyProbaResult struct {
	Form   map[string]float64            `json:"form"`
	Fields map[string]map[string]float64 `json:"fields,omitempty"`
}

ClassifyProbaResult holds probability-based classification results.

type ClassifyResult

type ClassifyResult struct {
	Form   string            `json:"form"`
	Fields map[string]string `json:"fields,omitempty"`
}

ClassifyResult holds the classification result for a form.

type FeaturePipeline

type FeaturePipeline struct {
	Name           string
	Extractor      FormFeatureExtractor
	VecType        string // "dict", "count", "tfidf"
	NgramRange     [2]int
	MinDF          int
	Binary         bool
	Analyzer       string
	StopWords      map[string]bool
	UseEnglishStop bool
}

FeaturePipeline describes a feature extraction + vectorization pipeline.

func DefaultFeaturePipelines

func DefaultFeaturePipelines() []FeaturePipeline

DefaultFeaturePipelines returns the 9 feature extraction pipelines matching Formasaurus's FEATURES list.

type FieldTypeModel

type FieldTypeModel struct {
	CRF *crf.Model
}

FieldTypeModel wraps a CRF model for field type classification.

func TrainFieldType

func TrainFieldType(sequences []crf.TrainingSequence, config crf.TrainerConfig) *FieldTypeModel

TrainFieldType trains a CRF model for field type classification.

func (*FieldTypeModel) Classify

func (m *FieldTypeModel) Classify(form *goquery.Selection, formType string) map[string]string

Classify returns field types for a form given the form type.

func (*FieldTypeModel) ClassifyProba

func (m *FieldTypeModel) ClassifyProba(form *goquery.Selection, formType string) map[string]map[string]float64

ClassifyProba returns field type probabilities for a form.

type FormCSS

type FormCSS struct{}

FormCSS extracts form CSS class and ID.

func (FormCSS) ExtractDict

func (f FormCSS) ExtractDict(form *goquery.Selection) map[string]any

func (FormCSS) ExtractString

func (f FormCSS) ExtractString(form *goquery.Selection) string

func (FormCSS) IsDict

func (f FormCSS) IsDict() bool

type FormElements

type FormElements struct{}

FormElements extracts structural boolean features from a form.

func (FormElements) ExtractDict

func (f FormElements) ExtractDict(form *goquery.Selection) map[string]any

func (FormElements) ExtractString

func (f FormElements) ExtractString(_ *goquery.Selection) string

func (FormElements) IsDict

func (f FormElements) IsDict() bool

type FormFeatureExtractor

type FormFeatureExtractor interface {
	ExtractString(form *goquery.Selection) string
	ExtractDict(form *goquery.Selection) map[string]any
	IsDict() bool
}

FormFeatureExtractor extracts features from a form element.

type FormFieldClassifier

type FormFieldClassifier struct {
	FormModel  *FormTypeModel
	FieldModel *FieldTypeModel
}

FormFieldClassifier detects HTML form and field types.

func LoadClassifier

func LoadClassifier(path string) (*FormFieldClassifier, error)

LoadClassifier loads a FormFieldClassifier from disk.

func (*FormFieldClassifier) Classify

func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult

Classify returns the form type and field types.

func (*FormFieldClassifier) ClassifyProba

func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult

ClassifyProba returns probabilities for form and field types.

func (*FormFieldClassifier) ExtractForms

func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)

ExtractForms extracts and classifies all forms from HTML.

func (*FormFieldClassifier) ExtractFormsFromReader

func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)

ExtractFormsFromReader extracts and classifies forms from an io.Reader.

func (*FormFieldClassifier) SaveModel

func (c *FormFieldClassifier) SaveModel(path string) error

SaveModel saves the classifier to disk.

type FormInputCSS

type FormInputCSS struct{}

FormInputCSS extracts CSS of non-hidden inputs.

func (FormInputCSS) ExtractDict

func (f FormInputCSS) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputCSS) ExtractString

func (f FormInputCSS) ExtractString(form *goquery.Selection) string

func (FormInputCSS) IsDict

func (f FormInputCSS) IsDict() bool

type FormInputNames

type FormInputNames struct{}

FormInputNames extracts names of non-hidden inputs.

func (FormInputNames) ExtractDict

func (f FormInputNames) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputNames) ExtractString

func (f FormInputNames) ExtractString(form *goquery.Selection) string

func (FormInputNames) IsDict

func (f FormInputNames) IsDict() bool

type FormInputTitle

type FormInputTitle struct{}

FormInputTitle extracts title attributes of non-hidden inputs.

func (FormInputTitle) ExtractDict

func (f FormInputTitle) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputTitle) ExtractString

func (f FormInputTitle) ExtractString(form *goquery.Selection) string

func (FormInputTitle) IsDict

func (f FormInputTitle) IsDict() bool

type FormLabelText

type FormLabelText struct{}

FormLabelText extracts label text inside the form.

func (FormLabelText) ExtractDict

func (f FormLabelText) ExtractDict(form *goquery.Selection) map[string]any

func (FormLabelText) ExtractString

func (f FormLabelText) ExtractString(form *goquery.Selection) string

func (FormLabelText) IsDict

func (f FormLabelText) IsDict() bool

type FormLinksText

type FormLinksText struct{}

FormLinksText extracts link text inside the form.

func (FormLinksText) ExtractDict

func (f FormLinksText) ExtractDict(_ *goquery.Selection) map[string]any

func (FormLinksText) ExtractString

func (f FormLinksText) ExtractString(form *goquery.Selection) string

func (FormLinksText) IsDict

func (f FormLinksText) IsDict() bool

type FormResult

type FormResult struct {
	FormHTML string              `json:"form_html"`
	Result   ClassifyResult      `json:"result,omitempty"`
	Proba    ClassifyProbaResult `json:"proba,omitempty"`
}

FormResult holds the result for a single form.

type FormTypeModel

type FormTypeModel struct {
	Classes   []string             `json:"classes"`
	Coef      [][]float64          `json:"coef"`      // [numClasses][numFeatures]
	Intercept []float64            `json:"intercept"` // [numClasses]
	Pipelines []SerializedPipeline `json:"pipelines"`
	// contains filtered or unexported fields
}

FormTypeModel holds a trained form type classifier.

func TrainFormType

func TrainFormType(forms []*goquery.Selection, labels []string, config FormTypeTrainConfig) *FormTypeModel

TrainFormType trains a form type classifier.

func (*FormTypeModel) Classify

func (m *FormTypeModel) Classify(form *goquery.Selection) string

Classify returns the predicted form type.

func (*FormTypeModel) ClassifyProba

func (m *FormTypeModel) ClassifyProba(form *goquery.Selection) map[string]float64

ClassifyProba returns probabilities for each form type.

func (*FormTypeModel) InitRuntime

func (m *FormTypeModel) InitRuntime()

InitRuntime initializes runtime state from serialized pipelines.

type FormTypeTrainConfig

type FormTypeTrainConfig struct {
	C       float64
	MaxIter int
	Verbose bool
}

FormTypeTrainConfig holds training configuration.

func DefaultFormTypeTrainConfig

func DefaultFormTypeTrainConfig() FormTypeTrainConfig

DefaultFormTypeTrainConfig returns default training config.

type FormURL

type FormURL struct{}

FormURL extracts the form action URL (normalized).

func (FormURL) ExtractDict

func (f FormURL) ExtractDict(form *goquery.Selection) map[string]any

func (FormURL) ExtractString

func (f FormURL) ExtractString(form *goquery.Selection) string

func (FormURL) IsDict

func (f FormURL) IsDict() bool

type SerializedPipeline

type SerializedPipeline struct {
	Name          string                      `json:"name"`
	ExtractorType string                      `json:"extractor_type"`
	VecType       string                      `json:"vec_type"`
	DictVec       *vectorizer.DictVectorizer  `json:"dict_vec,omitempty"`
	CountVec      *vectorizer.CountVectorizer `json:"count_vec,omitempty"`
	TfidfVec      *vectorizer.TfidfVectorizer `json:"tfidf_vec,omitempty"`
}

SerializedPipeline holds the serialized state of a feature pipeline.

type SubmitText

type SubmitText struct{}

SubmitText extracts submit button text.

func (SubmitText) ExtractDict

func (f SubmitText) ExtractDict(_ *goquery.Selection) map[string]any

func (SubmitText) ExtractString

func (f SubmitText) ExtractString(form *goquery.Selection) string

func (SubmitText) IsDict

func (f SubmitText) IsDict() bool

type UnifiedModel

type UnifiedModel struct {
	FormModel  *FormTypeModel `json:"form_model"`
	FieldModel *crf.Model     `json:"field_model"`
}

UnifiedModel holds both form and field models for serialization.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL