Documentation
¶
Overview ¶
Package classifier implements form and field type classification.
Index ¶
- func ElemFeatures(elem *goquery.Selection, form *goquery.Selection) map[string]any
- func GetFormFeatures(form *goquery.Selection, formType string, fieldElems []*goquery.Selection) []map[string]any
- type ClassifyProbaResult
- type ClassifyResult
- type FeaturePipeline
- type FieldTypeModel
- type FormCSS
- type FormElements
- type FormFeatureExtractor
- type FormFieldClassifier
- func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult
- func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult
- func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
- func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
- func (c *FormFieldClassifier) SaveModel(path string) error
- type FormInputCSS
- type FormInputNames
- type FormInputTitle
- type FormLabelText
- type FormLinksText
- type FormResult
- type FormTypeModel
- type FormTypeTrainConfig
- type FormURL
- type SerializedPipeline
- type SubmitText
- type UnifiedModel
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ElemFeatures ¶
ElemFeatures extracts per-field features for CRF classification.
Types ¶
type ClassifyProbaResult ¶
type ClassifyProbaResult struct {
Form map[string]float64 `json:"form"`
Fields map[string]map[string]float64 `json:"fields,omitempty"`
}
ClassifyProbaResult holds probability-based classification results.
type ClassifyResult ¶
type ClassifyResult struct {
Form string `json:"form"`
Fields map[string]string `json:"fields,omitempty"`
}
ClassifyResult holds the classification result for a form.
type FeaturePipeline ¶
type FeaturePipeline struct {
Name string
Extractor FormFeatureExtractor
VecType string // "dict", "count", "tfidf"
NgramRange [2]int
MinDF int
Binary bool
Analyzer string
StopWords map[string]bool
UseEnglishStop bool
}
FeaturePipeline describes a feature extraction + vectorization pipeline.
func DefaultFeaturePipelines ¶
func DefaultFeaturePipelines() []FeaturePipeline
DefaultFeaturePipelines returns the 9 feature extraction pipelines matching Formasaurus's FEATURES list.
type FieldTypeModel ¶
FieldTypeModel wraps a CRF model for field type classification.
func TrainFieldType ¶
func TrainFieldType(sequences []crf.TrainingSequence, config crf.TrainerConfig) *FieldTypeModel
TrainFieldType trains a CRF model for field type classification.
func (*FieldTypeModel) ClassifyProba ¶
func (m *FieldTypeModel) ClassifyProba(form *goquery.Selection, formType string) map[string]map[string]float64
ClassifyProba returns field type probabilities for a form.
type FormElements ¶
type FormElements struct{}
FormElements extracts structural boolean features from a form.
func (FormElements) ExtractDict ¶
func (f FormElements) ExtractDict(form *goquery.Selection) map[string]any
func (FormElements) ExtractString ¶
func (f FormElements) ExtractString(_ *goquery.Selection) string
func (FormElements) IsDict ¶
func (f FormElements) IsDict() bool
type FormFeatureExtractor ¶
type FormFeatureExtractor interface {
ExtractString(form *goquery.Selection) string
ExtractDict(form *goquery.Selection) map[string]any
IsDict() bool
}
FormFeatureExtractor extracts features from a form element.
type FormFieldClassifier ¶
type FormFieldClassifier struct {
FormModel *FormTypeModel
FieldModel *FieldTypeModel
}
FormFieldClassifier detects HTML form and field types.
func LoadClassifier ¶
func LoadClassifier(path string) (*FormFieldClassifier, error)
LoadClassifier loads a FormFieldClassifier from disk.
func (*FormFieldClassifier) Classify ¶
func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult
Classify returns the form type and field types.
func (*FormFieldClassifier) ClassifyProba ¶
func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult
ClassifyProba returns probabilities for form and field types.
func (*FormFieldClassifier) ExtractForms ¶
func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
ExtractForms extracts and classifies all forms from HTML.
func (*FormFieldClassifier) ExtractFormsFromReader ¶
func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)
ExtractFormsFromReader extracts and classifies forms from an io.Reader.
func (*FormFieldClassifier) SaveModel ¶
func (c *FormFieldClassifier) SaveModel(path string) error
SaveModel saves the classifier to disk.
type FormInputCSS ¶
type FormInputCSS struct{}
FormInputCSS extracts CSS of non-hidden inputs.
func (FormInputCSS) ExtractDict ¶
func (f FormInputCSS) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputCSS) ExtractString ¶
func (f FormInputCSS) ExtractString(form *goquery.Selection) string
func (FormInputCSS) IsDict ¶
func (f FormInputCSS) IsDict() bool
type FormInputNames ¶
type FormInputNames struct{}
FormInputNames extracts names of non-hidden inputs.
func (FormInputNames) ExtractDict ¶
func (f FormInputNames) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputNames) ExtractString ¶
func (f FormInputNames) ExtractString(form *goquery.Selection) string
func (FormInputNames) IsDict ¶
func (f FormInputNames) IsDict() bool
type FormInputTitle ¶
type FormInputTitle struct{}
FormInputTitle extracts title attributes of non-hidden inputs.
func (FormInputTitle) ExtractDict ¶
func (f FormInputTitle) ExtractDict(form *goquery.Selection) map[string]any
func (FormInputTitle) ExtractString ¶
func (f FormInputTitle) ExtractString(form *goquery.Selection) string
func (FormInputTitle) IsDict ¶
func (f FormInputTitle) IsDict() bool
type FormLabelText ¶
type FormLabelText struct{}
FormLabelText extracts label text inside the form.
func (FormLabelText) ExtractDict ¶
func (f FormLabelText) ExtractDict(form *goquery.Selection) map[string]any
func (FormLabelText) ExtractString ¶
func (f FormLabelText) ExtractString(form *goquery.Selection) string
func (FormLabelText) IsDict ¶
func (f FormLabelText) IsDict() bool
type FormLinksText ¶
type FormLinksText struct{}
FormLinksText extracts link text inside the form.
func (FormLinksText) ExtractDict ¶
func (f FormLinksText) ExtractDict(_ *goquery.Selection) map[string]any
func (FormLinksText) ExtractString ¶
func (f FormLinksText) ExtractString(form *goquery.Selection) string
func (FormLinksText) IsDict ¶
func (f FormLinksText) IsDict() bool
type FormResult ¶
type FormResult struct {
FormHTML string `json:"form_html"`
Result ClassifyResult `json:"result,omitempty"`
Proba ClassifyProbaResult `json:"proba,omitempty"`
}
FormResult holds the result for a single form.
type FormTypeModel ¶
type FormTypeModel struct {
Classes []string `json:"classes"`
Coef [][]float64 `json:"coef"` // [numClasses][numFeatures]
Intercept []float64 `json:"intercept"` // [numClasses]
Pipelines []SerializedPipeline `json:"pipelines"`
// contains filtered or unexported fields
}
FormTypeModel holds a trained form type classifier.
func TrainFormType ¶
func TrainFormType(forms []*goquery.Selection, labels []string, config FormTypeTrainConfig) *FormTypeModel
TrainFormType trains a form type classifier.
func (*FormTypeModel) Classify ¶
func (m *FormTypeModel) Classify(form *goquery.Selection) string
Classify returns the predicted form type.
func (*FormTypeModel) ClassifyProba ¶
func (m *FormTypeModel) ClassifyProba(form *goquery.Selection) map[string]float64
ClassifyProba returns probabilities for each form type.
func (*FormTypeModel) InitRuntime ¶
func (m *FormTypeModel) InitRuntime()
InitRuntime initializes runtime state from serialized pipelines.
type FormTypeTrainConfig ¶
FormTypeTrainConfig holds training configuration.
func DefaultFormTypeTrainConfig ¶
func DefaultFormTypeTrainConfig() FormTypeTrainConfig
DefaultFormTypeTrainConfig returns default training config.
type SerializedPipeline ¶
type SerializedPipeline struct {
Name string `json:"name"`
ExtractorType string `json:"extractor_type"`
VecType string `json:"vec_type"`
DictVec *vectorizer.DictVectorizer `json:"dict_vec,omitempty"`
CountVec *vectorizer.CountVectorizer `json:"count_vec,omitempty"`
TfidfVec *vectorizer.TfidfVectorizer `json:"tfidf_vec,omitempty"`
}
SerializedPipeline holds the serialized state of a feature pipeline.
type SubmitText ¶
type SubmitText struct{}
SubmitText extracts submit button text.
func (SubmitText) ExtractDict ¶
func (f SubmitText) ExtractDict(_ *goquery.Selection) map[string]any
func (SubmitText) ExtractString ¶
func (f SubmitText) ExtractString(form *goquery.Selection) string
func (SubmitText) IsDict ¶
func (f SubmitText) IsDict() bool
type UnifiedModel ¶
type UnifiedModel struct {
FormModel *FormTypeModel `json:"form_model"`
FieldModel *crf.Model `json:"field_model"`
}
UnifiedModel holds both form and field models for serialization.