parser

package
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 14, 2026 License: MIT Imports: 20 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	DocumentExts = regexp.MustCompile(`(?i)\.(docx|xlsx|pptx|dotx|potx|ppsx)$`)
	PdfExt       = regexp.MustCompile(`(?i)\.pdf$`)
)
View Source
var (
	EmailRe = regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
)
View Source
var Parsers = []ContentParser{
	{[]string{"javascript", "ecmascript"}, []string{".js", ".mjs"}, func(body []byte, wordSet map[string]struct{}) {
		ExtractFromJS(body, wordSet)
	}},
	{[]string{"xml", "svg"}, []string{".xml", ".svg", ".rss", ".atom", ".sitemap"}, ExtractFromXML},
	{[]string{"json"}, []string{".json", ".webmanifest"}, ExtractFromJSON},
	{[]string{"css"}, []string{".css"}, ExtractFromCSS},
	{[]string{"text/vtt", "subrip"}, []string{".vtt", ".srt"}, ExtractSubtitles},
	{[]string{"audio", "video"}, []string{".mp3", ".mp4", ".ogg", ".flac", ".wav", ".m4a", ".webm"}, ExtractMediaMetadata},
}
View Source
var Resources = []Resource{
	{"script[src]", "src"},
	{"link[href]", "href"},
	{"img[src]", "src"},
	{"iframe[src]", "src"},
	{"source[src]", "src"},
	{"video[src]", "src"},
	{"audio[src]", "src"},
	{"track[src]", "src"},
}
View Source
var WordAttrs = []string{
	"alt", "title", "placeholder", "aria-label", "aria-description",
	"data-title", "data-name", "data-label", "data-value",
	"content", "value", "label", "summary",
}

Functions

func CaptureURLComponents

func CaptureURLComponents(reqURL, baseURL *url.URL, wordSet map[string]struct{}, capturePaths, captureSubdomains, captureDomain bool)

func DeobfuscateEmail

func DeobfuscateEmail(raw string) string

func ExtractAttrs

func ExtractAttrs(doc *goquery.Document, addWords func(string), addContext func(string))

func ExtractBodyText

func ExtractBodyText(doc *goquery.Document, addWords func(string), addContext func(string))

func ExtractComments

func ExtractComments(doc *goquery.Document, addWords func(string))

func ExtractEmails

func ExtractEmails(doc *goquery.Document, addEmail func(string))

func ExtractEmailsFromText

func ExtractEmailsFromText(text string) []string

func ExtractFromCSS

func ExtractFromCSS(body []byte, wordSet map[string]struct{})

func ExtractFromJS

func ExtractFromJS(body []byte, wordSet map[string]struct{}) []string

func ExtractFromJSON

func ExtractFromJSON(body []byte, wordSet map[string]struct{})

func ExtractFromXML

func ExtractFromXML(body []byte, wordSet map[string]struct{})

func ExtractMediaMetadata

func ExtractMediaMetadata(body []byte, wordSet map[string]struct{})

func ExtractOfficeMetadata

func ExtractOfficeMetadata(body []byte, mu *sync.Mutex, metaSet map[string]struct{}, verbose bool, reqURL string)

func ExtractPDFMetadata

func ExtractPDFMetadata(body []byte, mu *sync.Mutex, metaSet map[string]struct{}, verbose bool, reqURL string)

func ExtractSubtitles

func ExtractSubtitles(body []byte, wordSet map[string]struct{})

func ExtractTextContent

func ExtractTextContent(body []byte, wordSet map[string]struct{}, pageContexts *[]string)

func ExtractTitle

func ExtractTitle(doc *goquery.Document) string
func FollowLinks(doc *goquery.Document, visit func(string))

func FollowResources

func FollowResources(doc *goquery.Document, visit func(string))

func MatchType

func MatchType(contentType, reqURL string, types []string, exts []string) bool

func ParseByExtension

func ParseByExtension(ext string, body []byte, wordSet map[string]struct{}, pageContexts *[]string)

Types

type ContentParser

type ContentParser struct {
	Types []string
	Exts  []string
	Parse func(body []byte, wordSet map[string]struct{})
}

type Resource

type Resource struct {
	Query string
	Attr  string
}

type SecretFinding added in v0.5.0

type SecretFinding struct {
	DetectorName string `json:"detector"`
	Raw          string `json:"raw"`
	Redacted     string `json:"redacted,omitempty"`
	Source       string `json:"source,omitempty"`
}

type SecretScanner added in v0.5.0

type SecretScanner struct {
	// contains filtered or unexported fields
}

func NewSecretScanner added in v0.5.0

func NewSecretScanner() *SecretScanner

func (*SecretScanner) Scan added in v0.5.0

func (s *SecretScanner) Scan(data []byte, source string) []SecretFinding

Scan checks data for secrets using trufflehog detectors. Accepts []byte directly to avoid unnecessary string/byte conversions.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL