scope

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 1, 2026 License: MIT Imports: 4 Imported by: 0

Documentation

Overview

Package scope provides URL scope checking for the crawler.

Index

Constants

This section is empty.

Variables

View Source
var CommonAPIPatterns = []string{
	`/api/`,
	`/v[0-9]+/`,
	`/graphql`,
	`/rest/`,
	`/rpc/`,
	`/ajax/`,
	`/json/`,
	`/xml/`,
}

CommonAPIPatterns contains common API path patterns.

View Source
var DefaultExcludePatterns = []string{
	`.*[?&]logout.*`,
	`.*[?&]signout.*`,
	`.*[?&]exit.*`,
	`.*\/logout.*`,
	`.*\/signout.*`,
	`.*\/delete-account.*`,
	`.*\/unsubscribe.*`,
	`.*\/reset-password.*`,
	`.*\.pdf$`,
	`.*\.zip$`,
	`.*\.exe$`,
	`.*\.dmg$`,
}

DefaultExcludePatterns contains common patterns to exclude.

Functions

func ClassifyURL

func ClassifyURL(urlStr string) string

ClassifyURL classifies a URL by its likely type.

func ExtractDomain

func ExtractDomain(urlStr string) (string, error)

ExtractDomain extracts the domain from a URL.

func IsAPIPath

func IsAPIPath(path string) bool

IsAPIPath checks if a path looks like an API endpoint.

func IsValidURL

func IsValidURL(urlStr string) bool

IsValidURL checks if a URL is valid for crawling.

func MatchPattern

func MatchPattern(url, pattern string) bool

MatchPattern checks if a URL matches a pattern.

func NormalizeURL

func NormalizeURL(rawURL string) (string, error)

NormalizeURL normalizes a URL for deduplication.

func ResolveURL

func ResolveURL(baseURL, relativeURL string) (string, error)

ResolveURL resolves a relative URL against a base URL.

Types

type Checker

type Checker struct {
	// contains filtered or unexported fields
}

Checker validates URLs against scope rules.

func NewChecker

func NewChecker(targetURL string, rules ScopeRules) (*Checker, error)

NewChecker creates a new scope checker.

func (*Checker) AddAllowedDomain

func (c *Checker) AddAllowedDomain(domain string)

AddAllowedDomain adds a domain to the allowed list.

func (*Checker) AddExcludePattern

func (c *Checker) AddExcludePattern(pattern string) error

AddExcludePattern adds an exclude pattern.

func (*Checker) AddIncludePattern

func (c *Checker) AddIncludePattern(pattern string) error

AddIncludePattern adds an include pattern.

func (*Checker) IsInScope

func (c *Checker) IsInScope(urlStr string, depth int) bool

IsInScope checks if a URL is within the crawling scope.

func (*Checker) SetMaxDepth

func (c *Checker) SetMaxDepth(depth int)

SetMaxDepth sets the maximum crawl depth.

type RuleBuilder

type RuleBuilder struct {
	// contains filtered or unexported fields
}

RuleBuilder helps build scope rules.

func NewRuleBuilder

func NewRuleBuilder() *RuleBuilder

NewRuleBuilder creates a new rule builder.

func (*RuleBuilder) Build

func (b *RuleBuilder) Build() ScopeRules

Build returns the configured rules.

func (*RuleBuilder) WithAllowedDomains

func (b *RuleBuilder) WithAllowedDomains(domains ...string) *RuleBuilder

WithAllowedDomains sets allowed domains.

func (*RuleBuilder) WithDefaultExcludes

func (b *RuleBuilder) WithDefaultExcludes() *RuleBuilder

WithDefaultExcludes adds default exclude patterns.

func (*RuleBuilder) WithExcludePatterns

func (b *RuleBuilder) WithExcludePatterns(patterns ...string) *RuleBuilder

WithExcludePatterns adds exclude patterns.

func (*RuleBuilder) WithFollowExternal

func (b *RuleBuilder) WithFollowExternal(follow bool) *RuleBuilder

WithFollowExternal enables following external links.

func (*RuleBuilder) WithIncludePatterns

func (b *RuleBuilder) WithIncludePatterns(patterns ...string) *RuleBuilder

WithIncludePatterns adds include patterns.

func (*RuleBuilder) WithMaxDepth

func (b *RuleBuilder) WithMaxDepth(depth int) *RuleBuilder

WithMaxDepth sets the maximum crawl depth.

type ScopeRules

type ScopeRules struct {
	IncludePatterns []string
	ExcludePatterns []string
	AllowedDomains  []string
	MaxDepth        int
	FollowExternal  bool
}

ScopeRules defines crawling scope rules.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL