crawler

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 1, 2026 License: MIT Imports: 29 Imported by: 0

Documentation

Overview

Package crawler provides the main DAST web application crawler functionality.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type AuthCredentials

type AuthCredentials struct {
	Type        AuthType          `json:"type"`
	Username    string            `json:"username,omitempty"`
	Password    string            `json:"password,omitempty"`
	Token       string            `json:"token,omitempty"`
	Headers     map[string]string `json:"headers,omitempty"`
	Cookies     []*http.Cookie    `json:"-"`
	LoginURL    string            `json:"login_url,omitempty"`
	FormFields  map[string]string `json:"form_fields,omitempty"`
	OAuthConfig *OAuthConfig      `json:"oauth_config,omitempty"`
}

AuthCredentials holds authentication credentials.

type AuthType

type AuthType string

AuthType represents the type of authentication.

const (
	AuthTypeNone      AuthType = "none"
	AuthTypeSession   AuthType = "session"
	AuthTypeJWT       AuthType = "jwt"
	AuthTypeOAuth     AuthType = "oauth"
	AuthTypeFormLogin AuthType = "form"
	AuthTypeAPIKey    AuthType = "apikey"
	AuthTypeBasic     AuthType = "basic"
)

type Config

type Config struct {
	// Target URL to crawl
	Target string `json:"target" yaml:"target"`

	// Number of concurrent workers
	Workers int `json:"workers" yaml:"workers"`

	// Maximum crawl depth
	MaxDepth int `json:"max_depth" yaml:"max_depth"`

	// Request timeout
	Timeout time.Duration `json:"timeout" yaml:"timeout"`

	// Scope rules
	Scope ScopeRules `json:"scope" yaml:"scope"`

	// Rate limiting
	RateLimit RateLimitConfig `json:"rate_limit" yaml:"rate_limit"`

	// Browser configuration
	Browser browser.Config `json:"browser" yaml:"browser"`

	// Authentication
	Auth AuthCredentials `json:"auth" yaml:"auth"`

	// Output configuration
	Output OutputConfig `json:"output" yaml:"output"`

	// State persistence
	State StateConfig `json:"state" yaml:"state"`

	// Enable passive API discovery
	PassiveAPIDiscovery bool `json:"passive_api_discovery" yaml:"passive_api_discovery"`

	// Enable active API probing
	ActiveAPIDiscovery bool `json:"active_api_discovery" yaml:"active_api_discovery"`

	// Enable WebSocket discovery
	WebSocketDiscovery bool `json:"websocket_discovery" yaml:"websocket_discovery"`

	// Enable form analysis
	FormAnalysis bool `json:"form_analysis" yaml:"form_analysis"`

	// Enable JavaScript analysis
	JSAnalysis bool `json:"js_analysis" yaml:"js_analysis"`

	// Enable AJAX discovery (intercept XHR/Fetch, trigger events, find AJAX forms)
	AJAXDiscovery bool `json:"ajax_discovery" yaml:"ajax_discovery"`

	// Fast mode skips heavy analysis (SPA framework detection, AJAX triggering) for speed
	FastMode bool `json:"fast_mode" yaml:"fast_mode"`

	// Custom headers to include in all requests
	CustomHeaders map[string]string `json:"custom_headers" yaml:"custom_headers"`

	// Cookies to include in all requests
	Cookies map[string]string `json:"cookies" yaml:"cookies"`

	// User agents to rotate (if empty, uses default)
	UserAgents []string `json:"user_agents" yaml:"user_agents"`

	// Proxy URL
	Proxy string `json:"proxy" yaml:"proxy"`

	// Verbose logging
	Verbose bool `json:"verbose" yaml:"verbose"`

	// Debug mode
	Debug bool `json:"debug" yaml:"debug"`

	// Enhanced discovery configuration
	EnhancedDiscovery EnhancedDiscoveryConfig `json:"enhanced_discovery" yaml:"enhanced_discovery"`
}

Config holds all crawler configuration.

func BalancedConfig

func BalancedConfig() *Config

BalancedConfig returns a configuration that balances speed with thoroughness.

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns a configuration with sensible defaults.

func LoadFromFile

func LoadFromFile(path string) (*Config, error)

LoadFromFile loads configuration from a file (JSON or YAML).

func TurboConfig

func TurboConfig() *Config

TurboConfig returns a configuration optimized for MAXIMUM SPEED. Use this when you need to crawl as fast as possible. Warning: This may trigger rate limiting or WAF blocks on some sites.

func (*Config) Clone

func (c *Config) Clone() *Config

Clone creates a deep copy of the configuration.

func (*Config) SaveToFile

func (c *Config) SaveToFile(path string) error

SaveToFile saves configuration to a file.

func (*Config) Validate

func (c *Config) Validate() error

Validate validates the configuration.

type CrawlError

type CrawlError struct {
	URL       string    `json:"url"`
	Error     string    `json:"error"`
	Timestamp time.Time `json:"timestamp"`
}

CrawlError represents an error encountered during crawling.

type CrawlResult

type CrawlResult struct {
	Target       string              `json:"target"`
	StartedAt    time.Time           `json:"started_at"`
	CompletedAt  time.Time           `json:"completed_at,omitempty"`
	Stats        CrawlStats          `json:"stats"`
	Endpoints    []Endpoint          `json:"endpoints"`
	Forms        []Form              `json:"forms"`
	WebSockets   []WebSocketEndpoint `json:"websockets"`
	Technologies []Technology        `json:"technologies,omitempty"`
	Secrets      []SecretFinding     `json:"secrets,omitempty"`
	Errors       []CrawlError        `json:"errors,omitempty"`
}

CrawlResult represents the complete result of a crawl session.

type CrawlStats

type CrawlStats struct {
	URLsDiscovered     int           `json:"urls_discovered"`
	PagesCrawled       int           `json:"pages_crawled"`
	FormsFound         int           `json:"forms_found"`
	APIEndpoints       int           `json:"api_endpoints"`
	WebSocketEndpoints int           `json:"websocket_endpoints"`
	ErrorCount         int           `json:"error_count"`
	Duration           time.Duration `json:"duration"`
	BytesTransferred   int64         `json:"bytes_transferred"`
}

CrawlStats contains statistics about the crawl.

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler is the main crawler orchestrator.

func New

func New(opts ...Option) (*Crawler, error)

New creates a new crawler with the given options.

func (*Crawler) IsRunning

func (c *Crawler) IsRunning() bool

IsRunning returns true if the crawler is running.

func (*Crawler) LoadState

func (c *Crawler) LoadState(path string) error

LoadState loads a saved crawl state.

func (*Crawler) Metrics

func (c *Crawler) Metrics() *metrics.Collector

Metrics returns the metrics collector for external access.

func (*Crawler) MetricsSnapshot

func (c *Crawler) MetricsSnapshot() *metrics.Snapshot

MetricsSnapshot returns a point-in-time snapshot of all metrics.

func (*Crawler) Results

func (c *Crawler) Results() <-chan interface{}

Results returns a channel for streaming results.

func (*Crawler) SaveState

func (c *Crawler) SaveState(path string) error

SaveState saves the current crawl state.

func (*Crawler) ShutdownContext

func (c *Crawler) ShutdownContext() context.Context

ShutdownContext returns the shutdown context for monitoring.

func (*Crawler) Start

func (c *Crawler) Start(ctx context.Context) (*CrawlResult, error)

Start begins the crawling process.

func (*Crawler) Stats

func (c *Crawler) Stats() CrawlStats

Stats returns current crawl statistics.

func (*Crawler) Stop

func (c *Crawler) Stop() error

Stop stops the crawler gracefully.

func (*Crawler) StopNow

func (c *Crawler) StopNow() error

StopNow stops the crawler immediately without waiting for cleanup.

type Endpoint

type Endpoint struct {
	URL            string            `json:"url"`
	Method         string            `json:"method"`
	Source         string            `json:"source"` // passive, active, html, javascript
	Depth          int               `json:"depth"`
	Parameters     []Parameter       `json:"parameters,omitempty"`
	Headers        map[string]string `json:"headers,omitempty"`
	DiscoveredFrom string            `json:"discovered_from,omitempty"`
	StatusCode     int               `json:"status_code,omitempty"`
	ContentType    string            `json:"content_type,omitempty"`
	ResponseSize   int64             `json:"response_size,omitempty"`
	Timestamp      time.Time         `json:"timestamp"`
}

Endpoint represents a discovered endpoint during crawling.

type EnhancedDiscoveryConfig

type EnhancedDiscoveryConfig struct {
	// Enable all enhanced discovery modules
	Enabled bool `json:"enabled" yaml:"enabled"`

	// Enable robots.txt parsing
	EnableRobots bool `json:"enable_robots" yaml:"enable_robots"`

	// Enable sitemap.xml discovery
	EnableSitemap bool `json:"enable_sitemap" yaml:"enable_sitemap"`

	// Enable JavaScript source map parsing
	EnableSourceMaps bool `json:"enable_source_maps" yaml:"enable_source_maps"`

	// Enable common path bruteforcing
	EnablePathBrute bool `json:"enable_path_brute" yaml:"enable_path_brute"`

	// Enable technology fingerprinting
	EnableFingerprint bool `json:"enable_fingerprint" yaml:"enable_fingerprint"`

	// Enable parameter discovery
	EnableParamDiscovery bool `json:"enable_param_discovery" yaml:"enable_param_discovery"`

	// Enable JavaScript extraction
	EnableJSExtract bool `json:"enable_js_extract" yaml:"enable_js_extract"`

	// Concurrency for enhanced discovery operations
	Concurrency int `json:"concurrency" yaml:"concurrency"`
}

EnhancedDiscoveryConfig holds configuration for enhanced discovery modules.

type Form

type Form struct {
	URL       string      `json:"url"`
	Action    string      `json:"action"`
	Method    string      `json:"method"`
	Enctype   string      `json:"enctype"`
	Inputs    []FormInput `json:"inputs"`
	HasCSRF   bool        `json:"has_csrf"`
	Depth     int         `json:"depth"`
	Timestamp time.Time   `json:"timestamp"`
}

Form represents an HTML form discovered during crawling.

type FormAuth

type FormAuth struct {
	LoginURL      string
	Username      string
	Password      string
	UsernameField string
	PasswordField string
	ExtraFields   map[string]string
}

FormAuth holds form-based authentication configuration.

type FormInput

type FormInput struct {
	Name        string `json:"name"`
	Type        string `json:"type"`
	Value       string `json:"value,omitempty"`
	Required    bool   `json:"required"`
	Placeholder string `json:"placeholder,omitempty"`
	Pattern     string `json:"pattern,omitempty"`
	MaxLength   int    `json:"max_length,omitempty"`
	MinLength   int    `json:"min_length,omitempty"`
}

FormInput represents an input field in a form.

type OAuthConfig

type OAuthConfig struct {
	ClientID     string   `json:"client_id"`
	ClientSecret string   `json:"client_secret"`
	AuthURL      string   `json:"auth_url"`
	TokenURL     string   `json:"token_url"`
	RedirectURL  string   `json:"redirect_url"`
	Scopes       []string `json:"scopes"`
}

OAuthConfig holds OAuth 2.0 configuration.

type Option

type Option func(*Crawler) error

Option is a functional option for configuring the Crawler.

func WithAPIKeyAuth

func WithAPIKeyAuth(headerName, apiKey string) Option

WithAPIKeyAuth configures API key authentication.

func WithActiveDiscovery

func WithActiveDiscovery(enabled bool) Option

WithActiveDiscovery enables/disables active API probing.

func WithAllowedDomains

func WithAllowedDomains(domains ...string) Option

WithAllowedDomains sets the allowed domains.

func WithAuth

func WithAuth(auth AuthCredentials) Option

WithAuth sets authentication credentials.

func WithAutoSave

func WithAutoSave(enabled bool, intervalSeconds int) Option

WithAutoSave enables/disables automatic state saving.

func WithBasicAuth

func WithBasicAuth(username, password string) Option

WithBasicAuth configures basic authentication.

func WithBrowserPool

func WithBrowserPool(size int) Option

WithBrowserPool sets the browser pool size.

func WithConfig

func WithConfig(config *Config) Option

WithConfig sets the entire configuration.

func WithCookies

func WithCookies(cookies []*http.Cookie) Option

WithCookies sets cookies to include in requests.

func WithCustomHeaders

func WithCustomHeaders(headers map[string]string) Option

WithCustomHeaders sets custom headers for all requests.

func WithDebug

func WithDebug(debug bool) Option

WithDebug enables/disables debug mode.

func WithExcludePatterns

func WithExcludePatterns(patterns ...string) Option

WithExcludePatterns adds URL patterns to exclude.

func WithFollowExternal

func WithFollowExternal(follow bool) Option

WithFollowExternal enables following external links.

func WithFormAnalysis

func WithFormAnalysis(enabled bool) Option

WithFormAnalysis enables/disables form analysis.

func WithFormAuth

func WithFormAuth(auth FormAuth) Option

WithFormAuth configures form-based authentication.

func WithHeadless

func WithHeadless(headless bool) Option

WithHeadless enables/disables headless mode.

func WithIncludePatterns

func WithIncludePatterns(patterns ...string) Option

WithIncludePatterns adds URL patterns to include.

func WithJSAnalysis

func WithJSAnalysis(enabled bool) Option

WithJSAnalysis enables/disables JavaScript analysis.

func WithJWTAuth

func WithJWTAuth(token string) Option

WithJWTAuth configures JWT authentication.

func WithLogLevel

func WithLogLevel(level logger.Level) Option

WithLogLevel sets the log level.

func WithLogger

func WithLogger(l *logger.Logger) Option

WithLogger sets a custom logger.

func WithMaxDepth

func WithMaxDepth(depth int) Option

WithMaxDepth sets the maximum crawl depth.

func WithMetrics

func WithMetrics(m *metrics.Collector) Option

WithMetrics sets a custom metrics collector.

func WithOutput

func WithOutput(w io.Writer) Option

WithOutput sets the output writer.

func WithOutputFile

func WithOutputFile(path string) Option

WithOutputFile sets the output file path.

func WithPassiveDiscovery

func WithPassiveDiscovery(enabled bool) Option

WithPassiveDiscovery enables/disables passive API discovery.

func WithPrettyOutput

func WithPrettyOutput(pretty bool) Option

WithPrettyOutput enables/disables pretty JSON output.

func WithProgress

func WithProgress(enabled bool) Option

WithProgress enables/disables progress bar display.

func WithProxy

func WithProxy(proxy string) Option

WithProxy sets the proxy URL.

func WithRateLimit

func WithRateLimit(rps float64, burst int) Option

WithRateLimit sets the rate limiting configuration.

func WithRespectRobotsTxt

func WithRespectRobotsTxt(respect bool) Option

WithRespectRobotsTxt enables/disables robots.txt respect.

func WithScope

func WithScope(scope ScopeRules) Option

WithScope sets the scope rules.

func WithStateFile

func WithStateFile(path string) Option

WithStateFile sets the state file path for persistence.

func WithStreamMode

func WithStreamMode(stream bool) Option

WithStreamMode enables streaming output mode.

func WithTarget

func WithTarget(url string) Option

WithTarget sets the target URL to crawl.

func WithTimeout

func WithTimeout(timeout time.Duration) Option

WithTimeout sets the request timeout.

func WithUserAgent

func WithUserAgent(ua string) Option

WithUserAgent sets the user agent string.

func WithVerbose

func WithVerbose(verbose bool) Option

WithVerbose enables/disables verbose logging.

func WithWebSocketDiscovery

func WithWebSocketDiscovery(enabled bool) Option

WithWebSocketDiscovery enables/disables WebSocket discovery.

func WithWorkers

func WithWorkers(n int) Option

WithWorkers sets the number of concurrent workers.

type OutputConfig

type OutputConfig struct {
	Format     string `json:"format"` // json
	FilePath   string `json:"file_path"`
	Pretty     bool   `json:"pretty"`
	StreamMode bool   `json:"stream_mode"`
}

OutputConfig defines output configuration.

type Parameter

type Parameter struct {
	Name     string `json:"name"`
	Type     string `json:"type"` // query, body, header, path, cookie
	Example  string `json:"example,omitempty"`
	Required bool   `json:"required,omitempty"`
}

Parameter represents a request parameter.

type QueueItem

type QueueItem struct {
	URL       string            `json:"url"`
	Method    string            `json:"method"`
	Depth     int               `json:"depth"`
	ParentURL string            `json:"parent_url"`
	Headers   map[string]string `json:"headers,omitempty"`
	Body      []byte            `json:"body,omitempty"`
	Priority  int               `json:"priority"`
	Timestamp time.Time         `json:"timestamp"`
}

QueueItem represents an item in the crawl queue.

type RateLimitConfig

type RateLimitConfig struct {
	RequestsPerSecond float64       `json:"requests_per_second"`
	Burst             int           `json:"burst"`
	DelayBetween      time.Duration `json:"delay_between"`
	RespectRobotsTxt  bool          `json:"respect_robots_txt"`
}

RateLimitConfig defines rate limiting configuration.

type ScopeRules

type ScopeRules struct {
	IncludePatterns []string `json:"include_patterns"`
	ExcludePatterns []string `json:"exclude_patterns"`
	AllowedDomains  []string `json:"allowed_domains"`
	MaxDepth        int      `json:"max_depth"`
	FollowExternal  bool     `json:"follow_external"`
}

ScopeRules defines crawling scope rules.

type SecretFinding

type SecretFinding struct {
	Type    string `json:"type"`
	Value   string `json:"value"`
	File    string `json:"file,omitempty"`
	Context string `json:"context,omitempty"`
}

SecretFinding represents a potential secret found in source code.

type StateConfig

type StateConfig struct {
	Enabled  bool   `json:"enabled"`
	FilePath string `json:"file_path"`
	AutoSave bool   `json:"auto_save"`
	Interval int    `json:"interval_seconds"`
}

StateConfig defines state persistence configuration.

type Technology

type Technology struct {
	Name       string `json:"name"`
	Category   string `json:"category"`
	Version    string `json:"version,omitempty"`
	Confidence int    `json:"confidence"`
	Evidence   string `json:"evidence,omitempty"`
}

Technology represents a detected technology.

type WebSocketEndpoint

type WebSocketEndpoint struct {
	URL            string         `json:"url"`
	DiscoveredFrom string         `json:"discovered_from"`
	SampleMessages []WebSocketMsg `json:"sample_messages,omitempty"`
	Protocols      []string       `json:"protocols,omitempty"`
	Timestamp      time.Time      `json:"timestamp"`
}

WebSocketEndpoint represents a discovered WebSocket endpoint.

type WebSocketMsg

type WebSocketMsg struct {
	Direction string    `json:"direction"` // sent, received
	Type      string    `json:"type"`      // text, binary
	Data      string    `json:"data"`
	Timestamp time.Time `json:"timestamp"`
}

WebSocketMsg represents a WebSocket message.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL