types

package
v1.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 1, 2025 License: MIT Imports: 20 Imported by: 22

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type CrawlerOptions

type CrawlerOptions struct {
	// OutputWriter is the interface for writing output
	OutputWriter output.Writer
	// RateLimit is a mechanism for controlling request rate limit
	RateLimit *ratelimit.Limiter
	// Parser is a mechanism for extracting new URLS from responses
	Parser *parser.Parser
	// Options contains the user specified configuration options
	Options *Options
	// ExtensionsValidator is a validator for file extensions
	ExtensionsValidator *extensions.Validator
	// UniqueFilter is a filter for deduplication of unique items
	UniqueFilter filters.Filter
	// ScopeManager is a manager for validating crawling scope
	ScopeManager *scope.Manager
	// Dialer is instance of the dialer for global crawler
	Dialer *fastdialer.Dialer
	// Wappalyzer instance for technologies detection
	Wappalyzer *wappalyzer.Wappalyze
}

CrawlerOptions contains helper utilities for the crawler

func NewCrawlerOptions

func NewCrawlerOptions(options *Options) (*CrawlerOptions, error)

NewCrawlerOptions creates a new crawler options structure from user specified options.

func (*CrawlerOptions) Close

func (c *CrawlerOptions) Close() error

Close closes the crawler options resources

func (*CrawlerOptions) ValidatePath added in v1.0.0

func (c *CrawlerOptions) ValidatePath(path string) bool

func (*CrawlerOptions) ValidateScope added in v1.0.0

func (c *CrawlerOptions) ValidateScope(absURL, rootHostname string) (bool, error)

ValidateScope validates scope for an AbsURL

type OnResultCallback added in v0.0.3

type OnResultCallback func(output.Result)

OnResultCallback (output.Result)

type OnSkipURLCallback added in v1.2.0

type OnSkipURLCallback func(string)

OnSkipURLCallback (string)

type Options

type Options struct {
	// URLs contains a list of URLs for crawling
	URLs goflags.StringSlice
	// Resume the scan from the state stored in the resume config file
	Resume string
	// Exclude host matching specified filter ('cdn', 'private-ips', cidr, ip, regex)
	Exclude goflags.StringSlice
	// Scope contains a list of regexes for in-scope URLS
	Scope goflags.StringSlice
	// OutOfScope contains a list of regexes for out-scope URLS
	OutOfScope goflags.StringSlice
	// NoScope disables host based default scope
	NoScope bool
	// DisplayOutScope displays out of scope items in results
	DisplayOutScope bool
	// ExtensionsMatch contains extensions to match explicitly
	ExtensionsMatch goflags.StringSlice
	// ExtensionFilter contains additional items for filter list
	ExtensionFilter goflags.StringSlice
	// NoDefaultExtFilter removes the default extensions from the filter list
	NoDefaultExtFilter bool
	// OutputMatchCondition is the condition to match output
	OutputMatchCondition string
	// OutputFilterCondition is the condition to filter output
	OutputFilterCondition string
	// MaxDepth is the maximum depth to crawl
	MaxDepth int
	// BodyReadSize is the maximum size of response body to read
	BodyReadSize int
	// Timeout is the time to wait for request in seconds
	Timeout int
	// TimeStable is the time to wait until the page is stable
	TimeStable int
	// CrawlDuration is the duration in seconds to crawl target from
	CrawlDuration time.Duration
	// Delay is the delay between each crawl requests in seconds
	Delay int
	// RateLimit is the maximum number of requests to send per second
	RateLimit int
	// Retries is the number of retries to do for request
	Retries int
	// RateLimitMinute is the maximum number of requests to send per minute
	RateLimitMinute int
	// Concurrency is the number of concurrent crawling goroutines
	Concurrency int
	// Parallelism is the number of urls processing goroutines
	Parallelism int
	// FormConfig is the path to the form configuration file
	FormConfig string
	// Proxy is the URL for the proxy server
	Proxy string
	// Strategy is the crawling strategy. depth-first or breadth-first
	Strategy string
	// FieldScope is the scope field for default DNS scope
	FieldScope string
	// OutputFile is the file to write output to
	OutputFile string
	// KnownFiles enables crawling of knows files like robots.txt, sitemap.xml, etc
	KnownFiles string
	// Fields is the fields to format in output
	Fields string
	// StoreFields is the fields to store in separate per-host files
	StoreFields string
	// FieldConfig is the path to the custom field configuration file
	FieldConfig string
	// NoColors disables coloring of response output
	NoColors bool
	// JSON enables writing output in JSON format
	JSON bool
	// ExcludeOutputFields is the list of fields to exclude from the output
	ExcludeOutputFields goflags.StringSlice
	// ListOutputFields is the list of fields
	ListOutputFields bool
	// Silent shows only output
	Silent bool
	// Verbose specifies showing verbose output
	Verbose bool
	// TechDetect enables technology detection
	TechDetect bool
	// Version enables showing of crawler version
	Version bool
	// ScrapeJSResponses enables scraping of relative endpoints from javascript
	ScrapeJSResponses bool
	// ScrapeJSLuiceResponses enables scraping of endpoints from javascript using jsluice
	ScrapeJSLuiceResponses bool
	// CustomHeaders is a list of custom headers to add to request
	CustomHeaders goflags.StringSlice
	// Headless enables headless scraping
	Headless bool
	// AutomaticFormFill enables optional automatic form filling and submission
	AutomaticFormFill bool
	// FormExtraction enables extraction of form, input, textarea & select elements
	FormExtraction bool
	// UseInstalledChrome skips chrome install and use local instance
	UseInstalledChrome bool
	// ShowBrowser specifies whether the show the browser in headless mode
	ShowBrowser bool
	// HeadlessOptionalArguments specifies optional arguments to pass to Chrome
	HeadlessOptionalArguments goflags.StringSlice
	// HeadlessNoSandbox specifies if chrome should be start in --no-sandbox mode
	HeadlessNoSandbox bool
	// SystemChromePath : Specify the chrome binary path for headless crawling
	SystemChromePath string
	// ChromeWSUrl : Specify the Chrome debugger websocket url for a running Chrome instance to attach to
	ChromeWSUrl string
	// OnResult allows callback function on a result
	OnResult OnResultCallback
	// OnSkipURL allows callback function on a skipped url
	OnSkipURL OnSkipURLCallback
	// StoreResponse specifies if katana should store http requests/responses
	StoreResponse bool
	// StoreResponseDir specifies if katana should use a custom directory to store http requests/responses
	StoreResponseDir string
	// NoClobber specifies if katana should overwrite existing output files
	NoClobber bool
	// StoreFieldDir specifies if katana should use a custom directory to store fields
	StoreFieldDir string
	// OmitRaw omits raw requests/responses from the output
	OmitRaw bool
	// OmitBody omits the response body from the output
	OmitBody bool
	// ChromeDataDir : 	Specify the --user-data-dir to chrome binary to preserve sessions
	ChromeDataDir string
	// HeadlessNoIncognito specifies if chrome should be started without incognito mode
	HeadlessNoIncognito bool
	// XhrExtraction extract xhr requests
	XhrExtraction bool
	// HealthCheck determines if a self-healthcheck should be performed
	HealthCheck bool
	// PprofServer enables pprof server
	PprofServer bool
	// ErrorLogFile specifies a file to write with the errors of all requests
	ErrorLogFile string
	// Resolvers contains custom resolvers
	Resolvers goflags.StringSlice
	// OutputTemplate enables custom output template
	OutputTemplate string
	// OutputMatchRegex is the regex to match output url
	OutputMatchRegex goflags.StringSlice
	// OutputFilterRegex is the regex to filter output url
	OutputFilterRegex goflags.StringSlice
	// FilterRegex is the slice regex to filter url
	FilterRegex []*regexp.Regexp
	// MatchRegex is the slice regex to match url
	MatchRegex []*regexp.Regexp
	//DisableUpdateCheck disables automatic update check
	DisableUpdateCheck bool
	//IgnoreQueryParams ignore crawling same path with different query-param values
	IgnoreQueryParams bool
	// Debug
	Debug bool
	// TlsImpersonate enables experimental tls ClientHello randomization for standard crawler
	TlsImpersonate bool
	// DisableRedirects disables the following of redirects
	DisableRedirects bool
	// PathClimb enables path expansion (auto crawl discovered paths)
	PathClimb bool
	// DisableUniqueFilter disables duplicate content filtering
	DisableUniqueFilter bool
	// MaxOnclickLinks is the maximum number of onclick links to process per page (default: 10)
	MaxOnclickLinks int
}
var DefaultOptions Options

func (*Options) ConfigureOutput added in v1.1.1

func (options *Options) ConfigureOutput()

ConfigureOutput configures the output logging levels to be displayed on the screen

func (*Options) ParseCustomHeaders

func (options *Options) ParseCustomHeaders() map[string]string

func (*Options) ParseHeadlessOptionalArguments added in v0.0.2

func (options *Options) ParseHeadlessOptionalArguments() map[string]string

func (*Options) ShouldResume added in v1.0.4

func (options *Options) ShouldResume() bool

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL