storage

package
v0.10.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 7, 2026 License: AGPL-3.0 Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const (
	RecordMeta       = "meta"
	RecordPage       = "page"
	RecordLink       = "link"
	RecordRobots     = "robots"
	RecordSitemap    = "sitemap"
	RecordSitemapURL = "sitemap_url"
)

Export JSONL record types.

View Source
const AlterPagesV2 = `` /* 1379-byte string literal not displayed */

AlterPagesV2 adds new columns to existing pages table.

View Source
const AlterPagesV3 = `
ALTER TABLE crawlobserver.pages
    ADD COLUMN IF NOT EXISTS pagerank Float64 DEFAULT 0 AFTER found_on
`
View Source
const AlterPagesV4 = `
ALTER TABLE crawlobserver.pages
    ADD COLUMN IF NOT EXISTS body_truncated Bool DEFAULT false AFTER body_html
`
View Source
const AlterPagesV5 = `` /* 1456-byte string literal not displayed */
View Source
const AlterPagesV6 = `
ALTER TABLE crawlobserver.pages
    ADD COLUMN IF NOT EXISTS content_hash UInt64 DEFAULT 0 AFTER pagerank
`
View Source
const AlterSessionsV2 = `
ALTER TABLE crawlobserver.crawl_sessions
    ADD COLUMN IF NOT EXISTS project_id Nullable(String) DEFAULT NULL
`
View Source
const CreateApplicationLogs = `` /* 347-byte string literal not displayed */
View Source
const CreateCrawlSessions = `` /* 294-byte string literal not displayed */
View Source
const CreateDatabase = `CREATE DATABASE IF NOT EXISTS crawlobserver`
View Source
const CreateExternalLinkChecks = `` /* 363-byte string literal not displayed */
View Source
const CreateExtractions = `` /* 291-byte string literal not displayed */
View Source
const CreateGSCAnalytics = `` /* 405-byte string literal not displayed */
View Source
const CreateGSCInspection = `` /* 497-byte string literal not displayed */
View Source
const CreateLinks = `` /* 305-byte string literal not displayed */
View Source
const CreateLinksV2 = `` /* 338-byte string literal not displayed */
View Source
const CreatePageResourceChecks = `` /* 427-byte string literal not displayed */
View Source
const CreatePageResourceRefs = `` /* 304-byte string literal not displayed */
View Source
const CreatePages = `` /* 1287-byte string literal not displayed */
View Source
const CreatePagesV2 = `` /* 1357-byte string literal not displayed */

DDL for v2 tables partitioned by crawl_session_id.

View Source
const CreateProviderAPICalls = `` /* 423-byte string literal not displayed */
View Source
const CreateProviderBacklinks = `` /* 551-byte string literal not displayed */
View Source
const CreateProviderData = `` /* 605-byte string literal not displayed */
View Source
const CreateProviderDomainMetrics = `` /* 427-byte string literal not displayed */
View Source
const CreateProviderRankings = `` /* 451-byte string literal not displayed */
View Source
const CreateProviderRefDomains = `` /* 389-byte string literal not displayed */
View Source
const CreateProviderTopPages = `` /* 484-byte string literal not displayed */
View Source
const CreateProviderVisibility = `` /* 370-byte string literal not displayed */
View Source
const CreateRetryAttempts = `` /* 247-byte string literal not displayed */
View Source
const CreateRobotsTxt = `` /* 263-byte string literal not displayed */
View Source
const CreateRobotsTxtV2 = `` /* 296-byte string literal not displayed */
View Source
const CreateSitemapURLs = `` /* 265-byte string literal not displayed */
View Source
const CreateSitemapURLsV2 = `` /* 298-byte string literal not displayed */
View Source
const CreateSitemaps = `` /* 286-byte string literal not displayed */
View Source
const CreateSitemapsV2 = `` /* 319-byte string literal not displayed */
View Source
const ExportFormatVersion = 1

ExportFormatVersion is the current export format version.

Variables

View Source
var BacklinkFilters = map[string]FilterDef{
	"source_url":    {Column: "source_url", Type: FilterLike},
	"target_url":    {Column: "target_url", Type: FilterLike},
	"anchor_text":   {Column: "anchor_text", Type: FilterLike},
	"trust_flow":    {Column: "domain_rank", Type: FilterUint},
	"citation_flow": {Column: "page_rank", Type: FilterUint},
	"nofollow":      {Column: "nofollow", Type: FilterBool},
	"first_seen":    {Column: "first_seen", Type: FilterLike},
	"last_seen":     {Column: "last_seen", Type: FilterLike},
}

BacklinkFilters defines the allowed filter columns for provider_backlinks.

View Source
var BacklinkSortColumns = map[string]string{
	"source_url":    "source_url",
	"target_url":    "target_url",
	"anchor_text":   "anchor_text",
	"trust_flow":    "domain_rank",
	"citation_flow": "page_rank",
	"nofollow":      "nofollow",
	"first_seen":    "first_seen",
	"last_seen":     "last_seen",
}

BacklinkSortColumns maps query param names to DB column names for provider_backlinks.

View Source
var ExternalCheckFilters = map[string]FilterDef{
	"url":          {Column: "url", Type: FilterLike},
	"status_code":  {Column: "status_code", Type: FilterUint},
	"error":        {Column: "error", Type: FilterLike},
	"content_type": {Column: "content_type", Type: FilterLike},
	"redirect_url": {Column: "redirect_url", Type: FilterLike},
}

ExternalCheckFilters defines the allowed filter columns for the external_link_checks table.

View Source
var ExternalDomainCheckFilters = map[string]FilterDef{
	"domain": {Column: "domain", Type: FilterLike},
}

ExternalDomainCheckFilters defines the allowed filter columns for domain-level external checks.

View Source
var LinkFilters = map[string]FilterDef{
	"source_url":  {Column: "source_url", Type: FilterLike},
	"target_url":  {Column: "target_url", Type: FilterLike},
	"anchor_text": {Column: "anchor_text", Type: FilterLike},
	"rel":         {Column: "rel", Type: FilterLike},
	"tag":         {Column: "tag", Type: FilterLike},
}

LinkFilters defines the allowed filter columns for the links table.

View Source
var LinkSortColumns = map[string]string{
	"source_url":  "source_url",
	"target_url":  "target_url",
	"anchor_text": "anchor_text",
	"rel":         "rel",
	"tag":         "tag",
	"crawled_at":  "crawled_at",
}

LinkSortColumns maps query param names to DB column names for links.

View Source
var Migrations = []Migration{
	{Name: "create database", DDL: CreateDatabase},
	{Name: "create crawl_sessions", DDL: CreateCrawlSessions},
	{Name: "create pages", DDL: CreatePages},
	{Name: "create links", DDL: CreateLinks},
	{Name: "alter pages v2", DDL: AlterPagesV2},
	{Name: "alter pages v3", DDL: AlterPagesV3},
	{Name: "alter pages v4", DDL: AlterPagesV4},
	{Name: "create robots_txt", DDL: CreateRobotsTxt},
	{Name: "alter sessions v2", DDL: AlterSessionsV2},
	{Name: "create sitemaps", DDL: CreateSitemaps},
	{Name: "create sitemap_urls", DDL: CreateSitemapURLs},
	{Name: "repartition by session_id", Fn: migrateRepartitionBySession},
	{Name: "create gsc_analytics", DDL: CreateGSCAnalytics},
	{Name: "create gsc_inspection", DDL: CreateGSCInspection},
	{Name: "create external_link_checks", DDL: CreateExternalLinkChecks},
	{Name: "create application_logs", DDL: CreateApplicationLogs},
	{Name: "create provider_domain_metrics", DDL: CreateProviderDomainMetrics},
	{Name: "create provider_backlinks", DDL: CreateProviderBacklinks},
	{Name: "create provider_refdomains", DDL: CreateProviderRefDomains},
	{Name: "create provider_rankings", DDL: CreateProviderRankings},
	{Name: "create provider_visibility", DDL: CreateProviderVisibility},
	{Name: "create page_resource_checks", DDL: CreatePageResourceChecks},
	{Name: "create page_resource_refs", DDL: CreatePageResourceRefs},
	{Name: "alter pages v5 js rendering", DDL: AlterPagesV5},
	{Name: "alter pages v6 content hash", DDL: AlterPagesV6},
	{Name: "create provider_top_pages", DDL: CreateProviderTopPages},
	{Name: "create provider_api_calls", DDL: CreateProviderAPICalls},
	{Name: "create extractions", DDL: CreateExtractions},
	{Name: "create provider_data", DDL: CreateProviderData},
	{Name: "alter provider_backlinks add ttf_topic", DDL: `ALTER TABLE crawlobserver.provider_backlinks ADD COLUMN IF NOT EXISTS source_ttf_topic String DEFAULT ''`},
	{Name: "create retry_attempts", DDL: CreateRetryAttempts},
}

Migrations is the ordered list of migrations.

View Source
var PageFilters = map[string]FilterDef{
	"url":                {Column: "url", Type: FilterLike},
	"content_type":       {Column: "content_type", Type: FilterLike},
	"title":              {Column: "title", Type: FilterLike},
	"canonical":          {Column: "canonical", Type: FilterLike},
	"meta_robots":        {Column: "meta_robots", Type: FilterLike},
	"meta_description":   {Column: "meta_description", Type: FilterLike},
	"meta_keywords":      {Column: "meta_keywords", Type: FilterLike},
	"lang":               {Column: "lang", Type: FilterLike},
	"og_title":           {Column: "og_title", Type: FilterLike},
	"content_encoding":   {Column: "content_encoding", Type: FilterLike},
	"index_reason":       {Column: "index_reason", Type: FilterLike},
	"error":              {Column: "error", Type: FilterLike},
	"found_on":           {Column: "found_on", Type: FilterLike},
	"status_code":        {Column: "status_code", Type: FilterUint},
	"title_length":       {Column: "title_length", Type: FilterUint},
	"meta_desc_length":   {Column: "meta_desc_length", Type: FilterUint},
	"depth":              {Column: "depth", Type: FilterUint},
	"word_count":         {Column: "word_count", Type: FilterUint},
	"internal_links_out": {Column: "internal_links_out", Type: FilterUint},
	"external_links_out": {Column: "external_links_out", Type: FilterUint},
	"images_count":       {Column: "images_count", Type: FilterUint},
	"images_no_alt":      {Column: "images_no_alt", Type: FilterUint},
	"body_size":          {Column: "body_size", Type: FilterUint},
	"fetch_duration_ms":  {Column: "fetch_duration_ms", Type: FilterUint},
	"is_indexable":       {Column: "is_indexable", Type: FilterBool},
	"canonical_is_self":  {Column: "canonical_is_self", Type: FilterBool},
	"h1":                 {Column: "h1", Type: FilterArray},
	"h2":                 {Column: "h2", Type: FilterArray},
	"pagerank":           {Column: "pagerank", Type: FilterUint},
}

PageFilters defines the allowed filter columns for the pages table.

View Source
var PageResourceCheckFilters = map[string]FilterDef{
	"url":           {Column: "url", Type: FilterLike},
	"resource_type": {Column: "resource_type", Type: FilterLike},
	"is_internal":   {Column: "is_internal", Type: FilterBool},
	"status_code":   {Column: "status_code", Type: FilterUint},
	"content_type":  {Column: "content_type", Type: FilterLike},
	"error":         {Column: "error", Type: FilterLike},
}

PageResourceCheckFilters defines the allowed filter columns for page_resource_checks.

View Source
var PageSortColumns = map[string]string{
	"url":                "url",
	"status_code":        "status_code",
	"title":              "title",
	"title_length":       "title_length",
	"word_count":         "word_count",
	"internal_links_out": "internal_links_out",
	"external_links_out": "external_links_out",
	"body_size":          "body_size",
	"fetch_duration_ms":  "fetch_duration_ms",
	"depth":              "depth",
	"pagerank":           "pagerank",
	"content_type":       "content_type",
	"meta_description":   "meta_description",
	"meta_desc_length":   "meta_desc_length",
	"meta_keywords":      "meta_keywords",
	"canonical":          "canonical",
	"is_indexable":       "is_indexable",
	"index_reason":       "index_reason",
	"meta_robots":        "meta_robots",
	"canonical_is_self":  "canonical_is_self",
	"images_count":       "images_count",
	"images_no_alt":      "images_no_alt",
	"content_encoding":   "content_encoding",
	"lang":               "lang",
	"og_title":           "og_title",
	"crawled_at":         "crawled_at",
}

PageSortColumns maps query param names to DB column names for pages.

View Source
var ProviderDataFilters = map[string]FilterDef{
	"item_url":      {Column: "item_url", Type: FilterLike},
	"title":         {Column: "str_data['title']", Type: FilterLike},
	"language":      {Column: "str_data['language']", Type: FilterLike},
	"trust_flow":    {Column: "trust_flow", Type: FilterUint},
	"citation_flow": {Column: "citation_flow", Type: FilterUint},
	"ext_backlinks": {Column: "ext_backlinks", Type: FilterUint},
	"ref_domains":   {Column: "ref_domains", Type: FilterUint},
	"topic":         {Column: "str_data['ttf_topic_0']", Type: FilterLike},
}

ProviderDataFilters defines the allowed filter columns for the provider_data table.

View Source
var ProviderDataSortColumns = map[string]string{
	"item_url":      "item_url",
	"trust_flow":    "trust_flow",
	"citation_flow": "citation_flow",
	"ext_backlinks": "ext_backlinks",
	"ref_domains":   "ref_domains",
	"domain_rank":   "domain_rank",
}

ProviderDataSortColumns maps query param names to DB column names for provider_data.

View Source
var RedirectFilters = map[string]FilterDef{
	"url":         {Column: "p.url", Type: FilterLike},
	"status_code": {Column: "p.status_code", Type: FilterUint},
	"final_url":   {Column: "p.final_url", Type: FilterLike},
}

RedirectFilters defines the allowed filter columns for the redirect pages view.

View Source
var RedirectSortColumns = map[string]string{
	"url":                    "p.url",
	"status_code":            "p.status_code",
	"final_url":              "p.final_url",
	"inbound_internal_links": "inbound_internal_links",
}

RedirectSortColumns maps query param names to DB column names for redirect pages.

Functions

func BuildOrderByClause added in v0.2.0

func BuildOrderByClause(sort *SortParam, defaultOrderBy string) string

BuildOrderByClause returns an ORDER BY clause using the sort param or the default.

func BuildWhereClause

func BuildWhereClause(filters []ParsedFilter) (string, []interface{}, error)

BuildWhereClause generates a SQL WHERE clause fragment and arguments from parsed filters.

Types

type AnchorCount

type AnchorCount struct {
	Anchor string `json:"anchor"`
	Count  uint64 `json:"count"`
}

AnchorCount is an anchor text + count.

type AuditContent

type AuditContent struct {
	Total                uint64 `json:"total"`
	HTMLPages            uint64 `json:"html_pages"`
	TitleMissing         uint64 `json:"title_missing"`
	TitleTooLong         uint64 `json:"title_too_long"`
	TitleTooShort        uint64 `json:"title_too_short"`
	TitleDuplicates      uint64 `json:"title_duplicates"`
	MetaDescMissing      uint64 `json:"meta_desc_missing"`
	MetaDescTooLong      uint64 `json:"meta_desc_too_long"`
	MetaDescTooShort     uint64 `json:"meta_desc_too_short"`
	H1Missing            uint64 `json:"h1_missing"`
	H1Multiple           uint64 `json:"h1_multiple"`
	ThinUnder100         uint64 `json:"thin_under_100"`
	Thin100300           uint64 `json:"thin_100_300"`
	ImagesTotal          uint64 `json:"images_total"`
	ImagesNoAltTotal     uint64 `json:"images_no_alt_total"`
	PagesWithImagesNoAlt uint64 `json:"pages_with_images_no_alt"`
}

AuditContent holds content-related audit metrics.

type AuditInternational

type AuditInternational struct {
	PagesWithHreflang  uint64        `json:"pages_with_hreflang"`
	PagesWithLang      uint64        `json:"pages_with_lang"`
	PagesWithSchema    uint64        `json:"pages_with_schema"`
	LangDistribution   []LangCount   `json:"lang_distribution"`
	SchemaDistribution []SchemaCount `json:"schema_distribution"`
}

AuditInternational holds international/schema audit metrics.

type AuditLinks struct {
	TotalInternal        uint64           `json:"total_internal"`
	TotalExternal        uint64           `json:"total_external"`
	ExternalNofollow     uint64           `json:"external_nofollow"`
	ExternalDofollow     uint64           `json:"external_dofollow"`
	PagesNoInternalOut   uint64           `json:"pages_no_internal_out"`
	PagesHighInternalOut uint64           `json:"pages_high_internal_out"`
	PagesNoExternal      uint64           `json:"pages_no_external"`
	BrokenInternal       uint64           `json:"broken_internal"`
	TopExternalDomains   []ExternalDomain `json:"top_external_domains"`
	TopAnchors           []AnchorCount    `json:"top_anchors"`
}

AuditLinks holds link audit metrics.

type AuditResult

type AuditResult struct {
	Content       *AuditContent       `json:"content"`
	Technical     *AuditTechnical     `json:"technical"`
	Links         *AuditLinks         `json:"links"`
	Structure     *AuditStructure     `json:"structure"`
	Sitemaps      *AuditSitemaps      `json:"sitemaps"`
	International *AuditInternational `json:"international"`
}

AuditResult is the combined audit result.

type AuditSitemaps

type AuditSitemaps struct {
	InBoth           uint64 `json:"in_both"`
	CrawledOnly      uint64 `json:"crawled_only"`
	SitemapOnly      uint64 `json:"sitemap_only"`
	TotalSitemapURLs uint64 `json:"total_sitemap_urls"`
}

AuditSitemaps holds sitemap coverage audit metrics.

type AuditStructure

type AuditStructure struct {
	Directories []DirectoryCount `json:"directories"`
	OrphanPages uint64           `json:"orphan_pages"`
}

AuditStructure holds site structure audit metrics.

type AuditTechnical

type AuditTechnical struct {
	Indexable           uint64             `json:"indexable"`
	NonIndexable        uint64             `json:"non_indexable"`
	CanonicalSelf       uint64             `json:"canonical_self"`
	CanonicalOther      uint64             `json:"canonical_other"`
	CanonicalMissing    uint64             `json:"canonical_missing"`
	HasRedirect         uint64             `json:"has_redirect"`
	RedirectChainsOver2 uint64             `json:"redirect_chains_over_2"`
	ResponseFast        uint64             `json:"response_fast"`
	ResponseOK          uint64             `json:"response_ok"`
	ResponseSlow        uint64             `json:"response_slow"`
	ResponseVerySlow    uint64             `json:"response_very_slow"`
	ErrorPages          uint64             `json:"error_pages"`
	NoindexReasons      []NoindexReason    `json:"noindex_reasons"`
	ContentTypes        []ContentTypeCount `json:"content_types"`
}

AuditTechnical holds technical audit metrics.

type BFSResult

type BFSResult struct {
	Depths  map[string]uint16
	FoundOn map[string]string
}

RecomputeDepths runs a BFS from seed URLs and updates depth/found_on in the pages table. BFSResult holds the output of a BFS depth computation.

func ComputeBFSDepths

func ComputeBFSDepths(seedURLs []string, crawledSet map[string]bool, adj map[string][]string) BFSResult

ComputeBFSDepths runs BFS from seedURLs over the link graph and returns the depth and found_on for every URL in crawledSet. Seeds get depth 0. Orphans (unreachable) get maxDepth+1.

type Buffer

type Buffer struct {
	// contains filtered or unexported fields
}

Buffer accumulates rows and flushes them in batches.

func NewBuffer

func NewBuffer(store PageLinkInserter, batchSize int, flushInterval time.Duration, sessionID string) *Buffer

NewBuffer creates a new write buffer.

func (*Buffer) AddExtractions added in v0.3.0

func (b *Buffer) AddExtractions(rows []extraction.ExtractionRow)

AddExtractions adds extraction rows to the buffer.

func (b *Buffer) AddLinks(links []LinkRow)

AddLinks adds link rows to the buffer.

func (*Buffer) AddPage

func (b *Buffer) AddPage(page PageRow)

AddPage adds a page row to the buffer.

func (*Buffer) Close

func (b *Buffer) Close()

Close flushes remaining data and stops the flush loop.

func (*Buffer) ErrorState

func (b *Buffer) ErrorState() BufferErrorState

ErrorState returns the current error state of the buffer for monitoring.

func (*Buffer) Flush

func (b *Buffer) Flush()

Flush writes all buffered data to ClickHouse, retrying previously failed batches first.

func (*Buffer) PageCount

func (b *Buffer) PageCount() int

PageCount returns the number of buffered pages.

func (*Buffer) SetOnDataLost

func (b *Buffer) SetOnDataLost(fn func(lostPages, lostLinks int64))

SetOnDataLost registers a callback invoked (outside the lock) whenever data is dropped.

type BufferErrorState

type BufferErrorState struct {
	LostPages    int64 `json:"lost_pages"`
	LostLinks    int64 `json:"lost_links"`
	PendingPages int   `json:"pending_retry_pages"`
	PendingLinks int   `json:"pending_retry_links"`
	LastError    error `json:"last_error,omitempty"`
}

BufferErrorState exposes retry/loss counters for monitoring.

type CSVImportResult added in v0.9.0

type CSVImportResult struct {
	Session      *CrawlSession `json:"session"`
	RowsImported int           `json:"rows_imported"`
	RowsSkipped  int           `json:"rows_skipped"`
}

CSVImportResult holds the outcome of a CSV import.

type CSVSource added in v0.9.0

type CSVSource string

CSVSource identifies the CSV format variant.

const (
	CSVSourceAddressBased CSVSource = "address-based"
	CSVSourceURLBased     CSVSource = "url-based"
)

type CompareStatsResult

type CompareStatsResult struct {
	SessionA string        `json:"session_a"`
	SessionB string        `json:"session_b"`
	StatsA   *SessionStats `json:"stats_a"`
	StatsB   *SessionStats `json:"stats_b"`
}

CompareStatsResult holds side-by-side stats for two sessions.

type ContentTypeCount

type ContentTypeCount struct {
	ContentType string `json:"content_type"`
	Count       uint64 `json:"count"`
}

ContentTypeCount is a content type + count.

type CrawlSession

type CrawlSession struct {
	ID           string
	StartedAt    time.Time
	FinishedAt   time.Time
	Status       string // running, completed, failed, stopped
	SeedURLs     []string
	Config       string // JSON
	PagesCrawled uint64
	UserAgent    string
	ProjectID    *string
}

CrawlSession represents a crawl session.

type DirectoryCount

type DirectoryCount struct {
	Directory string `json:"directory"`
	Count     uint64 `json:"count"`
}

DirectoryCount is a URL directory prefix + count.

type ExpiredDomain

type ExpiredDomain struct {
	RegistrableDomain string                `json:"registrable_domain"`
	DeadURLsChecked   uint64                `json:"dead_urls_checked"`
	Sources           []ExpiredDomainSource `json:"sources"`
}

ExpiredDomain represents a registrable domain where all checked URLs had DNS failures.

type ExpiredDomainSource

type ExpiredDomainSource struct {
	SourceURL string `json:"source_url"`
	TargetURL string `json:"target_url"`
}

ExpiredDomainSource represents a source page linking to an expired domain.

type ExpiredDomainsResult

type ExpiredDomainsResult struct {
	Domains []ExpiredDomain `json:"domains"`
	Total   uint64          `json:"total"`
}

ExpiredDomainsResult wraps paginated expired domain results.

type ExternalDomain

type ExternalDomain struct {
	Domain string `json:"domain"`
	Count  uint64 `json:"count"`
}

ExternalDomain is a domain + link count.

type ExternalDomainCheck

type ExternalDomainCheck struct {
	Domain        string `json:"domain"`
	TotalURLs     uint64 `json:"total_urls"`
	OK            uint64 `json:"ok"`
	Redirects     uint64 `json:"redirects"`
	ClientErrors  uint64 `json:"client_errors"`
	ServerErrors  uint64 `json:"server_errors"`
	Unreachable   uint64 `json:"unreachable"`
	AvgResponseMs uint32 `json:"avg_response_ms"`
}

ExternalDomainCheck represents aggregated external check stats per domain.

type ExternalLinkCheck

type ExternalLinkCheck struct {
	CrawlSessionID string    `json:"crawl_session_id"`
	URL            string    `json:"url"`
	StatusCode     uint16    `json:"status_code"`
	Error          string    `json:"error"`
	ContentType    string    `json:"content_type"`
	RedirectURL    string    `json:"redirect_url"`
	ResponseTimeMs uint32    `json:"response_time_ms"`
	CheckedAt      time.Time `json:"checked_at"`
}

ExternalLinkCheck represents a single external URL check result.

type ExtractionQueryRow added in v0.3.0

type ExtractionQueryRow struct {
	URL           string
	ExtractorName string
	Value         string
}

ExtractionRow for query results.

type FilterDef

type FilterDef struct {
	Column string
	Type   FilterType
}

type FilterType

type FilterType int
const (
	FilterLike  FilterType = iota // String → ILIKE '%val%'
	FilterUint                    // Numeric → =N, >N, <N, >=N, <=N
	FilterBool                    // Bool → = true/false
	FilterArray                   // Array(String) → arrayExists(x -> x ILIKE '%val%', col)
)

type GSCAnalyticsInsertRow

type GSCAnalyticsInsertRow struct {
	Date        time.Time
	Query       string
	Page        string
	Country     string
	Device      string
	Clicks      uint32
	Impressions uint32
	CTR         float32
	Position    float32
}

GSCAnalyticsInsertRow is the input row for batch inserts.

type GSCCountryRow

type GSCCountryRow struct {
	Country     string  `json:"country"`
	Clicks      uint64  `json:"clicks"`
	Impressions uint64  `json:"impressions"`
	CTR         float64 `json:"ctr"`
	Position    float64 `json:"position"`
}

type GSCDeviceRow

type GSCDeviceRow struct {
	Device      string  `json:"device"`
	Clicks      uint64  `json:"clicks"`
	Impressions uint64  `json:"impressions"`
	CTR         float64 `json:"ctr"`
	Position    float64 `json:"position"`
}

type GSCInspectionInsertRow

type GSCInspectionInsertRow struct {
	URL               string
	Verdict           string
	CoverageState     string
	IndexingState     string
	RobotsTxtState    string
	LastCrawlTime     time.Time
	CrawledAs         string
	CanonicalURL      string
	IsGoogleCanonical bool
	MobileUsability   string
	RichResultsItems  uint16
}

type GSCInspectionRow

type GSCInspectionRow struct {
	URL               string `json:"url"`
	Verdict           string `json:"verdict"`
	CoverageState     string `json:"coverage_state"`
	IndexingState     string `json:"indexing_state"`
	RobotsTxtState    string `json:"robots_txt_state"`
	LastCrawlTime     string `json:"last_crawl_time"`
	CrawledAs         string `json:"crawled_as"`
	CanonicalURL      string `json:"canonical_url"`
	IsGoogleCanonical bool   `json:"is_google_canonical"`
	MobileUsability   string `json:"mobile_usability"`
	RichResultsItems  uint16 `json:"rich_results_items"`
}

type GSCOverviewStats

type GSCOverviewStats struct {
	TotalClicks      uint64  `json:"total_clicks"`
	TotalImpressions uint64  `json:"total_impressions"`
	AvgCTR           float64 `json:"avg_ctr"`
	AvgPosition      float64 `json:"avg_position"`
	DateMin          string  `json:"date_min"`
	DateMax          string  `json:"date_max"`
	TotalQueries     uint64  `json:"total_queries"`
	TotalPages       uint64  `json:"total_pages"`
}

type GSCPageRow

type GSCPageRow struct {
	Page        string  `json:"page"`
	Clicks      uint64  `json:"clicks"`
	Impressions uint64  `json:"impressions"`
	CTR         float64 `json:"ctr"`
	Position    float64 `json:"position"`
}

type GSCQueryRow

type GSCQueryRow struct {
	Query       string  `json:"query"`
	Clicks      uint64  `json:"clicks"`
	Impressions uint64  `json:"impressions"`
	CTR         float64 `json:"ctr"`
	Position    float64 `json:"position"`
}

type GSCTimelineRow

type GSCTimelineRow struct {
	Date        string `json:"date"`
	Clicks      uint64 `json:"clicks"`
	Impressions uint64 `json:"impressions"`
}

type GlobalSessionStats

type GlobalSessionStats struct {
	SessionID  string  `json:"session_id"`
	TotalPages uint64  `json:"total_pages"`
	TotalLinks uint64  `json:"total_links"`
	ErrorCount uint64  `json:"error_count"`
	AvgFetchMs float64 `json:"avg_fetch_ms"`
}

GlobalSessionStats holds aggregated stats for a single session.

type HreflangRow

type HreflangRow struct {
	Lang string
	URL  string
}

HreflangRow represents a hreflang entry.

type LangCount

type LangCount struct {
	Lang  string `json:"lang"`
	Count uint64 `json:"count"`
}

LangCount is a language + count.

type LinkDiffResult

type LinkDiffResult struct {
	Links        []LinkDiffRow `json:"links"`
	TotalAdded   uint64        `json:"total_added"`
	TotalRemoved uint64        `json:"total_removed"`
}

LinkDiffResult wraps paginated link diff results.

type LinkDiffRow

type LinkDiffRow struct {
	SourceURL  string `json:"source_url"`
	TargetURL  string `json:"target_url"`
	AnchorText string `json:"anchor_text"`
	DiffType   string `json:"diff_type"`
}

LinkDiffRow represents a single internal link difference.

type LinkRow

type LinkRow struct {
	CrawlSessionID string
	SourceURL      string
	TargetURL      string
	AnchorText     string
	Rel            string
	IsInternal     bool
	Tag            string
	CrawledAt      time.Time
}

LinkRow represents a link for storage.

type Migration

type Migration struct {
	Name string
	DDL  string
	Fn   func(ctx context.Context, conn driver.Conn) error
}

Migration represents a schema migration step. Either DDL (a SQL string) or Fn (a function) must be set, not both.

type NearDuplicatePair

type NearDuplicatePair struct {
	URLa       string  `json:"url_a"`
	URLb       string  `json:"url_b"`
	TitleA     string  `json:"title_a"`
	TitleB     string  `json:"title_b"`
	CanonicalA string  `json:"canonical_a"`
	CanonicalB string  `json:"canonical_b"`
	WordCountA uint32  `json:"word_count_a"`
	WordCountB uint32  `json:"word_count_b"`
	Similarity float64 `json:"similarity"` // 0–1, 1 = exact duplicate
}

NearDuplicatePair represents two pages with near-identical content.

type NearDuplicatesResult

type NearDuplicatesResult struct {
	Pairs []NearDuplicatePair `json:"pairs"`
	Total uint64              `json:"total"`
}

NearDuplicatesResult wraps paginated near-duplicate results.

type NoindexReason

type NoindexReason struct {
	Reason string `json:"reason"`
	Count  uint64 `json:"count"`
}

NoindexReason is a reason + count for non-indexable pages.

type PageBody

type PageBody struct {
	URL      string
	BodyHTML string
}

PageBody holds a page URL and its raw HTML body for reprocessing.

type PageDiffResult

type PageDiffResult struct {
	Pages        []PageDiffRow `json:"pages"`
	TotalAdded   uint64        `json:"total_added"`
	TotalRemoved uint64        `json:"total_removed"`
	TotalChanged uint64        `json:"total_changed"`
}

PageDiffResult wraps paginated page diff results.

type PageDiffRow

type PageDiffRow struct {
	URL              string  `json:"url"`
	DiffType         string  `json:"diff_type"`
	StatusCodeA      uint16  `json:"status_code_a"`
	TitleA           string  `json:"title_a"`
	CanonicalA       string  `json:"canonical_a"`
	IsIndexableA     bool    `json:"is_indexable_a"`
	WordCountA       uint32  `json:"word_count_a"`
	DepthA           uint16  `json:"depth_a"`
	PageRankA        float64 `json:"pagerank_a"`
	MetaDescriptionA string  `json:"meta_description_a"`
	H1A              string  `json:"h1_a"`
	StatusCodeB      uint16  `json:"status_code_b"`
	TitleB           string  `json:"title_b"`
	CanonicalB       string  `json:"canonical_b"`
	IsIndexableB     bool    `json:"is_indexable_b"`
	WordCountB       uint32  `json:"word_count_b"`
	DepthB           uint16  `json:"depth_b"`
	PageRankB        float64 `json:"pagerank_b"`
	MetaDescriptionB string  `json:"meta_description_b"`
	H1B              string  `json:"h1_b"`
}

PageDiffRow represents a single page difference between two crawls.

type PageHTMLRow

type PageHTMLRow struct {
	URL  string
	HTML string
}

PageHTMLRow is a url+html pair streamed from ClickHouse.

type PageLinkInserter

type PageLinkInserter interface {
	InsertPages(ctx context.Context, pages []PageRow) error
	InsertLinks(ctx context.Context, links []LinkRow) error
	InsertExtractions(ctx context.Context, rows []extraction.ExtractionRow) error
}

PageLinkInserter is the subset of Store used by Buffer for flushing data.

type PageLinksResult

type PageLinksResult struct {
	OutLinks      []LinkRow `json:"out_links"`
	InLinks       []LinkRow `json:"in_links"`
	OutLinksCount uint64    `json:"out_links_count"`
	InLinksCount  uint64    `json:"in_links_count"`
}

PageLinksResult holds outbound links, inbound links (paginated), and counts.

type PageRankBucket

type PageRankBucket struct {
	Min   float64 `json:"min"`
	Max   float64 `json:"max"`
	Count uint64  `json:"count"`
	AvgPR float64 `json:"avg_pr"`
}

PageRankBucket holds one histogram bucket for PageRank distribution.

type PageRankDistributionResult

type PageRankDistributionResult struct {
	Buckets     []PageRankBucket `json:"buckets"`
	TotalWithPR uint64           `json:"total_with_pr"`
	Avg         float64          `json:"avg"`
	Median      float64          `json:"median"`
	P90         float64          `json:"p90"`
	P99         float64          `json:"p99"`
}

PageRankDistributionResult holds the full distribution response.

type PageRankEntry

type PageRankEntry struct {
	URL      string  `json:"url"`
	PageRank float64 `json:"pagerank"`
}

PageRankEntry holds a URL and its PageRank score.

type PageRankTopPage

type PageRankTopPage struct {
	URL              string  `json:"url"`
	PageRank         float64 `json:"pagerank"`
	Depth            uint16  `json:"depth"`
	InternalLinksOut uint32  `json:"internal_links_out"`
	ExternalLinksOut uint32  `json:"external_links_out"`
	WordCount        uint32  `json:"word_count"`
	StatusCode       uint16  `json:"status_code"`
	Title            string  `json:"title"`
}

PageRankTopPage holds a single page entry for the top PageRank list.

type PageRankTopResult

type PageRankTopResult struct {
	Pages []PageRankTopPage `json:"pages"`
	Total uint64            `json:"total"`
}

PageRankTopResult holds the paginated top PageRank pages response.

type PageRankTreemapEntry

type PageRankTreemapEntry struct {
	Path      string  `json:"path"`
	PageCount uint64  `json:"page_count"`
	TotalPR   float64 `json:"total_pr"`
	AvgPR     float64 `json:"avg_pr"`
	MaxPR     float64 `json:"max_pr"`
}

PageRankTreemapEntry holds aggregated PageRank data for a URL directory.

type PageResourceCheck

type PageResourceCheck struct {
	CrawlSessionID string    `json:"crawl_session_id"`
	URL            string    `json:"url"`
	ResourceType   string    `json:"resource_type"`
	IsInternal     bool      `json:"is_internal"`
	StatusCode     uint16    `json:"status_code"`
	Error          string    `json:"error"`
	ContentType    string    `json:"content_type"`
	RedirectURL    string    `json:"redirect_url"`
	ResponseTimeMs uint32    `json:"response_time_ms"`
	CheckedAt      time.Time `json:"checked_at"`
	PageCount      uint64    `json:"page_count,omitempty"`
}

PageResourceCheck represents a single page resource check result.

type PageResourceRef

type PageResourceRef struct {
	CrawlSessionID string `json:"crawl_session_id"`
	PageURL        string `json:"page_url"`
	ResourceURL    string `json:"resource_url"`
	ResourceType   string `json:"resource_type"`
	IsInternal     bool   `json:"is_internal"`
}

PageResourceRef links a page to a resource it uses.

type PageRow

type PageRow struct {
	CrawlSessionID   string
	URL              string
	FinalURL         string
	StatusCode       uint16
	ContentType      string
	Title            string
	TitleLength      uint16
	Canonical        string
	CanonicalIsSelf  bool
	IsIndexable      bool
	IndexReason      string // why not indexable
	MetaRobots       string
	MetaDescription  string
	MetaDescLength   uint16
	MetaKeywords     string
	H1               []string
	H2               []string
	H3               []string
	H4               []string
	H5               []string
	H6               []string
	WordCount        uint32
	InternalLinksOut uint32
	ExternalLinksOut uint32
	ImagesCount      uint16
	ImagesNoAlt      uint16
	Hreflang         []HreflangRow
	Lang             string
	OGTitle          string
	OGDescription    string
	OGImage          string
	SchemaTypes      []string
	Headers          map[string]string
	RedirectChain    []RedirectHopRow
	BodySize         uint64
	FetchDurationMs  uint64
	ContentEncoding  string
	XRobotsTag       string
	Error            string
	Depth            uint16
	FoundOn          string
	PageRank         float64
	ContentHash      uint64
	BodyHTML         string
	BodyTruncated    bool
	CrawledAt        time.Time

	// JS Rendering
	JSRendered         bool
	JSRenderDurationMs uint64
	JSRenderError      string

	// Rendered data
	RenderedTitle           string
	RenderedMetaDescription string
	RenderedH1              []string
	RenderedWordCount       uint32
	RenderedLinksCount      uint32
	RenderedImagesCount     uint16
	RenderedCanonical       string
	RenderedMetaRobots      string
	RenderedSchemaTypes     []string
	RenderedBodyHTML        string

	// Diff flags (static vs rendered)
	JSChangedTitle       bool
	JSChangedDescription bool
	JSChangedH1          bool
	JSChangedCanonical   bool
	JSChangedContent     bool  // word count changed >20%
	JSAddedLinks         int32 // delta links
	JSAddedImages        int32 // delta images
	JSAddedSchema        bool  // new schema types appeared
}

PageRow represents a crawled page for storage.

type PageWithAuthority

type PageWithAuthority struct {
	URL          string  `json:"url"`
	Title        string  `json:"title"`
	PageRank     float64 `json:"pagerank"`
	WordCount    uint32  `json:"word_count"`
	StatusCode   uint16  `json:"status_code"`
	Depth        uint16  `json:"depth"`
	TrustFlow    *uint8  `json:"trust_flow"`
	CitationFlow *uint8  `json:"citation_flow"`
	ExtBackLinks *int64  `json:"ext_backlinks"`
	RefDomains   *int64  `json:"ref_domains"`
}

PageWithAuthority combines a crawled page with its Majestic authority data.

type ParsedFilter

type ParsedFilter struct {
	Def   FilterDef
	Value string
}

type ProviderAPICallRow

type ProviderAPICallRow struct {
	ProjectID    string    `json:"project_id"`
	Provider     string    `json:"provider"`
	Endpoint     string    `json:"endpoint"`
	Method       string    `json:"method"`
	StatusCode   uint16    `json:"status_code"`
	DurationMs   uint32    `json:"duration_ms"`
	RowsReturned uint32    `json:"rows_returned"`
	ResponseBody string    `json:"response_body"`
	Error        string    `json:"error"`
	CalledAt     time.Time `json:"called_at"`
}

type ProviderBacklinkRow

type ProviderBacklinkRow struct {
	Provider       string    `json:"provider"`
	Domain         string    `json:"domain"`
	SourceURL      string    `json:"source_url"`
	TargetURL      string    `json:"target_url"`
	AnchorText     string    `json:"anchor_text"`
	SourceDomain   string    `json:"source_domain"`
	LinkType       string    `json:"link_type"`
	TrustFlow      float64   `json:"trust_flow"`
	CitationFlow   float64   `json:"citation_flow"`
	SourceTTFTopic string    `json:"source_ttf_topic"`
	Nofollow       bool      `json:"nofollow"`
	FirstSeen      time.Time `json:"first_seen"`
	LastSeen       time.Time `json:"last_seen"`
	FetchedAt      time.Time `json:"fetched_at"`
}

type ProviderDataRow added in v0.6.0

type ProviderDataRow struct {
	Provider     string             `json:"provider"`
	DataType     string             `json:"data_type"`
	Domain       string             `json:"domain"`
	ItemURL      string             `json:"item_url"`
	TrustFlow    uint8              `json:"trust_flow"`
	CitationFlow uint8              `json:"citation_flow"`
	DomainRank   float64            `json:"domain_rank"`
	ExtBacklinks int64              `json:"ext_backlinks"`
	RefDomains   int64              `json:"ref_domains"`
	StrData      map[string]string  `json:"str_data"`
	NumData      map[string]float64 `json:"num_data"`
	FetchedAt    time.Time          `json:"fetched_at"`
}

type ProviderDomainMetricsRow

type ProviderDomainMetricsRow struct {
	Provider        string    `json:"provider"`
	Domain          string    `json:"domain"`
	BacklinksTotal  int64     `json:"backlinks_total"`
	RefDomainsTotal int64     `json:"refdomains_total"`
	DomainRank      float64   `json:"domain_rank"`
	OrganicKeywords int64     `json:"organic_keywords"`
	OrganicTraffic  int64     `json:"organic_traffic"`
	OrganicCost     float64   `json:"organic_cost"`
	FetchedAt       time.Time `json:"fetched_at"`
}

type ProviderRankingRow

type ProviderRankingRow struct {
	Provider     string    `json:"provider"`
	Domain       string    `json:"domain"`
	Keyword      string    `json:"keyword"`
	URL          string    `json:"url"`
	SearchBase   string    `json:"search_base"`
	Position     uint16    `json:"position"`
	SearchVolume int64     `json:"search_volume"`
	CPC          float64   `json:"cpc"`
	Traffic      float64   `json:"traffic"`
	TrafficPct   float64   `json:"traffic_pct"`
	FetchedAt    time.Time `json:"fetched_at"`
}

type ProviderRefDomainRow

type ProviderRefDomainRow struct {
	Provider      string    `json:"provider"`
	Domain        string    `json:"domain"`
	RefDomain     string    `json:"ref_domain"`
	BacklinkCount int64     `json:"backlink_count"`
	DomainRank    float64   `json:"domain_rank"`
	FirstSeen     time.Time `json:"first_seen"`
	LastSeen      time.Time `json:"last_seen"`
	FetchedAt     time.Time `json:"fetched_at"`
}

type ProviderTopPageRow

type ProviderTopPageRow struct {
	Provider         string      `json:"provider"`
	Domain           string      `json:"domain"`
	URL              string      `json:"url"`
	Title            string      `json:"title"`
	TrustFlow        uint8       `json:"trust_flow"`
	CitationFlow     uint8       `json:"citation_flow"`
	ExtBackLinks     int64       `json:"ext_backlinks"`
	RefDomains       int64       `json:"ref_domains"`
	TopicalTrustFlow []TopicalTF `json:"topical_trust_flow"`
	Language         string      `json:"language"`
	FetchedAt        time.Time   `json:"fetched_at"`
}

type ProviderVisibilityRow

type ProviderVisibilityRow struct {
	Provider      string    `json:"provider"`
	Domain        string    `json:"domain"`
	SearchBase    string    `json:"search_base"`
	Date          time.Time `json:"date"`
	Visibility    float64   `json:"visibility"`
	KeywordsCount int64     `json:"keywords_count"`
	FetchedAt     time.Time `json:"fetched_at"`
}

type RedirectHopRow

type RedirectHopRow struct {
	URL        string
	StatusCode uint16
}

RedirectHopRow represents a redirect hop for storage.

type RedirectPageRow added in v0.2.1

type RedirectPageRow struct {
	URL                  string `json:"url"`
	StatusCode           uint16 `json:"status_code"`
	FinalURL             string `json:"final_url"`
	InboundInternalLinks uint64 `json:"inbound_internal_links"`
}

RedirectPageRow represents a redirect page with inbound internal link count.

type ResourceTypeSummary

type ResourceTypeSummary struct {
	ResourceType string `json:"resource_type"`
	Total        uint64 `json:"total"`
	Internal     uint64 `json:"internal"`
	External     uint64 `json:"external"`
	OK           uint64 `json:"ok"`
	Errors       uint64 `json:"errors"`
}

ResourceTypeSummary holds aggregated stats for one resource type.

type RobotsRow

type RobotsRow struct {
	CrawlSessionID string
	Host           string
	StatusCode     uint16
	Content        string
	FetchedAt      time.Time
}

RobotsRow represents a robots.txt entry for storage.

type SchemaCount

type SchemaCount struct {
	SchemaType string `json:"schema_type"`
	Count      uint64 `json:"count"`
}

SchemaCount is a schema type + count.

type SessionStats

type SessionStats struct {
	TotalPages            uint64            `json:"total_pages"`
	TotalLinks            uint64            `json:"total_links"`
	InternalLinks         uint64            `json:"internal_links"`
	ExternalLinks         uint64            `json:"external_links"`
	AvgFetchMs            float64           `json:"avg_fetch_ms"`
	ErrorCount            uint64            `json:"error_count"`
	StatusCodes           map[uint16]uint64 `json:"status_codes"`
	DepthDistribution     map[uint16]uint64 `json:"depth_distribution"`
	PagesPerSecond        float64           `json:"pages_per_second"`
	CrawlDurationSec      float64           `json:"crawl_duration_sec"`
	TopPageRank           []PageRankEntry   `json:"top_pagerank"`
	JSRenderedPages       uint64            `json:"js_rendered_pages"`
	JSChangedTitleCount   uint64            `json:"js_changed_title_count"`
	JSChangedH1Count      uint64            `json:"js_changed_h1_count"`
	JSChangedContentCount uint64            `json:"js_changed_content_count"`
	AvgJSRenderMs         float64           `json:"avg_js_render_ms"`
}

SessionStats holds aggregate stats for a crawl session.

type SitemapRow

type SitemapRow struct {
	CrawlSessionID string
	URL            string
	Type           string // "index" | "urlset"
	URLCount       uint32
	ParentURL      string // empty if top-level
	StatusCode     uint16
	FetchedAt      time.Time
}

SitemapRow represents a discovered sitemap for storage.

type SitemapURLRow

type SitemapURLRow struct {
	CrawlSessionID string
	SitemapURL     string
	Loc            string
	LastMod        string
	ChangeFreq     string
	Priority       string
}

SitemapURLRow represents a URL entry within a sitemap.

type SortParam added in v0.2.0

type SortParam struct {
	Column string // DB column name (from whitelist)
	Order  string // "ASC" or "DESC"
}

SortParam holds a validated sort column and direction.

func ParseSort added in v0.2.0

func ParseSort(sortKey, orderStr string, whitelist map[string]string) *SortParam

ParseSort validates sort/order params against a whitelist and returns a SortParam or nil.

type StatusTimelineBucket added in v0.9.0

type StatusTimelineBucket struct {
	Timestamp  time.Time `json:"ts"`
	OK         uint64    `json:"ok"`
	Redirect   uint64    `json:"redirect"`
	Status403  uint64    `json:"s403"`
	Status429  uint64    `json:"s429"`
	ClientErr  uint64    `json:"client_err"` // 4xx excluding 403/429
	ServerErr  uint64    `json:"server_err"` // 5xx
	FetchErr   uint64    `json:"fetch_err"`  // status_code = 0
	Total      uint64    `json:"total"`
	Retried403 uint64    `json:"retried_403"`
	Retried429 uint64    `json:"retried_429"`
	Retried5xx uint64    `json:"retried_5xx"`
}

StatusTimelineBucket holds counts per status code category for a time interval.

type StorageStatsResult

type StorageStatsResult struct {
	Tables []TableStorageStats `json:"tables"`
}

StorageStatsResult holds storage stats for all tables.

type Store

type Store struct {
	// contains filtered or unexported fields
}

Store manages ClickHouse connections and operations.

func NewStore

func NewStore(host string, port int, database, username, password string) (*Store, error)

NewStore creates a new ClickHouse store.

func (*Store) Close

func (s *Store) Close() error

Close closes the ClickHouse connection.

func (s *Store) CompareLinks(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*LinkDiffResult, error)

CompareLinks returns paginated link diffs between two sessions.

func (*Store) ComparePages

func (s *Store) ComparePages(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*PageDiffResult, error)

ComparePages returns paginated page diffs between two sessions.

func (*Store) CompareStats

func (s *Store) CompareStats(ctx context.Context, sessionA, sessionB string) (*CompareStatsResult, error)

CompareStats retrieves side-by-side stats for two sessions.

func (*Store) ComputePageRank

func (s *Store) ComputePageRank(ctx context.Context, sessionID string) error

ComputePageRank computes internal PageRank for all pages in a session. Uses uint32 IDs for memory efficiency and iterative power method. URL→ID mapping is done in ClickHouse via a Join-engine temp table, so only uint32 pairs are transferred for the link graph.

func (*Store) CountPages

func (s *Store) CountPages(ctx context.Context, sessionID string) (uint64, error)

CountPages returns the total number of pages for a session.

func (*Store) DeleteExtractions added in v0.3.0

func (s *Store) DeleteExtractions(ctx context.Context, sessionID string) error

DeleteExtractions removes all extraction data for a session.

func (*Store) DeleteFailedPages

func (s *Store) DeleteFailedPages(ctx context.Context, sessionID string) (int, error)

DeleteFailedPages removes pages with status_code = 0 for a session so they can be re-crawled.

func (*Store) DeleteGSCData

func (s *Store) DeleteGSCData(ctx context.Context, projectID string) error

func (*Store) DeletePagesByStatus

func (s *Store) DeletePagesByStatus(ctx context.Context, sessionID string, statusCode int) (int, error)

DeletePagesByStatus deletes pages with a specific status code and returns the count deleted.

func (*Store) DeleteProviderData

func (s *Store) DeleteProviderData(ctx context.Context, projectID, provider string) error

func (*Store) DeleteSession

func (s *Store) DeleteSession(ctx context.Context, sessionID string) error

DeleteSession deletes a crawl session and all its associated data. Uses DROP PARTITION for instant deletion on partitioned tables.

func (*Store) ExportLogs

func (s *Store) ExportLogs(ctx context.Context) ([]applog.LogRow, error)

ExportLogs returns all logs (up to 7 days per TTL) for JSONL export.

func (*Store) ExportSession

func (s *Store) ExportSession(ctx context.Context, sessionID string, w io.Writer, includeHTML bool) error

ExportSession streams a session's data as gzipped JSONL to w.

func (s *Store) ExternalLinks(ctx context.Context, sessionID string) ([]LinkRow, error)

ExternalLinks retrieves external links for a given session (or all sessions).

func (*Store) ExternalLinksPaginated

func (s *Store) ExternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]LinkRow, error)

ExternalLinksPaginated retrieves external links with pagination and optional filters.

func (*Store) FailedURLs

func (s *Store) FailedURLs(ctx context.Context, sessionID string) ([]string, error)

FailedURLs returns URLs with status_code = 0 (fetch errors) for a session.

func (*Store) GSCByCountry

func (s *Store) GSCByCountry(ctx context.Context, projectID string) ([]GSCCountryRow, error)

func (*Store) GSCByDevice

func (s *Store) GSCByDevice(ctx context.Context, projectID string) ([]GSCDeviceRow, error)

func (*Store) GSCInspectionResults

func (s *Store) GSCInspectionResults(ctx context.Context, projectID string, limit, offset int) ([]GSCInspectionRow, int, error)

func (*Store) GSCOverview

func (s *Store) GSCOverview(ctx context.Context, projectID string) (*GSCOverviewStats, error)

func (*Store) GSCTimeline

func (s *Store) GSCTimeline(ctx context.Context, projectID string) ([]GSCTimelineRow, error)

func (*Store) GSCTopPages

func (s *Store) GSCTopPages(ctx context.Context, projectID string, limit, offset int) ([]GSCPageRow, int, error)

func (*Store) GSCTopQueries

func (s *Store) GSCTopQueries(ctx context.Context, projectID string, limit, offset int) ([]GSCQueryRow, int, error)

func (*Store) GetExpiredDomains

func (s *Store) GetExpiredDomains(ctx context.Context, sessionID string, limit, offset int) (*ExpiredDomainsResult, error)

GetExpiredDomains returns registrable domains where all external checks failed with DNS errors.

func (*Store) GetExternalLinkCheckDomains

func (s *Store) GetExternalLinkCheckDomains(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]ExternalDomainCheck, error)

GetExternalLinkCheckDomains returns aggregated external check stats per domain.

func (*Store) GetExternalLinkChecks

func (s *Store) GetExternalLinkChecks(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]ExternalLinkCheck, error)

GetExternalLinkChecks returns paginated external link check results for a session.

func (*Store) GetExtractions added in v0.3.0

func (s *Store) GetExtractions(ctx context.Context, sessionID string, limit, offset int) (*extraction.ExtractionResult, error)

GetExtractions retrieves extraction results for a session, pivoted by extractor name.

func (*Store) GetPage

func (s *Store) GetPage(ctx context.Context, sessionID, url string) (*PageRow, error)

GetPage retrieves all fields for a single page (excluding body_html).

func (*Store) GetPageBodies

func (s *Store) GetPageBodies(ctx context.Context, sessionID string, limit, offset int) ([]PageBody, error)

GetPageBodies reads URL + body_html for a session in batches.

func (*Store) GetPageHTML

func (s *Store) GetPageHTML(ctx context.Context, sessionID, url string) (string, error)

GetPageHTML retrieves the raw HTML for a specific page.

func (s *Store) GetPageLinks(ctx context.Context, sessionID, url string, outLimit, outOffset, inLimit, inOffset int) (*PageLinksResult, error)

GetPageLinks retrieves outbound and inbound links for a URL with pagination.

func (*Store) GetPageResourceChecks

func (s *Store) GetPageResourceChecks(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]PageResourceCheck, error)

GetPageResourceChecks returns paginated resource checks with page_count from refs.

func (*Store) GetPageResourceTypeSummary

func (s *Store) GetPageResourceTypeSummary(ctx context.Context, sessionID string) ([]ResourceTypeSummary, error)

GetPageResourceTypeSummary returns aggregated stats per resource type.

func (*Store) GetRobotsContent

func (s *Store) GetRobotsContent(ctx context.Context, sessionID, host string) (*RobotsRow, error)

GetRobotsContent returns the full robots.txt content for a specific host in a session.

func (*Store) GetRobotsHosts

func (s *Store) GetRobotsHosts(ctx context.Context, sessionID string) ([]RobotsRow, error)

GetRobotsHosts returns all hosts with robots.txt data for a session (without content).

func (*Store) GetSession

func (s *Store) GetSession(ctx context.Context, sessionID string) (*CrawlSession, error)

GetSession retrieves a single crawl session by ID.

func (*Store) GetSitemapCoverageURLs added in v0.4.0

func (s *Store) GetSitemapCoverageURLs(ctx context.Context, sessionID, filter string, limit, offset int) ([]SitemapURLRow, error)

GetSitemapCoverageURLs returns paginated sitemap URLs filtered by coverage type. filter must be "sitemap_only" (in sitemap but not crawled) or "in_both" (in sitemap and crawled).

func (*Store) GetSitemapURLs

func (s *Store) GetSitemapURLs(ctx context.Context, sessionID, sitemapURL string, limit, offset int) ([]SitemapURLRow, error)

GetSitemapURLs returns paginated URLs from a specific sitemap.

func (*Store) GetSitemaps

func (s *Store) GetSitemaps(ctx context.Context, sessionID string) ([]SitemapRow, error)

GetSitemaps returns all sitemaps for a session.

func (*Store) GetURLsByHost

func (s *Store) GetURLsByHost(ctx context.Context, sessionID, host string) ([]string, error)

GetURLsByHost returns all distinct URLs for a given host in a session.

func (*Store) GlobalStats

func (s *Store) GlobalStats(ctx context.Context) ([]GlobalSessionStats, *StorageStatsResult, error)

GlobalStats retrieves aggregated stats per session across all data.

func (*Store) HasStoredHTML added in v0.3.0

func (s *Store) HasStoredHTML(ctx context.Context, sessionID string) (bool, error)

PageHTMLRowForExtraction is a helper to check if stored HTML exists.

func (*Store) ImportCSVSession added in v0.9.0

func (s *Store) ImportCSVSession(ctx context.Context, r io.Reader, projectID string) (*CSVImportResult, error)

ImportCSVSession reads a CSV file, auto-detects the source, maps columns to PageRow, creates a session, and batch-inserts pages.

func (*Store) ImportSession

func (s *Store) ImportSession(ctx context.Context, r io.Reader) (*CrawlSession, error)

ImportSession reads a gzipped JSONL stream and inserts the session with a new UUID.

func (*Store) InsertExternalLinkChecks

func (s *Store) InsertExternalLinkChecks(ctx context.Context, checks []ExternalLinkCheck) error

InsertExternalLinkChecks batch-inserts external link check results.

func (*Store) InsertExtractions added in v0.3.0

func (s *Store) InsertExtractions(ctx context.Context, rows []extraction.ExtractionRow) error

InsertExtractions batch inserts extraction rows.

func (*Store) InsertGSCAnalytics

func (s *Store) InsertGSCAnalytics(ctx context.Context, projectID string, rows []GSCAnalyticsInsertRow) error

func (*Store) InsertGSCInspection

func (s *Store) InsertGSCInspection(ctx context.Context, projectID string, rows []GSCInspectionInsertRow) error
func (s *Store) InsertLinks(ctx context.Context, links []LinkRow) error

InsertLinks batch inserts link rows.

func (*Store) InsertLogs

func (s *Store) InsertLogs(ctx context.Context, logs []applog.LogRow) error

InsertLogs batch inserts application log rows.

func (*Store) InsertPageResourceChecks

func (s *Store) InsertPageResourceChecks(ctx context.Context, checks []PageResourceCheck) error

InsertPageResourceChecks batch inserts page resource check results.

func (*Store) InsertPageResourceRefs

func (s *Store) InsertPageResourceRefs(ctx context.Context, refs []PageResourceRef) error

InsertPageResourceRefs batch inserts page-to-resource references.

func (*Store) InsertPages

func (s *Store) InsertPages(ctx context.Context, pages []PageRow) error

InsertPages batch inserts page rows.

func (*Store) InsertProviderAPICalls

func (s *Store) InsertProviderAPICalls(ctx context.Context, rows []ProviderAPICallRow) error
func (s *Store) InsertProviderBacklinks(ctx context.Context, projectID string, rows []ProviderBacklinkRow) error

func (*Store) InsertProviderData added in v0.6.0

func (s *Store) InsertProviderData(ctx context.Context, projectID string, rows []ProviderDataRow) error

func (*Store) InsertProviderDomainMetrics

func (s *Store) InsertProviderDomainMetrics(ctx context.Context, projectID string, rows []ProviderDomainMetricsRow) error

func (*Store) InsertProviderRankings

func (s *Store) InsertProviderRankings(ctx context.Context, projectID string, rows []ProviderRankingRow) error

func (*Store) InsertProviderRefDomains

func (s *Store) InsertProviderRefDomains(ctx context.Context, projectID string, rows []ProviderRefDomainRow) error

func (*Store) InsertProviderTopPages

func (s *Store) InsertProviderTopPages(ctx context.Context, projectID string, rows []ProviderTopPageRow) error

func (*Store) InsertProviderVisibility

func (s *Store) InsertProviderVisibility(ctx context.Context, projectID string, rows []ProviderVisibilityRow) error

func (*Store) InsertRetryAttempt added in v0.9.0

func (s *Store) InsertRetryAttempt(ctx context.Context, sessionID string, attemptedAt time.Time, statusCode int, url string) error

InsertRetryAttempt records a single retry attempt.

func (*Store) InsertRobotsData

func (s *Store) InsertRobotsData(ctx context.Context, rows []RobotsRow) error

InsertRobotsData batch inserts robots.txt rows.

func (*Store) InsertSession

func (s *Store) InsertSession(ctx context.Context, session *CrawlSession) error

InsertSession inserts or updates a crawl session.

func (*Store) InsertSitemapURLs

func (s *Store) InsertSitemapURLs(ctx context.Context, rows []SitemapURLRow) error

InsertSitemapURLs inserts sitemap URL rows.

func (*Store) InsertSitemaps

func (s *Store) InsertSitemaps(ctx context.Context, rows []SitemapRow) error

InsertSitemaps inserts sitemap rows.

func (*Store) InternalLinksPaginated

func (s *Store) InternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]LinkRow, error)

InternalLinksPaginated retrieves internal links with pagination and optional filters.

func (*Store) ListLogs

func (s *Store) ListLogs(ctx context.Context, limit, offset int, level, component, search string) ([]applog.LogRow, int, error)

ListLogs returns paginated application logs with optional filters.

func (*Store) ListPages

func (s *Store) ListPages(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]PageRow, error)

ListPages retrieves pages for a session with pagination and optional filters.

func (*Store) ListRedirectPages added in v0.2.1

func (s *Store) ListRedirectPages(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]RedirectPageRow, error)

ListRedirectPages retrieves pages with 3xx status codes and their inbound internal link count.

func (*Store) ListSessions

func (s *Store) ListSessions(ctx context.Context, projectID ...string) ([]CrawlSession, error)

ListSessions retrieves crawl sessions, optionally filtered by project ID.

func (*Store) ListSessionsPaginated

func (s *Store) ListSessionsPaginated(ctx context.Context, limit, offset int, projectID, search string) ([]CrawlSession, int, error)

ListSessionsPaginated retrieves crawl sessions with pagination, optional project and search filters.

func (*Store) Migrate

func (s *Store) Migrate(ctx context.Context) error

Migrate runs all DDL migrations.

func (*Store) NearDuplicates

func (s *Store) NearDuplicates(ctx context.Context, sessionID string, threshold int, limit, offset int) (*NearDuplicatesResult, error)

NearDuplicates finds pages with similar content using SimHash Hamming distance. threshold is the max Hamming distance (e.g. 3 = ≤3 bits differ out of 64).

func (*Store) PageRankDistribution

func (s *Store) PageRankDistribution(ctx context.Context, sessionID string, buckets int) (*PageRankDistributionResult, error)

PageRankDistribution returns a histogram of PageRank values for a session.

func (*Store) PageRankTop

func (s *Store) PageRankTop(ctx context.Context, sessionID string, limit, offset int, directory string) (*PageRankTopResult, error)

PageRankTop returns the top pages by PageRank with metadata, paginated.

func (*Store) PageRankTreemap

func (s *Store) PageRankTreemap(ctx context.Context, sessionID string, depth, minPages int) ([]PageRankTreemapEntry, error)

PageRankTreemap returns PageRank aggregated by URL directory prefix.

func (*Store) PagesWithAuthority

func (s *Store) PagesWithAuthority(ctx context.Context, sessionID, projectID string, limit, offset int) ([]PageWithAuthority, int, error)

PagesWithAuthority joins crawled pages with provider top_pages (Majestic authority data).

func (*Store) ProviderAPICalls

func (s *Store) ProviderAPICalls(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderAPICallRow, int, error)
func (s *Store) ProviderBacklinks(ctx context.Context, projectID, provider string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]ProviderBacklinkRow, int, error)

func (*Store) ProviderData added in v0.6.0

func (s *Store) ProviderData(ctx context.Context, projectID, provider, dataType string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]ProviderDataRow, int, error)

func (*Store) ProviderDataAge added in v0.6.0

func (s *Store) ProviderDataAge(ctx context.Context, projectID, provider, dataType string) (time.Time, error)

func (*Store) ProviderDomainMetrics

func (s *Store) ProviderDomainMetrics(ctx context.Context, projectID, provider string) (*ProviderDomainMetricsRow, error)

func (*Store) ProviderRankings

func (s *Store) ProviderRankings(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderRankingRow, int, error)

func (*Store) ProviderRefDomains

func (s *Store) ProviderRefDomains(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderRefDomainRow, int, error)

func (*Store) ProviderTopPages

func (s *Store) ProviderTopPages(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderTopPageRow, int, error)

func (*Store) ProviderVisibilityHistory

func (s *Store) ProviderVisibilityHistory(ctx context.Context, projectID, provider string) ([]ProviderVisibilityRow, error)

func (*Store) RecomputeDepths

func (s *Store) RecomputeDepths(ctx context.Context, sessionID string, seedURLs []string) error

func (*Store) RunCustomTestsSQL

func (s *Store) RunCustomTestsSQL(ctx context.Context, sessionID string, rules []customtests.TestRule) (map[string]map[string]string, error)

RunCustomTestsSQL runs ClickHouse-native test rules as a single query. All user values are parameterized via named placeholders.

func (*Store) RunExtractionsPostCrawl added in v0.3.0

func (s *Store) RunExtractionsPostCrawl(ctx context.Context, sessionID string, extractors []extraction.Extractor) (*extraction.ExtractionResult, error)

RunExtractionsPostCrawl runs extractors against stored HTML and inserts results.

func (*Store) SessionAudit

func (s *Store) SessionAudit(ctx context.Context, sessionID string) (*AuditResult, error)

SessionAudit computes a comprehensive SEO audit for a crawl session.

func (*Store) SessionStats

func (s *Store) SessionStats(ctx context.Context, sessionID string) (*SessionStats, error)

SessionStats retrieves aggregate statistics for a crawl session.

func (*Store) SessionStorageStats

func (s *Store) SessionStorageStats(ctx context.Context) (map[string]uint64, error)

SessionStorageStats returns bytes on disk per crawl session, computed from system.parts partitions across all data tables.

func (*Store) StatusTimeline added in v0.9.0

func (s *Store) StatusTimeline(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)

StatusTimeline returns time-bucketed status code counts for a crawl session. The interval is auto-computed to produce ~60-100 buckets.

func (*Store) StatusTimelineRecent added in v0.9.0

func (s *Store) StatusTimelineRecent(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)

StatusTimelineRecent returns the last 10 minutes of crawl activity in 10-second buckets.

func (*Store) StorageStats

func (s *Store) StorageStats(ctx context.Context) (*StorageStatsResult, error)

StorageStats retrieves disk usage and row counts for all crawlobserver tables.

func (*Store) StreamCrawledURLs added in v0.9.0

func (s *Store) StreamCrawledURLs(ctx context.Context, sessionID string, fn func(string)) (int, error)

StreamCrawledURLs streams all URLs already crawled in a session, calling fn for each URL. This avoids loading the entire URL list into memory (which can cause OOM on large sites with 1M+ pages). Returns the number of URLs streamed.

func (*Store) StreamPagesHTML

func (s *Store) StreamPagesHTML(ctx context.Context, sessionID string) (<-chan PageHTMLRow, error)

StreamPagesHTML streams url+body_html pairs for a session.

func (*Store) StreamPagesHTMLForExtraction added in v0.3.0

func (s *Store) StreamPagesHTMLForExtraction(ctx context.Context, sessionID string) (<-chan PageHTMLRow, error)

StreamPagesHTMLForExtraction is an alias to StreamPagesHTML for clarity in extraction context.

func (*Store) URLsByStatus

func (s *Store) URLsByStatus(ctx context.Context, sessionID string, statusCode int) ([]string, error)

URLsByStatus returns URLs with a specific status code for a session.

func (*Store) UncrawledURLs

func (s *Store) UncrawledURLs(ctx context.Context, sessionID string) ([]string, error)

UncrawledURLs returns internal link targets that were discovered but not crawled in a session.

func (*Store) UpdateSessionProject

func (s *Store) UpdateSessionProject(ctx context.Context, sessionID string, projectID *string) error

UpdateSessionProject re-inserts a session with a new project_id (ReplacingMergeTree pattern).

func (*Store) WeightedPageRankTop added in v0.6.1

func (s *Store) WeightedPageRankTop(ctx context.Context, sessionID, projectID string, limit, offset int, directory, sort, order string) (*WeightedPageRankResult, error)

WeightedPageRankTop returns pages ranked by a weighted PageRank that fuses internal PR with SEObserver data.

type TableStorageStats

type TableStorageStats struct {
	Name        string `json:"name"`
	BytesOnDisk uint64 `json:"bytes_on_disk"`
	Rows        uint64 `json:"rows"`
}

TableStorageStats holds storage stats for a single table.

type TopicalTF

type TopicalTF struct {
	Topic string `json:"topic"`
	Value uint8  `json:"value"`
}

type WeightedPageRankPage added in v0.6.1

type WeightedPageRankPage struct {
	URL              string  `json:"url"`
	PageRank         float64 `json:"pagerank"`
	WeightedPR       float64 `json:"weighted_pr"`
	TrustFlow        *uint8  `json:"trust_flow"`
	CitationFlow     *uint8  `json:"citation_flow"`
	ExtBackLinks     *int64  `json:"ext_backlinks"`
	RefDomains       *int64  `json:"ref_domains"`
	Depth            uint16  `json:"depth"`
	InternalLinksOut uint32  `json:"internal_links_out"`
	StatusCode       uint16  `json:"status_code"`
	Title            string  `json:"title"`
	TTFTopic         *string `json:"ttf_topic"`
}

WeightedPageRankPage represents a page with weighted PageRank combining internal PR and SEObserver data.

type WeightedPageRankResult added in v0.6.1

type WeightedPageRankResult struct {
	Pages []WeightedPageRankPage `json:"pages"`
	Total uint64                 `json:"total"`
}

WeightedPageRankResult wraps paginated weighted PageRank results.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL