Documentation
¶
Index ¶
- Constants
- Variables
- func BuildOrderByClause(sort *SortParam, defaultOrderBy string) string
- func BuildWhereClause(filters []ParsedFilter) (string, []interface{}, error)
- type AnchorCount
- type AuditContent
- type AuditInternational
- type AuditLinks
- type AuditResult
- type AuditSitemaps
- type AuditStructure
- type AuditTechnical
- type BFSResult
- type Buffer
- func (b *Buffer) AddExtractions(rows []extraction.ExtractionRow)
- func (b *Buffer) AddLinks(links []LinkRow)
- func (b *Buffer) AddPage(page PageRow)
- func (b *Buffer) Close()
- func (b *Buffer) ErrorState() BufferErrorState
- func (b *Buffer) Flush()
- func (b *Buffer) PageCount() int
- func (b *Buffer) SetOnDataLost(fn func(lostPages, lostLinks int64))
- type BufferErrorState
- type CSVImportResult
- type CSVSource
- type CompareStatsResult
- type ContentTypeCount
- type CrawlSession
- type DirectoryCount
- type ExpiredDomain
- type ExpiredDomainSource
- type ExpiredDomainsResult
- type ExternalDomain
- type ExternalDomainCheck
- type ExternalLinkCheck
- type ExtractionQueryRow
- type FilterDef
- type FilterType
- type GSCAnalyticsInsertRow
- type GSCCountryRow
- type GSCDeviceRow
- type GSCInspectionInsertRow
- type GSCInspectionRow
- type GSCOverviewStats
- type GSCPageRow
- type GSCQueryRow
- type GSCTimelineRow
- type GlobalSessionStats
- type HreflangRow
- type LangCount
- type LinkDiffResult
- type LinkDiffRow
- type LinkRow
- type Migration
- type NearDuplicatePair
- type NearDuplicatesResult
- type NoindexReason
- type PageBody
- type PageDiffResult
- type PageDiffRow
- type PageHTMLRow
- type PageLinkInserter
- type PageLinksResult
- type PageRankBucket
- type PageRankDistributionResult
- type PageRankEntry
- type PageRankTopPage
- type PageRankTopResult
- type PageRankTreemapEntry
- type PageResourceCheck
- type PageResourceRef
- type PageRow
- type PageWithAuthority
- type ParsedFilter
- type ProviderAPICallRow
- type ProviderBacklinkRow
- type ProviderDataRow
- type ProviderDomainMetricsRow
- type ProviderRankingRow
- type ProviderRefDomainRow
- type ProviderTopPageRow
- type ProviderVisibilityRow
- type RedirectHopRow
- type RedirectPageRow
- type ResourceTypeSummary
- type RobotsRow
- type SchemaCount
- type SessionStats
- type SitemapRow
- type SitemapURLRow
- type SortParam
- type StatusTimelineBucket
- type StorageStatsResult
- type Store
- func (s *Store) Close() error
- func (s *Store) CompareLinks(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*LinkDiffResult, error)
- func (s *Store) ComparePages(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*PageDiffResult, error)
- func (s *Store) CompareStats(ctx context.Context, sessionA, sessionB string) (*CompareStatsResult, error)
- func (s *Store) ComputePageRank(ctx context.Context, sessionID string) error
- func (s *Store) CountPages(ctx context.Context, sessionID string) (uint64, error)
- func (s *Store) DeleteExtractions(ctx context.Context, sessionID string) error
- func (s *Store) DeleteFailedPages(ctx context.Context, sessionID string) (int, error)
- func (s *Store) DeleteGSCData(ctx context.Context, projectID string) error
- func (s *Store) DeletePagesByStatus(ctx context.Context, sessionID string, statusCode int) (int, error)
- func (s *Store) DeleteProviderData(ctx context.Context, projectID, provider string) error
- func (s *Store) DeleteSession(ctx context.Context, sessionID string) error
- func (s *Store) ExportLogs(ctx context.Context) ([]applog.LogRow, error)
- func (s *Store) ExportSession(ctx context.Context, sessionID string, w io.Writer, includeHTML bool) error
- func (s *Store) ExternalLinks(ctx context.Context, sessionID string) ([]LinkRow, error)
- func (s *Store) ExternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, ...) ([]LinkRow, error)
- func (s *Store) FailedURLs(ctx context.Context, sessionID string) ([]string, error)
- func (s *Store) GSCByCountry(ctx context.Context, projectID string) ([]GSCCountryRow, error)
- func (s *Store) GSCByDevice(ctx context.Context, projectID string) ([]GSCDeviceRow, error)
- func (s *Store) GSCInspectionResults(ctx context.Context, projectID string, limit, offset int) ([]GSCInspectionRow, int, error)
- func (s *Store) GSCOverview(ctx context.Context, projectID string) (*GSCOverviewStats, error)
- func (s *Store) GSCTimeline(ctx context.Context, projectID string) ([]GSCTimelineRow, error)
- func (s *Store) GSCTopPages(ctx context.Context, projectID string, limit, offset int) ([]GSCPageRow, int, error)
- func (s *Store) GSCTopQueries(ctx context.Context, projectID string, limit, offset int) ([]GSCQueryRow, int, error)
- func (s *Store) GetExpiredDomains(ctx context.Context, sessionID string, limit, offset int) (*ExpiredDomainsResult, error)
- func (s *Store) GetExternalLinkCheckDomains(ctx context.Context, sessionID string, limit, offset int, ...) ([]ExternalDomainCheck, error)
- func (s *Store) GetExternalLinkChecks(ctx context.Context, sessionID string, limit, offset int, ...) ([]ExternalLinkCheck, error)
- func (s *Store) GetExtractions(ctx context.Context, sessionID string, limit, offset int) (*extraction.ExtractionResult, error)
- func (s *Store) GetPage(ctx context.Context, sessionID, url string) (*PageRow, error)
- func (s *Store) GetPageBodies(ctx context.Context, sessionID string, limit, offset int) ([]PageBody, error)
- func (s *Store) GetPageHTML(ctx context.Context, sessionID, url string) (string, error)
- func (s *Store) GetPageLinks(ctx context.Context, sessionID, url string, ...) (*PageLinksResult, error)
- func (s *Store) GetPageResourceChecks(ctx context.Context, sessionID string, limit, offset int, ...) ([]PageResourceCheck, error)
- func (s *Store) GetPageResourceTypeSummary(ctx context.Context, sessionID string) ([]ResourceTypeSummary, error)
- func (s *Store) GetRobotsContent(ctx context.Context, sessionID, host string) (*RobotsRow, error)
- func (s *Store) GetRobotsHosts(ctx context.Context, sessionID string) ([]RobotsRow, error)
- func (s *Store) GetSession(ctx context.Context, sessionID string) (*CrawlSession, error)
- func (s *Store) GetSitemapCoverageURLs(ctx context.Context, sessionID, filter string, limit, offset int) ([]SitemapURLRow, error)
- func (s *Store) GetSitemapURLs(ctx context.Context, sessionID, sitemapURL string, limit, offset int) ([]SitemapURLRow, error)
- func (s *Store) GetSitemaps(ctx context.Context, sessionID string) ([]SitemapRow, error)
- func (s *Store) GetURLsByHost(ctx context.Context, sessionID, host string) ([]string, error)
- func (s *Store) GlobalStats(ctx context.Context) ([]GlobalSessionStats, *StorageStatsResult, error)
- func (s *Store) HasStoredHTML(ctx context.Context, sessionID string) (bool, error)
- func (s *Store) ImportCSVSession(ctx context.Context, r io.Reader, projectID string) (*CSVImportResult, error)
- func (s *Store) ImportSession(ctx context.Context, r io.Reader) (*CrawlSession, error)
- func (s *Store) InsertExternalLinkChecks(ctx context.Context, checks []ExternalLinkCheck) error
- func (s *Store) InsertExtractions(ctx context.Context, rows []extraction.ExtractionRow) error
- func (s *Store) InsertGSCAnalytics(ctx context.Context, projectID string, rows []GSCAnalyticsInsertRow) error
- func (s *Store) InsertGSCInspection(ctx context.Context, projectID string, rows []GSCInspectionInsertRow) error
- func (s *Store) InsertLinks(ctx context.Context, links []LinkRow) error
- func (s *Store) InsertLogs(ctx context.Context, logs []applog.LogRow) error
- func (s *Store) InsertPageResourceChecks(ctx context.Context, checks []PageResourceCheck) error
- func (s *Store) InsertPageResourceRefs(ctx context.Context, refs []PageResourceRef) error
- func (s *Store) InsertPages(ctx context.Context, pages []PageRow) error
- func (s *Store) InsertProviderAPICalls(ctx context.Context, rows []ProviderAPICallRow) error
- func (s *Store) InsertProviderBacklinks(ctx context.Context, projectID string, rows []ProviderBacklinkRow) error
- func (s *Store) InsertProviderData(ctx context.Context, projectID string, rows []ProviderDataRow) error
- func (s *Store) InsertProviderDomainMetrics(ctx context.Context, projectID string, rows []ProviderDomainMetricsRow) error
- func (s *Store) InsertProviderRankings(ctx context.Context, projectID string, rows []ProviderRankingRow) error
- func (s *Store) InsertProviderRefDomains(ctx context.Context, projectID string, rows []ProviderRefDomainRow) error
- func (s *Store) InsertProviderTopPages(ctx context.Context, projectID string, rows []ProviderTopPageRow) error
- func (s *Store) InsertProviderVisibility(ctx context.Context, projectID string, rows []ProviderVisibilityRow) error
- func (s *Store) InsertRetryAttempt(ctx context.Context, sessionID string, attemptedAt time.Time, statusCode int, ...) error
- func (s *Store) InsertRobotsData(ctx context.Context, rows []RobotsRow) error
- func (s *Store) InsertSession(ctx context.Context, session *CrawlSession) error
- func (s *Store) InsertSitemapURLs(ctx context.Context, rows []SitemapURLRow) error
- func (s *Store) InsertSitemaps(ctx context.Context, rows []SitemapRow) error
- func (s *Store) InternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, ...) ([]LinkRow, error)
- func (s *Store) ListLogs(ctx context.Context, limit, offset int, level, component, search string) ([]applog.LogRow, int, error)
- func (s *Store) ListPages(ctx context.Context, sessionID string, limit, offset int, ...) ([]PageRow, error)
- func (s *Store) ListRedirectPages(ctx context.Context, sessionID string, limit, offset int, ...) ([]RedirectPageRow, error)
- func (s *Store) ListSessions(ctx context.Context, projectID ...string) ([]CrawlSession, error)
- func (s *Store) ListSessionsPaginated(ctx context.Context, limit, offset int, projectID, search string) ([]CrawlSession, int, error)
- func (s *Store) Migrate(ctx context.Context) error
- func (s *Store) NearDuplicates(ctx context.Context, sessionID string, threshold int, limit, offset int) (*NearDuplicatesResult, error)
- func (s *Store) PageRankDistribution(ctx context.Context, sessionID string, buckets int) (*PageRankDistributionResult, error)
- func (s *Store) PageRankTop(ctx context.Context, sessionID string, limit, offset int, directory string) (*PageRankTopResult, error)
- func (s *Store) PageRankTreemap(ctx context.Context, sessionID string, depth, minPages int) ([]PageRankTreemapEntry, error)
- func (s *Store) PagesWithAuthority(ctx context.Context, sessionID, projectID string, limit, offset int) ([]PageWithAuthority, int, error)
- func (s *Store) ProviderAPICalls(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderAPICallRow, int, error)
- func (s *Store) ProviderBacklinks(ctx context.Context, projectID, provider string, limit, offset int, ...) ([]ProviderBacklinkRow, int, error)
- func (s *Store) ProviderData(ctx context.Context, projectID, provider, dataType string, limit, offset int, ...) ([]ProviderDataRow, int, error)
- func (s *Store) ProviderDataAge(ctx context.Context, projectID, provider, dataType string) (time.Time, error)
- func (s *Store) ProviderDomainMetrics(ctx context.Context, projectID, provider string) (*ProviderDomainMetricsRow, error)
- func (s *Store) ProviderRankings(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderRankingRow, int, error)
- func (s *Store) ProviderRefDomains(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderRefDomainRow, int, error)
- func (s *Store) ProviderTopPages(ctx context.Context, projectID, provider string, limit, offset int) ([]ProviderTopPageRow, int, error)
- func (s *Store) ProviderVisibilityHistory(ctx context.Context, projectID, provider string) ([]ProviderVisibilityRow, error)
- func (s *Store) RecomputeDepths(ctx context.Context, sessionID string, seedURLs []string) error
- func (s *Store) RunCustomTestsSQL(ctx context.Context, sessionID string, rules []customtests.TestRule) (map[string]map[string]string, error)
- func (s *Store) RunExtractionsPostCrawl(ctx context.Context, sessionID string, extractors []extraction.Extractor) (*extraction.ExtractionResult, error)
- func (s *Store) SessionAudit(ctx context.Context, sessionID string) (*AuditResult, error)
- func (s *Store) SessionStats(ctx context.Context, sessionID string) (*SessionStats, error)
- func (s *Store) SessionStorageStats(ctx context.Context) (map[string]uint64, error)
- func (s *Store) StatusTimeline(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)
- func (s *Store) StatusTimelineRecent(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)
- func (s *Store) StorageStats(ctx context.Context) (*StorageStatsResult, error)
- func (s *Store) StreamCrawledURLs(ctx context.Context, sessionID string, fn func(string)) (int, error)
- func (s *Store) StreamPagesHTML(ctx context.Context, sessionID string) (<-chan PageHTMLRow, error)
- func (s *Store) StreamPagesHTMLForExtraction(ctx context.Context, sessionID string) (<-chan PageHTMLRow, error)
- func (s *Store) URLsByStatus(ctx context.Context, sessionID string, statusCode int) ([]string, error)
- func (s *Store) UncrawledURLs(ctx context.Context, sessionID string) ([]string, error)
- func (s *Store) UpdateSessionProject(ctx context.Context, sessionID string, projectID *string) error
- func (s *Store) WeightedPageRankTop(ctx context.Context, sessionID, projectID string, limit, offset int, ...) (*WeightedPageRankResult, error)
- type TableStorageStats
- type TopicalTF
- type WeightedPageRankPage
- type WeightedPageRankResult
Constants ¶
const ( RecordMeta = "meta" RecordPage = "page" RecordLink = "link" RecordRobots = "robots" RecordSitemap = "sitemap" RecordSitemapURL = "sitemap_url" )
Export JSONL record types.
const AlterPagesV2 = `` /* 1379-byte string literal not displayed */
AlterPagesV2 adds new columns to existing pages table.
const AlterPagesV3 = `
ALTER TABLE crawlobserver.pages
ADD COLUMN IF NOT EXISTS pagerank Float64 DEFAULT 0 AFTER found_on
`
const AlterPagesV4 = `
ALTER TABLE crawlobserver.pages
ADD COLUMN IF NOT EXISTS body_truncated Bool DEFAULT false AFTER body_html
`
const AlterPagesV5 = `` /* 1456-byte string literal not displayed */
const AlterPagesV6 = `
ALTER TABLE crawlobserver.pages
ADD COLUMN IF NOT EXISTS content_hash UInt64 DEFAULT 0 AFTER pagerank
`
const AlterSessionsV2 = `
ALTER TABLE crawlobserver.crawl_sessions
ADD COLUMN IF NOT EXISTS project_id Nullable(String) DEFAULT NULL
`
const CreateApplicationLogs = `` /* 347-byte string literal not displayed */
const CreateCrawlSessions = `` /* 294-byte string literal not displayed */
const CreateDatabase = `CREATE DATABASE IF NOT EXISTS crawlobserver`
const CreateExternalLinkChecks = `` /* 363-byte string literal not displayed */
const CreateExtractions = `` /* 291-byte string literal not displayed */
const CreateGSCAnalytics = `` /* 405-byte string literal not displayed */
const CreateGSCInspection = `` /* 497-byte string literal not displayed */
const CreateLinks = `` /* 305-byte string literal not displayed */
const CreateLinksV2 = `` /* 338-byte string literal not displayed */
const CreatePageResourceChecks = `` /* 427-byte string literal not displayed */
const CreatePageResourceRefs = `` /* 304-byte string literal not displayed */
const CreatePages = `` /* 1287-byte string literal not displayed */
const CreatePagesV2 = `` /* 1357-byte string literal not displayed */
DDL for v2 tables partitioned by crawl_session_id.
const CreateProviderAPICalls = `` /* 423-byte string literal not displayed */
const CreateProviderBacklinks = `` /* 551-byte string literal not displayed */
const CreateProviderData = `` /* 605-byte string literal not displayed */
const CreateProviderDomainMetrics = `` /* 427-byte string literal not displayed */
const CreateProviderRankings = `` /* 451-byte string literal not displayed */
const CreateProviderRefDomains = `` /* 389-byte string literal not displayed */
const CreateProviderTopPages = `` /* 484-byte string literal not displayed */
const CreateProviderVisibility = `` /* 370-byte string literal not displayed */
const CreateRetryAttempts = `` /* 247-byte string literal not displayed */
const CreateRobotsTxt = `` /* 263-byte string literal not displayed */
const CreateRobotsTxtV2 = `` /* 296-byte string literal not displayed */
const CreateSitemapURLs = `` /* 265-byte string literal not displayed */
const CreateSitemapURLsV2 = `` /* 298-byte string literal not displayed */
const CreateSitemaps = `` /* 286-byte string literal not displayed */
const CreateSitemapsV2 = `` /* 319-byte string literal not displayed */
const ExportFormatVersion = 1
ExportFormatVersion is the current export format version.
Variables ¶
var BacklinkFilters = map[string]FilterDef{ "source_url": {Column: "source_url", Type: FilterLike}, "target_url": {Column: "target_url", Type: FilterLike}, "anchor_text": {Column: "anchor_text", Type: FilterLike}, "trust_flow": {Column: "domain_rank", Type: FilterUint}, "citation_flow": {Column: "page_rank", Type: FilterUint}, "nofollow": {Column: "nofollow", Type: FilterBool}, "first_seen": {Column: "first_seen", Type: FilterLike}, "last_seen": {Column: "last_seen", Type: FilterLike}, }
BacklinkFilters defines the allowed filter columns for provider_backlinks.
var BacklinkSortColumns = map[string]string{
"source_url": "source_url",
"target_url": "target_url",
"anchor_text": "anchor_text",
"trust_flow": "domain_rank",
"citation_flow": "page_rank",
"nofollow": "nofollow",
"first_seen": "first_seen",
"last_seen": "last_seen",
}
BacklinkSortColumns maps query param names to DB column names for provider_backlinks.
var ExternalCheckFilters = map[string]FilterDef{ "url": {Column: "url", Type: FilterLike}, "status_code": {Column: "status_code", Type: FilterUint}, "error": {Column: "error", Type: FilterLike}, "content_type": {Column: "content_type", Type: FilterLike}, "redirect_url": {Column: "redirect_url", Type: FilterLike}, }
ExternalCheckFilters defines the allowed filter columns for the external_link_checks table.
var ExternalDomainCheckFilters = map[string]FilterDef{ "domain": {Column: "domain", Type: FilterLike}, }
ExternalDomainCheckFilters defines the allowed filter columns for domain-level external checks.
var LinkFilters = map[string]FilterDef{ "source_url": {Column: "source_url", Type: FilterLike}, "target_url": {Column: "target_url", Type: FilterLike}, "anchor_text": {Column: "anchor_text", Type: FilterLike}, "rel": {Column: "rel", Type: FilterLike}, "tag": {Column: "tag", Type: FilterLike}, }
LinkFilters defines the allowed filter columns for the links table.
var LinkSortColumns = map[string]string{
"source_url": "source_url",
"target_url": "target_url",
"anchor_text": "anchor_text",
"rel": "rel",
"tag": "tag",
"crawled_at": "crawled_at",
}
LinkSortColumns maps query param names to DB column names for links.
var Migrations = []Migration{ {Name: "create database", DDL: CreateDatabase}, {Name: "create crawl_sessions", DDL: CreateCrawlSessions}, {Name: "create pages", DDL: CreatePages}, {Name: "create links", DDL: CreateLinks}, {Name: "alter pages v2", DDL: AlterPagesV2}, {Name: "alter pages v3", DDL: AlterPagesV3}, {Name: "alter pages v4", DDL: AlterPagesV4}, {Name: "create robots_txt", DDL: CreateRobotsTxt}, {Name: "alter sessions v2", DDL: AlterSessionsV2}, {Name: "create sitemaps", DDL: CreateSitemaps}, {Name: "create sitemap_urls", DDL: CreateSitemapURLs}, {Name: "repartition by session_id", Fn: migrateRepartitionBySession}, {Name: "create gsc_analytics", DDL: CreateGSCAnalytics}, {Name: "create gsc_inspection", DDL: CreateGSCInspection}, {Name: "create external_link_checks", DDL: CreateExternalLinkChecks}, {Name: "create application_logs", DDL: CreateApplicationLogs}, {Name: "create provider_domain_metrics", DDL: CreateProviderDomainMetrics}, {Name: "create provider_backlinks", DDL: CreateProviderBacklinks}, {Name: "create provider_refdomains", DDL: CreateProviderRefDomains}, {Name: "create provider_rankings", DDL: CreateProviderRankings}, {Name: "create provider_visibility", DDL: CreateProviderVisibility}, {Name: "create page_resource_checks", DDL: CreatePageResourceChecks}, {Name: "create page_resource_refs", DDL: CreatePageResourceRefs}, {Name: "alter pages v5 js rendering", DDL: AlterPagesV5}, {Name: "alter pages v6 content hash", DDL: AlterPagesV6}, {Name: "create provider_top_pages", DDL: CreateProviderTopPages}, {Name: "create provider_api_calls", DDL: CreateProviderAPICalls}, {Name: "create extractions", DDL: CreateExtractions}, {Name: "create provider_data", DDL: CreateProviderData}, {Name: "alter provider_backlinks add ttf_topic", DDL: `ALTER TABLE crawlobserver.provider_backlinks ADD COLUMN IF NOT EXISTS source_ttf_topic String DEFAULT ''`}, {Name: "create retry_attempts", DDL: CreateRetryAttempts}, }
Migrations is the ordered list of migrations.
var PageFilters = map[string]FilterDef{ "url": {Column: "url", Type: FilterLike}, "content_type": {Column: "content_type", Type: FilterLike}, "title": {Column: "title", Type: FilterLike}, "canonical": {Column: "canonical", Type: FilterLike}, "meta_robots": {Column: "meta_robots", Type: FilterLike}, "meta_description": {Column: "meta_description", Type: FilterLike}, "meta_keywords": {Column: "meta_keywords", Type: FilterLike}, "lang": {Column: "lang", Type: FilterLike}, "og_title": {Column: "og_title", Type: FilterLike}, "content_encoding": {Column: "content_encoding", Type: FilterLike}, "index_reason": {Column: "index_reason", Type: FilterLike}, "error": {Column: "error", Type: FilterLike}, "found_on": {Column: "found_on", Type: FilterLike}, "status_code": {Column: "status_code", Type: FilterUint}, "title_length": {Column: "title_length", Type: FilterUint}, "meta_desc_length": {Column: "meta_desc_length", Type: FilterUint}, "depth": {Column: "depth", Type: FilterUint}, "word_count": {Column: "word_count", Type: FilterUint}, "internal_links_out": {Column: "internal_links_out", Type: FilterUint}, "external_links_out": {Column: "external_links_out", Type: FilterUint}, "images_count": {Column: "images_count", Type: FilterUint}, "images_no_alt": {Column: "images_no_alt", Type: FilterUint}, "body_size": {Column: "body_size", Type: FilterUint}, "fetch_duration_ms": {Column: "fetch_duration_ms", Type: FilterUint}, "is_indexable": {Column: "is_indexable", Type: FilterBool}, "canonical_is_self": {Column: "canonical_is_self", Type: FilterBool}, "h1": {Column: "h1", Type: FilterArray}, "h2": {Column: "h2", Type: FilterArray}, "pagerank": {Column: "pagerank", Type: FilterUint}, }
PageFilters defines the allowed filter columns for the pages table.
var PageResourceCheckFilters = map[string]FilterDef{ "url": {Column: "url", Type: FilterLike}, "resource_type": {Column: "resource_type", Type: FilterLike}, "is_internal": {Column: "is_internal", Type: FilterBool}, "status_code": {Column: "status_code", Type: FilterUint}, "content_type": {Column: "content_type", Type: FilterLike}, "error": {Column: "error", Type: FilterLike}, }
PageResourceCheckFilters defines the allowed filter columns for page_resource_checks.
var PageSortColumns = map[string]string{
"url": "url",
"status_code": "status_code",
"title": "title",
"title_length": "title_length",
"word_count": "word_count",
"internal_links_out": "internal_links_out",
"external_links_out": "external_links_out",
"body_size": "body_size",
"fetch_duration_ms": "fetch_duration_ms",
"depth": "depth",
"pagerank": "pagerank",
"content_type": "content_type",
"meta_description": "meta_description",
"meta_desc_length": "meta_desc_length",
"meta_keywords": "meta_keywords",
"canonical": "canonical",
"is_indexable": "is_indexable",
"index_reason": "index_reason",
"meta_robots": "meta_robots",
"canonical_is_self": "canonical_is_self",
"images_count": "images_count",
"images_no_alt": "images_no_alt",
"content_encoding": "content_encoding",
"lang": "lang",
"og_title": "og_title",
"crawled_at": "crawled_at",
}
PageSortColumns maps query param names to DB column names for pages.
var ProviderDataFilters = map[string]FilterDef{ "item_url": {Column: "item_url", Type: FilterLike}, "title": {Column: "str_data['title']", Type: FilterLike}, "language": {Column: "str_data['language']", Type: FilterLike}, "trust_flow": {Column: "trust_flow", Type: FilterUint}, "citation_flow": {Column: "citation_flow", Type: FilterUint}, "ext_backlinks": {Column: "ext_backlinks", Type: FilterUint}, "ref_domains": {Column: "ref_domains", Type: FilterUint}, "topic": {Column: "str_data['ttf_topic_0']", Type: FilterLike}, }
ProviderDataFilters defines the allowed filter columns for the provider_data table.
var ProviderDataSortColumns = map[string]string{
"item_url": "item_url",
"trust_flow": "trust_flow",
"citation_flow": "citation_flow",
"ext_backlinks": "ext_backlinks",
"ref_domains": "ref_domains",
"domain_rank": "domain_rank",
}
ProviderDataSortColumns maps query param names to DB column names for provider_data.
var RedirectFilters = map[string]FilterDef{ "url": {Column: "p.url", Type: FilterLike}, "status_code": {Column: "p.status_code", Type: FilterUint}, "final_url": {Column: "p.final_url", Type: FilterLike}, }
RedirectFilters defines the allowed filter columns for the redirect pages view.
var RedirectSortColumns = map[string]string{
"url": "p.url",
"status_code": "p.status_code",
"final_url": "p.final_url",
"inbound_internal_links": "inbound_internal_links",
}
RedirectSortColumns maps query param names to DB column names for redirect pages.
Functions ¶
func BuildOrderByClause ¶ added in v0.2.0
BuildOrderByClause returns an ORDER BY clause using the sort param or the default.
func BuildWhereClause ¶
func BuildWhereClause(filters []ParsedFilter) (string, []interface{}, error)
BuildWhereClause generates a SQL WHERE clause fragment and arguments from parsed filters.
Types ¶
type AnchorCount ¶
AnchorCount is an anchor text + count.
type AuditContent ¶
type AuditContent struct {
Total uint64 `json:"total"`
HTMLPages uint64 `json:"html_pages"`
TitleMissing uint64 `json:"title_missing"`
TitleTooLong uint64 `json:"title_too_long"`
TitleTooShort uint64 `json:"title_too_short"`
TitleDuplicates uint64 `json:"title_duplicates"`
MetaDescMissing uint64 `json:"meta_desc_missing"`
MetaDescTooLong uint64 `json:"meta_desc_too_long"`
MetaDescTooShort uint64 `json:"meta_desc_too_short"`
H1Missing uint64 `json:"h1_missing"`
H1Multiple uint64 `json:"h1_multiple"`
ThinUnder100 uint64 `json:"thin_under_100"`
Thin100300 uint64 `json:"thin_100_300"`
ImagesTotal uint64 `json:"images_total"`
ImagesNoAltTotal uint64 `json:"images_no_alt_total"`
PagesWithImagesNoAlt uint64 `json:"pages_with_images_no_alt"`
}
AuditContent holds content-related audit metrics.
type AuditInternational ¶
type AuditInternational struct {
PagesWithHreflang uint64 `json:"pages_with_hreflang"`
PagesWithLang uint64 `json:"pages_with_lang"`
PagesWithSchema uint64 `json:"pages_with_schema"`
LangDistribution []LangCount `json:"lang_distribution"`
SchemaDistribution []SchemaCount `json:"schema_distribution"`
}
AuditInternational holds international/schema audit metrics.
type AuditLinks ¶
type AuditLinks struct {
TotalInternal uint64 `json:"total_internal"`
TotalExternal uint64 `json:"total_external"`
ExternalNofollow uint64 `json:"external_nofollow"`
ExternalDofollow uint64 `json:"external_dofollow"`
PagesNoInternalOut uint64 `json:"pages_no_internal_out"`
PagesHighInternalOut uint64 `json:"pages_high_internal_out"`
PagesNoExternal uint64 `json:"pages_no_external"`
BrokenInternal uint64 `json:"broken_internal"`
TopExternalDomains []ExternalDomain `json:"top_external_domains"`
TopAnchors []AnchorCount `json:"top_anchors"`
}
AuditLinks holds link audit metrics.
type AuditResult ¶
type AuditResult struct {
Content *AuditContent `json:"content"`
Technical *AuditTechnical `json:"technical"`
Links *AuditLinks `json:"links"`
Structure *AuditStructure `json:"structure"`
Sitemaps *AuditSitemaps `json:"sitemaps"`
International *AuditInternational `json:"international"`
}
AuditResult is the combined audit result.
type AuditSitemaps ¶
type AuditSitemaps struct {
InBoth uint64 `json:"in_both"`
CrawledOnly uint64 `json:"crawled_only"`
SitemapOnly uint64 `json:"sitemap_only"`
TotalSitemapURLs uint64 `json:"total_sitemap_urls"`
}
AuditSitemaps holds sitemap coverage audit metrics.
type AuditStructure ¶
type AuditStructure struct {
Directories []DirectoryCount `json:"directories"`
OrphanPages uint64 `json:"orphan_pages"`
}
AuditStructure holds site structure audit metrics.
type AuditTechnical ¶
type AuditTechnical struct {
Indexable uint64 `json:"indexable"`
NonIndexable uint64 `json:"non_indexable"`
CanonicalSelf uint64 `json:"canonical_self"`
CanonicalOther uint64 `json:"canonical_other"`
CanonicalMissing uint64 `json:"canonical_missing"`
HasRedirect uint64 `json:"has_redirect"`
RedirectChainsOver2 uint64 `json:"redirect_chains_over_2"`
ResponseFast uint64 `json:"response_fast"`
ResponseOK uint64 `json:"response_ok"`
ResponseSlow uint64 `json:"response_slow"`
ResponseVerySlow uint64 `json:"response_very_slow"`
ErrorPages uint64 `json:"error_pages"`
NoindexReasons []NoindexReason `json:"noindex_reasons"`
ContentTypes []ContentTypeCount `json:"content_types"`
}
AuditTechnical holds technical audit metrics.
type BFSResult ¶
RecomputeDepths runs a BFS from seed URLs and updates depth/found_on in the pages table. BFSResult holds the output of a BFS depth computation.
func ComputeBFSDepths ¶
func ComputeBFSDepths(seedURLs []string, crawledSet map[string]bool, adj map[string][]string) BFSResult
ComputeBFSDepths runs BFS from seedURLs over the link graph and returns the depth and found_on for every URL in crawledSet. Seeds get depth 0. Orphans (unreachable) get maxDepth+1.
type Buffer ¶
type Buffer struct {
// contains filtered or unexported fields
}
Buffer accumulates rows and flushes them in batches.
func NewBuffer ¶
func NewBuffer(store PageLinkInserter, batchSize int, flushInterval time.Duration, sessionID string) *Buffer
NewBuffer creates a new write buffer.
func (*Buffer) AddExtractions ¶ added in v0.3.0
func (b *Buffer) AddExtractions(rows []extraction.ExtractionRow)
AddExtractions adds extraction rows to the buffer.
func (*Buffer) Close ¶
func (b *Buffer) Close()
Close flushes remaining data and stops the flush loop.
func (*Buffer) ErrorState ¶
func (b *Buffer) ErrorState() BufferErrorState
ErrorState returns the current error state of the buffer for monitoring.
func (*Buffer) Flush ¶
func (b *Buffer) Flush()
Flush writes all buffered data to ClickHouse, retrying previously failed batches first.
func (*Buffer) SetOnDataLost ¶
SetOnDataLost registers a callback invoked (outside the lock) whenever data is dropped.
type BufferErrorState ¶
type BufferErrorState struct {
LostPages int64 `json:"lost_pages"`
LostLinks int64 `json:"lost_links"`
PendingPages int `json:"pending_retry_pages"`
PendingLinks int `json:"pending_retry_links"`
LastError error `json:"last_error,omitempty"`
}
BufferErrorState exposes retry/loss counters for monitoring.
type CSVImportResult ¶ added in v0.9.0
type CSVImportResult struct {
Session *CrawlSession `json:"session"`
RowsImported int `json:"rows_imported"`
RowsSkipped int `json:"rows_skipped"`
}
CSVImportResult holds the outcome of a CSV import.
type CompareStatsResult ¶
type CompareStatsResult struct {
SessionA string `json:"session_a"`
SessionB string `json:"session_b"`
StatsA *SessionStats `json:"stats_a"`
StatsB *SessionStats `json:"stats_b"`
}
CompareStatsResult holds side-by-side stats for two sessions.
type ContentTypeCount ¶
type ContentTypeCount struct {
ContentType string `json:"content_type"`
Count uint64 `json:"count"`
}
ContentTypeCount is a content type + count.
type CrawlSession ¶
type CrawlSession struct {
ID string
StartedAt time.Time
FinishedAt time.Time
Status string // running, completed, failed, stopped
SeedURLs []string
Config string // JSON
PagesCrawled uint64
UserAgent string
ProjectID *string
}
CrawlSession represents a crawl session.
type DirectoryCount ¶
DirectoryCount is a URL directory prefix + count.
type ExpiredDomain ¶
type ExpiredDomain struct {
RegistrableDomain string `json:"registrable_domain"`
DeadURLsChecked uint64 `json:"dead_urls_checked"`
Sources []ExpiredDomainSource `json:"sources"`
}
ExpiredDomain represents a registrable domain where all checked URLs had DNS failures.
type ExpiredDomainSource ¶
type ExpiredDomainSource struct {
SourceURL string `json:"source_url"`
TargetURL string `json:"target_url"`
}
ExpiredDomainSource represents a source page linking to an expired domain.
type ExpiredDomainsResult ¶
type ExpiredDomainsResult struct {
Domains []ExpiredDomain `json:"domains"`
Total uint64 `json:"total"`
}
ExpiredDomainsResult wraps paginated expired domain results.
type ExternalDomain ¶
ExternalDomain is a domain + link count.
type ExternalDomainCheck ¶
type ExternalDomainCheck struct {
Domain string `json:"domain"`
TotalURLs uint64 `json:"total_urls"`
OK uint64 `json:"ok"`
Redirects uint64 `json:"redirects"`
ClientErrors uint64 `json:"client_errors"`
ServerErrors uint64 `json:"server_errors"`
Unreachable uint64 `json:"unreachable"`
AvgResponseMs uint32 `json:"avg_response_ms"`
}
ExternalDomainCheck represents aggregated external check stats per domain.
type ExternalLinkCheck ¶
type ExternalLinkCheck struct {
CrawlSessionID string `json:"crawl_session_id"`
URL string `json:"url"`
StatusCode uint16 `json:"status_code"`
Error string `json:"error"`
ContentType string `json:"content_type"`
RedirectURL string `json:"redirect_url"`
ResponseTimeMs uint32 `json:"response_time_ms"`
CheckedAt time.Time `json:"checked_at"`
}
ExternalLinkCheck represents a single external URL check result.
type ExtractionQueryRow ¶ added in v0.3.0
ExtractionRow for query results.
type FilterDef ¶
type FilterDef struct {
Column string
Type FilterType
}
type FilterType ¶
type FilterType int
const ( FilterLike FilterType = iota // String → ILIKE '%val%' FilterUint // Numeric → =N, >N, <N, >=N, <=N FilterBool // Bool → = true/false FilterArray // Array(String) → arrayExists(x -> x ILIKE '%val%', col) )
type GSCAnalyticsInsertRow ¶
type GSCAnalyticsInsertRow struct {
Date time.Time
Query string
Page string
Country string
Device string
Clicks uint32
Impressions uint32
CTR float32
Position float32
}
GSCAnalyticsInsertRow is the input row for batch inserts.
type GSCCountryRow ¶
type GSCDeviceRow ¶
type GSCInspectionInsertRow ¶
type GSCInspectionRow ¶
type GSCInspectionRow struct {
URL string `json:"url"`
Verdict string `json:"verdict"`
CoverageState string `json:"coverage_state"`
IndexingState string `json:"indexing_state"`
RobotsTxtState string `json:"robots_txt_state"`
LastCrawlTime string `json:"last_crawl_time"`
CrawledAs string `json:"crawled_as"`
CanonicalURL string `json:"canonical_url"`
IsGoogleCanonical bool `json:"is_google_canonical"`
MobileUsability string `json:"mobile_usability"`
RichResultsItems uint16 `json:"rich_results_items"`
}
type GSCOverviewStats ¶
type GSCOverviewStats struct {
TotalClicks uint64 `json:"total_clicks"`
TotalImpressions uint64 `json:"total_impressions"`
AvgCTR float64 `json:"avg_ctr"`
AvgPosition float64 `json:"avg_position"`
DateMin string `json:"date_min"`
DateMax string `json:"date_max"`
TotalQueries uint64 `json:"total_queries"`
TotalPages uint64 `json:"total_pages"`
}
type GSCPageRow ¶
type GSCQueryRow ¶
type GSCTimelineRow ¶
type GlobalSessionStats ¶
type GlobalSessionStats struct {
SessionID string `json:"session_id"`
TotalPages uint64 `json:"total_pages"`
TotalLinks uint64 `json:"total_links"`
ErrorCount uint64 `json:"error_count"`
AvgFetchMs float64 `json:"avg_fetch_ms"`
}
GlobalSessionStats holds aggregated stats for a single session.
type HreflangRow ¶
HreflangRow represents a hreflang entry.
type LinkDiffResult ¶
type LinkDiffResult struct {
Links []LinkDiffRow `json:"links"`
TotalAdded uint64 `json:"total_added"`
TotalRemoved uint64 `json:"total_removed"`
}
LinkDiffResult wraps paginated link diff results.
type LinkDiffRow ¶
type LinkDiffRow struct {
SourceURL string `json:"source_url"`
TargetURL string `json:"target_url"`
AnchorText string `json:"anchor_text"`
DiffType string `json:"diff_type"`
}
LinkDiffRow represents a single internal link difference.
type LinkRow ¶
type LinkRow struct {
CrawlSessionID string
SourceURL string
TargetURL string
AnchorText string
Rel string
IsInternal bool
Tag string
CrawledAt time.Time
}
LinkRow represents a link for storage.
type Migration ¶
type Migration struct {
Name string
DDL string
Fn func(ctx context.Context, conn driver.Conn) error
}
Migration represents a schema migration step. Either DDL (a SQL string) or Fn (a function) must be set, not both.
type NearDuplicatePair ¶
type NearDuplicatePair struct {
URLa string `json:"url_a"`
URLb string `json:"url_b"`
TitleA string `json:"title_a"`
TitleB string `json:"title_b"`
CanonicalA string `json:"canonical_a"`
CanonicalB string `json:"canonical_b"`
WordCountA uint32 `json:"word_count_a"`
WordCountB uint32 `json:"word_count_b"`
Similarity float64 `json:"similarity"` // 0–1, 1 = exact duplicate
}
NearDuplicatePair represents two pages with near-identical content.
type NearDuplicatesResult ¶
type NearDuplicatesResult struct {
Pairs []NearDuplicatePair `json:"pairs"`
Total uint64 `json:"total"`
}
NearDuplicatesResult wraps paginated near-duplicate results.
type NoindexReason ¶
NoindexReason is a reason + count for non-indexable pages.
type PageDiffResult ¶
type PageDiffResult struct {
Pages []PageDiffRow `json:"pages"`
TotalAdded uint64 `json:"total_added"`
TotalRemoved uint64 `json:"total_removed"`
TotalChanged uint64 `json:"total_changed"`
}
PageDiffResult wraps paginated page diff results.
type PageDiffRow ¶
type PageDiffRow struct {
URL string `json:"url"`
DiffType string `json:"diff_type"`
StatusCodeA uint16 `json:"status_code_a"`
TitleA string `json:"title_a"`
CanonicalA string `json:"canonical_a"`
IsIndexableA bool `json:"is_indexable_a"`
WordCountA uint32 `json:"word_count_a"`
DepthA uint16 `json:"depth_a"`
PageRankA float64 `json:"pagerank_a"`
MetaDescriptionA string `json:"meta_description_a"`
H1A string `json:"h1_a"`
StatusCodeB uint16 `json:"status_code_b"`
TitleB string `json:"title_b"`
CanonicalB string `json:"canonical_b"`
IsIndexableB bool `json:"is_indexable_b"`
WordCountB uint32 `json:"word_count_b"`
DepthB uint16 `json:"depth_b"`
PageRankB float64 `json:"pagerank_b"`
MetaDescriptionB string `json:"meta_description_b"`
H1B string `json:"h1_b"`
}
PageDiffRow represents a single page difference between two crawls.
type PageHTMLRow ¶
PageHTMLRow is a url+html pair streamed from ClickHouse.
type PageLinkInserter ¶
type PageLinkInserter interface {
InsertPages(ctx context.Context, pages []PageRow) error
InsertLinks(ctx context.Context, links []LinkRow) error
InsertExtractions(ctx context.Context, rows []extraction.ExtractionRow) error
}
PageLinkInserter is the subset of Store used by Buffer for flushing data.
type PageLinksResult ¶
type PageLinksResult struct {
OutLinks []LinkRow `json:"out_links"`
InLinks []LinkRow `json:"in_links"`
OutLinksCount uint64 `json:"out_links_count"`
InLinksCount uint64 `json:"in_links_count"`
}
PageLinksResult holds outbound links, inbound links (paginated), and counts.
type PageRankBucket ¶
type PageRankBucket struct {
Min float64 `json:"min"`
Max float64 `json:"max"`
Count uint64 `json:"count"`
AvgPR float64 `json:"avg_pr"`
}
PageRankBucket holds one histogram bucket for PageRank distribution.
type PageRankDistributionResult ¶
type PageRankDistributionResult struct {
Buckets []PageRankBucket `json:"buckets"`
TotalWithPR uint64 `json:"total_with_pr"`
Avg float64 `json:"avg"`
Median float64 `json:"median"`
P90 float64 `json:"p90"`
P99 float64 `json:"p99"`
}
PageRankDistributionResult holds the full distribution response.
type PageRankEntry ¶
PageRankEntry holds a URL and its PageRank score.
type PageRankTopPage ¶
type PageRankTopPage struct {
URL string `json:"url"`
PageRank float64 `json:"pagerank"`
Depth uint16 `json:"depth"`
InternalLinksOut uint32 `json:"internal_links_out"`
ExternalLinksOut uint32 `json:"external_links_out"`
WordCount uint32 `json:"word_count"`
StatusCode uint16 `json:"status_code"`
Title string `json:"title"`
}
PageRankTopPage holds a single page entry for the top PageRank list.
type PageRankTopResult ¶
type PageRankTopResult struct {
Pages []PageRankTopPage `json:"pages"`
Total uint64 `json:"total"`
}
PageRankTopResult holds the paginated top PageRank pages response.
type PageRankTreemapEntry ¶
type PageRankTreemapEntry struct {
Path string `json:"path"`
PageCount uint64 `json:"page_count"`
TotalPR float64 `json:"total_pr"`
AvgPR float64 `json:"avg_pr"`
MaxPR float64 `json:"max_pr"`
}
PageRankTreemapEntry holds aggregated PageRank data for a URL directory.
type PageResourceCheck ¶
type PageResourceCheck struct {
CrawlSessionID string `json:"crawl_session_id"`
URL string `json:"url"`
ResourceType string `json:"resource_type"`
IsInternal bool `json:"is_internal"`
StatusCode uint16 `json:"status_code"`
Error string `json:"error"`
ContentType string `json:"content_type"`
RedirectURL string `json:"redirect_url"`
ResponseTimeMs uint32 `json:"response_time_ms"`
CheckedAt time.Time `json:"checked_at"`
PageCount uint64 `json:"page_count,omitempty"`
}
PageResourceCheck represents a single page resource check result.
type PageResourceRef ¶
type PageResourceRef struct {
CrawlSessionID string `json:"crawl_session_id"`
PageURL string `json:"page_url"`
ResourceURL string `json:"resource_url"`
ResourceType string `json:"resource_type"`
IsInternal bool `json:"is_internal"`
}
PageResourceRef links a page to a resource it uses.
type PageRow ¶
type PageRow struct {
CrawlSessionID string
URL string
FinalURL string
StatusCode uint16
ContentType string
Title string
TitleLength uint16
Canonical string
CanonicalIsSelf bool
IsIndexable bool
IndexReason string // why not indexable
MetaRobots string
MetaDescription string
MetaDescLength uint16
MetaKeywords string
H1 []string
H2 []string
H3 []string
H4 []string
H5 []string
H6 []string
WordCount uint32
InternalLinksOut uint32
ExternalLinksOut uint32
ImagesCount uint16
ImagesNoAlt uint16
Hreflang []HreflangRow
Lang string
OGTitle string
OGDescription string
OGImage string
SchemaTypes []string
Headers map[string]string
RedirectChain []RedirectHopRow
BodySize uint64
FetchDurationMs uint64
ContentEncoding string
XRobotsTag string
Error string
Depth uint16
FoundOn string
PageRank float64
ContentHash uint64
BodyHTML string
BodyTruncated bool
CrawledAt time.Time
// JS Rendering
JSRendered bool
JSRenderDurationMs uint64
JSRenderError string
// Rendered data
RenderedTitle string
RenderedMetaDescription string
RenderedH1 []string
RenderedWordCount uint32
RenderedLinksCount uint32
RenderedImagesCount uint16
RenderedCanonical string
RenderedMetaRobots string
RenderedSchemaTypes []string
RenderedBodyHTML string
// Diff flags (static vs rendered)
JSChangedTitle bool
JSChangedDescription bool
JSChangedH1 bool
JSChangedCanonical bool
JSChangedContent bool // word count changed >20%
JSAddedLinks int32 // delta links
JSAddedImages int32 // delta images
JSAddedSchema bool // new schema types appeared
}
PageRow represents a crawled page for storage.
type PageWithAuthority ¶
type PageWithAuthority struct {
URL string `json:"url"`
Title string `json:"title"`
PageRank float64 `json:"pagerank"`
WordCount uint32 `json:"word_count"`
StatusCode uint16 `json:"status_code"`
Depth uint16 `json:"depth"`
TrustFlow *uint8 `json:"trust_flow"`
CitationFlow *uint8 `json:"citation_flow"`
ExtBackLinks *int64 `json:"ext_backlinks"`
RefDomains *int64 `json:"ref_domains"`
}
PageWithAuthority combines a crawled page with its Majestic authority data.
type ParsedFilter ¶
type ProviderAPICallRow ¶
type ProviderAPICallRow struct {
ProjectID string `json:"project_id"`
Provider string `json:"provider"`
Endpoint string `json:"endpoint"`
Method string `json:"method"`
StatusCode uint16 `json:"status_code"`
DurationMs uint32 `json:"duration_ms"`
RowsReturned uint32 `json:"rows_returned"`
ResponseBody string `json:"response_body"`
Error string `json:"error"`
CalledAt time.Time `json:"called_at"`
}
type ProviderBacklinkRow ¶
type ProviderBacklinkRow struct {
Provider string `json:"provider"`
Domain string `json:"domain"`
SourceURL string `json:"source_url"`
TargetURL string `json:"target_url"`
AnchorText string `json:"anchor_text"`
SourceDomain string `json:"source_domain"`
LinkType string `json:"link_type"`
TrustFlow float64 `json:"trust_flow"`
CitationFlow float64 `json:"citation_flow"`
SourceTTFTopic string `json:"source_ttf_topic"`
Nofollow bool `json:"nofollow"`
FirstSeen time.Time `json:"first_seen"`
LastSeen time.Time `json:"last_seen"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderDataRow ¶ added in v0.6.0
type ProviderDataRow struct {
Provider string `json:"provider"`
DataType string `json:"data_type"`
Domain string `json:"domain"`
ItemURL string `json:"item_url"`
TrustFlow uint8 `json:"trust_flow"`
CitationFlow uint8 `json:"citation_flow"`
DomainRank float64 `json:"domain_rank"`
ExtBacklinks int64 `json:"ext_backlinks"`
RefDomains int64 `json:"ref_domains"`
StrData map[string]string `json:"str_data"`
NumData map[string]float64 `json:"num_data"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderDomainMetricsRow ¶
type ProviderDomainMetricsRow struct {
Provider string `json:"provider"`
Domain string `json:"domain"`
BacklinksTotal int64 `json:"backlinks_total"`
RefDomainsTotal int64 `json:"refdomains_total"`
DomainRank float64 `json:"domain_rank"`
OrganicKeywords int64 `json:"organic_keywords"`
OrganicTraffic int64 `json:"organic_traffic"`
OrganicCost float64 `json:"organic_cost"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderRankingRow ¶
type ProviderRankingRow struct {
Provider string `json:"provider"`
Domain string `json:"domain"`
Keyword string `json:"keyword"`
URL string `json:"url"`
SearchBase string `json:"search_base"`
Position uint16 `json:"position"`
SearchVolume int64 `json:"search_volume"`
CPC float64 `json:"cpc"`
Traffic float64 `json:"traffic"`
TrafficPct float64 `json:"traffic_pct"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderRefDomainRow ¶
type ProviderRefDomainRow struct {
Provider string `json:"provider"`
Domain string `json:"domain"`
RefDomain string `json:"ref_domain"`
BacklinkCount int64 `json:"backlink_count"`
DomainRank float64 `json:"domain_rank"`
FirstSeen time.Time `json:"first_seen"`
LastSeen time.Time `json:"last_seen"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderTopPageRow ¶
type ProviderTopPageRow struct {
Provider string `json:"provider"`
Domain string `json:"domain"`
URL string `json:"url"`
Title string `json:"title"`
TrustFlow uint8 `json:"trust_flow"`
CitationFlow uint8 `json:"citation_flow"`
ExtBackLinks int64 `json:"ext_backlinks"`
RefDomains int64 `json:"ref_domains"`
TopicalTrustFlow []TopicalTF `json:"topical_trust_flow"`
Language string `json:"language"`
FetchedAt time.Time `json:"fetched_at"`
}
type ProviderVisibilityRow ¶
type RedirectHopRow ¶
RedirectHopRow represents a redirect hop for storage.
type RedirectPageRow ¶ added in v0.2.1
type RedirectPageRow struct {
URL string `json:"url"`
StatusCode uint16 `json:"status_code"`
FinalURL string `json:"final_url"`
InboundInternalLinks uint64 `json:"inbound_internal_links"`
}
RedirectPageRow represents a redirect page with inbound internal link count.
type ResourceTypeSummary ¶
type ResourceTypeSummary struct {
ResourceType string `json:"resource_type"`
Total uint64 `json:"total"`
Internal uint64 `json:"internal"`
External uint64 `json:"external"`
OK uint64 `json:"ok"`
Errors uint64 `json:"errors"`
}
ResourceTypeSummary holds aggregated stats for one resource type.
type RobotsRow ¶
type RobotsRow struct {
CrawlSessionID string
Host string
StatusCode uint16
Content string
FetchedAt time.Time
}
RobotsRow represents a robots.txt entry for storage.
type SchemaCount ¶
SchemaCount is a schema type + count.
type SessionStats ¶
type SessionStats struct {
TotalPages uint64 `json:"total_pages"`
TotalLinks uint64 `json:"total_links"`
InternalLinks uint64 `json:"internal_links"`
ExternalLinks uint64 `json:"external_links"`
AvgFetchMs float64 `json:"avg_fetch_ms"`
ErrorCount uint64 `json:"error_count"`
StatusCodes map[uint16]uint64 `json:"status_codes"`
DepthDistribution map[uint16]uint64 `json:"depth_distribution"`
PagesPerSecond float64 `json:"pages_per_second"`
CrawlDurationSec float64 `json:"crawl_duration_sec"`
TopPageRank []PageRankEntry `json:"top_pagerank"`
JSRenderedPages uint64 `json:"js_rendered_pages"`
JSChangedTitleCount uint64 `json:"js_changed_title_count"`
JSChangedH1Count uint64 `json:"js_changed_h1_count"`
JSChangedContentCount uint64 `json:"js_changed_content_count"`
AvgJSRenderMs float64 `json:"avg_js_render_ms"`
}
SessionStats holds aggregate stats for a crawl session.
type SitemapRow ¶
type SitemapRow struct {
CrawlSessionID string
URL string
Type string // "index" | "urlset"
URLCount uint32
ParentURL string // empty if top-level
StatusCode uint16
FetchedAt time.Time
}
SitemapRow represents a discovered sitemap for storage.
type SitemapURLRow ¶
type SitemapURLRow struct {
CrawlSessionID string
SitemapURL string
Loc string
LastMod string
ChangeFreq string
Priority string
}
SitemapURLRow represents a URL entry within a sitemap.
type SortParam ¶ added in v0.2.0
type SortParam struct {
Column string // DB column name (from whitelist)
Order string // "ASC" or "DESC"
}
SortParam holds a validated sort column and direction.
type StatusTimelineBucket ¶ added in v0.9.0
type StatusTimelineBucket struct {
Timestamp time.Time `json:"ts"`
OK uint64 `json:"ok"`
Redirect uint64 `json:"redirect"`
Status403 uint64 `json:"s403"`
Status429 uint64 `json:"s429"`
ClientErr uint64 `json:"client_err"` // 4xx excluding 403/429
ServerErr uint64 `json:"server_err"` // 5xx
FetchErr uint64 `json:"fetch_err"` // status_code = 0
Total uint64 `json:"total"`
Retried403 uint64 `json:"retried_403"`
Retried429 uint64 `json:"retried_429"`
Retried5xx uint64 `json:"retried_5xx"`
}
StatusTimelineBucket holds counts per status code category for a time interval.
type StorageStatsResult ¶
type StorageStatsResult struct {
Tables []TableStorageStats `json:"tables"`
}
StorageStatsResult holds storage stats for all tables.
type Store ¶
type Store struct {
// contains filtered or unexported fields
}
Store manages ClickHouse connections and operations.
func (*Store) CompareLinks ¶
func (s *Store) CompareLinks(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*LinkDiffResult, error)
CompareLinks returns paginated link diffs between two sessions.
func (*Store) ComparePages ¶
func (s *Store) ComparePages(ctx context.Context, sessionA, sessionB, diffType string, limit, offset int) (*PageDiffResult, error)
ComparePages returns paginated page diffs between two sessions.
func (*Store) CompareStats ¶
func (s *Store) CompareStats(ctx context.Context, sessionA, sessionB string) (*CompareStatsResult, error)
CompareStats retrieves side-by-side stats for two sessions.
func (*Store) ComputePageRank ¶
ComputePageRank computes internal PageRank for all pages in a session. Uses uint32 IDs for memory efficiency and iterative power method. URL→ID mapping is done in ClickHouse via a Join-engine temp table, so only uint32 pairs are transferred for the link graph.
func (*Store) CountPages ¶
CountPages returns the total number of pages for a session.
func (*Store) DeleteExtractions ¶ added in v0.3.0
DeleteExtractions removes all extraction data for a session.
func (*Store) DeleteFailedPages ¶
DeleteFailedPages removes pages with status_code = 0 for a session so they can be re-crawled.
func (*Store) DeleteGSCData ¶
func (*Store) DeletePagesByStatus ¶
func (s *Store) DeletePagesByStatus(ctx context.Context, sessionID string, statusCode int) (int, error)
DeletePagesByStatus deletes pages with a specific status code and returns the count deleted.
func (*Store) DeleteProviderData ¶
func (*Store) DeleteSession ¶
DeleteSession deletes a crawl session and all its associated data. Uses DROP PARTITION for instant deletion on partitioned tables.
func (*Store) ExportLogs ¶
ExportLogs returns all logs (up to 7 days per TTL) for JSONL export.
func (*Store) ExportSession ¶
func (s *Store) ExportSession(ctx context.Context, sessionID string, w io.Writer, includeHTML bool) error
ExportSession streams a session's data as gzipped JSONL to w.
func (*Store) ExternalLinks ¶
ExternalLinks retrieves external links for a given session (or all sessions).
func (*Store) ExternalLinksPaginated ¶
func (s *Store) ExternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]LinkRow, error)
ExternalLinksPaginated retrieves external links with pagination and optional filters.
func (*Store) FailedURLs ¶
FailedURLs returns URLs with status_code = 0 (fetch errors) for a session.
func (*Store) GSCByCountry ¶
func (*Store) GSCByDevice ¶
func (*Store) GSCInspectionResults ¶
func (*Store) GSCOverview ¶
func (*Store) GSCTimeline ¶
func (*Store) GSCTopPages ¶
func (*Store) GSCTopQueries ¶
func (*Store) GetExpiredDomains ¶
func (s *Store) GetExpiredDomains(ctx context.Context, sessionID string, limit, offset int) (*ExpiredDomainsResult, error)
GetExpiredDomains returns registrable domains where all external checks failed with DNS errors.
func (*Store) GetExternalLinkCheckDomains ¶
func (s *Store) GetExternalLinkCheckDomains(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]ExternalDomainCheck, error)
GetExternalLinkCheckDomains returns aggregated external check stats per domain.
func (*Store) GetExternalLinkChecks ¶
func (s *Store) GetExternalLinkChecks(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]ExternalLinkCheck, error)
GetExternalLinkChecks returns paginated external link check results for a session.
func (*Store) GetExtractions ¶ added in v0.3.0
func (s *Store) GetExtractions(ctx context.Context, sessionID string, limit, offset int) (*extraction.ExtractionResult, error)
GetExtractions retrieves extraction results for a session, pivoted by extractor name.
func (*Store) GetPageBodies ¶
func (s *Store) GetPageBodies(ctx context.Context, sessionID string, limit, offset int) ([]PageBody, error)
GetPageBodies reads URL + body_html for a session in batches.
func (*Store) GetPageHTML ¶
GetPageHTML retrieves the raw HTML for a specific page.
func (*Store) GetPageLinks ¶
func (s *Store) GetPageLinks(ctx context.Context, sessionID, url string, outLimit, outOffset, inLimit, inOffset int) (*PageLinksResult, error)
GetPageLinks retrieves outbound and inbound links for a URL with pagination.
func (*Store) GetPageResourceChecks ¶
func (s *Store) GetPageResourceChecks(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter) ([]PageResourceCheck, error)
GetPageResourceChecks returns paginated resource checks with page_count from refs.
func (*Store) GetPageResourceTypeSummary ¶
func (s *Store) GetPageResourceTypeSummary(ctx context.Context, sessionID string) ([]ResourceTypeSummary, error)
GetPageResourceTypeSummary returns aggregated stats per resource type.
func (*Store) GetRobotsContent ¶
GetRobotsContent returns the full robots.txt content for a specific host in a session.
func (*Store) GetRobotsHosts ¶
GetRobotsHosts returns all hosts with robots.txt data for a session (without content).
func (*Store) GetSession ¶
GetSession retrieves a single crawl session by ID.
func (*Store) GetSitemapCoverageURLs ¶ added in v0.4.0
func (s *Store) GetSitemapCoverageURLs(ctx context.Context, sessionID, filter string, limit, offset int) ([]SitemapURLRow, error)
GetSitemapCoverageURLs returns paginated sitemap URLs filtered by coverage type. filter must be "sitemap_only" (in sitemap but not crawled) or "in_both" (in sitemap and crawled).
func (*Store) GetSitemapURLs ¶
func (s *Store) GetSitemapURLs(ctx context.Context, sessionID, sitemapURL string, limit, offset int) ([]SitemapURLRow, error)
GetSitemapURLs returns paginated URLs from a specific sitemap.
func (*Store) GetSitemaps ¶
GetSitemaps returns all sitemaps for a session.
func (*Store) GetURLsByHost ¶
GetURLsByHost returns all distinct URLs for a given host in a session.
func (*Store) GlobalStats ¶
func (s *Store) GlobalStats(ctx context.Context) ([]GlobalSessionStats, *StorageStatsResult, error)
GlobalStats retrieves aggregated stats per session across all data.
func (*Store) HasStoredHTML ¶ added in v0.3.0
PageHTMLRowForExtraction is a helper to check if stored HTML exists.
func (*Store) ImportCSVSession ¶ added in v0.9.0
func (s *Store) ImportCSVSession(ctx context.Context, r io.Reader, projectID string) (*CSVImportResult, error)
ImportCSVSession reads a CSV file, auto-detects the source, maps columns to PageRow, creates a session, and batch-inserts pages.
func (*Store) ImportSession ¶
ImportSession reads a gzipped JSONL stream and inserts the session with a new UUID.
func (*Store) InsertExternalLinkChecks ¶
func (s *Store) InsertExternalLinkChecks(ctx context.Context, checks []ExternalLinkCheck) error
InsertExternalLinkChecks batch-inserts external link check results.
func (*Store) InsertExtractions ¶ added in v0.3.0
func (s *Store) InsertExtractions(ctx context.Context, rows []extraction.ExtractionRow) error
InsertExtractions batch inserts extraction rows.
func (*Store) InsertGSCAnalytics ¶
func (*Store) InsertGSCInspection ¶
func (*Store) InsertLinks ¶
InsertLinks batch inserts link rows.
func (*Store) InsertLogs ¶
InsertLogs batch inserts application log rows.
func (*Store) InsertPageResourceChecks ¶
func (s *Store) InsertPageResourceChecks(ctx context.Context, checks []PageResourceCheck) error
InsertPageResourceChecks batch inserts page resource check results.
func (*Store) InsertPageResourceRefs ¶
func (s *Store) InsertPageResourceRefs(ctx context.Context, refs []PageResourceRef) error
InsertPageResourceRefs batch inserts page-to-resource references.
func (*Store) InsertPages ¶
InsertPages batch inserts page rows.
func (*Store) InsertProviderAPICalls ¶
func (s *Store) InsertProviderAPICalls(ctx context.Context, rows []ProviderAPICallRow) error
func (*Store) InsertProviderBacklinks ¶
func (*Store) InsertProviderData ¶ added in v0.6.0
func (*Store) InsertProviderDomainMetrics ¶
func (*Store) InsertProviderRankings ¶
func (*Store) InsertProviderRefDomains ¶
func (*Store) InsertProviderTopPages ¶
func (*Store) InsertProviderVisibility ¶
func (*Store) InsertRetryAttempt ¶ added in v0.9.0
func (s *Store) InsertRetryAttempt(ctx context.Context, sessionID string, attemptedAt time.Time, statusCode int, url string) error
InsertRetryAttempt records a single retry attempt.
func (*Store) InsertRobotsData ¶
InsertRobotsData batch inserts robots.txt rows.
func (*Store) InsertSession ¶
func (s *Store) InsertSession(ctx context.Context, session *CrawlSession) error
InsertSession inserts or updates a crawl session.
func (*Store) InsertSitemapURLs ¶
func (s *Store) InsertSitemapURLs(ctx context.Context, rows []SitemapURLRow) error
InsertSitemapURLs inserts sitemap URL rows.
func (*Store) InsertSitemaps ¶
func (s *Store) InsertSitemaps(ctx context.Context, rows []SitemapRow) error
InsertSitemaps inserts sitemap rows.
func (*Store) InternalLinksPaginated ¶
func (s *Store) InternalLinksPaginated(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]LinkRow, error)
InternalLinksPaginated retrieves internal links with pagination and optional filters.
func (*Store) ListLogs ¶
func (s *Store) ListLogs(ctx context.Context, limit, offset int, level, component, search string) ([]applog.LogRow, int, error)
ListLogs returns paginated application logs with optional filters.
func (*Store) ListPages ¶
func (s *Store) ListPages(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]PageRow, error)
ListPages retrieves pages for a session with pagination and optional filters.
func (*Store) ListRedirectPages ¶ added in v0.2.1
func (s *Store) ListRedirectPages(ctx context.Context, sessionID string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]RedirectPageRow, error)
ListRedirectPages retrieves pages with 3xx status codes and their inbound internal link count.
func (*Store) ListSessions ¶
ListSessions retrieves crawl sessions, optionally filtered by project ID.
func (*Store) ListSessionsPaginated ¶
func (s *Store) ListSessionsPaginated(ctx context.Context, limit, offset int, projectID, search string) ([]CrawlSession, int, error)
ListSessionsPaginated retrieves crawl sessions with pagination, optional project and search filters.
func (*Store) NearDuplicates ¶
func (s *Store) NearDuplicates(ctx context.Context, sessionID string, threshold int, limit, offset int) (*NearDuplicatesResult, error)
NearDuplicates finds pages with similar content using SimHash Hamming distance. threshold is the max Hamming distance (e.g. 3 = ≤3 bits differ out of 64).
func (*Store) PageRankDistribution ¶
func (s *Store) PageRankDistribution(ctx context.Context, sessionID string, buckets int) (*PageRankDistributionResult, error)
PageRankDistribution returns a histogram of PageRank values for a session.
func (*Store) PageRankTop ¶
func (s *Store) PageRankTop(ctx context.Context, sessionID string, limit, offset int, directory string) (*PageRankTopResult, error)
PageRankTop returns the top pages by PageRank with metadata, paginated.
func (*Store) PageRankTreemap ¶
func (s *Store) PageRankTreemap(ctx context.Context, sessionID string, depth, minPages int) ([]PageRankTreemapEntry, error)
PageRankTreemap returns PageRank aggregated by URL directory prefix.
func (*Store) PagesWithAuthority ¶
func (s *Store) PagesWithAuthority(ctx context.Context, sessionID, projectID string, limit, offset int) ([]PageWithAuthority, int, error)
PagesWithAuthority joins crawled pages with provider top_pages (Majestic authority data).
func (*Store) ProviderAPICalls ¶
func (*Store) ProviderBacklinks ¶
func (s *Store) ProviderBacklinks(ctx context.Context, projectID, provider string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]ProviderBacklinkRow, int, error)
func (*Store) ProviderData ¶ added in v0.6.0
func (s *Store) ProviderData(ctx context.Context, projectID, provider, dataType string, limit, offset int, filters []ParsedFilter, sort *SortParam) ([]ProviderDataRow, int, error)
func (*Store) ProviderDataAge ¶ added in v0.6.0
func (*Store) ProviderDomainMetrics ¶
func (*Store) ProviderRankings ¶
func (*Store) ProviderRefDomains ¶
func (*Store) ProviderTopPages ¶
func (*Store) ProviderVisibilityHistory ¶
func (*Store) RecomputeDepths ¶
func (*Store) RunCustomTestsSQL ¶
func (s *Store) RunCustomTestsSQL(ctx context.Context, sessionID string, rules []customtests.TestRule) (map[string]map[string]string, error)
RunCustomTestsSQL runs ClickHouse-native test rules as a single query. All user values are parameterized via named placeholders.
func (*Store) RunExtractionsPostCrawl ¶ added in v0.3.0
func (s *Store) RunExtractionsPostCrawl(ctx context.Context, sessionID string, extractors []extraction.Extractor) (*extraction.ExtractionResult, error)
RunExtractionsPostCrawl runs extractors against stored HTML and inserts results.
func (*Store) SessionAudit ¶
SessionAudit computes a comprehensive SEO audit for a crawl session.
func (*Store) SessionStats ¶
SessionStats retrieves aggregate statistics for a crawl session.
func (*Store) SessionStorageStats ¶
SessionStorageStats returns bytes on disk per crawl session, computed from system.parts partitions across all data tables.
func (*Store) StatusTimeline ¶ added in v0.9.0
func (s *Store) StatusTimeline(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)
StatusTimeline returns time-bucketed status code counts for a crawl session. The interval is auto-computed to produce ~60-100 buckets.
func (*Store) StatusTimelineRecent ¶ added in v0.9.0
func (s *Store) StatusTimelineRecent(ctx context.Context, sessionID string) ([]StatusTimelineBucket, error)
StatusTimelineRecent returns the last 10 minutes of crawl activity in 10-second buckets.
func (*Store) StorageStats ¶
func (s *Store) StorageStats(ctx context.Context) (*StorageStatsResult, error)
StorageStats retrieves disk usage and row counts for all crawlobserver tables.
func (*Store) StreamCrawledURLs ¶ added in v0.9.0
func (s *Store) StreamCrawledURLs(ctx context.Context, sessionID string, fn func(string)) (int, error)
StreamCrawledURLs streams all URLs already crawled in a session, calling fn for each URL. This avoids loading the entire URL list into memory (which can cause OOM on large sites with 1M+ pages). Returns the number of URLs streamed.
func (*Store) StreamPagesHTML ¶
StreamPagesHTML streams url+body_html pairs for a session.
func (*Store) StreamPagesHTMLForExtraction ¶ added in v0.3.0
func (s *Store) StreamPagesHTMLForExtraction(ctx context.Context, sessionID string) (<-chan PageHTMLRow, error)
StreamPagesHTMLForExtraction is an alias to StreamPagesHTML for clarity in extraction context.
func (*Store) URLsByStatus ¶
func (s *Store) URLsByStatus(ctx context.Context, sessionID string, statusCode int) ([]string, error)
URLsByStatus returns URLs with a specific status code for a session.
func (*Store) UncrawledURLs ¶
UncrawledURLs returns internal link targets that were discovered but not crawled in a session.
func (*Store) UpdateSessionProject ¶
func (s *Store) UpdateSessionProject(ctx context.Context, sessionID string, projectID *string) error
UpdateSessionProject re-inserts a session with a new project_id (ReplacingMergeTree pattern).
func (*Store) WeightedPageRankTop ¶ added in v0.6.1
func (s *Store) WeightedPageRankTop(ctx context.Context, sessionID, projectID string, limit, offset int, directory, sort, order string) (*WeightedPageRankResult, error)
WeightedPageRankTop returns pages ranked by a weighted PageRank that fuses internal PR with SEObserver data.
type TableStorageStats ¶
type TableStorageStats struct {
Name string `json:"name"`
BytesOnDisk uint64 `json:"bytes_on_disk"`
Rows uint64 `json:"rows"`
}
TableStorageStats holds storage stats for a single table.
type WeightedPageRankPage ¶ added in v0.6.1
type WeightedPageRankPage struct {
URL string `json:"url"`
PageRank float64 `json:"pagerank"`
WeightedPR float64 `json:"weighted_pr"`
TrustFlow *uint8 `json:"trust_flow"`
CitationFlow *uint8 `json:"citation_flow"`
ExtBackLinks *int64 `json:"ext_backlinks"`
RefDomains *int64 `json:"ref_domains"`
Depth uint16 `json:"depth"`
InternalLinksOut uint32 `json:"internal_links_out"`
StatusCode uint16 `json:"status_code"`
Title string `json:"title"`
TTFTopic *string `json:"ttf_topic"`
}
WeightedPageRankPage represents a page with weighted PageRank combining internal PR and SEObserver data.
type WeightedPageRankResult ¶ added in v0.6.1
type WeightedPageRankResult struct {
Pages []WeightedPageRankPage `json:"pages"`
Total uint64 `json:"total"`
}
WeightedPageRankResult wraps paginated weighted PageRank results.
Source Files
¶
- buffer.go
- clickhouse.go
- clickhouse_compare.go
- clickhouse_custom_tests.go
- clickhouse_external_checks.go
- clickhouse_extractions.go
- clickhouse_gsc.go
- clickhouse_links.go
- clickhouse_logs.go
- clickhouse_pages.go
- clickhouse_providers.go
- clickhouse_resources.go
- clickhouse_sessions.go
- clickhouse_sitemaps.go
- export.go
- filters.go
- import.go
- import_csv.go
- models.go
- schema.go