Documentation
¶
Overview ¶
Package metadata provides functionality for extracting and processing document metadata. It extracts metadata from HTML documents including title, description, author, and Schema.org data.
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type MetaTag ¶
type MetaTag struct {
Name *string `json:"name,omitempty"`
Property *string `json:"property,omitempty"`
Content *string `json:"content"`
}
MetaTag represents a meta tag item from HTML JavaScript original code:
export interface MetaTagItem {
name?: string | null;
property?: string | null;
content: string | null;
}
type Metadata ¶
type Metadata struct {
Title string `json:"title"`
Description string `json:"description"`
Domain string `json:"domain"`
Favicon string `json:"favicon"`
Image string `json:"image"`
ParseTime int64 `json:"parseTime"`
Published string `json:"published"`
Author string `json:"author"`
Site string `json:"site"`
SchemaOrgData interface{} `json:"schemaOrgData"`
WordCount int `json:"wordCount"`
}
Metadata represents extracted metadata from a document JavaScript original code:
export interface DefuddleMetadata {
title: string;
description: string;
domain: string;
favicon: string;
image: string;
parseTime: number;
published: string;
author: string;
site: string;
schemaOrgData: any;
wordCount: number;
}
func Extract ¶
func Extract(doc *goquery.Document, schemaOrgData interface{}, metaTags []MetaTag, baseURL string) *Metadata
Extract extracts metadata from a document JavaScript original code:
static extract(doc: Document, schemaOrgData: any, metaTags: MetaTagItem[]): DefuddleMetadata {
let domain = '';
let url = '';
try {
// Try to get URL from document location
url = doc.location?.href || '';
// If no URL from location, try other sources
if (!url) {
url = this.getMetaContent(metaTags, "property", "og:url") ||
this.getMetaContent(metaTags, "property", "twitter:url") ||
this.getSchemaProperty(schemaOrgData, 'url') ||
this.getSchemaProperty(schemaOrgData, 'mainEntityOfPage.url') ||
this.getSchemaProperty(schemaOrgData, 'mainEntity.url') ||
this.getSchemaProperty(schemaOrgData, 'WebSite.url') ||
doc.querySelector('link[rel="canonical"]')?.getAttribute('href') || '';
}
if (url) {
try {
domain = new URL(url).hostname.replace(/^www\./, '');
} catch (e) {
console.warn('Failed to parse URL:', e);
}
}
} catch (e) {
// If URL parsing fails, try to get from base tag
const baseTag = doc.querySelector('base[href]');
if (baseTag) {
try {
url = baseTag.getAttribute('href') || '';
domain = new URL(url).hostname.replace(/^www\./, '');
} catch (e) {
console.warn('Failed to parse base URL:', e);
}
}
}
return {
title: this.getTitle(doc, schemaOrgData, metaTags),
description: this.getDescription(doc, schemaOrgData, metaTags),
domain,
favicon: this.getFavicon(doc, url, metaTags),
image: this.getImage(doc, schemaOrgData, metaTags),
published: this.getPublished(doc, schemaOrgData, metaTags),
author: this.getAuthor(doc, schemaOrgData, metaTags),
site: this.getSite(doc, schemaOrgData, metaTags),
schemaOrgData,
wordCount: 0,
parseTime: 0
};
}
Click to show internal directories.
Click to hide internal directories.