metadata

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 11, 2025 License: MIT Imports: 5 Imported by: 0

Documentation

Overview

Package metadata provides functionality for extracting and processing document metadata. It extracts metadata from HTML documents including title, description, author, and Schema.org data.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type MetaTag

type MetaTag struct {
	Name     *string `json:"name,omitempty"`
	Property *string `json:"property,omitempty"`
	Content  *string `json:"content"`
}

MetaTag represents a meta tag item from HTML JavaScript original code:

export interface MetaTagItem {
  name?: string | null;
  property?: string | null;
  content: string | null;
}

type Metadata

type Metadata struct {
	Title         string      `json:"title"`
	Description   string      `json:"description"`
	Domain        string      `json:"domain"`
	Favicon       string      `json:"favicon"`
	Image         string      `json:"image"`
	ParseTime     int64       `json:"parseTime"`
	Published     string      `json:"published"`
	Author        string      `json:"author"`
	Site          string      `json:"site"`
	SchemaOrgData interface{} `json:"schemaOrgData"`
	WordCount     int         `json:"wordCount"`
}

Metadata represents extracted metadata from a document JavaScript original code:

export interface DefuddleMetadata {
  title: string;
  description: string;
  domain: string;
  favicon: string;
  image: string;
  parseTime: number;
  published: string;
  author: string;
  site: string;
  schemaOrgData: any;
  wordCount: number;
}

func Extract

func Extract(doc *goquery.Document, schemaOrgData interface{}, metaTags []MetaTag, baseURL string) *Metadata

Extract extracts metadata from a document JavaScript original code:

static extract(doc: Document, schemaOrgData: any, metaTags: MetaTagItem[]): DefuddleMetadata {
  let domain = '';
  let url = '';

  try {
    // Try to get URL from document location
    url = doc.location?.href || '';

    // If no URL from location, try other sources
    if (!url) {
      url = this.getMetaContent(metaTags, "property", "og:url") ||
        this.getMetaContent(metaTags, "property", "twitter:url") ||
        this.getSchemaProperty(schemaOrgData, 'url') ||
        this.getSchemaProperty(schemaOrgData, 'mainEntityOfPage.url') ||
        this.getSchemaProperty(schemaOrgData, 'mainEntity.url') ||
        this.getSchemaProperty(schemaOrgData, 'WebSite.url') ||
        doc.querySelector('link[rel="canonical"]')?.getAttribute('href') || '';
    }

    if (url) {
      try {
        domain = new URL(url).hostname.replace(/^www\./, '');
      } catch (e) {
        console.warn('Failed to parse URL:', e);
      }
    }
  } catch (e) {
    // If URL parsing fails, try to get from base tag
    const baseTag = doc.querySelector('base[href]');
    if (baseTag) {
      try {
        url = baseTag.getAttribute('href') || '';
        domain = new URL(url).hostname.replace(/^www\./, '');
      } catch (e) {
        console.warn('Failed to parse base URL:', e);
      }
    }
  }

  return {
    title: this.getTitle(doc, schemaOrgData, metaTags),
    description: this.getDescription(doc, schemaOrgData, metaTags),
    domain,
    favicon: this.getFavicon(doc, url, metaTags),
    image: this.getImage(doc, schemaOrgData, metaTags),
    published: this.getPublished(doc, schemaOrgData, metaTags),
    author: this.getAuthor(doc, schemaOrgData, metaTags),
    site: this.getSite(doc, schemaOrgData, metaTags),
    schemaOrgData,
    wordCount: 0,
    parseTime: 0
  };
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL