Documentation
¶
Overview ¶
Package extractors provides site-specific content extraction functionality.
Index ¶
- Variables
- func ClearCache()
- func InitializeBuiltins()
- func Register(mapping ExtractorMapping)
- type BaseExtractor
- type ChatGPTExtractor
- func (c *ChatGPTExtractor) CanExtract() bool
- func (c *ChatGPTExtractor) Extract() *ExtractorResult
- func (c *ChatGPTExtractor) ExtractMessages() []ConversationMessage
- func (c *ChatGPTExtractor) GetFootnotes() []Footnote
- func (c *ChatGPTExtractor) GetMetadata() ConversationMetadata
- func (c *ChatGPTExtractor) GetName() string
- type ClaudeExtractor
- func (c *ClaudeExtractor) CanExtract() bool
- func (c *ClaudeExtractor) Extract() *ExtractorResult
- func (c *ClaudeExtractor) ExtractMessages() []ConversationMessage
- func (c *ClaudeExtractor) GetFootnotes() []Footnote
- func (c *ClaudeExtractor) GetMetadata() ConversationMetadata
- func (c *ClaudeExtractor) GetName() string
- type ConversationExtractor
- type ConversationExtractorBase
- type ConversationMessage
- type ConversationMetadata
- type ExtractorBase
- func (e *ExtractorBase) GetAttribute(sel *goquery.Selection, attr string) string
- func (e *ExtractorBase) GetDocument() *goquery.Document
- func (e *ExtractorBase) GetHTMLContent(sel *goquery.Selection) string
- func (e *ExtractorBase) GetSchemaOrgData() interface{}
- func (e *ExtractorBase) GetTextContent(sel *goquery.Selection) string
- func (e *ExtractorBase) GetURL() string
- type ExtractorConstructor
- type ExtractorMapping
- type ExtractorResult
- type Footnote
- type GeminiExtractor
- func (g *GeminiExtractor) CanExtract() bool
- func (g *GeminiExtractor) Extract() *ExtractorResult
- func (g *GeminiExtractor) ExtractMessages() []ConversationMessage
- func (g *GeminiExtractor) GetFootnotes() []Footnote
- func (g *GeminiExtractor) GetMetadata() ConversationMetadata
- func (g *GeminiExtractor) GetName() string
- type GitHubExtractor
- type GrokExtractor
- func (g *GrokExtractor) CanExtract() bool
- func (g *GrokExtractor) Extract() *ExtractorResult
- func (g *GrokExtractor) ExtractMessages() []ConversationMessage
- func (g *GrokExtractor) GetFootnotes() []Footnote
- func (g *GrokExtractor) GetMetadata() ConversationMetadata
- func (g *GrokExtractor) GetName() string
- type HackerNewsExtractor
- type RedditExtractor
- type Registry
- type TwitterExtractor
- type UserInfo
- type YouTubeExtractor
Constants ¶
This section is empty.
Variables ¶
var ( // DefaultRegistry is the global registry instance that can be extended by users DefaultRegistry = NewRegistry() )
Functions ¶
func InitializeBuiltins ¶
func InitializeBuiltins()
InitializeBuiltins initializes all built-in extractors TypeScript original code:
ExtractorRegistry.initialize();
func Register ¶
func Register(mapping ExtractorMapping)
Register adds a mapping to the default registry TypeScript original code: ExtractorRegistry.register (static method)
Types ¶
type BaseExtractor ¶
type BaseExtractor interface {
CanExtract() bool
Extract() *ExtractorResult
GetName() string
}
BaseExtractor defines the interface for site-specific extractors TypeScript original code:
export abstract class BaseExtractor {
protected document: Document;
protected url: string;
protected schemaOrgData?: any;
constructor(document: Document, url: string, schemaOrgData?: any) {
this.document = document;
this.url = url;
this.schemaOrgData = schemaOrgData;
}
abstract canExtract(): boolean;
abstract extract(): ExtractorResult;
abstract getName(): string;
}
func FindExtractor ¶
func FindExtractor(document *goquery.Document, url string, schemaOrgData interface{}) BaseExtractor
FindExtractor finds an extractor using the default registry TypeScript original code: ExtractorRegistry.findExtractor (static method)
type ChatGPTExtractor ¶
type ChatGPTExtractor struct {
*ConversationExtractorBase
// contains filtered or unexported fields
}
ChatGPTExtractor handles ChatGPT conversation content extraction TypeScript original code: import { ConversationExtractor } from './_conversation'; import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
export class ChatGPTExtractor extends ConversationExtractor {
private articles: NodeListOf<Element> | null;
private footnotes: Footnote[];
private footnoteCounter: number;
constructor(document: Document, url: string) {
super(document, url);
this.articles = document.querySelectorAll('article[data-testid^="conversation-turn-"]');
this.footnotes = [];
this.footnoteCounter = 0;
}
canExtract(): boolean {
return !!this.articles && this.articles.length > 0;
}
protected extractMessages(): ConversationMessage[] {
const messages: ConversationMessage[] = [];
this.footnotes = [];
this.footnoteCounter = 0;
if (!this.articles) return messages;
this.articles.forEach((article) => {
// Get the localized author text from the sr-only heading and clean it
const authorElement = article.querySelector('h5.sr-only, h6.sr-only');
const authorText = authorElement?.textContent
?.trim()
?.replace(/:\s*$/, '') // Remove colon and any trailing whitespace
|| '';
let currentAuthorRole = '';
const authorRole = article.getAttribute('data-message-author-role');
if (authorRole) {
currentAuthorRole = authorRole;
}
let messageContent = article.innerHTML || '';
messageContent = messageContent.replace(/\u200B/g, '');
// Remove specific elements from the message content
const tempDiv = document.createElement('div');
tempDiv.innerHTML = messageContent;
tempDiv.querySelectorAll('h5.sr-only, h6.sr-only, span[data-state="closed"]').forEach(el => el.remove());
messageContent = tempDiv.innerHTML;
// Process inline references using regex to find the containers
// Look for spans containing citation links (a[target=_blank][rel=noopener]), replacing entire structure
// Also capture optional preceding ZeroWidthSpace
const citationPattern = /(​)?(<span[^>]*?>\s*<a(?=[^>]*?href="([^"]+)")(?=[^>]*?target="_blank")(?=[^>]*?rel="noopener")[^>]*?>[\s\S]*?<\/a>\s*<\/span>)/gi;
messageContent = messageContent.replace(citationPattern, (match, zws, spanStructure, url) => {
// url is captured group 3
let domain = '';
let fragmentText = '';
try {
// Extract domain without www.
domain = new URL(url).hostname.replace(/^www\./, '');
// Extract and decode the fragment text if it exists
const hashParts = url.split('#:~:text=');
if (hashParts.length > 1) {
fragmentText = decodeURIComponent(hashParts[1]);
fragmentText = fragmentText.replace(/%2C/g, ',');
const parts = fragmentText.split(',');
if (parts.length > 1 && parts[0].trim()) {
fragmentText = ` — ${parts[0].trim()}...`;
} else if (parts[0].trim()) {
fragmentText = ` — ${fragmentText.trim()}`;
} else {
fragmentText = '';
}
}
} catch (e) {
console.error(`Failed to parse URL: ${url}`, e);
domain = url;
}
// Check if this URL already exists in our footnotes
let footnoteIndex = this.footnotes.findIndex(fn => fn.url === url);
let footnoteNumber: number;
if (footnoteIndex === -1) {
this.footnoteCounter++;
footnoteNumber = this.footnoteCounter;
this.footnotes.push({
url,
text: `<a href="${url}">${domain}</a>${fragmentText}`
});
} else {
footnoteNumber = footnoteIndex + 1;
}
// Return just the footnote reference, replacing the ZWS (if captured) and the entire span structure
return `<sup id="fnref:${footnoteNumber}"><a href="#fn:${footnoteNumber}">${footnoteNumber}</a></sup>`;
});
// Clean up any stray empty paragraph tags
messageContent = messageContent
.replace(/<p[^>]*>\s*<\/p>/g, '');
messages.push({
author: authorText,
content: messageContent.trim(),
metadata: {
role: currentAuthorRole || 'unknown'
}
});
});
return messages;
}
protected getFootnotes(): Footnote[] {
return this.footnotes;
}
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messages = this.extractMessages();
return {
title,
site: 'ChatGPT',
url: this.url,
messageCount: messages.length,
description: `ChatGPT conversation with ${messages.length} messages`
};
}
private getTitle(): string {
// Try to get the page title first
const pageTitle = this.document.title?.trim();
if (pageTitle && pageTitle !== 'ChatGPT') {
return pageTitle;
}
// Fall back to first user message
const firstUserTurn = this.articles?.item(0)?.querySelector('.text-message');
if (firstUserTurn) {
const text = firstUserTurn.textContent || '';
// Truncate to first 50 characters if longer
return text.length > 50 ? text.slice(0, 50) + '...' : text;
}
return 'ChatGPT Conversation';
}
}
func NewChatGPTExtractor ¶
func NewChatGPTExtractor(document *goquery.Document, urlStr string, schemaOrgData interface{}) *ChatGPTExtractor
NewChatGPTExtractor creates a new ChatGPT extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
this.articles = document.querySelectorAll('article[data-testid^="conversation-turn-"]');
this.footnotes = [];
this.footnoteCounter = 0;
}
func (*ChatGPTExtractor) CanExtract ¶
func (c *ChatGPTExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.articles && this.articles.length > 0;
}
func (*ChatGPTExtractor) Extract ¶
func (c *ChatGPTExtractor) Extract() *ExtractorResult
Extract extracts the ChatGPT conversation TypeScript original code:
extract(): ExtractorResult {
const messages = this.extractMessages();
const metadata = this.getMetadata();
const footnotes = this.getFootnotes();
const rawContentHtml = this.createContentHtml(messages, footnotes);
// ... rest of extract method
}
func (*ChatGPTExtractor) ExtractMessages ¶
func (c *ChatGPTExtractor) ExtractMessages() []ConversationMessage
ExtractMessages extracts conversation messages TypeScript original code (improved version):
protected extractMessages(): ConversationMessage[] {
const messages: ConversationMessage[] = [];
this.footnotes = [];
this.footnoteCounter = 0;
if (!this.articles) return messages;
this.articles.forEach((article) => {
// Get the localized author text from the sr-only heading and clean it
const authorElement = article.querySelector('h5.sr-only, h6.sr-only');
const authorText = authorElement?.textContent
?.trim()
?.replace(/:\s*$/, '') // Remove colon and any trailing whitespace
|| '';
let currentAuthorRole = '';
const authorRole = article.getAttribute('data-message-author-role');
if (authorRole) {
currentAuthorRole = authorRole;
}
let messageContent = article.innerHTML || '';
messageContent = messageContent.replace(/\u200B/g, '');
// Remove specific elements from the message content
const tempDiv = document.createElement('div');
tempDiv.innerHTML = messageContent;
tempDiv.querySelectorAll('h5.sr-only, h6.sr-only, span[data-state="closed"]').forEach(el => el.remove());
messageContent = tempDiv.innerHTML;
// Process inline references
messageContent = this.processFootnotes(messageContent);
// Clean up any stray empty paragraph tags
messageContent = messageContent.replace(/<p[^>]*>\s*<\/p>/g, '');
messages.push({
author: authorText,
content: messageContent.trim(),
metadata: {
role: currentAuthorRole || 'unknown'
}
});
});
return messages;
}
func (*ChatGPTExtractor) GetFootnotes ¶
func (c *ChatGPTExtractor) GetFootnotes() []Footnote
GetFootnotes returns the conversation footnotes TypeScript original code:
protected getFootnotes(): Footnote[] {
return this.footnotes;
}
func (*ChatGPTExtractor) GetMetadata ¶
func (c *ChatGPTExtractor) GetMetadata() ConversationMetadata
GetMetadata returns conversation metadata TypeScript original code:
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messages = this.extractMessages();
return {
title,
site: 'ChatGPT',
url: this.url,
messageCount: messages.length,
description: `ChatGPT conversation with ${messages.length} messages`
};
}
func (*ChatGPTExtractor) GetName ¶
func (c *ChatGPTExtractor) GetName() string
GetName returns the name of the extractor
type ClaudeExtractor ¶
type ClaudeExtractor struct {
*ConversationExtractorBase
// contains filtered or unexported fields
}
ClaudeExtractor handles Claude conversation content extraction TypeScript original code: import { ConversationExtractor } from './_conversation'; import { ConversationMessage, ConversationMetadata } from '../types/extractors';
export class ClaudeExtractor extends ConversationExtractor {
private articles: NodeListOf<Element> | null;
constructor(document: Document, url: string) {
super(document, url);
// Find all message blocks - both user and assistant messages
this.articles = document.querySelectorAll('div[data-testid="user-message"], div[data-testid="assistant-message"], div.font-claude-message');
}
canExtract(): boolean {
return !!this.articles && this.articles.length > 0;
}
protected extractMessages(): ConversationMessage[] {
const messages: ConversationMessage[] = [];
if (!this.articles) return messages;
this.articles.forEach((article) => {
let role: string;
let content: string;
if (article.hasAttribute('data-testid')) {
// Handle user messages
if (article.getAttribute('data-testid') === 'user-message') {
role = 'you';
content = article.innerHTML;
}
// Skip non-message elements
else {
return;
}
} else if (article.classList.contains('font-claude-message')) {
// Handle Claude messages
role = 'assistant';
content = article.innerHTML;
} else {
// Skip unknown elements
return;
}
if (content) {
messages.push({
author: role === 'you' ? 'You' : 'Claude',
content: content.trim(),
metadata: {
role: role
}
});
}
});
return messages;
}
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messages = this.extractMessages();
return {
title,
site: 'Claude',
url: this.url,
messageCount: messages.length,
description: `Claude conversation with ${messages.length} messages`
};
}
private getTitle(): string {
// Try to get the page title first
const pageTitle = this.document.title?.trim();
if (pageTitle && pageTitle !== 'Claude') {
// Remove ' - Claude' suffix if present
return pageTitle.replace(/ - Claude$/, '');
}
// Try to get title from header
const headerTitle = this.document.querySelector('header .font-tiempos')?.textContent?.trim();
if (headerTitle) {
return headerTitle;
}
// Fall back to first user message
const firstUserMessage = this.articles?.item(0)?.querySelector('[data-testid="user-message"]');
if (firstUserMessage) {
const text = firstUserMessage.textContent || '';
// Truncate to first 50 characters if longer
return text.length > 50 ? text.slice(0, 50) + '...' : text;
}
return 'Claude Conversation';
}
}
func NewClaudeExtractor ¶
func NewClaudeExtractor(document *goquery.Document, urlStr string, schemaOrgData interface{}) *ClaudeExtractor
NewClaudeExtractor creates a new Claude extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
// Find all message blocks - both user and assistant messages
this.articles = document.querySelectorAll('div[data-testid="user-message"], div[data-testid="assistant-message"], div.font-claude-message');
}
func (*ClaudeExtractor) CanExtract ¶
func (c *ClaudeExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.articles && this.articles.length > 0;
}
func (*ClaudeExtractor) Extract ¶
func (c *ClaudeExtractor) Extract() *ExtractorResult
Extract extracts the Claude conversation TypeScript original code:
extract(): ExtractorResult {
return this.extractWithDefuddle(this);
}
func (*ClaudeExtractor) ExtractMessages ¶
func (c *ClaudeExtractor) ExtractMessages() []ConversationMessage
ExtractMessages extracts conversation messages TypeScript original code:
protected extractMessages(): ConversationMessage[] {
const messages: ConversationMessage[] = [];
if (!this.articles) return messages;
this.articles.forEach((article) => {
let role: string;
let content: string;
if (article.hasAttribute('data-testid')) {
// Handle user messages
if (article.getAttribute('data-testid') === 'user-message') {
role = 'you';
content = article.innerHTML;
}
// Skip non-message elements
else {
return;
}
} else if (article.classList.contains('font-claude-message')) {
// Handle Claude messages
role = 'assistant';
content = article.innerHTML;
} else {
// Skip unknown elements
return;
}
if (content) {
messages.push({
author: role === 'you' ? 'You' : 'Claude',
content: content.trim(),
metadata: {
role: role
}
});
}
});
return messages;
}
func (*ClaudeExtractor) GetFootnotes ¶
func (c *ClaudeExtractor) GetFootnotes() []Footnote
GetFootnotes returns the conversation footnotes TypeScript original code:
protected getFootnotes(): Footnote[] {
return [];
}
func (*ClaudeExtractor) GetMetadata ¶
func (c *ClaudeExtractor) GetMetadata() ConversationMetadata
GetMetadata returns conversation metadata TypeScript original code:
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messages = this.extractMessages();
return {
title,
site: 'Claude',
url: this.url,
messageCount: messages.length,
description: `Claude conversation with ${messages.length} messages`
};
}
func (*ClaudeExtractor) GetName ¶
func (c *ClaudeExtractor) GetName() string
GetName returns the name of the extractor
type ConversationExtractor ¶
type ConversationExtractor interface {
BaseExtractor
ExtractMessages() []ConversationMessage
GetMetadata() ConversationMetadata
GetFootnotes() []Footnote
}
ConversationExtractor defines the interface for conversation extractors TypeScript original code:
export abstract class ConversationExtractor extends BaseExtractor {
protected abstract extractMessages(): ConversationMessage[];
protected abstract getMetadata(): ConversationMetadata;
protected getFootnotes(): Footnote[] {
return [];
}
}
type ConversationExtractorBase ¶
type ConversationExtractorBase struct {
*ExtractorBase
}
ConversationExtractorBase provides common functionality for conversation extractors Implementation corresponding to TypeScript ConversationExtractor abstract class
func NewConversationExtractorBase ¶
func NewConversationExtractorBase(document *goquery.Document, url string, schemaOrgData interface{}) *ConversationExtractorBase
NewConversationExtractorBase creates a new conversation extractor base TypeScript original code:
constructor(document: Document, url: string, schemaOrgData?: any) {
super(document, url, schemaOrgData);
}
func (*ConversationExtractorBase) CreateContentHTML ¶
func (c *ConversationExtractorBase) CreateContentHTML(messages []ConversationMessage, footnotes []Footnote) string
CreateContentHTML creates formatted HTML content from messages and footnotes TypeScript original code:
protected createContentHtml(messages: ConversationMessage[], footnotes: Footnote[]): string {
const messagesHtml = messages.map((message, index) => {
const timestampHtml = message.timestamp ?
`<div class="message-timestamp">${message.timestamp}</div>` : '';
// Check if content already has paragraph tags
const hasParagraphs = /<p[^>]*>[\s\S]*?<\/p>/i.test(message.content);
const contentHtml = hasParagraphs ? message.content : `<p>${message.content}</p>`;
// Add metadata to data attributes
const dataAttributes = message.metadata ?
Object.entries(message.metadata)
.map(([key, value]) => `data-${key}="${value}"`)
.join(' ') : '';
return `
<div class="message message-${message.author.toLowerCase()}" ${dataAttributes}>
<div class="message-header">
<p class="message-author"><strong>${message.author}</strong></p>
${timestampHtml}
</div>
<div class="message-content">
${contentHtml}
</div>
</div>${index < messages.length - 1 ? '\n<hr>' : ''}`;
}).join('\n').trim();
// Add footnotes section if we have any
const footnotesHtml = footnotes.length > 0 ? `
<div id="footnotes">
<ol>
${footnotes.map((footnote, index) => `
<li class="footnote" id="fn:${index + 1}">
<p>
<a href="${footnote.url}" target="_blank">${footnote.text}</a> <a href="#fnref:${index + 1}" class="footnote-backref">↩</a>
</p>
</li>
`).join('')}
</ol>
</div>` : '';
return `${messagesHtml}\n${footnotesHtml}`.trim();
}
func (*ConversationExtractorBase) ExtractWithDefuddle ¶
func (c *ConversationExtractorBase) ExtractWithDefuddle(extractor ConversationExtractor) *ExtractorResult
ExtractWithDefuddle extracts conversation content similar to TypeScript implementation TypeScript original code:
extract(): ExtractorResult {
const messages = this.extractMessages();
const metadata = this.getMetadata();
const footnotes = this.getFootnotes();
const rawContentHtml = this.createContentHtml(messages, footnotes);
// Create a temporary document to run Defuddle on our content
const tempDoc = document.implementation.createHTMLDocument();
const container = tempDoc.createElement('article');
container.innerHTML = rawContentHtml;
tempDoc.body.appendChild(container);
// Run Defuddle on our formatted content
const defuddled = new Defuddle(tempDoc).parse();
const contentHtml = defuddled.content;
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
messageCount: messages.length.toString(),
},
variables: {
title: metadata.title || 'Conversation',
site: metadata.site,
description: metadata.description || `${metadata.site} conversation with ${messages.length} messages`,
wordCount: defuddled.wordCount?.toString() || '',
}
};
}
type ConversationMessage ¶
type ConversationMessage struct {
Author string `json:"author"`
Content string `json:"content"`
Timestamp string `json:"timestamp,omitempty"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
ConversationMessage represents a single message in a conversation Corresponding to TypeScript interface ConversationMessage
type ConversationMetadata ¶
type ConversationMetadata struct {
Title string `json:"title"`
Site string `json:"site"`
URL string `json:"url"`
MessageCount int `json:"messageCount"`
Description string `json:"description"`
}
ConversationMetadata represents metadata about the conversation Corresponding to TypeScript interface ConversationMetadata
type ExtractorBase ¶
type ExtractorBase struct {
// contains filtered or unexported fields
}
ExtractorBase provides common functionality for extractors Implementation of the protected properties in TypeScript BaseExtractor
func NewExtractorBase ¶
func NewExtractorBase(document *goquery.Document, url string, schemaOrgData interface{}) *ExtractorBase
NewExtractorBase creates a new base extractor TypeScript original code:
constructor(document: Document, url: string, schemaOrgData?: any) {
this.document = document;
this.url = url;
this.schemaOrgData = schemaOrgData;
}
func (*ExtractorBase) GetAttribute ¶
func (e *ExtractorBase) GetAttribute(sel *goquery.Selection, attr string) string
GetAttribute safely gets an attribute value
func (*ExtractorBase) GetDocument ¶
func (e *ExtractorBase) GetDocument() *goquery.Document
GetDocument returns the document
func (*ExtractorBase) GetHTMLContent ¶
func (e *ExtractorBase) GetHTMLContent(sel *goquery.Selection) string
GetHTMLContent safely extracts HTML content from a selection
func (*ExtractorBase) GetSchemaOrgData ¶
func (e *ExtractorBase) GetSchemaOrgData() interface{}
GetSchemaOrgData returns the schema.org data
func (*ExtractorBase) GetTextContent ¶
func (e *ExtractorBase) GetTextContent(sel *goquery.Selection) string
GetTextContent safely extracts text content from a selection
type ExtractorConstructor ¶
type ExtractorConstructor func(document *goquery.Document, url string, schemaOrgData interface{}) BaseExtractor
ExtractorConstructor represents a function that creates an extractor TypeScript original code:
type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any) => BaseExtractor;
type ExtractorMapping ¶
type ExtractorMapping struct {
Patterns []interface{} // Can be string or *regexp.Regexp
Extractor ExtractorConstructor
}
ExtractorMapping represents the mapping configuration for an extractor TypeScript original code:
interface ExtractorMapping {
patterns: (string | RegExp)[];
extractor: ExtractorConstructor;
}
type ExtractorResult ¶
type ExtractorResult struct {
Content string `json:"content"`
ContentHTML string `json:"contentHtml"`
ExtractedContent map[string]interface{} `json:"extractedContent,omitempty"`
Variables map[string]string `json:"variables,omitempty"`
}
ExtractorResult represents the result of content extraction Corresponding to TypeScript interface ExtractorResult
type Footnote ¶
Footnote represents a footnote in the conversation Corresponding to TypeScript interface Footnote
type GeminiExtractor ¶
type GeminiExtractor struct {
*ConversationExtractorBase
// contains filtered or unexported fields
}
GeminiExtractor handles Gemini conversation content extraction TypeScript original code: import { ConversationExtractor } from './_conversation'; import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
export class GeminiExtractor extends ConversationExtractor {
private conversationContainers: NodeListOf<Element> | null;
private footnotes: Footnote[];
private messageCount: number | null = null;
constructor(document: Document, url: string) {
super(document, url);
this.conversationContainers = document.querySelectorAll('div.conversation-container');
this.footnotes = [];
}
canExtract(): boolean {
return !!this.conversationContainers && this.conversationContainers.length > 0;
}
protected extractMessages(): ConversationMessage[] {
this.messageCount = 0;
const messages: ConversationMessage[] = [];
if (!this.conversationContainers) return messages;
this.extractSources();
this.conversationContainers.forEach((container) => {
const userQuery = container.querySelector('user-query');
if (userQuery) {
const queryText = userQuery.querySelector('.query-text');
if (queryText) {
const content = queryText.innerHTML || '';
messages.push({
author: 'You',
content: content.trim(),
metadata: { role: 'user' }
});
}
}
const modelResponse = container.querySelector('model-response');
if (modelResponse) {
const regularContent = modelResponse.querySelector('.model-response-text .markdown');
const extendedContent = modelResponse.querySelector('#extended-response-markdown-content');
const contentElement = extendedContent || regularContent;
if (contentElement) {
let content = contentElement.innerHTML || '';
const tempDiv = document.createElement('div');
tempDiv.innerHTML = content;
tempDiv.querySelectorAll('.table-content').forEach(el => {
// `table-content` is a PARTIAL selector in defuddle (table of contents, will be removed), but a real table in Gemini (should be kept).
el.classList.remove('table-content');
});
content = tempDiv.innerHTML;
messages.push({
author: 'Gemini',
content: content.trim(),
metadata: { role: 'assistant' }
});
}
}
});
this.messageCount = messages.length;
return messages;
}
private extractSources(): void {
const browseItems = this.document.querySelectorAll('browse-item');
if (browseItems && browseItems.length > 0) {
browseItems.forEach(item => {
const link = item.querySelector('a');
if (link instanceof HTMLAnchorElement) {
const url = link.href;
const domain = link.querySelector('.domain')?.textContent?.trim() || '';
const title = link.querySelector('.title')?.textContent?.trim() || '';
if (url && (domain || title)) {
this.footnotes.push({
url,
text: title ? `${domain}: ${title}` : domain
});
}
}
});
}
}
protected getFootnotes(): Footnote[] {
return this.footnotes;
}
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messageCount = this.messageCount ?? this.extractMessages().length;
return {
title,
site: 'Gemini',
url: this.url,
messageCount,
description: `Gemini conversation with ${messageCount} messages`
};
}
private getTitle(): string {
const pageTitle = this.document.title?.trim();
if (pageTitle && pageTitle !== 'Gemini' && !pageTitle.includes('Gemini')) {
return pageTitle;
}
const researchTitle = this.document.querySelector('.title-text')?.textContent?.trim();
if (researchTitle) {
return researchTitle;
}
const firstUserQuery = this.conversationContainers?.item(0)?.querySelector('.query-text');
if (firstUserQuery) {
const text = firstUserQuery.textContent || '';
return text.length > 50 ? text.slice(0, 50) + '...' : text;
}
return 'Gemini Conversation';
}
}
func NewGeminiExtractor ¶
func NewGeminiExtractor(document *goquery.Document, urlStr string, schemaOrgData interface{}) *GeminiExtractor
NewGeminiExtractor creates a new Gemini extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
this.conversationContainers = document.querySelectorAll('div.conversation-container');
this.footnotes = [];
}
func (*GeminiExtractor) CanExtract ¶
func (g *GeminiExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.conversationContainers && this.conversationContainers.length > 0;
}
func (*GeminiExtractor) Extract ¶
func (g *GeminiExtractor) Extract() *ExtractorResult
Extract extracts the Gemini conversation
func (*GeminiExtractor) ExtractMessages ¶
func (g *GeminiExtractor) ExtractMessages() []ConversationMessage
ExtractMessages extracts conversation messages TypeScript original code:
protected extractMessages(): ConversationMessage[] {
this.messageCount = 0;
const messages: ConversationMessage[] = [];
if (!this.conversationContainers) return messages;
this.extractSources();
this.conversationContainers.forEach((container) => {
const userQuery = container.querySelector('user-query');
if (userQuery) {
const queryText = userQuery.querySelector('.query-text');
if (queryText) {
const content = queryText.innerHTML || '';
messages.push({
author: 'You',
content: content.trim(),
metadata: { role: 'user' }
});
}
}
const modelResponse = container.querySelector('model-response');
if (modelResponse) {
const regularContent = modelResponse.querySelector('.model-response-text .markdown');
const extendedContent = modelResponse.querySelector('#extended-response-markdown-content');
const contentElement = extendedContent || regularContent;
if (contentElement) {
let content = contentElement.innerHTML || '';
const tempDiv = document.createElement('div');
tempDiv.innerHTML = content;
tempDiv.querySelectorAll('.table-content').forEach(el => {
// `table-content` is a PARTIAL selector in defuddle (table of contents, will be removed), but a real table in Gemini (should be kept).
el.classList.remove('table-content');
});
content = tempDiv.innerHTML;
messages.push({
author: 'Gemini',
content: content.trim(),
metadata: { role: 'assistant' }
});
}
}
});
this.messageCount = messages.length;
return messages;
}
func (*GeminiExtractor) GetFootnotes ¶
func (g *GeminiExtractor) GetFootnotes() []Footnote
GetFootnotes returns the conversation footnotes TypeScript original code:
protected getFootnotes(): Footnote[] {
return this.footnotes;
}
func (*GeminiExtractor) GetMetadata ¶
func (g *GeminiExtractor) GetMetadata() ConversationMetadata
GetMetadata returns conversation metadata TypeScript original code:
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messageCount = this.messageCount ?? this.extractMessages().length;
return {
title,
site: 'Gemini',
url: this.url,
messageCount,
description: `Gemini conversation with ${messageCount} messages`
};
}
func (*GeminiExtractor) GetName ¶
func (g *GeminiExtractor) GetName() string
GetName returns the name of the extractor
type GitHubExtractor ¶ added in v0.1.4
type GitHubExtractor struct {
*ExtractorBase
}
GitHubExtractor handles GitHub content extraction TypeScript original code:
export class GitHubExtractor extends BaseExtractor {
canExtract(): boolean {
const githubIndicators = [
'meta[name="expected-hostname"][content="github.com"]',
'meta[name="octolytics-url"]',
'meta[name="github-keyboard-shortcuts"]',
'.js-header-wrapper',
'#js-repo-pjax-container',
];
const githubPageIndicators = {
issue: [
'[data-testid="issue-metadata-sticky"]',
'[data-testid="issue-title"]',
],
}
return githubIndicators.some(selector => this.document.querySelector(selector) !== null)
&& Object.values(githubPageIndicators).some(selectors => selectors.some(selector => this.document.querySelector(selector) !== null));
}
extract(): ExtractorResult {
return this.extractIssue();
}
func NewGitHubExtractor ¶ added in v0.1.4
func NewGitHubExtractor(document *goquery.Document, url string, schemaOrgData interface{}) *GitHubExtractor
NewGitHubExtractor creates a new GitHub extractor
func (*GitHubExtractor) CanExtract ¶ added in v0.1.4
func (g *GitHubExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
const githubIndicators = [
'meta[name="expected-hostname"][content="github.com"]',
'meta[name="octolytics-url"]',
'meta[name="github-keyboard-shortcuts"]',
'.js-header-wrapper',
'#js-repo-pjax-container',
];
const githubPageIndicators = {
issue: [
'[data-testid="issue-metadata-sticky"]',
'[data-testid="issue-title"]',
],
}
return githubIndicators.some(selector => this.document.querySelector(selector) !== null)
&& Object.values(githubPageIndicators).some(selectors => selectors.some(selector => this.document.querySelector(selector) !== null));
}
func (*GitHubExtractor) Extract ¶ added in v0.1.4
func (g *GitHubExtractor) Extract() *ExtractorResult
Extract extracts the GitHub content TypeScript original code:
extract(): ExtractorResult {
return this.extractIssue();
}
func (*GitHubExtractor) GetName ¶ added in v0.1.4
func (g *GitHubExtractor) GetName() string
GetName returns the name of the extractor
type GrokExtractor ¶
type GrokExtractor struct {
*ConversationExtractorBase
// contains filtered or unexported fields
}
GrokExtractor handles Grok (X.AI) conversation content extraction TypeScript original code: import { ConversationExtractor } from './_conversation'; import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
export class GrokExtractor extends ConversationExtractor {
// Note: This selector relies heavily on CSS utility classes and may break if Grok's UI changes.
private messageContainerSelector = '.relative.group.flex.flex-col.justify-center.w-full';
private messageBubbles: NodeListOf<Element> | null;
private footnotes: Footnote[];
private footnoteCounter: number;
constructor(document: Document, url: string) {
super(document, url);
this.messageBubbles = document.querySelectorAll(this.messageContainerSelector);
this.footnotes = [];
this.footnoteCounter = 0;
}
}
func NewGrokExtractor ¶
func NewGrokExtractor(document *goquery.Document, urlStr string, schemaOrgData interface{}) *GrokExtractor
NewGrokExtractor creates a new Grok extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
// Note: This selector relies heavily on CSS utility classes and may break if Grok's UI changes.
this.messageContainerSelector = '.relative.group.flex.flex-col.justify-center.w-full';
this.messageBubbles = document.querySelectorAll(this.messageContainerSelector);
this.footnotes = [];
this.footnoteCounter = 0;
}
func (*GrokExtractor) CanExtract ¶
func (g *GrokExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.messageBubbles && this.messageBubbles.length > 0;
}
func (*GrokExtractor) Extract ¶
func (g *GrokExtractor) Extract() *ExtractorResult
Extract extracts the Grok conversation TypeScript original code:
extract(): ExtractorResult {
return this.extractWithDefuddle(this);
}
func (*GrokExtractor) ExtractMessages ¶
func (g *GrokExtractor) ExtractMessages() []ConversationMessage
ExtractMessages extracts conversation messages TypeScript original code:
protected extractMessages(): ConversationMessage[] {
const messages: ConversationMessage[] = [];
this.footnotes = [];
this.footnoteCounter = 0;
if (!this.messageBubbles || this.messageBubbles.length === 0) return messages;
this.messageBubbles.forEach((container) => {
// Note: Relies on layout classes 'items-end' and 'items-start' which might change.
const isUserMessage = container.classList.contains('items-end');
const isGrokMessage = container.classList.contains('items-start');
if (!isUserMessage && !isGrokMessage) return; // Skip elements that aren't clearly user or Grok messages
const messageBubble = container.querySelector('.message-bubble');
if (!messageBubble) return; // Skip if the core message bubble isn't found
let content: string = '';
let role: string = '';
let author: string = '';
if (isUserMessage) {
// Assume user message bubble's textContent is the desired content.
// This is simpler and potentially less brittle than selecting specific spans.
content = messageBubble.textContent || '';
role = 'user';
author = 'You'; // Or potentially extract from an attribute if available later
} else if (isGrokMessage) {
role = 'assistant';
author = 'Grok'; // Or potentially extract from an attribute if available later
// Clone the bubble to modify it without affecting the original page
const clonedBubble = messageBubble.cloneNode(true) as Element;
// Remove known non-content elements like the DeepSearch artifact
clonedBubble.querySelector('.relative.border.border-border-l1.bg-surface-base')?.remove();
// Add selectors here for any other known elements to remove (e.g., buttons, toolbars within the bubble)
content = clonedBubble.innerHTML;
// Process footnotes/links in the cleaned content
content = this.processFootnotes(content);
}
if (content.trim()) {
messages.push({
author: author,
content: content.trim(),
metadata: {
role: role
}
});
}
});
return messages;
}
func (*GrokExtractor) GetFootnotes ¶
func (g *GrokExtractor) GetFootnotes() []Footnote
GetFootnotes returns the conversation footnotes TypeScript original code:
protected getFootnotes(): Footnote[] {
return this.footnotes;
}
func (*GrokExtractor) GetMetadata ¶
func (g *GrokExtractor) GetMetadata() ConversationMetadata
GetMetadata returns conversation metadata TypeScript original code:
protected getMetadata(): ConversationMetadata {
const title = this.getTitle();
const messageCount = this.messageBubbles?.length || 0;
return {
title,
site: 'Grok',
url: this.url,
messageCount: messageCount, // Use estimated count
description: `Grok conversation with ${messageCount} messages`
};
}
func (*GrokExtractor) GetName ¶
func (g *GrokExtractor) GetName() string
GetName returns the name of the extractor
type HackerNewsExtractor ¶
type HackerNewsExtractor struct {
*ExtractorBase
// contains filtered or unexported fields
}
HackerNewsExtractor handles Hacker News content extraction TypeScript original code: import { BaseExtractor } from './_base'; import { ExtractorResult } from '../types/extractors';
export class HackerNewsExtractor extends BaseExtractor {
private mainPost: Element | null;
private isCommentPage: boolean;
private mainComment: Element | null;
constructor(document: Document, url: string) {
super(document, url);
this.mainPost = document.querySelector('.fatitem');
this.isCommentPage = this.detectCommentPage();
this.mainComment = this.isCommentPage ? this.findMainComment() : null;
}
}
func NewHackerNewsExtractor ¶
func NewHackerNewsExtractor(document *goquery.Document, url string, schemaOrgData interface{}) *HackerNewsExtractor
NewHackerNewsExtractor creates a new HackerNews extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
this.mainPost = document.querySelector('.fatitem');
this.isCommentPage = this.detectCommentPage();
this.mainComment = this.isCommentPage ? this.findMainComment() : null;
}
func (*HackerNewsExtractor) CanExtract ¶
func (h *HackerNewsExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.mainPost;
}
func (*HackerNewsExtractor) Extract ¶
func (h *HackerNewsExtractor) Extract() *ExtractorResult
Extract extracts the HackerNews content TypeScript original code:
extract(): ExtractorResult {
const postContent = this.getPostContent();
const comments = this.extractComments();
const contentHtml = this.createContentHtml(postContent, comments);
const postTitle = this.getPostTitle();
const postAuthor = this.getPostAuthor();
const description = this.createDescription();
const published = this.getPostDate();
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
postId: this.getPostId(),
postAuthor,
},
variables: {
title: postTitle,
author: postAuthor,
site: 'Hacker News',
description,
published,
}
};
}
func (*HackerNewsExtractor) GetName ¶
func (h *HackerNewsExtractor) GetName() string
GetName returns the name of the extractor
type RedditExtractor ¶
type RedditExtractor struct {
*ExtractorBase
// contains filtered or unexported fields
}
RedditExtractor handles Reddit post and comment content extraction TypeScript original code: import { BaseExtractor } from './_base'; import { ExtractorResult } from '../types/extractors';
export class RedditExtractor extends BaseExtractor {
private shredditPost: Element | null;
constructor(document: Document, url: string) {
super(document, url);
this.shredditPost = document.querySelector('shreddit-post');
}
}
func NewRedditExtractor ¶
func NewRedditExtractor(document *goquery.Document, url string, schemaOrgData interface{}) *RedditExtractor
NewRedditExtractor creates a new Reddit extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
this.shredditPost = document.querySelector('shreddit-post');
}
func (*RedditExtractor) CanExtract ¶
func (r *RedditExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.shredditPost;
}
func (*RedditExtractor) Extract ¶
func (r *RedditExtractor) Extract() *ExtractorResult
Extract extracts the Reddit post and comments TypeScript original code:
extract(): ExtractorResult {
const postContent = this.getPostContent();
const comments = this.extractComments();
const contentHtml = this.createContentHtml(postContent, comments);
const postTitle = this.document.querySelector('h1')?.textContent?.trim() || '';
const subreddit = this.getSubreddit();
const postAuthor = this.getPostAuthor();
const description = this.createDescription(postContent);
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
postId: this.getPostId(),
subreddit,
postAuthor,
},
variables: {
title: postTitle,
author: postAuthor,
site: `r/${subreddit}`,
description,
}
};
}
func (*RedditExtractor) GetName ¶
func (r *RedditExtractor) GetName() string
GetName returns the name of the extractor
type Registry ¶
type Registry struct {
// contains filtered or unexported fields
}
Registry manages site-specific extractors with a clean, extensible API TypeScript original code:
export class ExtractorRegistry {
private static mappings: ExtractorMapping[] = [];
private static domainCache: Map<string, ExtractorConstructor | null> = new Map();
}
func NewRegistry ¶
func NewRegistry() *Registry
NewRegistry creates a new extractor registry TypeScript original code:
constructor() { this.initialize(); }
func (*Registry) ClearCache ¶
ClearCache clears the domain cache TypeScript original code:
static clearCache() {
this.domainCache.clear();
}
func (*Registry) FindExtractor ¶
func (r *Registry) FindExtractor(document *goquery.Document, urlStr string, schemaOrgData interface{}) BaseExtractor
FindExtractor finds the appropriate extractor for the given URL TypeScript original code:
static findExtractor(document: Document, url: string, schemaOrgData?: any): BaseExtractor | null {
try {
const domain = new URL(url).hostname;
// Check cache first
if (this.domainCache.has(domain)) {
const cachedExtractor = this.domainCache.get(domain);
return cachedExtractor ? new cachedExtractor(document, url, schemaOrgData) : null;
}
// Find matching extractor
for (const { patterns, extractor } of this.mappings) {
const matches = patterns.some(pattern => {
if (pattern instanceof RegExp) {
return pattern.test(url);
}
return domain.includes(pattern);
});
if (matches) {
// Cache the result
this.domainCache.set(domain, extractor);
return new extractor(document, url, schemaOrgData);
}
}
// Cache the negative result
this.domainCache.set(domain, null);
return null;
} catch (error) {
console.error('Error in findExtractor:', error);
return null;
}
}
func (*Registry) GetMappings ¶
func (r *Registry) GetMappings() []ExtractorMapping
GetMappings returns a copy of current mappings (read-only access) This is a Go-specific method for introspection
func (*Registry) Register ¶
func (r *Registry) Register(mapping ExtractorMapping) *Registry
Register adds a new extractor mapping to the registry TypeScript original code:
static register(mapping: ExtractorMapping) {
this.mappings.push(mapping);
}
type TwitterExtractor ¶
type TwitterExtractor struct {
*ExtractorBase
// contains filtered or unexported fields
}
TwitterExtractor handles Twitter/X content extraction TypeScript original code: import { BaseExtractor } from './_base'; import { ExtractorResult } from '../types/extractors';
export class TwitterExtractor extends BaseExtractor {
private mainTweet: Element | null = null;
private threadTweets: Element[] = [];
constructor(document: Document, url: string) {
super(document, url);
// Get all tweets from the timeline
const timeline = document.querySelector('[aria-label="Timeline: Conversation"]');
if (!timeline) {
// Try to find a single tweet if not in timeline view
const singleTweet = document.querySelector('article[data-testid="tweet"]');
if (singleTweet) {
this.mainTweet = singleTweet;
}
return;
}
// Get all tweets before any section with "Discover more" or similar headings
const allTweets = Array.from(timeline.querySelectorAll('article[data-testid="tweet"]'));
const firstSection = timeline.querySelector('section, h2')?.parentElement;
if (firstSection) {
// Filter out tweets that appear after the first section
allTweets.forEach((tweet, index) => {
if (firstSection.compareDocumentPosition(tweet) & Node.DOCUMENT_POSITION_FOLLOWING) {
allTweets.splice(index);
return false;
}
});
}
// Set main tweet and thread tweets
this.mainTweet = allTweets[0] || null;
this.threadTweets = allTweets.slice(1);
}
}
func NewTwitterExtractor ¶
func NewTwitterExtractor(document *goquery.Document, url string, schemaOrgData interface{}) *TwitterExtractor
NewTwitterExtractor creates a new Twitter extractor TypeScript original code:
constructor(document: Document, url: string) {
super(document, url);
// Get all tweets from the timeline
const timeline = document.querySelector('[aria-label="Timeline: Conversation"]');
if (!timeline) {
// Try to find a single tweet if not in timeline view
const singleTweet = document.querySelector('article[data-testid="tweet"]');
if (singleTweet) {
this.mainTweet = singleTweet;
}
return;
}
// Get all tweets before any section with "Discover more" or similar headings
const allTweets = Array.from(timeline.querySelectorAll('article[data-testid="tweet"]'));
// Set main tweet and thread tweets
if (allTweets.length > 0) {
this.mainTweet = allTweets[0];
this.threadTweets = allTweets.slice(1);
}
}
func (*TwitterExtractor) CanExtract ¶
func (t *TwitterExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return !!this.mainTweet;
}
func (*TwitterExtractor) Extract ¶
func (t *TwitterExtractor) Extract() *ExtractorResult
Extract extracts the Twitter content TypeScript original code:
extract(): ExtractorResult {
const mainContent = this.extractTweet(this.mainTweet);
const threadContents = this.threadTweets
.map(tweet => this.extractTweet(tweet))
.filter(content => content);
const threadContent = threadContents.join('\n<hr>\n');
let contentHtml = '<div class="tweet-thread">';
contentHtml += '<div class="main-tweet">' + mainContent + '</div>';
if (threadContent) {
contentHtml += '<hr><div class="thread-tweets">' + threadContent + '</div>';
}
contentHtml += '</div>';
const tweetId = this.getTweetId();
const tweetAuthor = this.getTweetAuthor();
const description = this.createDescription(this.mainTweet);
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
tweetId: tweetId,
tweetAuthor: tweetAuthor
},
variables: {
title: `Thread by ${tweetAuthor}`,
author: tweetAuthor,
site: 'X (Twitter)',
description: description
}
};
}
func (*TwitterExtractor) GetName ¶
func (t *TwitterExtractor) GetName() string
GetName returns the name of the extractor
type YouTubeExtractor ¶
type YouTubeExtractor struct {
*ExtractorBase
// contains filtered or unexported fields
}
YouTubeExtractor handles YouTube content extraction TypeScript original code: import { BaseExtractor } from './_base'; import { ExtractorResult } from '../types/extractors';
export class YoutubeExtractor extends BaseExtractor {
private videoElement: HTMLVideoElement | null;
protected override schemaOrgData: any;
constructor(document: Document, url: string, schemaOrgData?: any) {
super(document, url, schemaOrgData);
this.videoElement = document.querySelector('video');
this.schemaOrgData = schemaOrgData;
}
canExtract(): boolean {
return true;
}
extract(): ExtractorResult {
const videoData = this.getVideoData();
const description = videoData.description || '';
const formattedDescription = this.formatDescription(description);
const contentHtml = `<iframe width="560" height="315" src="https://www.youtube.com/embed/${this.getVideoId()}?si=_m0qv33lAuJFoGNh" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe><br>${formattedDescription}`;
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
videoId: this.getVideoId(),
author: videoData.author || '',
},
variables: {
title: videoData.name || '',
author: videoData.author || '',
site: 'YouTube',
image: Array.isArray(videoData.thumbnailUrl) ? videoData.thumbnailUrl[0] || '' : '',
published: videoData.uploadDate,
description: description.slice(0, 200).trim(),
}
};
}
}
func NewYouTubeExtractor ¶
func NewYouTubeExtractor(document *goquery.Document, url string, schemaOrgData interface{}) *YouTubeExtractor
NewYouTubeExtractor creates a new YouTube extractor TypeScript original code:
constructor(document: Document, url: string, schemaOrgData?: any) {
super(document, url, schemaOrgData);
this.videoElement = document.querySelector('video');
this.schemaOrgData = schemaOrgData;
}
func (*YouTubeExtractor) CanExtract ¶
func (y *YouTubeExtractor) CanExtract() bool
CanExtract checks if the extractor can extract content TypeScript original code:
canExtract(): boolean {
return true; // YouTube extractor can always extract
}
func (*YouTubeExtractor) Extract ¶
func (y *YouTubeExtractor) Extract() *ExtractorResult
Extract extracts the YouTube content TypeScript original code:
extract(): ExtractorResult {
const videoData = this.getVideoData();
const description = videoData.description || '';
const formattedDescription = this.formatDescription(description);
const contentHtml = `<iframe width="560" height="315" src="https://www.youtube.com/embed/${this.getVideoId()}?si=_m0qv33lAuJFoGNh" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe><br>${formattedDescription}`;
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
videoId: this.getVideoId(),
author: videoData.author || '',
},
variables: {
title: videoData.name || '',
author: videoData.author || '',
site: 'YouTube',
image: Array.isArray(videoData.thumbnailUrl) ? videoData.thumbnailUrl[0] || '' : '',
published: videoData.uploadDate,
description: description.slice(0, 200).trim(),
}
};
}
func (*YouTubeExtractor) GetName ¶
func (y *YouTubeExtractor) GetName() string
GetName returns the name of the extractor