document

package
v0.26.0-beta Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 29, 2024 License: MIT Imports: 23 Imported by: 1

README

---
title: "Document"
lang: "en-US"
draft: false
description: "Learn about how to set up a VDP Document component https://github.com/instill-ai/instill-core"
---

The Document component is an operator component that allows users to manipulate Document files.
It can carry out the following tasks:

- [Convert To Markdown](#convert-to-markdown)
- [Convert To Text](#convert-to-text)
- [Convert To Images](#convert-to-images)



## Release Stage

`Alpha`



## Configuration

The component configuration is defined and maintained [here](https://github.com/instill-ai/component/blob/main/operator/document/v0/config/definition.json).





## Supported Tasks

### Convert To Markdown

Convert document to text in Markdown format.


| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_MARKDOWN` |
| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Body | `body` | string | Markdown text converted from the PDF document |
| Filename (optional) | `filename` | string | The name of the file |






### Convert To Text

Convert document to text.


| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_TEXT` |
| Document (required) | `document` | string | Base64 encoded PDF/DOC/DOCX/XML/HTML/RTF/MD/PPTX/ODT/TIF/CSV/TXT/PNG document to be converted to plain text |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Body | `body` | string | Plain text converted from the document |
| Filename (optional) | `filename` | string | The name of the file |
| Meta | `meta` | object | Metadata extracted from the document |
| MSecs | `msecs` | number | Time taken to convert the document |
| Error | `error` | string | Error message if any during the conversion process |






### Convert To Images

Convert PDF to images.


| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_IMAGES` |
| PDF (required) | `pdf` | string | Base64 encoded PDF to be converted to images |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Images | `images` | array[string] | Images converted from the PDF document |
| Filenames (optional) | `filenames` | array[string] | The filenames of the images. The filenames will be appended with the page number. e.g. 'example-1.jpg' |







Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConvertToPDF

func ConvertToPDF(base64Encoded, fileExtension string) (string, error)

func Init

func Init(bc base.Component) *component

Types

type ConvertDocumentToMarkdownInput

type ConvertDocumentToMarkdownInput struct {
	Document        string `json:"document"`
	DisplayImageTag bool   `json:"display-image-tag"`
	Converter       string `json:"converter"`
	Filename        string `json:"filename"`
}

type ConvertDocumentToMarkdownOutput

type ConvertDocumentToMarkdownOutput struct {
	Body     string `json:"body"`
	Filename string `json:"filename"`
}

type ConvertPDFToImagesInput

type ConvertPDFToImagesInput struct {
	PDF      string `json:"pdf"`
	Filename string `json:"filename"`
}

type ConvertPDFToImagesOutput

type ConvertPDFToImagesOutput struct {
	Images    []string `json:"images"`
	Filenames []string `json:"filenames"`
}

func ConvertPDFToImage

func ConvertPDFToImage(inputStruct *ConvertPDFToImagesInput) (*ConvertPDFToImagesOutput, error)

type ConvertToTextInput

type ConvertToTextInput struct {
	// Document: Document to convert
	Document string `json:"document"`
	Filename string `json:"filename"`
}

ConvertToTextInput defines the input for convert to text task

type ConvertToTextOutput

type ConvertToTextOutput struct {
	// Body: Plain text converted from the document
	Body string `json:"body"`
	// Meta: Metadata extracted from the document
	Meta map[string]string `json:"meta"`
	// MSecs: Time taken to convert the document
	MSecs uint32 `json:"msecs"`
	// Error: Error message if any during the conversion process
	Error    string `json:"error"`
	Filename string `json:"filename"`
}

ConvertToTextOutput defines the output for convert to text task

func ConvertToText

func ConvertToText(input ConvertToTextInput) (ConvertToTextOutput, error)

type DocxDocToMarkdownTransformer

type DocxDocToMarkdownTransformer struct {
	Base64EncodedText string
	FileExtension     string
	DisplayImageTag   bool
	Converter         string
}

func (DocxDocToMarkdownTransformer) Transform

func (t DocxDocToMarkdownTransformer) Transform() (string, error)

type HTMLToMarkdownTransformer

type HTMLToMarkdownTransformer struct {
	Base64EncodedText string
	FileExtension     string
	DisplayImageTag   bool
	Converter         string
}

func (HTMLToMarkdownTransformer) Transform

func (t HTMLToMarkdownTransformer) Transform() (string, error)

type MarkdownTransformer

type MarkdownTransformer interface {
	Transform() (string, error)
}

func GetMarkdownTransformer

func GetMarkdownTransformer(fileExtension string, inputStruct *ConvertDocumentToMarkdownInput) (MarkdownTransformer, error)

type MarkdownTransformerGetterFunc

type MarkdownTransformerGetterFunc func(fileExtension string, inputStruct *ConvertDocumentToMarkdownInput) (MarkdownTransformer, error)

type PDFToMarkdownTransformer

type PDFToMarkdownTransformer struct {
	Base64EncodedText string
	FileExtension     string
	DisplayImageTag   bool
	Converter         string
}

func (PDFToMarkdownTransformer) Transform

func (t PDFToMarkdownTransformer) Transform() (string, error)

type PptPptxToMarkdownTransformer

type PptPptxToMarkdownTransformer struct {
	Base64EncodedText string
	FileExtension     string
	DisplayImageTag   bool
	Converter         string
}

func (PptPptxToMarkdownTransformer) Transform

func (t PptPptxToMarkdownTransformer) Transform() (string, error)

type XlsxToMarkdownTransformer

type XlsxToMarkdownTransformer struct {
	Base64EncodedText string
}

func (XlsxToMarkdownTransformer) Transform

func (t XlsxToMarkdownTransformer) Transform() (string, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL