lm

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 24, 2024 License: MIT Imports: 24 Imported by: 0

Documentation

Overview

Package lm provides a library for storing large n-gram language models in memory. Mostly inspired by https://code.google.com/archive/p/berkeleylm/

Index

Constants

View Source
const (
	// UnknownWordID is an index of an unregistered word
	UnknownWordID = uint32(0xffffffff)
	// UnknownWordSymbol is a symbol that is returned on an unregistred word
	UnknownWordSymbol = "<UNK>"
)
View Source
const (
	// InvalidContextOffset is context id that represents invalid context offset
	InvalidContextOffset = maxContextOffset - 1
)
View Source
const (
	// UnknownWordScore is the score for unknown phrases
	UnknownWordScore = -100.0
)

Variables

View Source
var (
	// ErrContextOverflow tells that it was an attempt
	ErrContextOverflow = errors.New("out of maxContextOffset")
)
View Source
var ErrInvalidIndex = errors.New("index is not exists")

ErrInvalidIndex tells that there is not data for the provided index index

View Source
var ErrNGramOrderIsOutOfRange = errors.New("nGrams order is out of range")

ErrNGramOrderIsOutOfRange informs that the given NGrams is out of range for the given

Functions

func NewTokenizer

func NewTokenizer(alphabet alphabet.Alphabet) analysis.Tokenizer

NewTokenizer creates a new instance of Tokenizer

func StoreBinaryLMFromGoogleFormat

func StoreBinaryLMFromGoogleFormat(directory store.Directory, config *Config) error

StoreBinaryLMFromGoogleFormat creates a ngram language model from the google ngram format

Types

type Config

type Config struct {
	Name        string   `json:"name"`
	NGramOrder  uint8    `json:"nGramOrder"`
	SourcePath  string   `json:"source"`
	OutputPath  string   `json:"output"`
	Alphabet    []string `json:"alphabet"`
	Separators  []string `json:"separators"`
	StartSymbol string   `json:"startSymbol"`
	EndSymbol   string   `json:"endSymbol"`
	// contains filtered or unexported fields
}

Config represents a configuration of a language model

func ReadConfig

func ReadConfig(configPath string) (*Config, error)

ReadConfig reads a language model config from the given reader

func (*Config) GetBinaryPath

func (c *Config) GetBinaryPath() string

GetBinaryPath returns a stored path for the binary lm

func (*Config) GetDictionaryPath

func (c *Config) GetDictionaryPath() string

GetDictionaryPath returns a stored path for the dictionary

func (*Config) GetOutputPath

func (c *Config) GetOutputPath() string

GetOutputPath returns a output path of the builded index

func (*Config) GetSeparatorsAlphabet

func (c *Config) GetSeparatorsAlphabet() alphabet.Alphabet

GetSeparatorsAlphabet returns a separators alphabet corresponding to the declaration

func (*Config) GetSourcePath

func (c *Config) GetSourcePath() string

GetSourcePath returns a source path of the index description

func (*Config) GetWordsAlphabet

func (c *Config) GetWordsAlphabet() alphabet.Alphabet

GetWordsAlphabet returns a word alphabet corresponding to the declaration

type ContextOffset

type ContextOffset = uint32

ContextOffset represents the id of parent nGram path

type CountTrie

type CountTrie interface {
	// Put increments WordCount for last element of given sequence.
	Put(sentence Sentence, count WordCount)
	// Walk iterates through trie and calls walker function on each element.
	Walk(walker TrieIterator) error
}

CountTrie represents a data structure for counting ngrams.

func NewCountTrie

func NewCountTrie() CountTrie

NewCountTrie creates new a instance of CountTrie

type Indexer

type Indexer interface {
	// Returns the index for the word, otherwise returns UnknownWordID
	Get(token Token) (WordID, error)
	// Find a token by the given index
	Find(id WordID) (Token, error)
}

Indexer enumerates words in the vocabulary of a language model. Stores a two-way mapping between uint32 and strings.

func BuildIndexer

func BuildIndexer(dict dictionary.Dictionary) (Indexer, error)

BuildIndexer builds a indexer from the given dictionary

func NewIndexer

func NewIndexer(dict dictionary.Dictionary, table mph.MPH) Indexer

NewIndexer creates a new instance of indexer

type Key

type Key uint64

Key represents a NGramNode key as a composition of a NGram context and wordID

func MakeKey

func MakeKey(word WordID, context ContextOffset) Key

MakeKey creates uint64 key for the given pair (word, context)

func (Key) GetContext

func (k Key) GetContext() ContextOffset

GetContext returns the context for the given key

func (Key) GetWordID

func (k Key) GetWordID() WordID

GetWordID returns the wordID for the given key

type LanguageModel

type LanguageModel interface {
	// ScoreSentence scores and returns a lm weight for the given sentence
	ScoreSentence(sentence Sentence) (float64, error)
	// ScoreWordIDs scores and returns a lm weight for the given sequence of nGrams
	ScoreWordIDs(sequence []WordID) float64
	// GetWordID returns id for the given token
	GetWordID(token Token) (WordID, error)
	// Next returns the list of candidates for the given sequence
	Next(sequence []WordID) (ScorerNext, error)
}

LanguageModel is an interface for an n-gram language model

func NewLanguageModel

func NewLanguageModel(
	model NGramModel,
	indexer Indexer,
	config *Config,
) (LanguageModel, error)

NewLanguageModel creates a new instance of a LanguageModel

func RetrieveLMFromBinary

func RetrieveLMFromBinary(directory store.Directory, config *Config) (LanguageModel, error)

RetrieveLMFromBinary retrieves a language model from the binary format

type NGramBuilder

type NGramBuilder struct {
	// contains filtered or unexported fields
}

NGramBuilder is an entity that responsible for creating CountTrie

func NewNGramBuilder

func NewNGramBuilder(
	startSymbol, endSymbol string,
) *NGramBuilder

NewNGramBuilder returns new instance of NGramBuilder

func (*NGramBuilder) Build

func (nb *NGramBuilder) Build(retriever SentenceRetriever, nGramOrder uint8) CountTrie

Build builds CountTrie with nGrams

type NGramModel

type NGramModel interface {
	store.Marshaler
	store.Unmarshaler

	// Score returns a lm value of the given sequence of WordID
	Score(nGrams []WordID) float64
	// Next returns a list of WordID which follow after the given sequence of nGrams
	Next(nGrams []WordID) (ScorerNext, error)
}

NGramModel is an entity that responses for scoring the given nGrams

func NewNGramModel

func NewNGramModel(indices []NGramVector) NGramModel

NewNGramModel creates a NGramModel from the given indices.

type NGramNode

type NGramNode struct {
	Key   Key
	Count WordCount
}

NGramNode represents tree node for the given nGram

func (*NGramNode) Less

func (n *NGramNode) Less(other rbtree.Item) bool

Less tells is current elements is bigger than the other

type NGramReader

type NGramReader interface {
	// Read builds NGramModel from the given list of readers
	Read() (NGramModel, error)
}

NGramReader is responsible for creating NGramModel from the files

func NewGoogleNGramReader

func NewGoogleNGramReader(nGramOrder uint8, indexer Indexer, directory store.Directory) NGramReader

NewGoogleNGramReader creates new instance of NGramReader

type NGramVector

type NGramVector interface {
	store.Marshaler
	store.Unmarshaler

	// GetCount returns WordCount and Node ContextOffset for the given pair (word, context)
	GetCount(word WordID, context ContextOffset) (WordCount, ContextOffset)
	// GetContextOffset returns the given node context offset
	GetContextOffset(word WordID, context ContextOffset) ContextOffset
	// CorpusCount returns size of all counts in the collection
	CorpusCount() WordCount
	// SubVector returns NGramVector for the given context
	SubVector(context ContextOffset) NGramVector
}

NGramVector represents one level of nGram trie

func CreatePackedArray

func CreatePackedArray(ch <-chan NGramNode) NGramVector

CreatePackedArray creates a NGramVector form the channel of NGramNodes.

func NewNGramVector

func NewNGramVector() NGramVector

NewNGramVector creates a new instance of NGramVector.

type NGramVectorBuilder

type NGramVectorBuilder interface {
	// Put adds the given sequence of nGrams and count to model
	Put(nGrams []WordID, count WordCount) error
	// Build creates new instance of NGramVector
	Build() NGramVector
}

NGramVectorBuilder is an entity that responses for building NGramVector

func NewNGramVectorBuilder

func NewNGramVectorBuilder(parents []NGramVector, factory NGramVectorFactory) NGramVectorBuilder

NewNGramVectorBuilder creates new instance of NGramVectorBuilder

type NGramVectorFactory

type NGramVectorFactory func(ch <-chan NGramNode) NGramVector

NGramVectorFactory represents a factory method for creating a NGramVector instance.

type NGramWriter

type NGramWriter interface {
	// Write persists the given trie to a storage
	Write(trie CountTrie) error
}

NGramWriter is the interface that persists the NGram Count Trie to a storage

func NewGoogleNGramWriter

func NewGoogleNGramWriter(nGramOrder uint8, directory store.Directory) NGramWriter

NewGoogleNGramWriter creates new instance of NGramWriter that persists the given NGram Count Trie with Google NGram Format negotiations

type NGrams

type NGrams = [][]WordID

NGrams is the result of splitting the given sequence of words into nGrams

type ScorerNext

type ScorerNext interface {
	// ScoreNext calculates the score for the given nGram built on the parent context
	ScoreNext(nGram WordID) float64
}

ScorerNext represents the entity that responses for scoring the word using the parent context

type Sentence

type Sentence = []Token

Sentence is a sequence of tokens

type SentenceRetriever

type SentenceRetriever interface {
	// Retrieves and returns the next sentence from the source
	Retrieve() Sentence
}

SentenceRetriever is an entity that is responsible for retrieving sentences from the given source

func NewSentenceRetriever

func NewSentenceRetriever(tokenizer analysis.Tokenizer, reader io.Reader, alphabet alphabet.Alphabet) SentenceRetriever

NewSentenceRetriever creates new instance of sentence retriever

type Token

type Token = analysis.Token

Token is a string with an assigned and thus identified meaning

type TrieIterator

type TrieIterator = func(path Sentence, count WordCount) error

TrieIterator is a callback that is called for each path of the given trie

type WordCount

type WordCount = uint32

WordCount is a count of a corresponding path

type WordID

type WordID = uint32

WordID is an index of the corresponding word

func MapIntoListOfWordIDs

func MapIntoListOfWordIDs(lm LanguageModel, sentence Sentence) ([]WordID, error)

MapIntoListOfWordIDs maps the given sentence into a list of WordIDs

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL