cmd

package

v0.2.0 Latest Latest Go to latest Published: Feb 2, 2024 License: MIT Imports: 41 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/shenwei356/LexicMap

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
func ClearSubstrPairs(subs *[]*SubstrPair, k int)
func Combinations2(set []uint64) [][2]uint64
func Execute()
func IntSlice2StringSlice(vals []int) []string
func MeanStdev(values []float64) (float64, float64)
func RC(s []byte) []byte
func RecycleChaining2Result(chains *[]*[]int)
func RecycleChainingResult(chains *[]*[]int)
func RecycleSeqComparatorResult(r *SeqComparatorResult)
func RecycleSubstrPairs(subs *[]*SubstrPair)
type Chainer
- func NewChainer(options *ChainingOptions) *Chainer
- func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64)
type Chainer2
- func NewChainer2(options *Chaining2Options) *Chainer2
- func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*[]int, int, int)
type Chaining2Options
type ChainingOptions
type Index
- func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)
- func (idx *Index) Close() error
- func (idx *Index) RecycleSearchResult(r *SearchResult)
- func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
- func (idx *Index) Search(s []byte) (*[]*SearchResult, error)
- func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
type IndexBuildingOptions
type IndexInfo
type IndexSearchingOptions
type Options
type Query
- func (q *Query) Reset()
type SearchResult
- func (r *SearchResult) Reset()
type SeqComparator
- func NewSeqComparator(options *SeqComparatorOptions) *SeqComparator
- func (cpr *SeqComparator) Compare(s []byte) (*SeqComparatorResult, error)
- func (cpr *SeqComparator) Index(s []byte) error
- func (cpr *SeqComparator) RecycleIndex()
type SeqComparatorOptions
type SeqComparatorResult
type SimilarityDetail
type SubstrPair
- func (s SubstrPair) String() string
type Uint64Slice
- func (s Uint64Slice) Len() int
- func (s Uint64Slice) Less(i, j int) bool
- func (s *Uint64Slice) Pop() interface{}
- func (s *Uint64Slice) Push(x interface{})
- func (s Uint64Slice) Swap(i, j int)

Constants ¶

View Source

const DirGenomes = "genomes"

DirGenomes is the directory of genomes datas

View Source

const DirSeeds = "seeds"

DirSeeds is the directory of k-mer-value data files

View Source

const ExtSeeds = ".bin"

ExtSeeds is file extention of k-mer-value data files

View Source

const ExtTmpDir = ".tmp"

ExtTmpDir is the path extension for temporary files

View Source

const FileGenomeIndex = "genomes.map.bin"

FileGenomeIndex maps genome id to genome batch id and index in the batch

View Source

const FileGenomes = "genomes.bin"

FileGenomes is the name of each genome file

View Source

const FileInfo = "info.toml"

FileInfo is the summary file

View Source

const FileMasks = "masks.bin"

FileMasks is the file for storing lexichash mask

Variables ¶

View Source

var BufferSize = 65536 // os.Getpagesize()

BufferSize is size of buffer

View Source

var DefaultChaining2Options = Chaining2Options{
	MaxGap:   32,
	MinScore: 20,

	MaxDistance: 50,
	Band:        20,
}

DefaultChaining2Options is the defalt vaule of Chaining2Option.

View Source

var DefaultChainingOptions = ChainingOptions{
	MaxGap:   5000,
	MinScore: 40,
}

DefaultChainingOptions is the defalt vaule of ChainingOption.

View Source

var DefaultIndexSearchingOptions = IndexSearchingOptions{
	NumCPUs:      runtime.NumCPU(),
	MaxOpenFiles: 512,

	MinPrefix:       15,
	MaxMismatch:     -1,
	MinSinglePrefix: 20,
	TopN:            10,

	MaxGap: 5000,
}

View Source

var DefaultSeqComparatorOptions = SeqComparatorOptions{
	K:         32,
	MinPrefix: 11,

	Chaining2Options: Chaining2Options{

		MaxGap: 32,

		MinScore: 20,

		MaxDistance: 50,

		Band: 20,
	},

	MinAlignedFraction: 70,
	MinIdentity:        70,
}

DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.

View Source

var MainVersion uint8 = 0

MainVersion is use for checking compatibility

View Source

var MinorVersion uint8 = 1

MinorVersion is less important

View Source

var RootCmd = &cobra.Command{
	Use:   "lexicmap",
	Short: "efficient sequence alignment against millions of microbial genomes",
	Long: fmt.Sprintf(`
    Program: LexicMap: efficient sequence alignment against millions of microbial genomes
    Version: v%s
  Documents: https://bioinf.shenwei.me/LexicMap
Source code: https://github.com/shenwei356/LexicMap

`, VERSION),
}

RootCmd represents the base command when called without any subcommands

View Source

var Strands = [2]byte{'+', '-'}

Strands could be used to output strand for a reverse complement flag

View Source

var VERSION = "0.2.0"

VERSION is the version

Functions ¶

func BuildIndex ¶ added in v0.2.0

func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error

BuildIndex builds index from a list of input files

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error

CheckIndexBuildingOptions checks some important options

func CheckIndexSearchingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error

func ClearSubstrPairs ¶ added in v0.2.0

func ClearSubstrPairs(subs *[]*SubstrPair, k int)

ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.

func Execute ¶

func Execute()

Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.

func IntSlice2StringSlice ¶

func IntSlice2StringSlice(vals []int) []string

func MeanStdev ¶

func MeanStdev(values []float64) (float64, float64)

func RC ¶ added in v0.2.0

func RC(s []byte) []byte

RC computes the reverse complement sequence

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChaining2Result(chains *[]*[]int)

RecycleChainingResult reycles the chaining paths. Please remember to call this after using the results.

func RecycleChainingResult ¶ added in v0.2.0

func RecycleChainingResult(chains *[]*[]int)

RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSeqComparatorResult(r *SeqComparatorResult)

RecycleSeqComparatorResult recycles a SeqComparatorResult

func RecycleSubstrPairs ¶ added in v0.2.0

func RecycleSubstrPairs(subs *[]*SubstrPair)

RecycleSubstrPairs recycles a list of SubstrPairs

Types ¶

type Chainer ¶ added in v0.2.0

type Chainer struct {
	// contains filtered or unexported fields
}

Chainer is an object for chaining the lexichash substrings between query and reference sequences.

func NewChainer ¶ added in v0.2.0

func NewChainer(options *ChainingOptions) *Chainer

NewChainer creates a new chainer.

func (*Chainer) Chain ¶ added in v0.2.0

func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64)

Chain finds the possible seed paths. Please remember to call RecycleChainingResult after using the results.

type Chainer2 ¶ added in v0.2.0

type Chainer2 struct {
	// contains filtered or unexported fields
}

Chainer2 is an object for chaining the anchors in two similar sequences. Different from Chainer, Chainer2 find chains with no overlaps. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer2 ¶ added in v0.2.0

func NewChainer2(options *Chaining2Options) *Chainer2

NewChainer creates a new chainer.

func (*Chainer2) Chain ¶ added in v0.2.0

func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*[]int, int, int)

Chain finds the possible chain paths. Please remember to call RecycleChainingResult after using the results. Returned results:

Paths.
The number of matched bases.
The number of aligned bases.

type Chaining2Options ¶ added in v0.2.0

type Chaining2Options struct {
	MaxGap   int
	MinScore int // minimum score of a chain

	// only used in Chain2
	MaxDistance int
	Band        int // only check i in range of  i − A < j < i
}

Chaining2Options contains all options in chaining.

type ChainingOptions ¶ added in v0.2.0

type ChainingOptions struct {
	MaxGap   float64
	MinScore float64
}

ChainingOptions contains all options in chaining.

type Index ¶ added in v0.2.0

type Index struct {

	// k-mer-value searchers
	Searchers []*kv.Searcher
	// contains filtered or unexported fields
}

Index creates LexicMap index from a path and supports searching with a query sequence.

func NewIndexSearcher ¶ added in v0.2.0

func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)

func (*Index) Close ¶ added in v0.2.0

func (idx *Index) Close() error

Close closes the searcher.

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (idx *Index) RecycleSearchResult(r *SearchResult)

RecycleSearchResults recycles a search result object

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)

RecycleSearchResults recycles search results objects

func (*Index) Search ¶ added in v0.2.0

func (idx *Index) Search(s []byte) (*[]*SearchResult, error)

Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)

SetSeqCompareOptions sets the sequence comparing options

type IndexBuildingOptions ¶ added in v0.2.0

type IndexBuildingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	Force        bool // force overwrite existed index
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	K                int   // k-mer size
	Masks            int   // number of masks
	RandSeed         int64 // random seed
	PrefixForCheckLC int   // length of prefix for checking low-complexity

	Chunks     int // the number of chunks for storing k-mer data
	Partitions int // the number of partitions for indexing k-mer data

	GenomeBatchSize int // the maximum number of genomes of a batch

	ReRefName    *regexp.Regexp   // for extracting genome id from the file name
	ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern
}

IndexBuildingOptions contains all options for building an LexicMap index.

type IndexInfo ¶ added in v0.2.0

type IndexInfo struct {
	MainVersion     uint8 `toml:"main-version" comment:"Index format"`
	MinorVersion    uint8 `toml:"minor-version"`
	K               uint8 `toml:"max-K" comment:"LexicHash"`
	Masks           int   `toml:"masks"`
	RandSeed        int64 `toml:"rand-seed"`
	Chunks          int   `toml:"chunks" comment:"Seeds (k-mer-value data) files"`
	Partitions      int   `toml:"index-partitions"`
	Genomes         int   `toml:"genomes" comment:"Genome data"`
	GenomeBatchSize int   `toml:"genome-batch-size"`
	GenomeBatches   int   `toml:"genome-batches"`
}

IndexInfo contains summary of the index

type IndexSearchingOptions ¶ added in v0.2.0

type IndexSearchingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	// seed searching
	MinPrefix       uint8 // minimum prefix length, e.g., 15
	MaxMismatch     int   // maximum mismatch, e.g., 3
	MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20
	TopN            int   // keep the topN scores, e.g, 10

	// seeds chaining
	MaxGap float64 // e.g., 5000
}

IndexSearchingOptions contains all options for searching

type Options ¶

type Options struct {
	NumCPUs int
	Verbose bool

	LogFile  string
	Log2File bool

	Compress         bool
	CompressionLevel int
}

Options contains the global flags

type Query ¶

type Query struct {
	// contains filtered or unexported fields
}

Query is an object for each query sequence, it also contains the query result.

func (*Query) Reset ¶

func (q *Query) Reset()

Reset reset the data for next round of using

type SearchResult ¶ added in v0.2.0

type SearchResult struct {
	GenomeBatch int
	GenomeIndex int
	ID          []byte
	GenomeSize  int

	Subs *[]*SubstrPair // matched substring pairs (query,target)

	Score  float64 //  score for soring
	Chains *[]*[]int

	// more about the alignment detail
	SimilarityDetails *[]*SimilarityDetail // sequence comparing
}

SearchResult stores a search result for the given query sequence.

func (*SearchResult) Reset ¶ added in v0.2.0

func (r *SearchResult) Reset()

type SeqComparator ¶ added in v0.2.0

type SeqComparator struct {
	// contains filtered or unexported fields
}

SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).

func NewSeqComparator ¶ added in v0.2.0

func NewSeqComparator(options *SeqComparatorOptions) *SeqComparator

NewSeqComparator creates a new SeqComparator with given options. No options checking now.

func (*SeqComparator) Compare ¶ added in v0.2.0

func (cpr *SeqComparator) Compare(s []byte) (*SeqComparatorResult, error)

Compare matchs k-mers for the query sequence, chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.

func (*SeqComparator) Index ¶ added in v0.2.0

func (cpr *SeqComparator) Index(s []byte) error

Index initializes the SeqComparator with a sequence.

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

func (cpr *SeqComparator) RecycleIndex()

RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorOptions struct {
	// indexing
	K         uint8
	MinPrefix uint8

	// chaining
	Chaining2Options

	// seq similarity
	MinAlignedFraction float64 // percentage
	MinIdentity        float64 // percentage
}

SeqComparatorOptions contains options for comparing two sequences.

type SeqComparatorResult ¶ added in v0.2.0

type SeqComparatorResult struct {
	MatchedBases int // The number of matched bases.
	AlignedBases int // The number of aligned bases.
	NumChains    int // The number of chains

	AlignedFraction float64 // aligned fraction, percentage
	Identity        float64 // identity (fraction of same bases), percentage
}

SeqComparatorResult contains the details of a seq comparison result.

type SimilarityDetail ¶ added in v0.2.0

type SimilarityDetail struct {
	TBegin int
	TEnd   int
	RC     bool

	SimilarityScore float64
	Similarity      *SeqComparatorResult
	Chain           *[]int

	// sequence details
	SeqLen int
	SeqID  []byte // seqid of the region
}

type SubstrPair ¶ added in v0.2.0

type SubstrPair struct {
	QBegin int    // start position of the substring (0-based) in query
	TBegin int    // start position of the substring (0-based) in reference
	Len    int    // prefix length
	Code   uint64 // k-mer, only for debugging

	Mismatch uint8 // number of mismatches

	RC bool // is the substring from the reference seq on the negative strand.
}

SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.

func (SubstrPair) String ¶ added in v0.2.0

func (s SubstrPair) String() string

type Uint64Slice ¶

type Uint64Slice []uint64

func (Uint64Slice) Len ¶

func (s Uint64Slice) Len() int

func (Uint64Slice) Less ¶

func (s Uint64Slice) Less(i, j int) bool

func (*Uint64Slice) Pop ¶

func (s *Uint64Slice) Pop() interface{}

func (*Uint64Slice) Push ¶

func (s *Uint64Slice) Push(x interface{})

func (Uint64Slice) Swap ¶

func (s Uint64Slice) Swap(i, j int)

Directories ¶

Path	Synopsis
genome
kv
tree
util

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func BuildIndex ¶ added in v0.2.0

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions ¶ added in v0.2.0

func ClearSubstrPairs ¶ added in v0.2.0

func Combinations2 ¶

func Execute ¶

func IntSlice2StringSlice ¶

func MeanStdev ¶

func RC ¶ added in v0.2.0

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChainingResult ¶ added in v0.2.0

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSubstrPairs ¶ added in v0.2.0

Types ¶

type Chainer ¶ added in v0.2.0

func NewChainer ¶ added in v0.2.0

func (*Chainer) Chain ¶ added in v0.2.0

type Chainer2 ¶ added in v0.2.0

func NewChainer2 ¶ added in v0.2.0

func (*Chainer2) Chain ¶ added in v0.2.0

type Chaining2Options ¶ added in v0.2.0

type ChainingOptions ¶ added in v0.2.0

type Index ¶ added in v0.2.0

func NewIndexSearcher ¶ added in v0.2.0

func (*Index) Close ¶ added in v0.2.0

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (*Index) Search ¶ added in v0.2.0

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

type IndexBuildingOptions ¶ added in v0.2.0

type IndexInfo ¶ added in v0.2.0

type IndexSearchingOptions ¶ added in v0.2.0

type Options ¶

type Query ¶

func (*Query) Reset ¶

type SearchResult ¶ added in v0.2.0

func (*SearchResult) Reset ¶ added in v0.2.0

type SeqComparator ¶ added in v0.2.0

func NewSeqComparator ¶ added in v0.2.0

func (*SeqComparator) Compare ¶ added in v0.2.0

func (*SeqComparator) Index ¶ added in v0.2.0

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorResult ¶ added in v0.2.0

type SimilarityDetail ¶ added in v0.2.0

type SubstrPair ¶ added in v0.2.0

func (SubstrPair) String ¶ added in v0.2.0

type Uint64Slice ¶

func (Uint64Slice) Len ¶

func (Uint64Slice) Less ¶

func (*Uint64Slice) Pop ¶

func (*Uint64Slice) Push ¶

func (Uint64Slice) Swap ¶

Source Files ¶

Directories ¶