cmd

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 14, 2024 License: MIT Imports: 51 Imported by: 0

Documentation

Index

Constants

View Source
const DirGenomes = "genomes"

DirGenomes is the directory of genomes datas

View Source
const DirSeeds = "seeds"

DirSeeds is the directory of k-mer-value data files

View Source
const ExtSeeds = ".bin"

ExtSeeds is file extention of k-mer-value data files

View Source
const ExtTmpDir = ".tmp"

ExtTmpDir is the path extension for temporary files

View Source
const FileGenomeIndex = "genomes.map.bin"

FileGenomeIndex maps genome id to genome batch id and index in the batch

View Source
const FileGenomes = "genomes.bin"

FileGenomes is the name of each genome file

View Source
const FileInfo = "info.toml"

FileInfo is the summary file

View Source
const FileMasks = "masks.bin"

FileMasks is the file for storing lexichash mask

View Source
const FileSeedPositions = "seed_positions.bin"

FileSeedPositions is the name of seed position file

Variables

View Source
var BufferSize = 65536 // os.Getpagesize()

BufferSize is size of buffer

View Source
var DefaultChaining2Options = Chaining2Options{
	MaxGap:      50,
	MinScore:    50,
	MinAlignLen: 50,

	MaxDistance: 50,
	Band:        100,
}

DefaultChaining2Options is the defalt vaule of Chaining2Option.

View Source
var DefaultChainingOptions = ChainingOptions{
	MaxGap:      5000,
	MinScore:    40,
	MaxDistance: 10000,
}

DefaultChainingOptions is the defalt vaule of ChainingOption.

View Source
var DefaultIndexSearchingOptions = IndexSearchingOptions{
	NumCPUs:      runtime.NumCPU(),
	MaxOpenFiles: 512,

	MinPrefix:       15,
	MaxMismatch:     -1,
	MinSinglePrefix: 20,
	TopN:            500,

	MaxGap:      5000,
	MaxDistance: 10000,

	ExtendLength:                     2000,
	MinQueryAlignedFractionInAGenome: 70,
}
View Source
var DefaultSeqComparatorOptions = SeqComparatorOptions{
	K:         32,
	MinPrefix: 11,

	Chaining2Options: Chaining2Options{

		MaxGap: 50,

		MinScore: 50,

		MaxDistance: 50,

		Band: 100,
	},

	MinAlignedFraction: 0,
}

DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.

View Source
var MainVersion uint8 = 1

MainVersion is use for checking compatibility

View Source
var MinorVersion uint8 = 1

MinorVersion is less important

View Source
var RootCmd = &cobra.Command{
	Use:   "lexicmap",
	Short: "efficient sequence alignment against millions of prokaryotic genomes",
	Long: fmt.Sprintf(`
   LexicMap: efficient sequence alignment against millions of prokaryotic genomes

    Version: v%s
  Documents: https://bioinf.shenwei.me/LexicMap
Source code: https://github.com/shenwei356/LexicMap

`, VERSION),
}

RootCmd represents the base command when called without any subcommands

View Source
var Strands = [2]byte{'+', '-'}

Strands could be used to output strand for a reverse complement flag

View Source
var VERSION = "0.3.0"

VERSION is the version

Functions

func BuildIndex added in v0.2.0

func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error

BuildIndex builds index from a list of input files

func CheckIndexBuildingOptions added in v0.2.0

func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error

CheckIndexBuildingOptions checks some important options

func CheckIndexSearchingOptions added in v0.2.0

func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error

func ClearSubstrPairs added in v0.2.0

func ClearSubstrPairs(subs *[]*SubstrPair, k int)

ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.

func Combinations2

func Combinations2(set []uint64) [][2]uint64

Note: set should not have duplicates

func Execute

func Execute()

Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.

func GenerateMasks added in v0.3.0

func GenerateMasks(files []string, opt *IndexBuildingOptions, outFile string) ([]uint64, []string, error)

GenerateMasks generates Masks from the top N largest genomes.

func IntSlice2StringSlice

func IntSlice2StringSlice(vals []int) []string

func MeanStdev

func MeanStdev(values []float64) (float64, float64)

func RC added in v0.2.0

func RC(s []byte) []byte

RC computes the reverse complement sequence

func RecycleChaining2Result added in v0.2.0

func RecycleChaining2Result(chains *[]*Chain2Result)

RecycleChaining2Result reycles the chaining paths. Please remember to call this after using the results.

func RecycleChainingResult added in v0.2.0

func RecycleChainingResult(chains *[]*[]int)

RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.

func RecycleSeqComparatorResult added in v0.2.0

func RecycleSeqComparatorResult(r *SeqComparatorResult)

RecycleSeqComparatorResult recycles a SeqComparatorResult

func RecycleSubstrPairs added in v0.2.0

func RecycleSubstrPairs(subs *[]*SubstrPair)

RecycleSubstrPairs recycles a list of SubstrPairs

Types

type Chain2Result added in v0.3.0

type Chain2Result struct {
	NAnchors int // The number of substrings

	MatchedBases  int     // The number of matched bases.
	AlignedBasesQ int     // The number of aligned bases in Query sequence
	AlignedBasesT int     // The number of aligned bases in Subject sequence
	Pident        float64 // percentage of identity
	QBegin, QEnd  int     // Query begin/end position (0-based)
	TBegin, TEnd  int     // Target begin/end position (0-based)
}

Chain2Result represents a result of a chain

func (*Chain2Result) Reset added in v0.3.0

func (r *Chain2Result) Reset()

Reset resets a Chain2Result

type Chainer added in v0.2.0

type Chainer struct {
	// contains filtered or unexported fields
}

Chainer is an object for chaining the lexichash substrings between query and reference sequences.

func NewChainer added in v0.2.0

func NewChainer(options *ChainingOptions) *Chainer

NewChainer creates a new chainer.

func (*Chainer) Chain added in v0.2.0

func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64)

Chain finds the possible seed paths. Please remember to call RecycleChainingResult after using the results.

type Chainer2 added in v0.2.0

type Chainer2 struct {
	// contains filtered or unexported fields
}

Chainer2 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer2 added in v0.2.0

func NewChainer2(options *Chaining2Options) *Chainer2

NewChainer creates a new chainer.

func (*Chainer2) Chain added in v0.2.0

func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)

Chain finds the possible chain paths. Please remember to call RecycleChaining2Result after using the results. Returned results:

  1. Chain2Results.
  2. The number of matched bases.
  3. The number of aligned bases.
  4. QBegin.
  5. QEnd.
  6. TBegin.
  7. TEnd.

type Chaining2Options added in v0.2.0

type Chaining2Options struct {
	MaxGap      int
	MinScore    int // minimum score of a chain
	MinAlignLen int
	MinIdentity float64
	MaxDistance int
	Band        int // only check i in range of  i − A < j < i
}

Chaining2Options contains all options in chaining.

type ChainingOptions added in v0.2.0

type ChainingOptions struct {
	MaxGap      float64
	MinScore    float64
	MaxDistance float64
}

ChainingOptions contains all options in chaining.

type File2GSize added in v0.3.0

type File2GSize struct {
	Size int
	File string

} // An IntHeap is a min-heap of ints.

type File2GSizes added in v0.3.0

type File2GSizes []File2GSize

func (File2GSizes) Len added in v0.3.0

func (h File2GSizes) Len() int

func (File2GSizes) Less added in v0.3.0

func (h File2GSizes) Less(i, j int) bool

func (*File2GSizes) Pop added in v0.3.0

func (h *File2GSizes) Pop() any

func (*File2GSizes) Push added in v0.3.0

func (h *File2GSizes) Push(x any)

func (File2GSizes) Swap added in v0.3.0

func (h File2GSizes) Swap(i, j int)

type Index added in v0.2.0

type Index struct {

	// k-mer-value searchers
	Searchers         []*kv.Searcher
	InMemorySearchers []*kv.InMemorySearcher
	// contains filtered or unexported fields
}

Index creates a LexicMap index from a path and supports searching with query sequences.

func NewIndexSearcher added in v0.2.0

func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)

NewIndexSearcher creates a new searcher

func (*Index) Close added in v0.2.0

func (idx *Index) Close() error

Close closes the searcher.

func (*Index) RecycleSearchResult added in v0.2.0

func (idx *Index) RecycleSearchResult(r *SearchResult)

RecycleSearchResults recycles a search result object

func (*Index) RecycleSearchResults added in v0.2.0

func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)

RecycleSearchResults recycles search results objects

func (*Index) RecycleSimilarityDetails added in v0.3.0

func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)

RecycleSimilarityDetails recycles a list of SimilarityDetails

func (*Index) Search added in v0.2.0

func (idx *Index) Search(s []byte) (*[]*SearchResult, error)

Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().

func (*Index) SetSeqCompareOptions added in v0.2.0

func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)

SetSeqCompareOptions sets the sequence comparing options

type IndexBuildingOptions added in v0.2.0

type IndexBuildingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	Force        bool // force overwrite existed index
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	// skipping extremely large genome
	MaxGenomeSize int    // Maximum genome size. Extremely large genomes (non-isolate assemblies) will be skipped
	BigGenomeFile string // Out file of skipped files with genomes

	// LexicHash
	MaskFile string // file of custom masks

	K        int   // k-mer size
	Masks    int   // number of masks
	RandSeed int64 // random seed

	// generate mask randomly
	Prefix int // length of prefix for checking low-complexity and choosing k-mers to fill deserts

	// filling sketching deserts
	DesertMaxLen           uint32 // maxi length of sketching deserts
	DesertExpectedSeedDist int    // expected distance between seeds
	DesertSeedPosRange     int    // the upstream and down stream region for adding a seeds

	// generate mask from the top N biggest genomes
	TopN      int // Select the the top N largest genomes for generating masks
	PrefixExt int // Extension length of prefixes

	Chunks     int // the number of chunks for storing k-mer data
	Partitions int // the number of partitions for indexing k-mer data

	GenomeBatchSize int // the maximum number of genomes of a batch

	ReRefName    *regexp.Regexp   // for extracting genome id from the file name
	ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern

	ContigInterval int // the length of N's between contigs

	SaveSeedPositions bool
}

IndexBuildingOptions contains all options for building an LexicMap index.

type IndexInfo added in v0.2.0

type IndexInfo struct {
	MainVersion     uint8 `toml:"main-version" comment:"Index format"`
	MinorVersion    uint8 `toml:"minor-version"`
	K               uint8 `toml:"max-K" comment:"LexicHash"`
	Masks           int   `toml:"masks"`
	RandSeed        int64 `toml:"rand-seed"`
	Chunks          int   `toml:"chunks" comment:"Seeds (k-mer-value data) files"`
	Partitions      int   `toml:"index-partitions"`
	Genomes         int   `toml:"genomes" comment:"Genome data"`
	GenomeBatchSize int   `toml:"genome-batch-size"`
	GenomeBatches   int   `toml:"genome-batches"`
	ContigInterval  int   `toml:"contig-interval"`
}

IndexInfo contains summary of the index

type IndexSearchingOptions added in v0.2.0

type IndexSearchingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	// seed searching
	InMemorySearch  bool  // load the seed/kv data into memory
	MinPrefix       uint8 // minimum prefix length, e.g., 15
	MaxMismatch     int   // maximum mismatch, e.g., 3
	MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20
	TopN            int   // keep the topN scores, e.g, 10

	// seeds chaining
	MaxGap      float64 // e.g., 5000
	MaxDistance float64 // e.g., 20k

	// alignment
	ExtendLength int // the length of extra sequence on the flanking of seeds.
	// seq similarity
	MinQueryAlignedFractionInAGenome float64 // minimum query aligned fraction in the target genome

	// Output
	OutputSeq bool
}

IndexSearchingOptions contains all options for searching

type Kmer2Locs added in v0.3.0

type Kmer2Locs [][2]uint64

func (Kmer2Locs) Len added in v0.3.0

func (h Kmer2Locs) Len() int

func (Kmer2Locs) Less added in v0.3.0

func (h Kmer2Locs) Less(i, j int) bool

func (Kmer2Locs) Swap added in v0.3.0

func (h Kmer2Locs) Swap(i, j int)

type Options

type Options struct {
	NumCPUs int
	Verbose bool

	LogFile  string
	Log2File bool

	Compress         bool
	CompressionLevel int
}

Options contains the global flags

type Query

type Query struct {
	// contains filtered or unexported fields
}

Query is an object for each query sequence, it also contains the query result.

func (*Query) Reset

func (q *Query) Reset()

Reset reset the data for next round of using

type SearchResult added in v0.2.0

type SearchResult struct {
	GenomeBatch int
	GenomeIndex int
	ID          []byte
	GenomeSize  int

	Subs *[]*SubstrPair // matched substring pairs (query,target)

	Score  float64 //  score for soring
	Chains *[]*[]int

	// more about the alignment detail
	SimilarityDetails *[]*SimilarityDetail // sequence comparing
	AlignedFraction   float64              // query coverage per genome
}

SearchResult stores a search result for the given query sequence.

func (*SearchResult) Reset added in v0.2.0

func (r *SearchResult) Reset()

type SeqComparator added in v0.2.0

type SeqComparator struct {
	// contains filtered or unexported fields
}

SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).

func NewSeqComparator added in v0.2.0

func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator

NewSeqComparator creates a new SeqComparator with given options. No options checking now.

func (*SeqComparator) Compare added in v0.2.0

func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)

Compare matchs k-mers for the query sequence (begin: end), chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.

func (*SeqComparator) Index added in v0.2.0

func (cpr *SeqComparator) Index(s []byte) error

Index initializes the SeqComparator with the query sequence.

func (*SeqComparator) RecycleIndex added in v0.2.0

func (cpr *SeqComparator) RecycleIndex()

RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.

type SeqComparatorOptions added in v0.2.0

type SeqComparatorOptions struct {
	// indexing
	K         uint8
	MinPrefix uint8

	// chaining
	Chaining2Options

	// seq similarity
	MinAlignedFraction float64 // minimum query aligned fraction in a HSP

	MinIdentity float64
}

SeqComparatorOptions contains options for comparing two sequences.

type SeqComparatorResult added in v0.2.0

type SeqComparatorResult struct {
	AlignedBases int // The number of aligned bases.

	AlignedFraction float64 // query (original query) coverage per HSP

	MatchedBases int
	PIdent       float64

	QueryLen int // length of the original query, used to compute/update AlignedFraction

	QBegin int
	QEnd   int
	TBegin int
	TEnd   int

	TSeq []byte // target seq

	Chains *[]*Chain2Result
}

SeqComparatorResult contains the details of a seq comparison result.

func (*SeqComparatorResult) Update added in v0.3.0

func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)

Update updates the data with new chains.

type SimilarityDetail added in v0.2.0

type SimilarityDetail struct {
	// QBegin int
	// QEnd   int
	// TBegin int
	// TEnd   int
	RC bool

	SimilarityScore float64
	Similarity      *SeqComparatorResult
	// Chain           *[]int
	NSeeds int

	// sequence details
	SeqLen int
	SeqID  []byte // seqid of the region
}

SimilarityDetail is the similarity detail of one reference sequence

type SubstrPair added in v0.2.0

type SubstrPair struct {
	QBegin int32 // start position of the substring (0-based) in query
	TBegin int32 // start position of the substring (0-based) in reference

	Len      uint8 // prefix length
	Mismatch uint8 // number of mismatches

	TRC bool // is the substring from the reference seq on the negative strand.
	QRC bool // is the substring from the query seq on the negative strand.
}

SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.

func (SubstrPair) String added in v0.2.0

func (s SubstrPair) String() string

type Uint64Slice

type Uint64Slice []uint64

func (Uint64Slice) Len

func (s Uint64Slice) Len() int

func (Uint64Slice) Less

func (s Uint64Slice) Less(i, j int) bool

func (*Uint64Slice) Pop

func (s *Uint64Slice) Pop() interface{}

func (*Uint64Slice) Push

func (s *Uint64Slice) Push(x interface{})

func (Uint64Slice) Swap

func (s Uint64Slice) Swap(i, j int)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL