cmd

package

v0.6.0 Latest Latest Go to latest Published: Mar 25, 2025 License: MIT Imports: 58 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/shenwei356/LexicMap

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)
func Combinations2(set []uint64) [][2]uint64
func Execute()
func IntSlice2StringSlice(vals []int) []string
func MeanStdev(values []float64) (float64, float64)
func ParseByteSize(val string) (int64, error)
func RC(s []byte) []byte
func RecycleChaining2Result(chains *[]*Chain2Result)
func RecycleChainingResult(chains *[]*[]int)
func RecycleSeqComparatorResult(r *SeqComparatorResult)
func RecycleSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair)
func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)
type Chain2Result
- func (r *Chain2Result) Reset()
type Chain3Result
- func (r *Chain3Result) Reset()
type Chainer
- func NewChainer(options *ChainingOptions) *Chainer
- func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64)
type Chainer2
- func NewChainer2(options *Chaining2Options) *Chainer2
- func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)
type Chainer3
- func NewChainer3(options *Chaining3Options) *Chainer3
- func (ce *Chainer3) Chain(subs *[]*SubstrPair) *Chain3Result
type Chaining2Options
type Chaining3Options
type ChainingOptions
type Index
- func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)
- func (idx *Index) Close() error
- func (idx *Index) RecycleSearchResult(r *SearchResult)
- func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
- func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
- func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
- func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
type IndexBuildingOptions
type IndexInfo
type IndexSearchingOptions
type Options
type Query
- func (q *Query) Reset()
type SearchResult
- func (r *SearchResult) Reset()
- func (sr *SearchResult) SortBySeqID()
type SeqComparator
- func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator
- func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)
- func (cpr *SeqComparator) Index(s []byte) error
- func (cpr *SeqComparator) RecycleIndex()
type SeqComparatorOptions
type SeqComparatorResult
- func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)
- func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)
type SimilarityDetail
type SubstrPair
- func (s SubstrPair) String() string
type Uint64Slice
- func (s Uint64Slice) Len() int
- func (s Uint64Slice) Less(i, j int) bool
- func (s *Uint64Slice) Pop() interface{}
- func (s *Uint64Slice) Push(x interface{})
- func (s Uint64Slice) Swap(i, j int)

Constants ¶

View Source

const BITS_BATCH_IDX = 17

BITS_BATCH_IDX is the number of bits to store the genome batch index.

View Source

const BITS_FLAGS = BITS_STRAND + BITS_REVERSE

BITS_FLAGS is the number of bits to store two bits

View Source

const BITS_GENOME_IDX = 17

BITS_GENOME_IDX is the number of bits to store the genome index.

View Source

const BITS_IDX = BITS_BATCH_IDX + BITS_BATCH_IDX

BITS_IDX is the number of bits to strore batch index and genome index.

View Source

const BITS_IDX_FLAGS = BITS_IDX + BITS_FLAGS

BITS_IDX_FLAGS is the sum of BITS_IDX and BITS_FLAGS

View Source

const BITS_NONE_IDX = 64 - BITS_BATCH_IDX - BITS_GENOME_IDX

BITS_NONE_IDX is the number of bits to store data except for batch index and genome index.

View Source

const BITS_NONE_POS = 64 - BITS_POSITION

BITS_NONE_POS is the number of bits except for position

View Source

const BITS_POSITION = 28

BITS_POSITION is the number of bits to store the k-mer position/coordinate.

View Source

const BITS_REVERSE = 1

BITS_SUFFIX_IDX is the flag to indicate if the k-mer is reversed.

View Source

const BITS_STRAND = 1

BITS_STRAND is the flag to indicate if the k-mer is from the reverse complement strand.

View Source

const DirGenomes = "genomes"

DirGenomes is the directory of genomes datas

View Source

const DirSeeds = "seeds"

DirSeeds is the directory of k-mer-value data files

View Source

const ExtSeeds = ".bin"

ExtSeeds is file extention of k-mer-value data files

View Source

const ExtTmpDir = ".tmp"

ExtTmpDir is the path extension for temporary files

View Source

const FileGenomeChunks = "genomes.chunks.bin"

FileGenomeChunks store lists of batch+genome index of genome chunks

View Source

const FileGenomeIndex = "genomes.map.bin"

FileGenomeIndex maps genome id to genome batch id and index in the batch

View Source

const FileGenomes = "genomes.bin"

FileGenomes is the name of each genome file

View Source

const FileInfo = "info.toml"

FileInfo is the summary file

View Source

const FileMasks = "masks.bin"

FileMasks is the file for storing lexichash mask

View Source

const FileSeedPositions = "seed_positions.bin"

FileSeedPositions is the name of seed position file

View Source

const MASK_GENOME_IDX = (1 << BITS_GENOME_IDX) - 1

MASK_GENOME_IDX is the mask of genome index.

View Source

const MASK_NONE_IDX = (1 << BITS_NONE_IDX) - 1

MASK_NONE_IDX is the mask of non-index data

View Source

const MASK_REVERSE = 1

MASK_REVERSE is the mask of reversed flag

View Source

const MAX_GENOME_SIZE = 1 << BITS_POSITION

MAX_GENOME_SIZE is the maximum genome size, 268435456

View Source

const NO_VALID_SEQS = "no_valid_seqs"

NO_VALID_SEQS means there are no valid sequences in a genome file.

View Source

const OpD = uint64('D')

View Source

const OpH = uint64('H')

View Source

const OpI = uint64('I')

View Source

const OpM = uint64('M')

View Source

const OpX = uint64('X')

View Source

const TOO_LARGE_GENOME = "too_large_genome"

TOO_LARGE_GENOME means the genome is too big to index.

View Source

const TOO_MANY_SEQS = "too_many_seqs"

TOO_MANY_SEQS means there are too many sequences, as we require: $total_bases + ($num_contigs - 1) * $interval_size <= 268,435,456

Variables ¶

View Source

var BufferSize = 65536 // os.Getpagesize()

BufferSize is size of buffer

View Source

var COMMIT = func() string {
	if info, ok := debug.ReadBuildInfo(); ok {
		for _, setting := range info.Settings {
			if setting.Key == "vcs.revision" {
				return setting.Value[:7]
			}
		}
	}
	return ""
}()

COMMIT is the last commit

View Source

var DefaultChaining2Options = Chaining2Options{
	MaxGap:      50,
	MinScore:    50,
	MinAlignLen: 50,

	MaxDistance: 100,
	BandCount:   50,
	BandBase:    100,
}

DefaultChaining2Options is the defalt vaule of Chaining2Option.

View Source

var DefaultChaining3Options = Chaining3Options{
	MaxGap:      5,
	MinScore:    1,
	MinAlignLen: 2,

	MaxDistance: 10,
	BandCount:   20,
	BandBase:    10,
}

DefaultChaining3Options is the defalt vaule of Chaining2Option.

View Source

var DefaultChainingOptions = ChainingOptions{
	MaxGap:      5000,
	MinScore:    40,
	MaxDistance: 10000,
}

DefaultChainingOptions is the defalt vaule of ChainingOption.

View Source

var DefaultIndexSearchingOptions = IndexSearchingOptions{
	NumCPUs:      runtime.NumCPU(),
	MaxOpenFiles: 512,

	MinPrefix: 15,

	MinSinglePrefix: 17,

	TopN: 500,

	MaxGap:      5000,
	MaxDistance: 10000,

	ExtendLength:                     2000,
	ExtendLength2:                    50,
	MinQueryAlignedFractionInAGenome: 70,
	MaxEvalue:                        10,
}

View Source

var DefaultSeqComparatorOptions = SeqComparatorOptions{
	K:         32,
	MinPrefix: 11,

	Chaining2Options: Chaining2Options{

		MaxGap: 50,

		MinScore: 50,

		MaxDistance: 50,

		BandBase: 100,
	},

	MinAlignedFraction: 0,
}

DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.

View Source

var MainVersion uint8 = 3

MainVersion is use for checking compatibility

View Source

var MinorVersion uint8 = 3

MinorVersion is less important

View Source

var RootCmd = &cobra.Command{
	Use:   "lexicmap",
	Short: "efficient sequence alignment against millions of prokaryotic genomes",
	Long: fmt.Sprintf(`
   LexicMap: efficient sequence alignment against millions of prokaryotic genomes

    Version: v%s (%s)
  Documents: https://bioinf.shenwei.me/LexicMap
Source code: https://github.com/shenwei356/LexicMap

`, VERSION, COMMIT),
}

RootCmd represents the base command when called without any subcommands

View Source

var Strands = [2]byte{'+', '-'}

Strands could be used to output strand for a reverse complement flag

View Source

var VERSION = "0.6.0"

VERSION is the version

Functions ¶

func BuildIndex ¶ added in v0.2.0

func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error

BuildIndex builds index from a list of input files

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error

CheckIndexBuildingOptions checks some important options

func CheckIndexSearchingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error

func ClearSubstrPairs ¶ added in v0.2.0

func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)

ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.

func Execute ¶

func Execute()

Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.

func IntSlice2StringSlice ¶

func IntSlice2StringSlice(vals []int) []string

func MeanStdev ¶

func MeanStdev(values []float64) (float64, float64)

func ParseByteSize ¶ added in v0.4.0

func ParseByteSize(val string) (int64, error)

ParseByteSize parses byte size from string

func RC ¶ added in v0.2.0

func RC(s []byte) []byte

RC computes the reverse complement sequence

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChaining2Result(chains *[]*Chain2Result)

RecycleChaining2Result reycles the chaining paths. Please remember to call this after using the results.

func RecycleChainingResult ¶ added in v0.2.0

func RecycleChainingResult(chains *[]*[]int)

RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSeqComparatorResult(r *SeqComparatorResult)

RecycleSeqComparatorResult recycles a SeqComparatorResult

func RecycleSubstrPairs ¶ added in v0.2.0

func RecycleSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair)

RecycleSubstrPairs recycles a list of SubstrPairs

func TrimSubStrPairs ¶ added in v0.5.0

func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)

TrimSubStrPairs trims anchors for query/subjects with tandem repeats in either end.

case 1: embeded anchor in query/target

61: 156-186 (+) vs 1163-1193 (+), len:31
62: 157-187 (-) vs 1164-1194 (-), len:31
63: 158-188 (+) vs 1165-1195 (+), len:31
64: 168-195 (-) vs 1168-1195 (-), len:28
65: 175-202 (-) vs 1168-1195 (-), len:28 <---
66: 182-209 (-) vs 1168-1195 (-), len:28 <---
67: 189-216 (-) vs 1168-1195 (-), len:28 <---
68: 196-223 (+) vs 1168-1195 (+), len:28 <---
69: 203-230 (+) vs 1168-1195 (+), len:28 <---
70: 210-237 (-) vs 1168-1195 (-), len:28 <---
71: 217-244 (-) vs 1168-1195 (-), len:28 <--- gap=7, overlap=28 (28/28)

case 2: big overlap + big gap

727: 789-819 (-) vs 789-819 (-), len:31
728: 790-820 (-) vs 790-820 (-), len:31
729: 804-821 (-) vs 821-838 (-), len:18 <--- gap=17, overlap=17 (17/18)

Types ¶

type Chain2Result ¶ added in v0.3.0

type Chain2Result struct {
	NAnchors int // The number of substrings

	AlignedFraction float64

	MatchedBases  int     // The number of matched bases.
	AlignedBasesQ int     // The number of aligned bases in Query sequence
	AlignedBasesT int     // The number of aligned bases in Subject sequence
	PIdent        float64 // percentage of identity
	AlignedLength int     // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
	Gaps          int     // The number of gaps

	QBegin, QEnd int // Query begin/end position (0-based)
	TBegin, TEnd int // Target begin/end position (0-based)

	MaxExtLen int // max extend length

	// for output
	CIGAR     []byte // cigar string
	QSeq      []byte // query seq
	TSeq      []byte // target seq
	Alignment []byte // alignment text

	// statistic
	Score    int
	BitScore int
	Evalue   float64
	// contains filtered or unexported fields
}

Chain2Result represents a result of a chain

func (*Chain2Result) Reset ¶ added in v0.3.0

func (r *Chain2Result) Reset()

Reset resets a Chain2Result

type Chain3Result ¶ added in v0.6.0

type Chain3Result struct {
	NAnchors int // The number of substrings

	AlignedFraction float64

	MatchedBases  int     // The number of matched bases.
	AlignedBasesQ int     // The number of aligned bases in Query sequence
	AlignedBasesT int     // The number of aligned bases in Subject sequence
	PIdent        float64 // percentage of identity
	AlignedLength int     // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
	Gaps          int     // The number of gaps

	QBegin, QEnd int // Query begin/end position (0-based)
	TBegin, TEnd int // Target begin/end position (0-based)
}

Chain3Result represents a result of a chain

func (*Chain3Result) Reset ¶ added in v0.6.0

func (r *Chain3Result) Reset()

Reset resets a Chain3Result

type Chainer ¶ added in v0.2.0

type Chainer struct {
	// contains filtered or unexported fields
}

Chainer is an object for chaining the lexichash substrings between query and reference sequences.

func NewChainer ¶ added in v0.2.0

func NewChainer(options *ChainingOptions) *Chainer

NewChainer creates a new chainer.

func (*Chainer) Chain ¶ added in v0.2.0

func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64)

Chain finds the possible seed paths. Please remember to call RecycleChainingResult after using the results.

type Chainer2 ¶ added in v0.2.0

type Chainer2 struct {
	// contains filtered or unexported fields
}

Chainer2 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer2 ¶ added in v0.2.0

func NewChainer2(options *Chaining2Options) *Chainer2

NewChainer creates a new chainer.

func (*Chainer2) Chain ¶ added in v0.2.0

func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)

Chain finds the possible chain paths. Please remember to call RecycleChaining2Result after using the results. Returned results:

Chain2Results.
The number of matched bases.
The number of aligned bases.
QBegin.
QEnd.
TBegin.
TEnd.

type Chainer3 ¶ added in v0.6.0

type Chainer3 struct {
	// contains filtered or unexported fields
}

Chainer3 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer3 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer3 ¶ added in v0.6.0

func NewChainer3(options *Chaining3Options) *Chainer3

NewChainer creates a new chainer.

func (*Chainer3) Chain ¶ added in v0.6.0

func (ce *Chainer3) Chain(subs *[]*SubstrPair) *Chain3Result

Chain finds the possible chain path. Please remember to call RecycleChaining3Result after using the results. Returned results:

Chain3Result.
The number of matched bases.
The number of aligned bases.
QBegin.
QEnd.
TBegin.
TEnd.

type Chaining2Options ¶ added in v0.2.0

type Chaining2Options struct {
	MaxGap      int
	MinScore    int // minimum score of a chain
	MinAlignLen int
	MinIdentity float64
	MaxDistance int
	BandCount   int // only check i in range of  i − A < j < i
	BandBase    int // only check i where i.Qstart+i.Len + A < j.Qstart
}

Chaining2Options contains all options in chaining.

type Chaining3Options ¶ added in v0.6.0

type Chaining3Options struct {
	MaxGap      int
	MinScore    int // minimum score of a chain
	MinAlignLen int
	MinIdentity float64
	MaxDistance int
	BandCount   int // only check i in range of  i − A < j < i
	BandBase    int // only check i where i.Qstart+i.Len + A < j.Qstart
}

Chaining3Options contains all options in chaining.

type ChainingOptions ¶ added in v0.2.0

type ChainingOptions struct {
	MaxGap      float64
	MinLen      uint8
	MinScore    float64
	MaxDistance float64
}

ChainingOptions contains all options in chaining.

type Index ¶ added in v0.2.0

type Index struct {

	// k-mer-value searchers
	Searchers         []*kv.Searcher
	InMemorySearchers []*kv.InMemorySearcher
	// contains filtered or unexported fields
}

Index creates a LexicMap index from a path and supports searching with query sequences.

func NewIndexSearcher ¶ added in v0.2.0

func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)

NewIndexSearcher creates a new searcher

func (*Index) Close ¶ added in v0.2.0

func (idx *Index) Close() error

Close closes the searcher.

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (idx *Index) RecycleSearchResult(r *SearchResult)

RecycleSearchResults recycles a search result object

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)

RecycleSearchResults recycles search results objects

func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0

func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)

RecycleSimilarityDetails recycles a list of SimilarityDetails

func (*Index) Search ¶ added in v0.2.0

func (idx *Index) Search(query *Query) (*[]*SearchResult, error)

Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)

SetSeqCompareOptions sets the sequence comparing options

type IndexBuildingOptions ¶ added in v0.2.0

type IndexBuildingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	Force        bool // force overwrite existed index
	MaxOpenFiles int  // maximum opened files, used in merging indexes
	MergeThreads int  // Maximum Concurrent Merge Jobs

	MinSeqLen int // minimum sequence length, should be >= k

	// skipping extremely large genome
	MaxGenomeSize int    // Maximum genome size. Extremely large genomes (non-isolate assemblies) will be skipped
	BigGenomeFile string // Out file of skipped files with genomes

	// LexicHash
	MaskFile string // file of custom masks

	K        int   // k-mer size
	Masks    int   // number of masks
	RandSeed int64 // random seed

	// filling sketching deserts
	DisableDesertFilling   bool   // disable desert filling (just for analysis index)
	DesertMaxLen           uint32 // maxi length of sketching deserts
	DesertExpectedSeedDist int    // expected distance between seeds
	DesertSeedPosRange     int    // the upstream and down stream region for adding a seeds

	// generate mask from the top N biggest genomes
	TopN      int // Select the the top N largest genomes for generating masks
	PrefixExt int // Extension length of prefixes

	Chunks     int // the number of chunks for storing k-mer data
	Partitions int // the number of partitions for indexing k-mer data

	GenomeBatchSize int // the maximum number of genomes of a batch

	ReRefName    *regexp.Regexp   // for extracting genome id from the file name
	ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern

	ContigInterval int // the length of N's between contigs

	SaveSeedPositions bool

	Debug bool
}

IndexBuildingOptions contains all options for building an LexicMap index.

type IndexInfo ¶ added in v0.2.0

type IndexInfo struct {
	MainVersion      uint8 `toml:"main-version" comment:"Index format"`
	MinorVersion     uint8 `toml:"minor-version"`
	K                uint8 `toml:"max-K" comment:"LexicHash"`
	Masks            int   `toml:"masks"`
	RandSeed         int64 `toml:"rand-seed"`
	MaxDesert        int   `toml:"max-seed-dist" comment:"Seed distance"`
	SeedDistInDesert int   `toml:"seed-dist-in-desert"`
	Chunks           int   `toml:"chunks" comment:"Seeds (k-mer-value data) files"`
	Partitions       int   `toml:"index-partitions"`
	InputGenomes     int   `toml:"input-genomes" comment:"Input genomes"`
	InputBases       int64 `toml:"input-bases" comment:"Input bases"`
	Genomes          int   `` /* 239-byte string literal not displayed */
	GenomeBatchSize  int   `toml:"genome-batch-size"`
	GenomeBatches    int   `toml:"genome-batches"`
	ContigInterval   int   `toml:"contig-interval"`
}

IndexInfo contains summary of the index

type IndexSearchingOptions ¶ added in v0.2.0

type IndexSearchingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	InMemorySearch bool  // load the seed/kv data into memory
	MinPrefix      uint8 // minimum prefix length, e.g., 15
	// MaxMismatch     int   // maximum mismatch, e.g., 3
	MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20
	// MinMatchedBases uint8 // the total matched bases
	TopN int // keep the topN scores, e.g, 10

	// seeds chaining
	MaxGap      float64 // e.g., 5000
	MaxDistance float64 // e.g., 20k

	// alignment
	ExtendLength  int // the length of extra sequence on the flanking of seeds.
	ExtendLength2 int // the length of extra sequence on the flanking of pseudo-alignment region.

	// alignment filtering
	MinQueryAlignedFractionInAGenome float64 // minimum query aligned fraction in the target genome
	MaxEvalue                        float64

	// Output
	OutputSeq bool

	//
	Debug bool
}

IndexSearchingOptions contains all options for searching

type Options ¶

type Options struct {
	NumCPUs int
	Verbose bool

	LogFile  string
	Log2File bool

	Compress         bool
	CompressionLevel int
}

Options contains the global flags

type Query ¶

type Query struct {
	// contains filtered or unexported fields
}

Query is an object for each query sequence, it also contains the query result.

func (*Query) Reset ¶

func (q *Query) Reset()

Reset reset the data for next round of using

type SearchResult ¶ added in v0.2.0

type SearchResult struct {
	BatchGenomeIndex uint64 // just for finding genome chunks of the same genome

	GenomeBatch int
	GenomeIndex int
	ID          []byte
	GenomeSize  int

	Subs *[]*SubstrPair // matched substring pairs (query,target)

	Score  float64 //  score for soring
	Chains *[]*[]int

	// more about the alignment detail
	SimilarityDetails *[]*SimilarityDetail // sequence comparing
	AlignedFraction   float64              // query coverage per genome
}

SearchResult stores a search result in a genome for the given query sequence.

func (*SearchResult) Reset ¶ added in v0.2.0

func (r *SearchResult) Reset()

func (*SearchResult) SortBySeqID ¶ added in v0.4.0

func (sr *SearchResult) SortBySeqID()

type SeqComparator ¶ added in v0.2.0

type SeqComparator struct {
	// contains filtered or unexported fields
}

SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).

func NewSeqComparator ¶ added in v0.2.0

func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator

NewSeqComparator creates a new SeqComparator with given options. No options checking now.

func (*SeqComparator) Compare ¶ added in v0.2.0

func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)

Compare matchs k-mers for the query sequence (begin: end), chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.

func (*SeqComparator) Index ¶ added in v0.2.0

func (cpr *SeqComparator) Index(s []byte) error

Index initializes the SeqComparator with the query sequence.

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

func (cpr *SeqComparator) RecycleIndex()

RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorOptions struct {
	// indexing
	K         uint8
	MinPrefix uint8

	// chaining
	Chaining2Options

	// seq similarity
	MinAlignedFraction float64 // minimum query aligned fraction in a HSP

	MinIdentity float64
}

SeqComparatorOptions contains options for comparing two sequences.

type SeqComparatorResult ¶ added in v0.2.0

type SeqComparatorResult struct {
	AlignedBases int // The number of aligned bases.

	AlignedFraction float64 // query (original query) coverage per HSP

	MatchedBases int
	PIdent       float64

	QueryLen int // length of the original query, used to compute/update AlignedFraction

	QBegin int
	QEnd   int
	TBegin int
	TEnd   int

	TSeq []byte // target seq

	Chains *[]*Chain2Result
}

SeqComparatorResult contains the details of a seq comparison result.

func (*SeqComparatorResult) Update ¶ added in v0.3.0

func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)

Update updates the data with new chains. However it does not considerate gaps.

func (*SeqComparatorResult) Update2 ¶ added in v0.4.0

func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)

Update2 only compute the aligned fraction for all chains

type SimilarityDetail ¶ added in v0.2.0

type SimilarityDetail struct {
	// QBegin int
	// QEnd   int
	// TBegin int
	// TEnd   int
	RC bool

	SimilarityScore float64
	Similarity      *SeqComparatorResult
	// Chain           *[]int
	NSeeds int

	// sequence details
	SeqLen int
	SeqID  []byte // seqid of the region
}

SimilarityDetail is the similarity detail of one reference sequence

type SubstrPair ¶ added in v0.2.0

type SubstrPair struct {
	QBegin int32 // start position of the substring (0-based) in query
	TBegin int32 // start position of the substring (0-based) in reference

	Len uint8 // prefix length

	TRC bool // is the substring from the reference seq on the negative strand.
	QRC bool // is the substring from the query seq on the negative strand.

}

SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.

func (SubstrPair) String ¶ added in v0.2.0

func (s SubstrPair) String() string

type Uint64Slice ¶

type Uint64Slice []uint64

func (Uint64Slice) Len ¶

func (s Uint64Slice) Len() int

func (Uint64Slice) Less ¶

func (s Uint64Slice) Less(i, j int) bool

func (*Uint64Slice) Pop ¶

func (s *Uint64Slice) Pop() interface{}

func (*Uint64Slice) Push ¶

func (s *Uint64Slice) Push(x interface{})

func (Uint64Slice) Swap ¶

func (s Uint64Slice) Swap(i, j int)

Directories ¶

Path	Synopsis
genome
kv
seedposition
tree
util

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func BuildIndex ¶ added in v0.2.0

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions ¶ added in v0.2.0

func ClearSubstrPairs ¶ added in v0.2.0

func Combinations2 ¶

func Execute ¶

func IntSlice2StringSlice ¶

func MeanStdev ¶

func ParseByteSize ¶ added in v0.4.0

func RC ¶ added in v0.2.0

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChainingResult ¶ added in v0.2.0

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSubstrPairs ¶ added in v0.2.0

func TrimSubStrPairs ¶ added in v0.5.0

Types ¶

type Chain2Result ¶ added in v0.3.0

func (*Chain2Result) Reset ¶ added in v0.3.0

type Chain3Result ¶ added in v0.6.0

func (*Chain3Result) Reset ¶ added in v0.6.0

type Chainer ¶ added in v0.2.0

func NewChainer ¶ added in v0.2.0

func (*Chainer) Chain ¶ added in v0.2.0

type Chainer2 ¶ added in v0.2.0

func NewChainer2 ¶ added in v0.2.0

func (*Chainer2) Chain ¶ added in v0.2.0

type Chainer3 ¶ added in v0.6.0

func NewChainer3 ¶ added in v0.6.0

func (*Chainer3) Chain ¶ added in v0.6.0

type Chaining2Options ¶ added in v0.2.0

type Chaining3Options ¶ added in v0.6.0

type ChainingOptions ¶ added in v0.2.0

type Index ¶ added in v0.2.0

func NewIndexSearcher ¶ added in v0.2.0

func (*Index) Close ¶ added in v0.2.0

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0

func (*Index) Search ¶ added in v0.2.0

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

type IndexBuildingOptions ¶ added in v0.2.0

type IndexInfo ¶ added in v0.2.0

type IndexSearchingOptions ¶ added in v0.2.0

type Options ¶

type Query ¶

func (*Query) Reset ¶

type SearchResult ¶ added in v0.2.0

func (*SearchResult) Reset ¶ added in v0.2.0

func (*SearchResult) SortBySeqID ¶ added in v0.4.0

type SeqComparator ¶ added in v0.2.0

func NewSeqComparator ¶ added in v0.2.0

func (*SeqComparator) Compare ¶ added in v0.2.0

func (*SeqComparator) Index ¶ added in v0.2.0

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorResult ¶ added in v0.2.0

func (*SeqComparatorResult) Update ¶ added in v0.3.0

func (*SeqComparatorResult) Update2 ¶ added in v0.4.0

type SimilarityDetail ¶ added in v0.2.0

type SubstrPair ¶ added in v0.2.0

func (SubstrPair) String ¶ added in v0.2.0

type Uint64Slice ¶

func (Uint64Slice) Len ¶

func (Uint64Slice) Less ¶

func (*Uint64Slice) Pop ¶

func (*Uint64Slice) Push ¶

func (Uint64Slice) Swap ¶

Source Files ¶

Directories ¶