database

package
v0.1.0-alpha.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 12, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Overview

Package database provides SQLite-backed storage for SRA metadata records including studies, experiments, samples, runs, submissions, and analyses.

Package database provides safe SQL utilities to prevent SQL injection.

Index

Constants

This section is empty.

Variables

View Source
var AllowedColumns = map[string]bool{

	"study_accession":      true,
	"experiment_accession": true,
	"sample_accession":     true,
	"run_accession":        true,
	"submission_accession": true,
	"analysis_accession":   true,

	"title":             true,
	"abstract":          true,
	"description":       true,
	"organism":          true,
	"scientific_name":   true,
	"taxon_id":          true,
	"platform":          true,
	"instrument_model":  true,
	"library_strategy":  true,
	"library_source":    true,
	"library_selection": true,
	"library_layout":    true,

	"created_at":      true,
	"updated_at":      true,
	"submission_date": true,
	"first_public":    true,
	"last_update":     true,

	"table_name": true,
	"row_count":  true,
}

AllowedColumns is the whitelist of valid column names. This is used for dynamic column selection in queries.

View Source
var AllowedTables = map[string]bool{

	"studies":     true,
	"experiments": true,
	"samples":     true,
	"runs":        true,
	"submissions": true,
	"analyses":    true,

	"sample_pool":        true,
	"identifiers":        true,
	"links":              true,
	"experiment_samples": true,

	"fts_accessions": true,
	"fts_samples":    true,
	"fts_runs":       true,

	"statistics":     true,
	"sync_status":    true,
	"progress":       true,
	"index_progress": true,
}

AllowedTables is the whitelist of valid table names in SRAKE database. Any table name not in this list will be rejected to prevent SQL injection.

View Source
var ErrInvalidColumnName = fmt.Errorf("invalid column name")

ErrInvalidColumnName is returned when a column name is not in the whitelist.

View Source
var ErrInvalidTableName = fmt.Errorf("invalid table name")

ErrInvalidTableName is returned when a table name is not in the whitelist.

Functions

func MustColumnName

func MustColumnName(column string) string

MustColumnName returns the column name if valid, panics otherwise. Use this only for hardcoded column names that are known to be valid.

func MustTableName

func MustTableName(table string) string

MustTableName returns the table name if valid, panics otherwise. Use this only for hardcoded table names that are known to be valid.

func SafeColumnName

func SafeColumnName(column string) (string, error)

SafeColumnName returns the column name if valid, otherwise returns an error. Use this when you need the column name for SQL construction.

func SafeTableName

func SafeTableName(table string) (string, error)

SafeTableName returns the table name if valid, otherwise returns an error. Use this when you need the table name for SQL construction.

func ValidateColumnName

func ValidateColumnName(column string) error

ValidateColumnName checks if a column name is in the allowed list. Returns nil if valid, ErrInvalidColumnName otherwise.

func ValidateIdentifier

func ValidateIdentifier(identifier string) error

ValidateIdentifier checks if a string is a valid SQL identifier format. This is a fallback for dynamic identifiers not in the whitelists. Valid format: starts with letter or underscore, followed by alphanumeric or underscore.

func ValidateTableName

func ValidateTableName(table string) error

ValidateTableName checks if a table name is in the allowed list. Returns nil if valid, ErrInvalidTableName otherwise.

Types

type AccessionResult

type AccessionResult struct {
	Accession string
	Type      string
	Title     string
	Metadata  string
	Score     float64
}

AccessionResult holds a single accession match from an FTS5 search, including its BM25 relevance score.

type Analysis

type Analysis struct {
	AnalysisAccession string     `json:"analysis_accession"`
	Alias             string     `json:"alias"`
	CenterName        string     `json:"center_name"`
	BrokerName        string     `json:"broker_name"`
	AnalysisCenter    string     `json:"analysis_center"`
	AnalysisDate      *time.Time `json:"analysis_date"`
	StudyAccession    string     `json:"study_accession"`
	Title             string     `json:"title"`
	Description       string     `json:"description"`
	AnalysisType      string     `json:"analysis_type"`

	// Analysis-specific fields
	Targets     string `json:"targets"`      // JSON array of target SRA objects
	DataBlocks  string `json:"data_blocks"`  // JSON array of data blocks
	AssemblyRef string `json:"assembly_ref"` // JSON object for assembly reference
	RunLabels   string `json:"run_labels"`   // JSON array for run label mappings
	SeqLabels   string `json:"seq_labels"`   // JSON array for sequence label mappings
	Processing  string `json:"processing"`   // JSON object for pipeline info

	// Links and attributes
	AnalysisLinks      string `json:"analysis_links"`      // JSON array
	AnalysisAttributes string `json:"analysis_attributes"` // JSON array
	Metadata           string `json:"metadata"`            // JSON
}

Analysis represents an analysis record with comprehensive fields

type DB

type DB struct {
	*sql.DB
	// contains filtered or unexported fields
}

DB wraps the SQL database connection

func Initialize

func Initialize(path string) (*DB, error)

Initialize creates and configures the database connection

func (*DB) BatchInsertExperiments

func (db *DB) BatchInsertExperiments(experiments []Experiment) error

BatchInsertExperiments inserts multiple experiments in a single transaction for performance.

func (*DB) CountSamplePools

func (db *DB) CountSamplePools() (int, error)

CountSamplePools counts total number of pool relationships

func (*DB) CountTable

func (db *DB) CountTable(table string) (int64, error)

CountTable counts rows in a table. The table name is validated against the AllowedTables whitelist to prevent SQL injection attacks.

func (*DB) FindRecordsByIdentifier

func (db *DB) FindRecordsByIdentifier(idValue string) ([]Identifier, error)

FindRecordsByIdentifier finds records with a specific identifier value

func (*DB) FullTextSearch

func (db *DB) FullTextSearch(query string) (interface{}, error)

FullTextSearch performs a LIKE-based text search across studies and experiments, returning results from both tables ranked by relevance.

func (*DB) GetAnalysis

func (db *DB) GetAnalysis(accession string) (*Analysis, error)

GetAnalysis retrieves an analysis by its accession identifier. Returns an error if the analysis is not found.

func (*DB) GetAveragePoolSize

func (db *DB) GetAveragePoolSize() (float64, error)

GetAveragePoolSize returns the average pool size

func (*DB) GetExperiment

func (db *DB) GetExperiment(accession string) (*Experiment, error)

GetExperiment retrieves an experiment by its accession identifier. Returns an error if the experiment is not found.

func (*DB) GetIdentifiers

func (db *DB) GetIdentifiers(recordType, recordAccession string) ([]Identifier, error)

GetIdentifiers retrieves identifiers for a record

func (*DB) GetInfo

func (db *DB) GetInfo() (*DatabaseInfo, error)

GetInfo returns database information

func (db *DB) GetLinks(recordType, recordAccession string) ([]Link, error)

GetLinks retrieves links for a record

func (*DB) GetMaxPoolSize

func (db *DB) GetMaxPoolSize() (int, error)

GetMaxPoolSize returns the maximum pool size

func (*DB) GetRun

func (db *DB) GetRun(accession string) (*Run, error)

GetRun retrieves a run by its accession identifier. Returns an error if the run is not found.

func (*DB) GetSQLDB

func (db *DB) GetSQLDB() *sql.DB

GetSQLDB returns the underlying SQL database connection

func (*DB) GetSample

func (db *DB) GetSample(accession string) (*Sample, error)

GetSample retrieves a sample by its accession identifier. Returns an error if the sample is not found.

func (*DB) GetSamplePools

func (db *DB) GetSamplePools(parentSample string) ([]SamplePool, error)

GetSamplePools retrieves pool relationships for a parent sample

func (*DB) GetStatistics

func (db *DB) GetStatistics() (map[string]int64, error)

GetStatistics retrieves cached statistics from the statistics table

func (*DB) GetStats

func (db *DB) GetStats() (*DatabaseStats, error)

GetStats returns live row counts for all core SRA tables.

func (*DB) GetStudiesBatch

func (db *DB) GetStudiesBatch(offset, limit int) ([]*Study, error)

GetStudiesBatch retrieves a batch of studies with pagination

func (*DB) GetStudy

func (db *DB) GetStudy(accession string) (*Study, error)

GetStudy retrieves a study by its accession identifier. Returns an error if the study is not found.

func (*DB) GetSubmission

func (db *DB) GetSubmission(accession string) (*Submission, error)

GetSubmission retrieves a submission by its accession identifier. Returns an error if the submission is not found.

func (*DB) InitializeStatistics

func (db *DB) InitializeStatistics() error

InitializeStatistics ensures the statistics table exists but does NOT populate it Population happens only via UpdateStatistics() after ingestion

func (*DB) InsertAnalysis

func (db *DB) InsertAnalysis(analysis *Analysis) error

InsertAnalysis inserts or replaces an analysis record in the database.

func (*DB) InsertExperiment

func (db *DB) InsertExperiment(exp *Experiment) error

InsertExperiment inserts or replaces an experiment record in the database.

func (*DB) InsertIdentifier

func (db *DB) InsertIdentifier(identifier *Identifier) error

InsertIdentifier inserts a structured identifier

func (db *DB) InsertLink(link *Link) error

InsertLink inserts a structured link

func (*DB) InsertRun

func (db *DB) InsertRun(run *Run) error

InsertRun inserts or replaces a run record in the database.

func (*DB) InsertSample

func (db *DB) InsertSample(sample *Sample) error

InsertSample inserts or replaces a sample record in the database.

func (*DB) InsertSamplePool

func (db *DB) InsertSamplePool(pool *SamplePool) error

InsertSamplePool inserts a pool relationship

func (*DB) InsertStudy

func (db *DB) InsertStudy(study *Study) error

InsertStudy inserts or replaces a study record in the database.

func (*DB) InsertSubmission

func (db *DB) InsertSubmission(submission *Submission) error

InsertSubmission inserts or replaces a submission record in the database.

func (*DB) Ping

func (db *DB) Ping() error

Ping verifies database connection

func (*DB) Query

func (db *DB) Query(query string, args ...interface{}) (*sql.Rows, error)

Query executes a query that returns rows

func (*DB) QueryRow

func (db *DB) QueryRow(query string, args ...interface{}) *sql.Row

QueryRow executes a query that returns at most one row

func (*DB) ScanExperiment

func (db *DB) ScanExperiment(scanner interface{}, exp *Experiment) error

ScanExperiment scans a row into an Experiment struct

func (*DB) ScanRun

func (db *DB) ScanRun(scanner interface{}, run *Run) error

ScanRun scans a row into a Run struct

func (*DB) ScanSample

func (db *DB) ScanSample(scanner interface{}, sample *Sample) error

ScanSample scans a row into a Sample struct

func (*DB) ScanStudy

func (db *DB) ScanStudy(scanner interface{}, study *Study) error

ScanStudy scans a row into a Study struct

func (*DB) SearchByLibraryStrategy

func (db *DB) SearchByLibraryStrategy(strategy string, limit int) ([]Experiment, error)

SearchByLibraryStrategy returns experiments matching the given library strategy (e.g., RNA-Seq, WGS).

func (*DB) SearchByOrganism

func (db *DB) SearchByOrganism(organism string, limit int) ([]Sample, error)

SearchByOrganism returns samples matching the given organism name or scientific name.

func (*DB) UpdateStatistics

func (db *DB) UpdateStatistics() error

UpdateStatistics recalculates and updates the statistics table This should be called only after batch operations complete

type DatabaseInfo

type DatabaseInfo struct {
	Size        int64
	Studies     int64
	Experiments int64
	Samples     int64
	Runs        int64
}

DatabaseInfo holds database file size and cached table row counts.

type DatabaseStats

type DatabaseStats struct {
	TotalStudies     int       `json:"total_studies"`
	TotalExperiments int       `json:"total_experiments"`
	TotalSamples     int       `json:"total_samples"`
	TotalRuns        int       `json:"total_runs"`
	LastUpdate       time.Time `json:"last_update"`
}

DatabaseStats holds aggregate counts for all core SRA tables.

type Experiment

type Experiment struct {
	// Primary key
	ExperimentAccession string `json:"experiment_accession"`

	// NameGroup attributes
	Alias      string `json:"alias"`
	CenterName string `json:"center_name"`
	BrokerName string `json:"broker_name"`

	// References
	StudyAccession  string `json:"study_accession"`
	SampleAccession string `json:"sample_accession"`

	// Core fields
	Title             string `json:"title"`
	DesignDescription string `json:"design_description"`

	// Library information
	LibraryName                 string `json:"library_name"`
	LibraryStrategy             string `json:"library_strategy"`
	LibrarySource               string `json:"library_source"`
	LibrarySelection            string `json:"library_selection"`
	LibraryLayout               string `json:"library_layout"` // 'SINGLE' or 'PAIRED'
	LibraryConstructionProtocol string `json:"library_construction_protocol"`

	// Paired-end specific
	NominalLength int     `json:"nominal_length"`
	NominalSdev   float64 `json:"nominal_sdev"`

	// Platform information
	Platform        string `json:"platform"`
	InstrumentModel string `json:"instrument_model"`

	// Targeted sequencing
	TargetedLoci string `json:"targeted_loci"` // JSON array

	// Pooling information
	PoolMemberCount int    `json:"pool_member_count"`
	PoolInfo        string `json:"pool_info"` // JSON object

	// Links and attributes
	ExperimentLinks      string `json:"experiment_links"`      // JSON array
	ExperimentAttributes string `json:"experiment_attributes"` // JSON array

	// Spot descriptor
	SpotLength     int    `json:"spot_length"`
	SpotDecodeSpec string `json:"spot_decode_spec"` // JSON object

	// Full metadata
	Metadata string `json:"metadata"` // JSON
}

Experiment represents a comprehensive SRA experiment record

type FTS5Manager

type FTS5Manager struct {
	// contains filtered or unexported fields
}

FTS5Manager manages SQLite FTS5 tables for fast text search

func NewFTS5Manager

func NewFTS5Manager(db *DB) *FTS5Manager

NewFTS5Manager creates a new FTS5 manager

func (*FTS5Manager) CreateFTSTables

func (f *FTS5Manager) CreateFTSTables() error

CreateFTSTables creates FTS5 tables for tier 3 search (samples and runs)

func (*FTS5Manager) GetFTSStats

func (f *FTS5Manager) GetFTSStats() (map[string]int64, error)

GetFTSStats returns statistics about FTS5 tables

func (*FTS5Manager) OptimizeFTSTables

func (f *FTS5Manager) OptimizeFTSTables() error

OptimizeFTSTables optimizes FTS5 tables for better performance

func (*FTS5Manager) SearchAccessions

func (f *FTS5Manager) SearchAccessions(query string, limit int) ([]AccessionResult, error)

SearchAccessions searches for accessions using FTS5

func (*FTS5Manager) SearchRuns

func (f *FTS5Manager) SearchRuns(query string, limit int) ([]RunResult, error)

SearchRuns searches runs using FTS5

func (*FTS5Manager) SearchSamples

func (f *FTS5Manager) SearchSamples(query string, limit int) ([]SampleResult, error)

SearchSamples searches samples using FTS5

type Identifier

type Identifier struct {
	RecordType      string `json:"record_type"`
	RecordAccession string `json:"record_accession"`
	IDType          string `json:"id_type"`
	IDNamespace     string `json:"id_namespace"`
	IDValue         string `json:"id_value"`
	IDLabel         string `json:"id_label"`
}

Identifier represents a structured identifier

type Link struct {
	RecordType      string `json:"record_type"`
	RecordAccession string `json:"record_accession"`
	LinkType        string `json:"link_type"`
	DB              string `json:"db"`
	ID              string `json:"id"`
	Label           string `json:"label"`
	URL             string `json:"url"`
}

Link represents a structured link

type Run

type Run struct {
	// Primary key
	RunAccession string `json:"run_accession"`

	// NameGroup attributes
	Alias      string `json:"alias"`
	CenterName string `json:"center_name"`
	BrokerName string `json:"broker_name"`
	RunCenter  string `json:"run_center"`

	// References
	ExperimentAccession string `json:"experiment_accession"`

	// Core fields
	Title   string     `json:"title"`
	RunDate *time.Time `json:"run_date"`

	// Statistics
	TotalSpots int64  `json:"total_spots"`
	TotalBases int64  `json:"total_bases"`
	TotalSize  int64  `json:"total_size"`
	LoadDone   bool   `json:"load_done"`
	Published  string `json:"published"`

	// File information
	DataFiles string `json:"data_files"` // JSON array

	// Links and attributes
	RunLinks      string `json:"run_links"`      // JSON array
	RunAttributes string `json:"run_attributes"` // JSON array

	// Quality metrics
	QualityScoreMean float64 `json:"quality_score_mean"`
	QualityScoreStd  float64 `json:"quality_score_std"`
	ReadCountR1      int64   `json:"read_count_r1"`
	ReadCountR2      int64   `json:"read_count_r2"`

	// Full metadata
	Metadata string `json:"metadata"` // JSON
}

Run represents a comprehensive SRA run record

type RunResult

type RunResult struct {
	RunAccession        string
	ExperimentAccession string
	TotalSpots          string
	TotalBases          string
	Score               float64
}

RunResult holds a single run match from an FTS5 search, including its BM25 relevance score.

type Sample

type Sample struct {
	// Primary key
	SampleAccession string `json:"sample_accession"`

	// NameGroup attributes
	Alias      string `json:"alias"`
	CenterName string `json:"center_name"`
	BrokerName string `json:"broker_name"`

	// Core fields
	Title       string `json:"title"`
	Description string `json:"description"`

	// Taxonomy
	TaxonID        int    `json:"taxon_id"`
	ScientificName string `json:"scientific_name"`
	CommonName     string `json:"common_name"`
	Organism       string `json:"organism"`

	// Sample source information
	Tissue    string `json:"tissue"`
	CellType  string `json:"cell_type"`
	CellLine  string `json:"cell_line"`
	Strain    string `json:"strain"`
	Sex       string `json:"sex"`
	Age       string `json:"age"`
	Disease   string `json:"disease"`
	Treatment string `json:"treatment"`

	// Geographic/environmental
	GeoLocName     string `json:"geo_loc_name"`
	LatLon         string `json:"lat_lon"`
	CollectionDate string `json:"collection_date"`
	EnvBiome       string `json:"env_biome"`
	EnvFeature     string `json:"env_feature"`
	EnvMaterial    string `json:"env_material"`

	// Links and attributes
	SampleLinks      string `json:"sample_links"`      // JSON array
	SampleAttributes string `json:"sample_attributes"` // JSON array

	// BioSample/BioProject references
	BiosampleAccession  string `json:"biosample_accession"`
	BioprojectAccession string `json:"bioproject_accession"`

	// Full metadata
	Metadata string `json:"metadata"` // JSON
}

Sample represents a comprehensive SRA sample record

type SamplePool

type SamplePool struct {
	PoolID       int     `json:"pool_id"`
	ParentSample string  `json:"parent_sample"`
	MemberSample string  `json:"member_sample"`
	MemberName   string  `json:"member_name"`
	Proportion   float64 `json:"proportion"`
	ReadLabel    string  `json:"read_label"`
}

SamplePool represents a pool/multiplex relationship

type SampleResult

type SampleResult struct {
	SampleAccession string
	Description     string
	Organism        string
	ScientificName  string
	Score           float64
}

SampleResult holds a single sample match from an FTS5 search, including its BM25 relevance score.

type Study

type Study struct {
	// Primary key
	StudyAccession string `json:"study_accession"`

	// NameGroup attributes
	Alias      string `json:"alias"`
	CenterName string `json:"center_name"`
	BrokerName string `json:"broker_name"`

	// Core fields
	StudyTitle        string `json:"study_title"`
	StudyType         string `json:"study_type"`
	StudyAbstract     string `json:"study_abstract"`
	StudyDescription  string `json:"study_description"`
	CenterProjectName string `json:"center_project_name"`

	// Dates
	SubmissionDate *time.Time `json:"submission_date"`
	FirstPublic    *time.Time `json:"first_public"`
	LastUpdate     *time.Time `json:"last_update"`

	// Identifiers (JSON)
	PrimaryID    string `json:"primary_id"`
	SecondaryIDs string `json:"secondary_ids"` // JSON array
	ExternalIDs  string `json:"external_ids"`  // JSON array
	SubmitterIDs string `json:"submitter_ids"` // JSON array

	// Links and attributes (JSON)
	StudyLinks      string `json:"study_links"`      // JSON array
	StudyAttributes string `json:"study_attributes"` // JSON array
	RelatedStudies  string `json:"related_studies"`  // JSON array

	// Extracted organism
	Organism string `json:"organism"`

	// Full metadata
	Metadata string `json:"metadata"` // JSON
}

Study represents a comprehensive SRA study record

type Submission

type Submission struct {
	SubmissionAccession  string     `json:"submission_accession"`
	Alias                string     `json:"alias"`
	CenterName           string     `json:"center_name"`
	BrokerName           string     `json:"broker_name"`
	LabName              string     `json:"lab_name"`
	Title                string     `json:"title"`
	SubmissionDate       *time.Time `json:"submission_date"`
	SubmissionComment    string     `json:"submission_comment"`
	Contacts             string     `json:"contacts"`              // JSON array of contacts
	Actions              string     `json:"actions"`               // JSON array of actions
	SubmissionLinks      string     `json:"submission_links"`      // JSON array
	SubmissionAttributes string     `json:"submission_attributes"` // JSON array
	Metadata             string     `json:"metadata"`              // JSON
}

Submission represents a submission record with enhanced fields

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL