-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1.10: Separate hmm_mincov params for domains and proteins
- Loading branch information
1 parent
effc5af
commit 3f8502c
Showing
6 changed files
with
769 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
{% set version = "1.9.15" %} | ||
{% set version = "1.10" %} | ||
|
||
package: | ||
name: pfitmap-db | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,10 +6,8 @@ SQLITE_SELECT_CLASSIFICATION = sqlite3 [email protected] "select '--> accessions <--'; | |
SQLITE_SELECT_GTDB_CLASSIFICATION = sqlite3 $@.sqlite3 "SELECT '--> accessions <--'; SELECT accno, genome_accno, db FROM accessions ORDER BY accno, genome_accno, db; SELECT '--> proteins <--'; SELECT accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_FROM, hmm_to, ali_FROM, ali_to, env_FROM, env_to FROM proteins ORDER BY accno, profile; SELECT '--> hmm_profiles <--'; SELECT profile, psuperfamily, pfamily, pclass, psubclass, pgroup, prank, version, plen FROM hmm_profiles ORDER BY profile; SELECT '--> taxa <--'; SELECT genome_accno, ncbi_taxon_id, tdomain, tphylum, tclass, torder, tfamily, tgenus, tspecies, trank FROM taxa ORDER BY genome_accno; SELECT '--> dbsources <--'; SELECT source, name, version FROM dbsources; SELECT '--> domains <--'; SELECT accno, profile, score, evalue FROM domains ORDER BY accno, profile, score; SELECT '--> tblout <--'; SELECT * FROM tblout ORDER BY accno, profile; SELECT '--> domtblout <--'; SELECT * FROM domtblout ORDER BY accno, profile, i;" > $@.out | ||
SQLITE_SELECT_CLASSIFY_SEQUENCES = sqlite3 $@.sqlite3 "SELECT '--> sequences <--'; SELECT accno, sequence FROM sequences ORDER BY accno;" >> $@.out | ||
SQLITE_SELECT_FETCHSEQS = sqlite3 $@.sqlite3 "SELECT accno, sequence FROM sequences ORDER BY accno;" > $@.out | ||
#SQLITE_SELECT_CLASSIFICATION = sqlite3 [email protected] "select accno, accto, taxon, db from accessions order by accno, accto, taxon, db; select accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to from proteins order by accno, profile; select accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to from dupfree_proteins order by accno, profile; select accno_from, accno_to from dupfree_accessions order by accno, accto; select profile, psuperfamily, pfamily, pclass, psubclass, pgroup, prank, version, plen from hmm_profiles order by profile; select ncbi_taxon_id, tdomain, tkingdom, tphylum, tclass, torder, tfamily, tgenus, tspecies, taxon, trank from taxa order by ncbi_taxon_id; SELECT source, name, version FROM dbsources; SELECT accno, profile, i, n, dom_c_evalue, dom_i_evalue, dom_score, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to FROM domains ORDER BY accno, profile, i; SELECT * FROM tblout ORDER BY accno, profile; SELECT * FROM domtblout ORDER BY accno, profile, i;" > [email protected] | ||
|
||
DB2FEATHER_OUT = for f in $@.*.feather; do Rscript --default-packages=dplyr,feather -e "print(paste0('$$f', ': ', read_feather('$$f') %>% nrow(), ' rows'))"; done > $@.out | ||
#DB2FEATHER_CONTENT = for f in $@.*.feather; do echo "--> $$f <--"; Rscript --default-packages=dplyr,feather -e "options(width = 1e4); print(arrange(read_feather('$$f')), n = 10000, width = Inf)"; done > [email protected] | ||
DB2FEATHER_CONTENT = for f in $@.*.feather; do echo "--> $$f <--"; Rscript --default-packages=dplyr,feather -e "options(width = 1e4); as.data.frame(arrange(read_feather('$$f')))"; done > $@.out | ||
|
||
all: pf-classify.gtdb pf-classify pf-db2feather pf-fetchseqs | ||
|
@@ -19,7 +17,7 @@ clean: | |
|
||
all.sqlite: pf-classify.02 pf-classify.03 pf-classify.04 | ||
|
||
pf-classify.gtdb: pf-classify.gtdb.00 pf-classify.gtdb.01 pf-classify.gtdb.02 pf-classify.gtdb.03 pf-classify.gtdb.04 pf-classify.gtdb.05 pf-classify.gtdb.06 pf-classify.gtdb.07 pf-classify.gtdb.08 | ||
pf-classify.gtdb: pf-classify.gtdb.00 pf-classify.gtdb.01 pf-classify.gtdb.02 pf-classify.gtdb.03 pf-classify.gtdb.04 pf-classify.gtdb.05 pf-classify.gtdb.06 pf-classify.gtdb.07 pf-classify.gtdb.08 pf-classify.gtdb.09 | ||
|
||
pf-classify.ncbi: pf-classify.00 pf-classify.01 pf-classify.02 pf-classify.03 pf-classify.04 pf-classify.05 pf-classify.06 pf-classify.07 pf-classify.08 pf-classify.09 | ||
|
||
|
@@ -38,85 +36,92 @@ Nrd.test.tar.gz: | |
|
||
pf-classify.gtdb.00: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --singletable=$@.out --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --singletable=$@.out --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.01: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_GTDB_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.02: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_GTDB_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.03: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_GTDB_CLASSIFICATION) | ||
@$(SQLITE_SELECT_CLASSIFY_SEQUENCES) | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.04: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.05: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa.gz $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa.gz $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@$(CHECK) | ||
|
||
pf-classify.gtdb.06: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@$(CHECK) | ||
|
||
# File output with missing genomes | ||
pf-classify.gtdb.07: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa --missing=$@.missing $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa --missing=$@.missing $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@cat $@.missing >> $@.out | ||
@$(CHECK) | ||
|
||
# Short and other problematic proteins | ||
pf-classify.gtdb.08: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --hmm_mincov=0.1 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.1 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@$(CHECK) | ||
|
||
# Test the domain_hmm_mincov param | ||
pf-classify.gtdb.09: | ||
@rm -f $@.*.feather | ||
../R/pf-classify.r --domain_hmm_mincov=0.9 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout | ||
@$(DB2FEATHER_CONTENT) | ||
@$(CHECK) | ||
|
||
pf-classify.00: | ||
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(CHECK) | ||
|
||
pf-classify.01: | ||
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(CHECK) | ||
|
||
pf-classify.02: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
pf-classify.03: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
# Test the fuzzy_factor to produce a taxon reduced protein list | ||
pf-classify.04: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --fuzzy_factor=30 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --fuzzy_factor=30 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
|
@@ -126,25 +131,25 @@ pf-classify.05: | |
@$(CHECK) | ||
|
||
pf-classify.06: | ||
( ../R/pf-classify.r --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'dbsource is required' > $@.out ) | ||
( ../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'dbsource is required' > $@.out ) | ||
@$(CHECK) | ||
|
||
pf-classify.07: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.95 --protein_hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
# Thought I had problems with the 4CON structure... | ||
pf-classify.08: | ||
@rm -f $@.sqlite3 | ||
../R/pf-classify.r --hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
../R/pf-classify.r --domain_hmm_mincov=0.95 --protein_hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout | ||
@$(SQLITE_SELECT_CLASSIFICATION) | ||
@$(CHECK) | ||
|
||
# Check that the script fails when the hmm_profiles table is not unique on profile | ||
pf-classify.09: | ||
( ../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'The profile column in the hmm_profiles table .* needs to be unique' > $@.out ) | ||
( ../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'The profile column in the hmm_profiles table .* needs to be unique' > $@.out ) | ||
@$(CHECK) | ||
|
||
pf-db2feather.00: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pf-classify.gtdb.03.d |
Oops, something went wrong.