Skip to content

Commit

Permalink
Fixed problem with empty tblout/domtblout files
Browse files Browse the repository at this point in the history
  • Loading branch information
erikrikarddaniel committed Nov 6, 2020
1 parent cbc6b6c commit 81e55b5
Show file tree
Hide file tree
Showing 9 changed files with 58 additions and 50 deletions.
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = "1.9.11" %}
{% set version = "1.9.13" %}

package:
name: pfitmap-db
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
100 changes: 54 additions & 46 deletions src/R/pf-classify.r
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(feather))

SCRIPT_VERSION = "1.9.11"
SCRIPT_VERSION = "1.9.13"
ROWS_PER_SEQUENCE_TSV = 1e7

options(warn = 1)
Expand Down Expand Up @@ -187,25 +187,29 @@ accessions <- data.table(accno = character(), accto = character())

# Read all the tblout files
for ( tbloutfile in grep('\\.tblout', opt$args, value=TRUE) ) {
logmsg(sprintf("Reading %s", tbloutfile), 'DEBUG')
t = read_fwf(
tbloutfile, fwf_cols(content = c(1, NA)),
col_types = cols(content = col_character()),
comment='#'
) %>%
separate(
content,
c('accno', 't0', 'profile', 't1', 'evalue', 'score', 'bias', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'rest'),
'\\s+',
extra='merge',
convert = T
) %>%
as.data.table()
tblout <- funion(tblout, t[, .(accno, profile, evalue, score, bias)])
accessions <- funion(
accessions,
t[, { accno <- accno; accto <- sprintf("%s %s", accno, rest); .(accno = accno, accto = accto) }]
)
if ( file.info(tbloutfile)$size == 0 ) {
logmsg(sprintf("Skipping %s -- empty", tbloutfile), 'DEBUG')
} else {
logmsg(sprintf("Reading %s", tbloutfile), 'DEBUG')
t = read_fwf(
tbloutfile, fwf_cols(content = c(1, NA)),
col_types = cols(content = col_character()),
comment='#'
) %>%
separate(
content,
c('accno', 't0', 'profile', 't1', 'evalue', 'score', 'bias', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'rest'),
'\\s+',
extra='merge',
convert = T
) %>%
as.data.table()
tblout <- funion(tblout, t[, .(accno, profile, evalue, score, bias)])
accessions <- funion(
accessions,
t[, { accno <- accno; accto <- sprintf("%s %s", accno, rest); .(accno = accno, accto = accto) }]
)
}
}

# Split the accto field (data.table magic I don't understand, see https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows/31514711#31514711)
Expand All @@ -230,32 +234,36 @@ domtblout <- data.table(

# Read all the domtblout files
for ( domtbloutfile in grep('\\.domtblout', opt$args, value=TRUE) ) {
logmsg(sprintf("Reading %s", domtbloutfile), 'DEBUG')
t <- read_fwf(
domtbloutfile, fwf_cols(content = c(1, NA)),
col_types = cols(content = col_character()),
comment='#'
) %>%
separate(
content,
c(
'accno', 't0', 'tlen', 'profile', 't1', 'qlen', 'evalue', 'score', 'bias', 'i', 'n',
'dom_c_evalue', 'dom_i_evalue', 'dom_score', 'dom_bias',
'hmm_from', 'hmm_to', 'ali_from', 'ali_to', 'env_from', 'env_to', 'acc', 'rest'
),
'\\s+',
extra='merge',
convert = T
) %>%
as.data.table()

domtblout <- funion(
domtblout,
t[, .(
accno, tlen, profile, qlen, i, n, dom_c_evalue, dom_i_evalue, dom_score, dom_bias,
hmm_from, hmm_to, ali_from, ali_to, env_from, env_to
)]
)
if ( file.info(domtbloutfile)$size == 0 ) {
logmsg(sprintf("Skipping %s -- empty", domtbloutfile), 'DEBUG')
} else {
logmsg(sprintf("Reading %s", domtbloutfile), 'DEBUG')
t <- read_fwf(
domtbloutfile, fwf_cols(content = c(1, NA)),
col_types = cols(content = col_character()),
comment='#'
) %>%
separate(
content,
c(
'accno', 't0', 'tlen', 'profile', 't1', 'qlen', 'evalue', 'score', 'bias', 'i', 'n',
'dom_c_evalue', 'dom_i_evalue', 'dom_score', 'dom_bias',
'hmm_from', 'hmm_to', 'ali_from', 'ali_to', 'env_from', 'env_to', 'acc', 'rest'
),
'\\s+',
extra='merge',
convert = T
) %>%
as.data.table()

domtblout <- funion(
domtblout,
t[, .(
accno, tlen, profile, qlen, i, n, dom_c_evalue, dom_i_evalue, dom_score, dom_bias,
hmm_from, hmm_to, ali_from, ali_to, env_from, env_to
)]
)
}
}

# Calculate lengths:
Expand Down
2 changes: 1 addition & 1 deletion src/R/pf-db2feather.r
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(stringr))

SCRIPT_VERSION = "1.9.8"
SCRIPT_VERSION = "1.9.13"

# Options for testing: opt <- list(options = list(gtdb = TRUE, verbose = TRUE, prefix='testing'), args = 'pf-classify.02.sqlite3')
# Get arguments
Expand Down
2 changes: 1 addition & 1 deletion src/R/pf-fasta-unique-taxon-protein.r
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))

SCRIPT_VERSION = "1.9.8"
SCRIPT_VERSION = "1.9.13"

# Testing arguments: opt <- list('options' = list('featherprefix' = 'pf-fasta-unique-taxon-protein.01', 'prank' = 'psubclass', 'trank' = 'tspecies'), args = c('pf-fasta-unique-taxon-protein.01.faa'))
# Get arguments
Expand Down
2 changes: 1 addition & 1 deletion src/R/pf-fetchseqs.r
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
suppressPackageStartupMessages(library(optparse))

# Arguments for testing: opt <- list(options = list(sqlitedb = 'pf-fetchseqs.07.original.sqlite3', fetch = TRUE, verbose = TRUE, sourcedbs = 'refseq,pdb', faalevel='pfamily', faadir='.'))
SCRIPT_VERSION = "1.9.8"
SCRIPT_VERSION = "1.9.13"

# Get arguments
option_list = list(
Expand Down

0 comments on commit 81e55b5

Please sign in to comment.