diff --git a/ChangeLog b/ChangeLog index f68f8035..692e1c38 100644 --- a/ChangeLog +++ b/ChangeLog @@ -10,12 +10,124 @@ # changes in development version since last release ################################################################################ +################################################################################ +################################################################################ + +################################################################################ +### version 3.1.2 +################################################################################ + +# IntaRNA +- `--outSep` = user-defined column separator for tabular CSV output +- bugfix non-overlapping suboptimal enumeration +- bugfix noLP optimization (missing case of direct left-stack extension) +- CSV output + - new ensemble energy and partition function output for intra-molecular + structures formed by seq1 and seq2 (`Eall1, Eall2, Zall1, Zall2`) + - new total energy output `Etotal` = (E+Eall1+Eall2) and + `EallTotal` = (Eall+Eall1+Eall2) + - new `RT` output + +# auxiliary R scripts +- plotRegions.py - visualization of sequence regions covered by IntaRNA + predictions, similar to the IntaRNA webserver output (thanks to @dgelsin) + +################################################################################ + +191030 Martin Raden + * IntaRNA/OutputHandlerCsv : + + RT + * string2list() : + + support of '*' encoding to generate full list + * bin/CommandLineParsing : + + docu and implementation of '*' outCsvCol behaviour + * IntaRNA/InteractionEnergyBasePair : + + computeIntraEall() : computes Eall1|2 via NussinovHandler + * getEall1|2() : call computeIntraEall if needed + * README.md + + RT CSV col + +191029 Martin Raden + * IntaRNA/PredictorMfe : + * getNextBest() : + * bugfix: energy check was applying duplicated ED values + (thanks to Jens Georg) + * IntaRNA/PredictorMfe2dSeedExtension : + * IntaRNA/PredictorMfe2dHeuristicSeedExtension : + * IntaRNA/PredictorMfeEns2dSeedExtension : + * fillHybrid*_left() : + * traceBack() : + * bugfix: missing case in noLP mode (direct left-stack extension) + * IntaRNA/AccessibilityVrna : + + addConstraints() : dedicated function to add constraint to VrnaFoldCompound + * fillByRNAplfold() : using addConstraints() + * IntaRNA/InteractionEnergy : + + getBoltzmannWeight( Z_type ) : conversion from kcal/mol-based energies + + getEall1|2() : ensemble energy for seq1|2 + * IntaRNA/InteractionEnergyBasePair : + + getEall1|2() : NOT IMPLEMENTED YET + * IntaRNA/InteractionEnergyIdxOffset : + + getEall1|2() : forward to wrapped energy handler + * IntaRNA/InteractionEnergyVrna : + + Eall1|2 : ensemble energies for seq1|2 + + getEall1|2() : lazy computation of Eall1|2 using computeIntraEall() + + computeIntraEall() : Eall* computation via vrna_pf() using the respective + accessibility constraints + * IntaRNA/OutputHandlerEnsemble : + - no output of Zall + + output of RT, Eall1, Eall2, EallTotal + * IntaRNA/OutputHandlerCsv : + + Eall1, Eall2, EallTotal, Etotal, Zall1, Zall2 + * README.md : + + docu of new CSV columns (Eall1, Eall2, Zall1, Zall2, Etotal, EallTotal) + * ensemble output docu updated + +191021 Martin Raden + + R/plotRegions.R : visualization of sequence regions covered by RRI + + R/README.md : plotRegions docu + * README.md : link to R/README.md + +191008 Martin Raden + * IntaRNA/OutputHandlerCsv : + + bpList output : list of base pairs + * README.md : + + bpList + + outSep + +191008 Martin Raden + * bin/CommandLineParsing : + + 'outSep' argument to set column separator for tabular CSV output + * outSep applied to PredictionTracker* + * IntaRNA/PredictionTracker* : + + explicit output column separator + * IntaRNA/OutputHandlerCsv : + * needsZall() : + * needBPs() : + - colSep argument : obsolete + +191007 Martin Raden + * bin/CommandLineParsing : + * bugfix accNoLP and accNoGUend checks for energy!=V + * IntaRNA/NussinovHandler : + * getQ() : + * fix: return 1 if (i==j) + * IntaRNA/AccessibilityVrna : + * construction() : + + bugfix: missing ED initialization for short sequences + * tests/AccessibilityBasePair : + + short sequence test + + tests/AccessibilityVrna : + + short sequence test to validate ED initialization + +################################################################################ +### version 3.1.1 +################################################################################ + # IntaRNA - base pairs details only computed if needed for output (speedup for large -n) - predefined parameter sets for loading (Turner04, Turner99, Andronescu07) - 'tRegion' and 'qRegion' now available for and applied to multi-sequence input -################################################################################ ################################################################################ 190924 Martin Raden diff --git a/R/README.md b/R/README.md new file mode 100644 index 00000000..0690fa97 --- /dev/null +++ b/R/README.md @@ -0,0 +1,35 @@ + +# Auxiliary R scripts of the IntaRNA package + + + +# `plotRegions.R` - Visualization of RRI-covered regions + +To visualize sequences' regions covered by RNA-RNA interactions predicted by +IntaRNA, you can use `plotRegions.R` by providing the following arguments (in +the given order) + +1. CSV-IntaRNA output file (semicolon separated) covering the columns `start,end,id` + with suffix `1` or `2` to plot target or query regions, respectively +2. `1` or `2` to select whether to plot target or query regions +3. output file name with a file-format-specific suffix from `.pdf`, `.png`, + `.svg`, `.eps`, `.ps`, `.jpeg`, `.tiff` + +An example is given below, when calling +```bash +Rscript --vanilla plotRegions.R pred.csv 1 regions.png +``` + +with `pred.csv` containing +``` +id1;start1;end1;id2;start2;end2 +b0001;266;273;query;116;123 +b0002;204;231;query;85;111 +b0003;229;262;query;96;125 +b0004;265;300;query;10;38 +b0005;281;295;query;5;22 +``` + +will produce the output +![plotRegions.R example](plotRegions.example.png) + diff --git a/R/plotRegions.R b/R/plotRegions.R new file mode 100644 index 00000000..3cb2b55e --- /dev/null +++ b/R/plotRegions.R @@ -0,0 +1,151 @@ +#!/usr/bin/env Rscript + +#################################################################### +# Visualization of sequence regions covered by RNA-RNA interactions +# predicted by IntaRNA. +# +# arguments: <1|2> +# 1 = ";"-separated CSV output of IntaRNA +# 2 <1|2> = suffix of "start,end,id" CSV cols to plot +# 3 = file name of the otuput figure suffixed +# by one of ".pdf",".png",".svg",".eps",".ps",".jpeg",".tiff" +# +# example call: +# +# Rscript --slave -f plotRegions.R --args pred.csv 1 regions.png +# +# This script is part of the IntaRNA source code package. See +# respective licence and documentation for further information. +# +# https://github.com/BackofenLab/IntaRNA +# +#################################################################### +# check and load dependencies +#################################################################### + +options(warn=-1) +suppressPackageStartupMessages(require(ggplot2)) +suppressPackageStartupMessages(require(ggalt)) +suppressPackageStartupMessages(require(cowplot)) # cowplot starts with a note +options(warn=0) + +theme_set(theme_cowplot()) + +#################################################################### +# get command line arguments +#################################################################### + +args = commandArgs(trailingOnly=TRUE) +# check and parse +if (length(args)!=3) { stop("call with <1|2> ", call.=FALSE) } + +intarnaOutputFile = args[1]; +if (!file.exists(intarnaOutputFile )) { stop("intarna-csv-output file '", intarnaOutputFile, "' does not exist!", call.=FALSE) } + +seqNr = args[2]; +if (seqNr != "1" && seqNr != "2") { stop("second call argument as to be '1' or '2' to specify which regions to plot"); } +# compile column names +id = paste("id",seqNr,sep=""); +start = paste("start",seqNr,sep=""); +end = paste("end",seqNr,sep=""); + +outFile = args[3]; +fileExtensions = c(".pdf",".png",".svg",".eps",".ps",".jpeg",".tiff"); +outFileExtOk = FALSE; +for( ext in fileExtensions ) { outFileExtOk = outFileExtOk || endsWith(outFile,ext); } +if ( !outFileExtOk ) {stop(" has to have one of the following file extensions ",paste(fileExtensions,sep=" "), call.=FALSE);} + +# if set to some x-position, this will trigger the plotting of a vertical line at that location +xVline = NA; +#xVline = 250; # DEBUG TEST VALUE + + +#################################################################### +# parse IntaRNA output +#################################################################### + +d = read.csv2( intarnaOutputFile ) +# check if all columns present +for( x in c(id,start,end)) { + if (!is.element(x, colnames(d))) { + stop("'",id,"' is not among the column names of '",intarnaOutputFile,"'", call.=FALSE); + } +} + +#################################################################### +# create count plot +#################################################################### + +allPos = c(); +for( i in 1:nrow(d) ) { + allPos = c( allPos, d[i,start]:d[i,end] ); +} +allPos = as.data.frame(allPos,ncol=1) +#allPos # DEBUG OUT + +coveragePlot = + ggplot( allPos, aes(x=allPos, stat(count))) + + geom_density() + + ylab("coverage") + + xlab("position") + + scale_y_continuous(position = "right", expand=expand_scale(mult = c(0, .02))) + + scale_x_continuous(expand = c(0, 0)) + + theme( axis.title.x=element_blank() ) + +if ( ! is.na(xVline)) { + coveragePlot = coveragePlot + + geom_vline(aes(xintercept=xVline)); +} + +#################################################################### +# create region plot +#################################################################### + +dRegion = data.frame() +dRegion[1:nrow(d),1:3] = d[,c(start,end,id)] +dRegion[1:nrow(d),4] = factor(sprintf("%08d",nrow(d):1)) +colnames(dRegion) = c("start","end","id","idx"); +#dRegion # DEBUG OUT + +yLabelScale = 0.6 # if you have to alter for more/less sequence IDs per inch; see below + +regionPlot = + ggplot(dRegion, aes(x=start,xend=end,y=idx)) + + geom_dumbbell(color="dodgerblue", size=2) + + xlab("position") + + scale_x_continuous( expand = c(0, 0) ) + + ylab("") + + scale_y_discrete(position = "right", breaks=dRegion$idx, labels=dRegion$id) + + geom_vline(aes(xintercept=min(allPos))) + + theme(panel.grid.major.y=element_line(size=0.7,color="lightgray") + , axis.text.y=element_text(size=rel(yLabelScale)) + ) + +if ( ! is.na(xVline)) { + regionPlot = regionPlot + + geom_vline(aes(xintercept=xVline)); +} + +#################################################################### +# plot to file +#################################################################### + +plotWidth = 6 +plotHeightDensity = 2 +plotHeight = plotHeightDensity + max(2,nrow(d)/9) +plotHeightDensityRel = plotHeightDensity / plotHeight + + +plot_grid( coveragePlot, regionPlot + , nrow=2, ncol=1 + , align = "hv" + , axis = "r" + , rel_heights= c( plotHeightDensityRel, 1.0-plotHeightDensityRel) +) + +ggsave( outFile + , width= plotWidth + , height= plotHeight +); + +#############################################################EOF diff --git a/R/plotRegions.example.png b/R/plotRegions.example.png new file mode 100644 index 00000000..5f6f080b Binary files /dev/null and b/R/plotRegions.example.png differ diff --git a/README.md b/README.md index 816f8220..bbdcbeb1 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,9 @@ The following topics are covered by this documentation: - [IntaRNAhelix - helix-based predictions](#IntaRNAhelix) - [IntaRNAexact - exact predictions like RNAup](#IntaRNAexact) - [IntaRNAduplex - hybrid-only optimization like RNAduplex](#IntaRNAduplex) + - [IntaRNAsTar - optimized for sRNA-target prediction](#IntaRNAsTar) + - [IntaRNAseed - identifys and reports seed interactions only](#IntaRNAseed) + - [IntaRNAens - ensemble-based prediction and partition function computation](#IntaRNAens) - [How to constrain predicted interactions](#constraintSetup) - [Interaction restrictions](#interConstr) - [Seed constraints](#seed) @@ -116,6 +119,8 @@ The following topics are covered by this documentation: - [Constrain regions to be accessible or blocked](#accConstraints) - [Read/write accessibility from/to file or stream](#accFromFile) - [Library for integration in external tools](#lib) +- [Auxiliary R scripts for output visualization etc.](/R) +- [Auxiliary python scripts for IntaRNA-based pipelines](/python)



@@ -849,8 +854,8 @@ Both personalities make use of the slower `--model=S` [prediction strategy](#interactionModel-ssUnconstraintMfe) first introduced for IntaRNA version 1.0. -Note, since IntaRNA v1.* used the old `Turner99` energy parameters, you also -have to provide the respective [energy file](#energy) when using *IntaRNA1*. +Note, IntaRNA v1.* used the old `Turner99` energy parameters, such that the +`--energyVRNA=Turner99` [energy set](#energy) is preset, when using *IntaRNA1*. [![up](doc/figures/icon-up.28.png) back to overview](#overview) @@ -874,7 +879,7 @@ In contrast to RNAup, it also - enforces [seed constraints](#seed), - uses a much faster [seed-extension-based computation model](#predModel), -- allows longer interaction length (RNAup restricts ot length 25), and +- allows longer interaction length, and - enables much more flexible output options. If needed, you can disables these features. @@ -1326,7 +1331,9 @@ target;5;16;query;5;16;ACCCCCGGUGGU&ACCCCCGGUGGU;(((.((((.(((&))).)))).)));-6.76 target;6;16;query;5;15;CCCCCGGUGGU&ACCCCCGGUGG;((.((((.(((&))).)))).));-5.56 target;7;16;query;5;15;CCCCGGUGGU&ACCCCCGGUGG;((((((.(((&))).)))).));-5.55 ``` -For each prediction, a row in the CSV is generated. +For each prediction, a row in the CSV is generated. The column separator within +the tabular CSV output can be changed using `--outSep`, e.g. to produce tab-separated +`.tsv` output. Using the argument `--outCsvCols`, the user can specify what columns are printed to the output using a comma-separated list of colIds. Available colIds @@ -1348,6 +1355,7 @@ are - `hybridDPfull` : hybrid in VRNA dot-bracket notation (full sequence length) - `hybridDB` : hybrid in dot-bar notation (interactin sites only) - `hybridDBfull` : hybrid in dot-bar notation (full sequence length) +- `bpList` : list of hybrid base pairs, e.g. '(4,3):(5,2):(7,1)' - `E` : overall interaction energy - `ED1` : ED value of seq1 - `ED2` : ED value of seq2 @@ -1373,9 +1381,16 @@ are - `seedED2` : ED value of seq2 of the seed only (excluding rest) (* see below) - `seedPu1` : probability of seed region to be accessible for seq1 (* see below) - `seedPu2` : probability of seed region to be accessible for seq2 (* see below) -- `Eall` : ensemble energy of all considered interactions (-RT*log(Zall)) +- `Eall` : ensemble energy of all considered interactions (-RT*log(`Zall`)) - `Zall` : partition function of all considered interactions +- `Eall1` : ensemble energy of all considered intra-molecular structures of seq1 (given its accessibility constraints) +- `Eall2` : ensemble energy of all considered intra-molecular structures of seq2 (given its accessibility constraints) +- `EallTotal` : total ensemble energy of all considered interactions including the ensemble energies of intra-molecular structure formation (`Eall+Eall1+Eall2`) +- `Etotal` : total energy of an interaction including the ensemble energies of intra-molecular structure formation (`E+Eall1+Eall2`) +- `Zall1` : partition function represented by `Eall1` (exp(-`Eall1`/RT)) +- `Zall2` : partition function represented by `Eall2` (exp(-`Eall2`/RT)) - `P_E` : probability of an interaction (site) within the considered ensemble +- `RT` : normalized temperature used for Boltzmann weight computation (*) Note, since an interaction can cover more than one seed, all `seed*` columns might contain multiple entries separated by ':' symbols. In order to print only @@ -1385,7 +1400,7 @@ to the call. Note further, `Pu` values are recomputed from rounded `ED` values and are thus not equal to the RNAplfold values from which the ED values are derived from! -Using `--outCsvCols ''`, all available columns are added to the output. +Using `--outCsvCols '*'`, all available columns are added to the output. Energies are provided in unit *kcal/mol*, probabilities in the interval [0,1]. Position annotations start indexing with 1 (or the values set via `--qIdxPos0` and `--tIdxPos0`). @@ -1438,8 +1453,11 @@ column labels introduced for the CVS output: - `id1` : id of first sequence (target) - `id2` : id of second sequence (query) -- `Zall` : partition function of all considered interactions +- `RT`: the scaled temperature used for Boltzmann-weight computation - `Eall` : ensemble energy of all considered interactions (-RT*log(Zall)) +- `Eall1` : ensemble energy of all considered intra-molecular structures of seq1 (given its accessibility constraints) +- `Eall2` : ensemble energy of all considered intra-molecular structures of seq2 (given its accessibility constraints) +- `EallTotal` : total ensemble energy of all considered interactions including the ensemble energies of intra-molecular structure formation (`Eall+Eall1+Eall2`) Note, `Zall` depends on the selected [prediction mode](#predModes) and @@ -1448,7 +1466,7 @@ It holds `Zall(--model=S) <= Zall(--model=P)` as well as `Zall(--mode=H) <= Zall(--mode=M)`. Thus, most accurate results are computed using ``` -IntaRNA --model=P --mode=M --out=E ... +IntaRNA --model=P --mode=M --outMode=E ... ``` @@ -1633,12 +1651,17 @@ targeted file/stream name: - `qAcc:`/`tAcc:` the [query/target's ED accessibility values](#accessibility) (RNAplfold-like format), respectively - `qPu:`/`tPu:` the [query/target's unpaired probabilities](#accessibility) (RNAplfold format; rounded!!), respectively -Note, for *multiple sequences* in FASTA input, the provided file names -are suffixed with `-q#t#` (where `#` denotes the according sequence number -within the input where indexing starts with 1 or the values set via `--qIdxPos0` and `--tIdxPos0`). +Note, for *multiple sequences* in FASTA input, the provided file names are suffixed by +- `-t#q#` : for `*spotProb` and `*minE` output, and +- `-s#` : for `*Acc` and `*Pu` output, +where `#` denotes the according target/query sequence number +within the input where numbering starts with 1. + +The column separator within tabular CSV output (defaulting to `;`) can be changed +using `--outSep`, e.g. to produce tab-separated `.tsv` output. Note further, `qPu:`|`tPu:` will report unpaired probability values based on rounded accessibility (ED) values. -Thus, these values will most likely differ from values eg. produced by RNAplfold. +Thus, these values will most likely differ from values eg. produced by the program RNAplfold. We therefore strongly recommend to store `qAcc:`|`tAcc:` values when you want to use them as input for subsequent IntaRNA calls! @@ -2002,9 +2025,9 @@ IntaRNA [..] --out=tAcc:STDOUT | IntaRNA [..] --tAcc=E --tAccFile=STDIN Note, for *multiple sequences* in FASTA input, one can also load the accessibilities (for *all* sequencces) from file. To this end, the file names -have to be prefixed with with `s` and the according sequence's number (where indexing +have to be suffixed with with `-s#`, where `#` denotes the respective sequence's number (where indexing starts with 1) within the FASTA input using a common suffix after the index. -This suffix is to be provided to the according `--?AccFile` argument. +The file name without `-s#` is to be provided to the according `--?AccFile` argument. The files generated by `--out=?Acc:...` are already conform to this requirement, such that you can use the use case examples from above also for multi-sequence FASTA input. diff --git a/configure.ac b/configure.ac index 54fb2c9e..44fe1fcf 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ AC_PREREQ([2.65]) # 5 argument version only available with aclocal >= 2.64 -AC_INIT([IntaRNA], [3.1.1], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) +AC_INIT([IntaRNA], [3.1.2], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) # minimal required version of the boost library BOOST_REQUIRED_VERSION=1.50.0 diff --git a/doc/figures/interaction-context.svg b/doc/figures/interaction-context.svg index ad951342..f382cc51 100644 --- a/doc/figures/interaction-context.svg +++ b/doc/figures/interaction-context.svg @@ -24,9 +24,9 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="1.8101934" - inkscape:cx="1032.5094" - inkscape:cy="186.33596" + inkscape:zoom="0.9050967" + inkscape:cx="541.81219" + inkscape:cy="-121.09379" inkscape:document-units="px" inkscape:current-layer="layer1" showgrid="false" @@ -36,9 +36,9 @@ fit-margin-left="0" fit-margin-right="0" fit-margin-bottom="0" - inkscape:window-width="1266" - inkscape:window-height="746" - inkscape:window-x="92" + inkscape:window-width="1820" + inkscape:window-height="1178" + inkscape:window-x="1458" inkscape:window-y="-8" inkscape:window-maximized="1" /> + y="437.22537" /> + + + + - - + + + + + + + + + + + + @@ -672,18 +779,6 @@ y="370.66202" style="font-size:24px;font-weight:bold;-inkscape-font-specification:DejaVu Sans Bold">e - e h h + n diff --git a/src/IntaRNA/AccessibilityVrna.cpp b/src/IntaRNA/AccessibilityVrna.cpp index fcd1b2cb..100c3396 100644 --- a/src/IntaRNA/AccessibilityVrna.cpp +++ b/src/IntaRNA/AccessibilityVrna.cpp @@ -56,6 +56,13 @@ AccessibilityVrna::AccessibilityVrna( , (plFoldW==0? getSequence().size() : std::min(plFoldW,getSequence().size())) , getAccConstraint().getMaxBpSpan() ); + } else { + // init ED values for short sequences + for (auto row = edValues.begin1(); row != edValues.end1(); row++) { + for (auto ed = row.begin(); ed != row.end(); ed++) { + *ed = 0; + } + } } } @@ -115,6 +122,58 @@ callbackForStorage(FLT_OR_DBL *pr, } +/////////////////////////////////////////////////////////////////////////////// + +/** + * Adds constraints to the VRNA fold compound that correspond to the present + * AccessibilityConstraints + * @param fold_compound INOUT the object to extend + */ +void +AccessibilityVrna:: +addConstraints( vrna_fold_compound_t & fold_compound, const Accessibility & acc ) +{ + // setup folding constraints + if ( ! acc.getAccConstraint().isEmpty() ) { + + const int length = (int)acc.getSequence().size(); + + // copy structure constraint + char * structure = structure = (char *) vrna_alloc(sizeof(char) * (length + 1)); + for (int i=0; iclear(); } // copy data - *seed = *(toCopy.seed); + seed->insert(toCopy.seed->begin(), toCopy.seed->end()); } else { // remove seed information if present INTARNA_CLEANUP(seed); diff --git a/src/IntaRNA/InteractionEnergy.h b/src/IntaRNA/InteractionEnergy.h index 7eeab883..53bbfd17 100644 --- a/src/IntaRNA/InteractionEnergy.h +++ b/src/IntaRNA/InteractionEnergy.h @@ -122,11 +122,12 @@ class InteractionEnergy { , const E_type hybridE ) const; /** - * Provides the ensemble energy for a given partition function Z. + * Provides the ensemble energy (in internal energy representation) + * for a given partition function Z. * * @param Z the ensemble's partition function to convert * - * @return E = -RT * log( Z ) + * @return E = -RT * log( Z ) in internal energy representation */ virtual E_type @@ -480,13 +481,22 @@ class InteractionEnergy { /** * Provides the Boltzmann weight for a given energy. - * @param energy the energy the Boltzmann weight is to be computed for + * @param energy the energy (internal representation) the Boltzmann weight is to be computed for * @return the Boltzmann weight, i.e. exp( - energy / RT ); */ virtual Z_type getBoltzmannWeight( const E_type energy ) const ; + /** + * Provides the Boltzmann weight for a given energy. + * @param energy the energy (in kcal/mol) the Boltzmann weight is to be computed for + * @return the Boltzmann weight, i.e. exp( - energy / RT ); + */ + virtual + Z_type + getBoltzmannWeight( const Z_type energy ) const ; + /** * Provides the base pair encoding for the given indices. @@ -553,6 +563,23 @@ class InteractionEnergy { bool isInternalLoopGUallowed() const; + /** + * Provides the overall ensemble energy for sequence 1 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq1) + */ + virtual + E_type + getEall1() const = 0; + + /** + * Provides the overall ensemble energy for sequence 2 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq2) + */ + virtual + E_type + getEall2() const = 0; protected: @@ -761,6 +788,17 @@ getBoltzmannWeight( const E_type e ) const //////////////////////////////////////////////////////////////////////////// +inline +Z_type +InteractionEnergy:: +getBoltzmannWeight( const Z_type e ) const +{ + // TODO can be optimized when using exp-energies from VRNA + return Z_exp( - e / getRT() ); +} + +//////////////////////////////////////////////////////////////////////////// + inline Interaction::BasePair InteractionEnergy:: @@ -954,7 +992,6 @@ getE( const Z_type Z ) const return Z_2_E( - getRT() * Z_log( Z ) ); } - //////////////////////////////////////////////////////////////////////////// inline diff --git a/src/IntaRNA/InteractionEnergyBasePair.cpp b/src/IntaRNA/InteractionEnergyBasePair.cpp index 1dc49e3a..68b84e5a 100644 --- a/src/IntaRNA/InteractionEnergyBasePair.cpp +++ b/src/IntaRNA/InteractionEnergyBasePair.cpp @@ -3,6 +3,8 @@ namespace IntaRNA { +//////////////////////////////////////////////////////////////////////////// + void InteractionEnergyBasePair::computeES(const RnaSequence &seq, NussinovHandler::E2dMatrix &logQ) { const size_t N = seq.size(); @@ -30,4 +32,68 @@ void InteractionEnergyBasePair::computeES(const RnaSequence &seq, } } +//////////////////////////////////////////////////////////////////////////// + +inline +E_type +InteractionEnergyBasePair:: +getEall1() const +{ + // compute Z if needed + if (E_isINF(Eall1)) { + Eall1 = computeIntraEall( accS1 ); + } + return Eall1; +} + +//////////////////////////////////////////////////////////////////////////// + +inline +E_type +InteractionEnergyBasePair:: +getEall2() const +{ + // compute Z if needed + if (E_isINF(Eall2)) { + Eall2 = computeIntraEall( accS2.getAccessibilityOrigin() ); + } + return Eall2; +} + +//////////////////////////////////////////////////////////////////////////// + +E_type +InteractionEnergyBasePair:: +computeIntraEall( const Accessibility & acc ) const +{ + if ( !acc.getAccConstraint().isEmpty() ) { + INTARNA_NOT_IMPLEMENTED("InteractionEnergyBasePair: accessibility constraints for ensemble energy computation not supported"); + } + + const size_t N = acc.getSequence().size(); + + // check if any base pair can be formed + if (N < minLoopLength) { + return 0; + } + + // create temporary matrices for ED computation + NussinovHandler::Z2dMatrix Q(N, N); + NussinovHandler::Z2dMatrix Qb(N, N); + + // init temporary matrices + for (size_t i = 0u; i < N; ++i) { + for (size_t j = i; j < N; ++j) { + Q(i, j) = -1.0; + Qb(i, j) = -1.0; + } + } + + // compute partition function and convert to ensemble energy + return getE( NussinovHandler::getQ( 0, N-1, acc.getSequence(), basePairWeight, minLoopLength, Q, Qb) ); + +} + +//////////////////////////////////////////////////////////////////////////// + } // namespace IntaRNA diff --git a/src/IntaRNA/InteractionEnergyBasePair.h b/src/IntaRNA/InteractionEnergyBasePair.h index 1ae48695..27804aeb 100644 --- a/src/IntaRNA/InteractionEnergyBasePair.h +++ b/src/IntaRNA/InteractionEnergyBasePair.h @@ -242,29 +242,60 @@ class InteractionEnergyBasePair: public InteractionEnergy { E_type getE_basePair() const; + /** + * Provides the overall ensemble energy for sequence 1 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq1) + */ + virtual + E_type + getEall1() const; + + /** + * Provides the overall ensemble energy for sequence 2 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq2) + */ + virtual + E_type + getEall2() const; + private: //! energy of an individual base pair - const E_type basePairEnergy; + const E_type basePairEnergy; //! temperature constant for normalization - const Z_type RT; - //! Boltzmann Energy weight - const Z_type basePairWeight; - //! minimum length of loops within intramolecular structures (for ES values etc.) - const size_t minLoopLength; - - //! ES values for seq1 - NussinovHandler::E2dMatrix logQ1; - //! ES values for seq2 - NussinovHandler::E2dMatrix logQ2; - - /*** - * Compute the ES for a given sequence and store it in the given lookup table. - * @param seq RNASequence - * @param logQ The resulting lookuptable - */ - void computeES(const RnaSequence &seq, NussinovHandler::E2dMatrix &logQ); + const Z_type RT; + //! Boltzmann Energy weight + const Z_type basePairWeight; + //! minimum length of loops within intramolecular structures (for ES values etc.) + const size_t minLoopLength; + //! ensemble energy of seq1 (lazy computation if needed) + mutable E_type Eall1; + //! ensemble energy of seq2 (lazy computation if needed) + mutable E_type Eall2; + + //! ES values for seq1 + NussinovHandler::E2dMatrix logQ1; + //! ES values for seq2 + NussinovHandler::E2dMatrix logQ2; + + /*** + * Compute the ES for a given sequence and store it in the given lookup table. + * @param seq RNASequence + * @param logQ The resulting lookuptable + */ + void computeES(const RnaSequence &seq, NussinovHandler::E2dMatrix &logQ); + + /** + * Computes the ensemble energy of all intra-molecular structures that + * are conform to the accessibility constraints + * @param acc the Accessibility object to access sequence and constraints + * @return the computed ensemble energy + */ + E_type + computeIntraEall( const Accessibility & acc ) const; }; @@ -296,6 +327,8 @@ InteractionEnergyBasePair::InteractionEnergyBasePair( basePairEnergy(bpEnergy), minLoopLength(minLoopLen), basePairWeight(Z_exp(E_2_Z(-bpEnergy) / _RT)), + Eall1(E_INF), + Eall2(E_INF), logQ1(), logQ2() { diff --git a/src/IntaRNA/InteractionEnergyIdxOffset.h b/src/IntaRNA/InteractionEnergyIdxOffset.h index e81ece8c..1d0c7b69 100644 --- a/src/IntaRNA/InteractionEnergyIdxOffset.h +++ b/src/IntaRNA/InteractionEnergyIdxOffset.h @@ -469,6 +469,24 @@ class InteractionEnergyIdxOffset : public InteractionEnergy bool isValidInternalLoop( const size_t i1, const size_t j1, const size_t i2, const size_t j2 ) const; + /** + * Provides the overall ensemble energy for sequence 1 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq1) + */ + virtual + E_type + getEall1() const; + + /** + * Provides the overall ensemble energy for sequence 2 + * given its accessibility constraints + * @return Eall(constraint-conform intra-molecular structures for seq2) + */ + virtual + E_type + getEall2() const; + protected: @@ -869,7 +887,26 @@ isValidInternalLoop( const size_t i1, const size_t j1, const size_t i2, const si { return energyOriginal.isValidInternalLoop( i1+offset1, j1+offset1, i2+offset2, j2+offset2 ); } - + +//////////////////////////////////////////////////////////////////////////// + +inline +E_type +InteractionEnergyIdxOffset:: +getEall1() const +{ + return energyOriginal.getEall1(); +} + +//////////////////////////////////////////////////////////////////////////// + +inline +E_type +InteractionEnergyIdxOffset:: +getEall2() const +{ + return energyOriginal.getEall2(); +} //////////////////////////////////////////////////////////////////////////// diff --git a/src/IntaRNA/InteractionEnergyVrna.cpp b/src/IntaRNA/InteractionEnergyVrna.cpp index cd152821..bbd6f36c 100644 --- a/src/IntaRNA/InteractionEnergyVrna.cpp +++ b/src/IntaRNA/InteractionEnergyVrna.cpp @@ -1,5 +1,6 @@ #include "IntaRNA/InteractionEnergyVrna.h" +#include "IntaRNA/AccessibilityVrna.h" #include #include @@ -10,9 +11,12 @@ extern "C" { #include #include #include + #include #include } + + namespace IntaRNA { //////////////////////////////////////////////////////////////////////////// @@ -38,6 +42,8 @@ InteractionEnergyVrna::InteractionEnergyVrna( , bpGC( BP_pair[RnaSequence::getCodeForChar('G')][RnaSequence::getCodeForChar('C')] ) , esValues1(NULL) , esValues2(NULL) + , Eall1(E_INF) + , Eall2(E_INF) { vrna_md_defaults_reset( &foldModel ); @@ -160,5 +166,41 @@ computeES( const Accessibility & acc, InteractionEnergyVrna::EsMatrix & esToFill //////////////////////////////////////////////////////////////////////////// +E_type +InteractionEnergyVrna:: +computeIntraEall( const Accessibility & acc ) const +{ + + vrna_md_t curModel = foldModel; + // avoid computation of base pair probabilities + curModel.compute_bpp = 0; + + const int length = acc.getSequence().size(); + + // copy sequence into C data structure + char * sequence = (char *) vrna_alloc(sizeof(char) * (length + 1)); + for (int i=0; i j || j >= seq.size()) { + if (i >= j || j >= seq.size()) { return 1.0; } Z_type &ret = Q(i, j); @@ -156,19 +156,20 @@ NussinovHandler::getPu(const size_t i, const size_t j, const RnaSequence &seq, if (i > j || j >= seq.size()) { return 0.0; } - Z_type &ret = Pu(i, j); + // referencing cell access to enable Pu update if needed + Z_type &curPu = Pu(i, j); // If value is already computed, return it - if (ret > -0.5) { - return ret; + if (curPu > -0.5) { + return curPu; } // Else compute Pu - ret = ((i>0?getQ(0, i - 1, seq, bpWeight, minLoopLength, Q, Qb):1.0) * + curPu = ((i>0?getQ(0, i - 1, seq, bpWeight, minLoopLength, Q, Qb):1.0) * getQ(j + 1, seq.size() - 1, seq, bpWeight, minLoopLength, Q, Qb) / getQ(0, seq.size() - 1, seq, bpWeight, minLoopLength, Q, Qb)); for (size_t p = 0; p < i; ++p) { for (size_t q = j + 1; q < seq.size(); ++q) { if (p+minLoopLengthgetId(), colSep, "_"); break; case id2: // ensure no colSeps are contained - outTmp <getId(), colSep, "_"); break; case seq1: - outTmp <asString(); break; case seq2: - outTmp <asString(); break; case subseq1: - outTmp <asString().substr(i1, j1-i1+1); break; case subseq2: - outTmp <asString().substr(j2, i2-j2+1); break; case subseqDP: - outTmp <asString().substr(i1, j1-i1+1) <<'&' - <asString().substr(j2, i2-j2+1); break; case subseqDB: outTmp <getInOutIndex(i1) - <asString().substr(i1, j1-i1+1) <<'&' <getInOutIndex(j2) - <asString().substr(j2, i2-j2+1); break; case start1: @@ -188,10 +192,25 @@ add( const Interaction & i ) outTmp <getInOutIndex(bp->first) <<',' <getInOutIndex(bp->second) <<')' ; + for( ++bp; bp != i.basePairs.end(); bp++ ) { + outTmp << listSep <<'(' <getInOutIndex(bp->first) <<',' <getInOutIndex(bp->second) <<')'; + }; + break;} + case E: outTmp <(i) ) == colType2string.end() ) { @@ -210,6 +226,9 @@ class OutputHandlerCsv : public OutputHandler * Converts string representation of a list of Coltypes to the respective * list. * + * If the provided string encoding is empty or equals '*', all available + * columns are returned + * * @param listString the string representation of the list * @return the parsed list * @@ -222,24 +241,22 @@ class OutputHandlerCsv : public OutputHandler /** * Checks whether or not Zall needs to be computed to generate all colTypes * @param colTypes the list of column types to consider - * @param colSep the column separator to be used * @return true if one of the colTypes requires Zall computation; * false otherwise. */ static bool - needsZall( const ColTypeList & colTypes, const std::string & colSep = ";" ); + needsZall( const ColTypeList & colTypes ); /** * Checks whether or not interaction base pairs are needed to generate all colTypes * @param colTypes the list of column types to consider - * @param colSep the column separator to be used * @return true if one of the colTypes requires Zall computation; * false otherwise. */ static bool - needBPs( const ColTypeList & colTypes, const std::string & colSep = ";" ); + needBPs( const ColTypeList & colTypes ); /** * Generates the header line for a given list of columns @@ -249,7 +266,7 @@ class OutputHandlerCsv : public OutputHandler */ static std::string - getHeader( const ColTypeList & colTypes, const std::string& colSep = ";" ); + getHeader( const ColTypeList & colTypes, const std::string& colSep ); protected: @@ -322,7 +339,7 @@ string2list( const std::string & stringEncoding ) ColTypeList list; // empty string : give full list - if (stringEncoding.empty()) { + if (stringEncoding.empty() || stringEncoding == "*") { // generate list of all types for (size_t c = 0; c < (size_t)ColTypeNumber; c++) { list.push_back( static_cast(c) ); @@ -362,12 +379,13 @@ string2list( const std::string & stringEncoding ) inline bool OutputHandlerCsv:: -needsZall( const ColTypeList & colTypes, const std::string & colSep ) +needsZall( const ColTypeList & colTypes ) { for (auto it = colTypes.begin(); it != colTypes.end(); it++ ) { // check if type requires Zall computation switch ( *it ) { case Eall: + case EallTotal: case Zall: case P_E: return true; @@ -381,7 +399,7 @@ needsZall( const ColTypeList & colTypes, const std::string & colSep ) inline bool OutputHandlerCsv:: -needBPs( const ColTypeList & colTypes, const std::string & colSep ) +needBPs( const ColTypeList & colTypes ) { for (auto it = colTypes.begin(); it != colTypes.end(); it++ ) { // check if type requires Zall computation @@ -390,6 +408,7 @@ needBPs( const ColTypeList & colTypes, const std::string & colSep ) case hybridDBfull: case hybridDP: case hybridDPfull: + case bpList: return true; } } diff --git a/src/IntaRNA/OutputHandlerEnsemble.cpp b/src/IntaRNA/OutputHandlerEnsemble.cpp index 9b04b69e..d1977dc6 100644 --- a/src/IntaRNA/OutputHandlerEnsemble.cpp +++ b/src/IntaRNA/OutputHandlerEnsemble.cpp @@ -37,10 +37,16 @@ OutputHandlerEnsemble:: outTmp <<"id1 " < "nt_index" ... starting with index 1 out <<"minE"; for (size_t j=rna2Str.size(); j-- > 0; ) { - out <<';' < 0; ) { // out separator - out <<';'; + out < E2dMatrix; @@ -112,13 +119,15 @@ class PredictionTrackerPairMinE: public PredictionTracker * @param pairMinE the minE data to write * @param energy the energy function used * @param E_INF_string the string to be used for E_INF entries + * @param sep_string the output string representation of column separators */ static void writeData( std::ostream &out , const E2dMatrix & pairMinE , const InteractionEnergy & energy - , const std::string & E_INF_string ); + , const std::string & E_INF_string + , const std::string & sep_string ); }; diff --git a/src/IntaRNA/PredictionTrackerProfileMinE.cpp b/src/IntaRNA/PredictionTrackerProfileMinE.cpp index d1d5d2c6..8467a233 100644 --- a/src/IntaRNA/PredictionTrackerProfileMinE.cpp +++ b/src/IntaRNA/PredictionTrackerProfileMinE.cpp @@ -10,7 +10,8 @@ PredictionTrackerProfileMinE( const InteractionEnergy & energy , const std::string & seq1streamName , const std::string & seq2streamName - , const std::string E_INF_string + , const std::string & E_INF_string + , const std::string & sep_string ) : PredictionTracker() , energy(energy) @@ -18,9 +19,16 @@ PredictionTrackerProfileMinE( , seq1stream(NULL) , seq2stream(NULL) , E_INF_string(E_INF_string) + , sep_string(sep_string) , seq1minE( seq1streamName.empty() ? 0 : energy.size1(), E_INF ) // init E_INF , seq2minE( seq2streamName.empty() ? 0 : energy.size2(), E_INF ) // init E_INF { +#if INTARNA_IN_DEBUG_MODE + // check separator + if (sep_string.empty()) { + throw std::runtime_error("PredictionTrackerProfileMinE() empty separator provided"); + } +#endif // open streams if (!seq1streamName.empty()) { seq1stream = newOutputStream( seq1streamName ); @@ -47,7 +55,8 @@ PredictionTrackerProfileMinE( const InteractionEnergy & energy , std::ostream * seq1stream , std::ostream * seq2stream - , const std::string E_INF_string + , const std::string & E_INF_string + , const std::string & sep_string ) : PredictionTracker() , energy(energy) @@ -55,9 +64,16 @@ PredictionTrackerProfileMinE( , seq1stream(seq1stream) , seq2stream(seq2stream) , E_INF_string(E_INF_string) + , sep_string(sep_string) , seq1minE( seq1stream==NULL ? 0 : energy.size1(), E_INF ) // init E_INF , seq2minE( seq2stream==NULL ? 0 : energy.size2(), E_INF ) // init E_INF { +#if INTARNA_IN_DEBUG_MODE + // check separator + if (sep_string.empty()) { + throw std::runtime_error("PredictionTrackerProfileMinE() empty separator provided"); + } +#endif } ////////////////////////////////////////////////////////////////////// @@ -71,14 +87,18 @@ PredictionTrackerProfileMinE:: , seq1minE.begin() , seq1minE.end() , energy.getAccessibility1().getSequence() - , E_INF_string ); + , E_INF_string + , sep_string + ); } if (seq2stream != NULL) { writeProfile( *seq2stream , seq2minE.begin() , seq2minE.end() , energy.getAccessibility2().getAccessibilityOrigin().getSequence() - , E_INF_string ); + , E_INF_string + , sep_string + ); } // clean up if file pointers were created in constructor diff --git a/src/IntaRNA/PredictionTrackerProfileMinE.h b/src/IntaRNA/PredictionTrackerProfileMinE.h index e821cd31..f8747456 100644 --- a/src/IntaRNA/PredictionTrackerProfileMinE.h +++ b/src/IntaRNA/PredictionTrackerProfileMinE.h @@ -37,12 +37,14 @@ class PredictionTrackerProfileMinE: public PredictionTracker * if empty, no data for seq2 is collected * @param E_INF_string the output string representation of E_INF values in * the profile output + * @param sep the column separator to be used in profile output */ PredictionTrackerProfileMinE( const InteractionEnergy & energy , const std::string & seq1streamName , const std::string & seq2streamName - , const std::string E_INF_string = "NA" + , const std::string & E_INF_string = "NA" + , const std::string & sep = ";" ); /** @@ -56,12 +58,14 @@ class PredictionTrackerProfileMinE: public PredictionTracker * is to be written to; if NULL, no data for seq2 is collected * @param E_INF_string the output string representation of E_INF values in * the profile output + * @param sep the column separator to be used in profile output */ PredictionTrackerProfileMinE( const InteractionEnergy & energy , std::ostream * seq1stream , std::ostream * seq2stream - , const std::string E_INF_string = "NA" + , const std::string & E_INF_string = "NA" + , const std::string & sep = ";" ); /** @@ -104,6 +108,9 @@ class PredictionTrackerProfileMinE: public PredictionTracker //! the output string representation of E_INF values in the profile output const std::string E_INF_string; + //! the output string representation of column separators + const std::string sep_string; + //! container definition for minE profile data typedef std::vector MinEProfile; @@ -144,7 +151,9 @@ class PredictionTrackerProfileMinE: public PredictionTracker , const MinEProfileIterator & begin , const MinEProfileIterator & end , const RnaSequence & rna - , const std::string & E_INF_string ); + , const std::string & E_INF_string + , const std::string & sep_string + ); }; @@ -159,20 +168,22 @@ writeProfile( std::ostream &out , const MinEProfileIterator & begin , const MinEProfileIterator & end , const RnaSequence & rna - , const std::string & E_INF_string ) + , const std::string & E_INF_string + , const std::string & sep_string + ) { // write in CSV-like format (column data) // print header : seq.ID ; minE - out <<"idx;"< ZProfile; @@ -145,6 +152,7 @@ class PredictionTrackerProfileSpotProb: public PredictionTracker * normalization * @param rna the RNA the data is about * @param NA_string the string to be used for missing entries + * @param sep the column separator to be used in profile output */ template < typename ZProfileIterator > static @@ -154,7 +162,9 @@ class PredictionTrackerProfileSpotProb: public PredictionTracker , const ZProfileIterator & end , const Z_type overallZ , const RnaSequence & rna - , const std::string & NA_string ); + , const std::string & NA_string + , const std::string & sep + ); }; @@ -170,22 +180,24 @@ writeProfile( std::ostream &out , const ZProfileIterator & end , const Z_type overallZ , const RnaSequence & rna - , const std::string & NA_string ) + , const std::string & NA_string + , const std::string & sep + ) { // write in CSV-like format (column data) const bool noZ = (overallZ==0.0); // print header : seq.ID ; spotProb - out <<"idx;"<flush(); diff --git a/src/IntaRNA/PredictionTrackerSpotProb.h b/src/IntaRNA/PredictionTrackerSpotProb.h index 29a3a185..a6e9afc4 100644 --- a/src/IntaRNA/PredictionTrackerSpotProb.h +++ b/src/IntaRNA/PredictionTrackerSpotProb.h @@ -40,11 +40,13 @@ class PredictionTrackerSpotProb: public PredictionTracker * @param outStreamName the stream name where the probability data * is to be written to. use STDOUT/STDERR for the respective stream. * Otherwise, an according file is created + * @param sep the column separator to be used in profile output */ PredictionTrackerSpotProb( const InteractionEnergy & energy , const std::string & spots , const std::string & outStreamName + , const std::string & sep = ";" ); /** @@ -56,11 +58,13 @@ class PredictionTrackerSpotProb: public PredictionTracker * @param spots the interaction spots to track for their probabilities * @param outStream the stream where the probability data * is to be written to. + * @param sep the column separator to be used in profile output */ PredictionTrackerSpotProb( const InteractionEnergy & energy , const std::string & spots , std::ostream & outStream + , const std::string & sep = ";" ); @@ -128,6 +132,9 @@ class PredictionTrackerSpotProb: public PredictionTracker //! tracking information for each spot std::vector spots; + //! the output column separator to be used + const std::string sep; + //! partition function of all interaction not covering a tracked spot Z_type noSpotZ; diff --git a/src/IntaRNA/PredictionTrackerSpotProbAll.cpp b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp index 4f1abdd6..03602da5 100644 --- a/src/IntaRNA/PredictionTrackerSpotProbAll.cpp +++ b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp @@ -9,13 +9,15 @@ PredictionTrackerSpotProbAll:: PredictionTrackerSpotProbAll( const InteractionEnergy & energy , const std::string & streamName - , const std::string NA_string + , const std::string & NA_string + , const std::string & sep ) : PredictionTracker() , energy(energy) , deleteStreamsOnDestruction(true) , outStream(NULL) , NA_string(NA_string) + , sep(sep) , overallZ( Z_type(0.0) ) , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 { @@ -23,6 +25,10 @@ PredictionTrackerSpotProbAll( if (streamName.empty()) { throw std::runtime_error("PredictionTrackerSpotProbAll() : streamName empty"); } + // check separator + if (sep.empty()) { + throw std::runtime_error("PredictionTrackerSpotProbAll() empty separator provided"); + } #endif // open streams outStream = newOutputStream( streamName ); @@ -37,16 +43,24 @@ PredictionTrackerSpotProbAll:: PredictionTrackerSpotProbAll( const InteractionEnergy & energy , std::ostream * outStream - , const std::string NA_string + , const std::string & NA_string + , const std::string & sep ) : PredictionTracker() , energy(energy) , deleteStreamsOnDestruction(false) , outStream(outStream) , NA_string(NA_string) + , sep(sep) , overallZ( Z_type(0.0) ) , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 { +#if INTARNA_IN_DEBUG_MODE + // check separator + if (sep.empty()) { + throw std::runtime_error("PredictionTrackerSpotProbAll() empty separator provided"); + } +#endif } ////////////////////////////////////////////////////////////////////// @@ -58,7 +72,8 @@ PredictionTrackerSpotProbAll:: , pairZ , overallZ , energy - , NA_string ); + , NA_string + , sep ); // clean up if file pointers were created in constructor if (deleteStreamsOnDestruction) { @@ -113,7 +128,8 @@ writeData( std::ostream &out , const Z2dMatrix & pairZ , const Z_type & overallZ , const InteractionEnergy & energy - , const std::string & NA_string ) + , const std::string & NA_string + , const std::string & sep ) { // direct access to sequence string information const RnaSequence & rna1 = energy.getAccessibility1().getSequence(); @@ -129,7 +145,7 @@ writeData( std::ostream &out // print header : spotProb; "nt_index" ... starting with index 1 out <<"spotProb"; for (size_t j=rna2.size(); j-- > 0; ) { - out <<';' < 0; ) { // out separator - out <<';'; + out <clear(); + INTARNA_CLEANUP(curBest.seed); + } // identify cell with next best non-overlapping interaction site // iterate (decreasingly) over all left interaction starts @@ -328,9 +333,7 @@ getNextBest( Interaction & curBest ) // pointer access to cell value curCell = &(curLeftBound->second); // get overall energy of the interaction - curCellE = energy.getE(curLeftBound->first.first, curLeftBound->second.j1 - , curLeftBound->first.second, curLeftBound->second.j2 - , curLeftBound->second.val ); + curCellE = curLeftBound->second.val; // deal with degeneracy of energy model, ie. multiple sites with bestE if (E_equal(curCellE,curBestCellE)) { // select left most to be deterministic in output diff --git a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp index 4131915b..4e124e0f 100644 --- a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp @@ -222,13 +222,14 @@ fillHybridE_left( const size_t si1, const size_t si2 ) const size_t noLpShift = outConstraint.noLP ? 1 : 0; E_type iStackE = E_type(0); - // iterate over all window starts j1 (seq1) and j2 (seq2) - for (i1=si1; si1-i1 < hybridE_left.size1(); i1--) { - // screen for right boundaries in seq2 - for (i2=si2; si2-i2 < hybridE_left.size2(); i2--) { + // iterate over all window starts + for (size_t l1=0; l1 < hybridE_left.size1(); l1++) { + for (size_t l2=0; l2 < hybridE_left.size2(); l2++) { + i1 = si1-l1; + i2 = si2-l2; // referencing cell access - E_type & curMinE = hybridE_left(si1-i1,si2-i2); + E_type & curMinE = hybridE_left(l1,l2); // init cell curMinE = (i1==si1 && i2==si2) ? energy.getE_init() : E_INF; // check if complementary @@ -242,10 +243,8 @@ fillHybridE_left( const size_t si1, const size_t si2 ) } // get stacking energy to avoid recomputation in recursion below iStackE = energy.getE_interLeft(i1,i1+noLpShift,i2,i2+noLpShift); - // check just stacked seed extension - if (i1+noLpShift==si1 && i2+noLpShift==si2) { - curMinE = std::min( curMinE, iStackE + hybridE_left(0,0) ); - } + // check just stacked + curMinE = std::min( curMinE, iStackE + hybridE_left(l1-noLpShift,l2-noLpShift)); } // check all combinations of decompositions into (i1,i2)..(k1,k2)-(j1,j2) diff --git a/src/IntaRNA/PredictorMfe2dSeedExtension.cpp b/src/IntaRNA/PredictorMfe2dSeedExtension.cpp index 9bcbf799..3d2e5d75 100644 --- a/src/IntaRNA/PredictorMfe2dSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfe2dSeedExtension.cpp @@ -99,12 +99,12 @@ predict( const IndexRange & r1, const IndexRange & r2 ) // update Optimum for all boundary combinations for (int i1 = 0; i1 < hybridE_left.size1(); i1++) { - const size_t j1max = std::min(maxMatrixLen1-i1, hybridE_right.size1()); - // ensure max interaction length in seq 1 - for (int j1 = 0; j1 < j1max; j1++) { - assert(sj1+j1-si1+i1 < energy.getAccessibility1().getMaxLength()); - for (int i2 = 0; i2 < hybridE_left.size2(); i2++) { - if (E_isINF(hybridE_left(i1,i2))) continue; + for (int i2 = 0; i2 < hybridE_left.size2(); i2++) { + if (E_isINF(hybridE_left(i1,i2))) continue; + const size_t j1max = std::min(maxMatrixLen1-i1, hybridE_right.size1()); + // ensure max interaction length in seq 1 + for (int j1 = 0; j1 < j1max; j1++) { + assert(sj1+j1-si1+i1 < energy.getAccessibility1().getMaxLength()); const size_t j2max = std::min(maxMatrixLen2-i2, hybridE_right.size2()); // ensure max interaction length in seq 2 for (int j2 = 0; j2 < j2max; j2++) { @@ -147,14 +147,15 @@ fillHybridE_left( const size_t si1, const size_t si2 ) E_type iStackE = E_type(0); // iterate over all window starts j1 (seq1) and j2 (seq2) - for (i1=si1; si1-i1 < hybridE_left.size1(); i1--) { - for (i2=si2; si2-i2 < hybridE_left.size2(); i2--) { + for (size_t l1=0; l1 < hybridE_left.size1(); l1++) { + for (size_t l2=0; l2 < hybridE_left.size2(); l2++) { + i1 = si1-l1; + i2 = si2-l2; // referencing cell access - E_type & curE = hybridE_left(si1-i1,si2-i2); - // init current cell (e_init if just left (i1,i2) base pair) + E_type & curE = hybridE_left(l1,l2); + // init current cell (e_init if just left (i1,i2) base pair; assuming seed is internally stacked on the left end if noLP) curE = (i1==si1 && i2==si2) ? energy.getE_init() : E_INF; - // check if complementary if( i1 energy.getMaxInternalLoopSize1()+1) break; - for (k2=i2+noLpShift; k2++ < si2; ) { - // ensure maximal loop length - if (k2-i2-noLpShift > energy.getMaxInternalLoopSize2()+1) break; - // check if (k1,k2) are valid left boundary - if ( E_isNotINF( hybridE_left(si1-k1,si2-k2) ) ) { - curE = std::min( curE, - (iStackE - + energy.getE_interLeft(i1+noLpShift,k1,i2+noLpShift,k2) - + hybridE_left(si1-k1,si2-k2) ) - ); - } - } // k2 - } // k1 + for (k2=i2+noLpShift; k2++ < si2; ) { + // ensure maximal loop length + if (k2-i2-noLpShift > energy.getMaxInternalLoopSize2()+1) break; + // check if (k1,k2) are valid left boundary + if ( E_isNotINF( hybridE_left(si1-k1,si2-k2) ) ) { + curE = std::min( curE, + (iStackE + + energy.getE_interLeft(i1+noLpShift,k1,i2+noLpShift,k2) + + hybridE_left(si1-k1,si2-k2) ) + ); + } + } // k2 + } // k1 } // i is valid right end } // i2 } // i1 @@ -379,14 +378,22 @@ traceBack( Interaction & interaction ) // check all interval splits if ( (si1-i1-noLpShift) > 1 && (si2-i2-noLpShift) > 1) { - // temp variables - size_t k1,k2; bool traceNotFound = true; + // temp variables + size_t k1=i1+noLpShift,k2=i2+noLpShift; + if (outConstraint.noLP && E_equal( curE, (iStackE + hybridE_left(si1-k1,si2-k2)) )) { + traceNotFound = false; + interaction.basePairs.push_back( energy.getBasePair(k1,k2) ); + // trace right part of stack + i1=k1; + i2=k2; + curE = hybridE_left(si1-i1,si2-i2); + } // check all combinations of decompositions into (i1,i2)..(k1,k2)-(si1,si2) const size_t k1max = std::min(si1-1,i1+noLpShift+energy.getMaxInternalLoopSize1()+1); const size_t k2max = std::min(si2-1,i2+noLpShift+energy.getMaxInternalLoopSize2()+1); - for (k1=i1+noLpShift+1; traceNotFound && k1<=k1max; k1++) { - for (k2=i2+noLpShift+1; traceNotFound && k2<=k2max; k2++) { + for (k1++; traceNotFound && k1<=k1max; k1++) { + for (k2++; traceNotFound && k2<=k2max; k2++) { // check if (k1,k2) are valid left boundary if ( E_isNotINF( hybridE_left(si1-k1,si2-k2) ) ) { if ( E_equal( curE, diff --git a/src/IntaRNA/PredictorMfeEns2dSeedExtension.cpp b/src/IntaRNA/PredictorMfeEns2dSeedExtension.cpp index ab0d7433..cc7aa459 100644 --- a/src/IntaRNA/PredictorMfeEns2dSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfeEns2dSeedExtension.cpp @@ -200,8 +200,10 @@ fillHybridZ_left( const size_t si1, const size_t si2 ) Z_type iStackZ = Z_type(1); // iterate over all window starts i1 (seq1) and i2 (seq2) - for (i1=si1; si1-i1 < hybridZ_left.size1(); i1-- ) { - for (i2=si2; si2-i2 < hybridZ_left.size2(); i2-- ) { + for (size_t l1=0; l1 < hybridZ_left.size1(); l1++) { + for (size_t l2=0; l2 < hybridZ_left.size2(); l2++) { + i1 = si1-l1; + i2 = si2-l2; // referencing cell access Z_type & curZ = hybridZ_left(si1-i1,si2-i2); @@ -220,10 +222,8 @@ fillHybridZ_left( const size_t si1, const size_t si2 ) } // get stacking energy to avoid recomputation in recursion below iStackZ = energy.getBoltzmannWeight(energy.getE_interLeft(i1,i1+noLpShift,i2,i2+noLpShift)); - // check just stacked seed extension - if (i1+noLpShift==si1 && i2+noLpShift==si2) { - curZ += iStackZ * hybridZ_left(0,0); - } + // check just stacked + curZ += iStackZ + hybridZ_left(l1-noLpShift,l2-noLpShift); } // check all combinations of decompositions into (i1,i2)..(k1,k2)-(j1,j2) diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index e8cd6c19..fd37c11c 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -81,8 +81,7 @@ const std::string CommandLineParsing::outCsvCols_default = "id1,start1,end1,id2, //////////////////////////////////////////////////////////////////////////// -const std::string CommandLineParsing::outCsvColSep = ";"; -const std::string CommandLineParsing::outCsvLstSep = ","; +const std::string CommandLineParsing::outCsvLstSep = ":"; //////////////////////////////////////////////////////////////////////////// @@ -195,6 +194,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) outBestSeedOnly(false), outNoLP(false), outNoGUend(false), + outSep(";"), outCsvCols(outCsvCols_default), outPerRegion(false), outSpotProbSpots(""), @@ -835,6 +835,12 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) "\n 'spotProb:' (target+query) tracks for a given set of interaction spots their probability to be covered by an interaction. If no spots are provided, probabilities for all index combinations are computed. Spots are encoded by comma-separated 'idxT&idxQ' pairs (target-query). For each spot a probability is provided in concert with the probability that none of the spots (encoded by '0&0') is covered (CSV format). The spot encoding is followed colon-separated by the output stream/file name, eg. '--out=\"spotProb:3&76,59&2:STDERR\"'. NOTE: value has to be quoted due to '&' symbol!" "\nFor each, provide a file name or STDOUT/STDERR to write to the respective output stream." ).c_str()) + ("outSep" + , value< std::string >(&(outSep)) + ->composing() + ->default_value(outSep.c_str()) + ->notifier(boost::bind(&CommandLineParsing::validate_outSep,this,_1)) + , std::string("column separator to be used in tabular CSV output").c_str()) (outMode.name.c_str() , value(&(outMode.val)) ->default_value(outMode.def) @@ -901,7 +907,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ->default_value(outCsvCols,"see text") ->notifier(boost::bind(&CommandLineParsing::validate_outCsvCols,this,_1)) , std::string("output : comma separated list of CSV column IDs to print if outMode=C." - " An empty argument (using '') prints all possible columns from the following available ID list: " + " Using '*' or an empty argument ('') prints all possible columns from the following available ID list: " + OutputHandlerCsv::list2string(OutputHandlerCsv::string2list(""),", ")+"." + "\nDefault = '"+outCsvCols+"'." ).c_str()) @@ -1282,8 +1288,8 @@ parse(int argc, char** argv) case 'C' : { if (!qAccFile.empty()) LOG(INFO) <<"qAcc = "<getOutStream() - <getOutStream(), energy ); case 'C' : // ensure that Zall is computed if needed - outNeedsZall = outNeedsZall || OutputHandlerCsv::needsZall(OutputHandlerCsv::string2list( outCsvCols ), outCsvColSep); + outNeedsZall = outNeedsZall || OutputHandlerCsv::needsZall(OutputHandlerCsv::string2list( outCsvCols )); // check whether interaction details are needed - outNeedsBPs = OutputHandlerCsv::needBPs(OutputHandlerCsv::string2list( outCsvCols ), outCsvColSep);; + outNeedsBPs = OutputHandlerCsv::needBPs(OutputHandlerCsv::string2list( outCsvCols ));; // create output handler - return new OutputHandlerCsv( getOutputConstraint(), outStreamHandler->getOutStream(), energy, OutputHandlerCsv::string2list( outCsvCols ), outCsvColSep, false, outCsvLstSep ); + return new OutputHandlerCsv( getOutputConstraint(), outStreamHandler->getOutStream(), energy, OutputHandlerCsv::string2list( outCsvCols ), outSep, false, outCsvLstSep ); default : INTARNA_NOT_IMPLEMENTED("Output mode "+toString(outMode.val)+" not implemented yet"); } diff --git a/src/bin/CommandLineParsing.h b/src/bin/CommandLineParsing.h index e279240b..cea60b04 100644 --- a/src/bin/CommandLineParsing.h +++ b/src/bin/CommandLineParsing.h @@ -642,9 +642,9 @@ class CommandLineParsing { bool outNoLP; //! whether or not GU base pairs are allowed at interaction and helix ends bool outNoGUend; - //! the CSV column separator - static const std::string outCsvColSep; - //! the CSV list separator within individual columns + //! the column separator to be used for tabular output + std::string outSep; + //! the list separator within individual columns of tabular output static const std::string outCsvLstSep; //! the CSV column selection std::string outCsvCols; @@ -873,6 +873,13 @@ class CommandLineParsing { */ void validate_out(const std::vector & list); + /** + * Validates the tabular column separator argument. + * + * @param sep the argument value to validate + */ + void validate_outSep(const std::string & sep); + /** * Validates the outCsvCols argument. * @param value the argument value to validate @@ -1514,6 +1521,27 @@ void CommandLineParsing::validate_outputTarget( //////////////////////////////////////////////////////////////////////////// +inline +void CommandLineParsing::validate_outSep(const std::string & outSep) { + + // check if empty + if ( outSep.empty() ) { + LOG(ERROR) <<"no column separator for tabular CSV output defined (empty)"; + updateParsingCode(ReturnCode::STOP_PARSING_ERROR); + return; + } + + // check if distinct from value list separator + if ( outSep == outCsvLstSep ) { + LOG(ERROR) <<"the selected column separator '"< & list) { diff --git a/tests/AccessibilityBasePair_test.cpp b/tests/AccessibilityBasePair_test.cpp index d77ad3f8..0dda2556 100644 --- a/tests/AccessibilityBasePair_test.cpp +++ b/tests/AccessibilityBasePair_test.cpp @@ -8,7 +8,6 @@ using namespace IntaRNA; -const std::string seq = "gguccacguccaa"; TEST_CASE("AccessibilityBasePair", "[AccessibilityBasePair]") { @@ -16,10 +15,11 @@ TEST_CASE("AccessibilityBasePair", "[AccessibilityBasePair]") { // setup easylogging++ stuff if not already done #include "testEasyLoggingSetup.icc" - RnaSequence rna("test", seq); + SECTION("ED test long") { - SECTION("Basepair Energy difference") { - AccessibilityBasePair acc(rna, 10, NULL); + const std::string seq = "gguccacguccaa"; + RnaSequence rna("test", seq); + AccessibilityBasePair acc(rna, 10, NULL); // std::cout <<"\n\n########\n"; // for (int i =0; i +#include "IntaRNA/AccessibilityVrna.h" + +using namespace IntaRNA; + + + +TEST_CASE("AccessibilityVrna", "[AccessibilityVrna]") { + + // setup easylogging++ stuff if not already done + #include "testEasyLoggingSetup.icc" + + + SECTION("ED test short") { + + const std::string seq = "GG"; + RnaSequence rna("test", seq); + VrnaHandler vrnaHandler(37,"Turner04",false,false); + AccessibilityVrna acc(rna, 2, NULL,vrnaHandler,2); + + REQUIRE( E_equal( acc.getED(0, 0), 0 ) ); + REQUIRE( E_equal( acc.getED(1, 1), 0 ) ); + REQUIRE( E_equal( acc.getED(0, 1), 0 ) ); + + } +} diff --git a/tests/Makefile.am b/tests/Makefile.am index b2f67b71..ff862502 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -27,6 +27,7 @@ runApiTests_SOURCES = \ AccessibilityConstraint_test.cpp \ AccessibilityFromStream_test.cpp \ AccessibilityBasePair_test.cpp \ + AccessibilityVrna_test.cpp \ HelixConstraint_test.cpp \ HelixHandlerNoBulgeMax_test.cpp \ HelixHandlerNoBulgeMaxSeed_test.cpp \