diff --git a/ChangeLog b/ChangeLog index 7784a54d..0b2b9d38 100644 --- a/ChangeLog +++ b/ChangeLog @@ -10,9 +10,84 @@ # changes in development version since last release ################################################################################ + + ################################################################################ ################################################################################ +################################################################################ +### version 3.2.0 +################################################################################ + +# IntaRNA +- BUGFIX: accessibility blocking constraints were only applied to seed location +- improved Zall estimate (and depending values) for `--model=X` (default) +- new arguments: + - `qId|tId` : optional id (FASTA prefix) setup for sequence naming + - `acc|accW|accL` : meta accessibility setup for both query and target + - `intLenMax|intLoopMax` : meta interaction and interior loop length setup + +# CopomuS +- Compensatory mutation selector to support interaction validation experiments + +################################################################################ + +200302 Martin Raden + * IntaRNA/IndexRange : + * fromString() : + * bugfix: + missing parsing of negative indices + * IntaRNA/InteractionEnergy : + * areComplementary() : + + check if both positions are accessible (to avoid additional checks in + predictor recursions) + * bin/IntaRNA : + * ambiguous nt warning now in verbose log (was info log) + +200217 Martin Raden + * bin/CommandLineParsing : + * NumericParameter : + * CharParameter : + + isSet() : checks if value and default differ + + qId|tId : id (prefix) for sequence naming + + acc|accW|accL : meta for q|t* + + intLenMax|intLoopMax : meta for q|t* + * resetParamDefault() : + + overwrite value (since set to default in constructor) + + validate_region|shape|shapeMethod|shapeConversion : generic checks for q|t + + validate_seedRange : generic checks for q|t + + validate_numberArgumentExcludeRange() : generic check excluding a range + + validate_id() : checks for line breaks in q|tId + * parseSequences() : + * parseSequencesFasta() : + + idPrefix handling + - obsolete functions + - validate_query|target + - validate_qAccW|L|Constr + - validate_q|tRegion|Shape|ShapeMethod|ShapeConversion + - validate_seedQ|TRange + * constructor() : + + extended default reset for new arguments + * q|tAcc* arguments now hidden + * validation calls refactored + + model|acc|intLenMax|intLoopMax now general arguments + * parse() : + + additional checks for meta arguments with setup of q|t variables + + usage of .isSet() where appropriate + + check that outSep is not within id prefix in outMode=C + + ensure intLenMax >= seedBP + +200207 Martin Raden + Fabio Gutmann + + python/CopomuS.py : Compensatory mutation selector to support interaction + validation experiments + + python/copomus/* : utility functions of CopomuS + * python/README.md : links to dedicated README.md of subfolders + +200131 Martin Raden + * IntaRNA/PredictorMfe2dHeuristicSeedExtension : + * fillHybridE_left() : + + extended Zall update (additional save cases considered) + + ################################################################################ ### version 3.1.5 ################################################################################ diff --git a/README.md b/README.md index bbdcbeb1..9cb92b71 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ not needed to be installed separately): The data provided within the github repository (or within the `Source code` archives provided at the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases)) +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest)) is no complete distribution and lacks all system specifically generated files. Thus, in order to get started with a fresh clone of the IntaRNA source code repository you have to run the GNU autotools @@ -214,7 +214,7 @@ Afterwards, you can continue as if you would have downloaded an ## IntaRNA package distribution (e.g. `intaRNA-2.0.0.tar.gz`) When downloading an IntaRNA package distribution (e.g. `intaRNA-2.0.0.tar.gz`) from the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases), you should +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest), you should first ensure, that you have all [dependencies](#deps) installed. If so, you can simply run the following (assuming `bash` shell). ```bash @@ -277,10 +277,10 @@ and follow either [install from github](#instgithub) or ### ... using pre-compiled binaries For some releases, we also provide precompiled binary packages for Microsoft Windows at the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases) +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest) that enable 'out-of-the-box' usage. If you want to use them: -- [download](https://github.com/BackofenLab/IntaRNA/releases) the according ZIP archive and extract +- [download](https://github.com/BackofenLab/IntaRNA/releases/latest) the according ZIP archive and extract - open a [Windows command prompt](https://www.lifewire.com/how-to-open-command-prompt-2618089) - [run IntaRNA](#usage) @@ -331,7 +331,7 @@ brew install viennarna brew install doxygen ``` -Download and extract the IntaRNA source code package (e.g. `intaRNA-2.0.2.tar.gz`) from the [release page](releases/). +Download and extract the IntaRNA source code package (e.g. `intaRNA-2.0.2.tar.gz`) from the [release page](https://github.com/BackofenLab/IntaRNA/releases/latest). ```[bash] ./configure CC=gcc-6 CXX=g++-6 @@ -412,9 +412,12 @@ interaction energy = -10.85 kcal/mol ``` -or provide (multiple) sequence(s) in [FASTA-format](#https://en.wikipedia.org/wiki/FASTA_format). -It is possible to provide either file input or to read the FASTA input from the -STDIN stream. +In case you need specific RNA names in your output, you can provide ID strings +for each RNA using e.g. `--tId="mRNA with GC"` or `--qId="sRNA-example"`. + +Multiple sequences can be provided in [FASTA-format](#https://en.wikipedia.org/wiki/FASTA_format). +It is possible to use either file input or to read the FASTA input from the +`STDIN` stream. ```bash # running IntaRNA with FASTA files @@ -497,10 +500,8 @@ mode = M # no seed constraint noSeed = true # full global accessibility computation -qAccW = 0 -qAccL = 0 -tAccW = 0 -tAccL = 0 +accW = 0 +accL = 0 ``` where you use the long parameter names without the leading `--`. @@ -750,7 +751,7 @@ by running IntaRNA when disabling both the seed constraint as well as the accessibility integration using ```bash # prediction results similar to TargetScan/RNAhybrid -IntaRNA [..] --noSeed --qAcc=N --tAcc=N +IntaRNA [..] --noSeed --acc=N ``` We *add seed-constraint support to TargetScan/RNAhybrid-like computations* by removing the `--noSeed` flag from the above call. @@ -767,7 +768,7 @@ Finally, RNAup only predicts interactions for subsequences of length 25. All this can be setup using ```bash # prediction results similar to 'RNAup -b' (incorporating accessibility of both RNAs) -IntaRNA --mode=M --noSeed --qAccW=0 --qAccL=0 --qIntLenMax=25 --tAccW=0 --tAccL=0 --tIntLenMax=25 +IntaRNA --mode=M --noSeed --accW=0 --accL=0 --intLenMax=25 ``` We *add seed-constraint support to RNAup-like computations* by removing the `--noSeed` flag from the above call. @@ -1055,10 +1056,10 @@ which results in two shorter ranges. If a range is shorter than `--seedBP`, it is completely removed. Finally, it is possible to restrict the overall length an interaction is allowed -to have. This can be done independently for the query and target sequence using -`--qIntLenMax` and `--tIntLenMax`, respectively. By setting both to 0 (default), +to have via `--intLenMax`. This can be done independently for the query and target sequence using +`--qIntLenMax` and `--tIntLenMax`, respectively. By setting to 0 (default), the smaller of the full sequence length and the maximal accessibility-window -size (`--tAccW`, `--qAccW`) is used. +size is used (see `--accW`, `--tAccW`, or `--qAccW`). @@ -1610,8 +1611,8 @@ If no value for `--energyVRNA` is provided, the default model of the underlying Vienna RNA package is used (see respective documentation). To increase prediction quality and to reduce the computational complexity, the -number of unpaired bases between intermolecular base pairs is restricted -(similar to internal loop length restrictions in single RNA folding algorithm). The +number of unpaired bases between intermolecular base pairs is restricted via +`--intLoopMax` (similar to internal loop length restrictions in single RNA folding algorithm). The upper bound can be set independently for the query and target sequence via `--qIntLoopMax` and `--tIntLoopMax`, respectively, and defaults to 16. @@ -1856,10 +1857,11 @@ penalty for a subsequence partaking in an interaction is given by *ED=-RT log(Pu where *Pu* denotes the unpaired probability of the subsequence. Within the IntaRNA energy model, *ED* values for both interacting subsequences are considered. -Accessibility incorporation can be disabled for query or target sequences using +To globally turn off accessibility consideration, set `--acc=N`. +Accessibility incorporation can be disabled separately for query or target sequences using `--qAcc=N` or `--tAcc=N`, respectively. -A setup of `--qAcc=C` or `--tAcc=C` (default) enables accessibility computation +A setup of `--acc=C` (default, as for `--qAcc=C` and `--tAcc=C`) enables accessibility computation using the selected [energy model](#energy) for query or target sequences, respectively. Using `--accNoLP` and `--accNoGUend`, the consideration of lonely base pairs @@ -1885,7 +1887,8 @@ unpaired probabilities within the window (while only intramolecular base pairs within the window are considered). IntaRNA enables both global as well as local unpaired probability computation. -To this end, the sliding window length has to be specified in order to enable/disable +To this end, the sliding window length `--accW` and the maximal base pair span +`--accL` (<= `--accW`) have to be specified in order to enable/disable local folding. ##### Use case examples global/local unpaired probability computation @@ -1899,7 +1902,7 @@ i.e. the maximal number of positions enclosed by a base pair independently while respecting `AccL <= AccW`. ```bash # using global accessibilities for target and query -IntaRNA [..] --qAccW=0 --qAccL=0 --tAccW=0 --qAccL=0 +IntaRNA [..] --accW=0 --accL=0 # using global accessibilities for query and local ones for target IntaRNA [..] --qAccW=0 --qAccL=0 --tAccW=150 --qAccL=100 ``` diff --git a/configure.ac b/configure.ac index 1b48235f..4e2ad09f 100644 --- a/configure.ac +++ b/configure.ac @@ -1,13 +1,13 @@ AC_PREREQ([2.65]) # 5 argument version only available with aclocal >= 2.64 -AC_INIT([IntaRNA], [3.1.5], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) +AC_INIT([IntaRNA], [3.2.0], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) # minimal required version of the boost library BOOST_REQUIRED_VERSION=1.50.0 # minimal required version of the Vienna RNA library VRNA_REQUIRED_VERSION=2.4.14 -# minimal required version of python interpreter +# minimal required version of python interpreter (to install intarnapvalue) PYTHON_REQUIRED_VERSION=3.6 @@ -436,6 +436,7 @@ AC_CONFIG_FILES([R/Makefile]) AC_CONFIG_FILES([python/Makefile]) AC_CONFIG_FILES([python/bin/Makefile]) AC_CONFIG_FILES([python/intarnapvalue/Makefile]) +AC_CONFIG_FILES([python/copomus/Makefile]) AC_CONFIG_FILES([tests/Makefile]) AC_CONFIG_FILES([tests/data/Makefile]) AC_CONFIG_FILES([doc/Makefile]) diff --git a/python/CopomuS.py b/python/CopomuS.py new file mode 100644 index 00000000..26c3af4b --- /dev/null +++ b/python/CopomuS.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 + +# Copyright 2019 +# Author: Fabio Gutmann + + +__intarna_min__ = '3.1.2' +__python_min__ = '3.7.0' + +import os +import sys +from platform import python_version + +if python_version() < __python_min__: + sys.stderr.write(f'CopomuS requires python >= {__python_min__}\n') + sys.exit(1) + +import csv +from tempfile import gettempdir +from argparse import ArgumentParser, FileType, SUPPRESS +from typing import List +from copomus.mutation import Mutation +from copomus.IntaRNA import IntaRNA +from copomus.candidate_selectors import get_selector +from copomus.candidate_filters import get_filter +from copomus.mutation_generators import get_generator + + + +class CopomuS: + def __init__(self): + self.query = '' + self.target = '' + self.qidxpos0 = 1 + self.tidxpos0 = 1 + self.param_file = '' + self.measures = [] + self.candidate_selection = '' + self.candidate_filters = [] + self.output = '' + self.delimiter = '' + self.mutation_generator = '' + self.mutation_encoding = '' + self.alpha, self.beta = 0.0, 0.0 + self.threads = 0 + + def main(self): + # PARSE COMMAND LINE ARGS + self.parse_cmd_args() + + # CHECK INTARNA BIN AND VERSION + IntaRNA.check_binary(__intarna_min__) + + # SELECT CANDIDATES + bp_func = get_selector(self.candidate_selection) if not self.mutation_encoding else get_selector('mutEnc') + bp = bp_func(self.query, self.target, self.qidxpos0, self.tidxpos0, self.param_file, self.threads, + enc=self.mutation_encoding) + + # FILTER OUT CANDIDATES + if not self.mutation_encoding: + for f in self.candidate_filters: + func = get_filter(f) + bp = func(bp, self.query, self.target, self.qidxpos0, self.tidxpos0) + + if not bp: + sys.stderr.write('There are no base pair candidates left after filtering!\n') + sys.exit(1) + + # CREATE MUTATIONS + if not self.mutation_encoding: + gen_func = get_generator(self.mutation_generator) + mutations = gen_func(self.query, self.target, self.qidxpos0, self.tidxpos0, bp, self.measures, + self.alpha, self.beta) + else: + gen_func = get_generator('specific') + mutations = gen_func(self.query, self.target, self.qidxpos0, self.tidxpos0, self.measures, + self.alpha, self.beta, self.mutation_encoding) + + # RUN MEASUREMENTS + self.run_tests(mutations) + + # SORT MUTATIONS + mutations.sort() + + # CALCULATE RANKS + self.calculate_ranks(mutations) + + # OUTPUT + self.output_csv(mutations) + + def parse_cmd_args(self): + defaults = { + 'measures': ['mfeCover', 'E', 'minDeltaE'], + 'selector': 'mfe', + 'filters': [], + 'generator': 'flip', + 'alpha': 1.0, + 'beta': 1.0, + 'threads': 0 + } + + parser = ArgumentParser(description='Checks different measures for rating mutations') + parser.add_argument('-q', '--query', dest='query', required=True, help='The query sequence.') + parser.add_argument('-t', '--target', dest='target', required=True, help='The target sequence.') + parser.add_argument('--qIdxPos0', dest='qidxpos0', default=1, + type=int, help='The starting index for the query. (Default: 1)') + parser.add_argument('--tIdxPos0', dest='tidxpos0', default=1, type=int, + help='The starting index for the target. (Default: 1)') + parser.add_argument('-m', '--measure', dest='measures', action='append', default=[], + help='Which measure to add to the output, can be used multiple times. ' + 'Output will be sorted in order of measures specified. ' + f"(Default: {defaults['measures']})", + choices=['E', 'minDeltaE', 'mfeCover', 'EDqi', 'cEDqi', 'Eqi', 'Eti']) + parser.add_argument('-s', '--candidateSelection', dest='candidate_selection', default=defaults['selector'], + help='Defines the method used to select candidate base pairs. ' + f"(Default: {defaults['selector']})", + choices=['mfe', 'mfeSO']) + parser.add_argument('-f', '--candidateFilter', dest='candidate_filters', action='append', + default=[], + help='Filters candidate base pairs, can be used multiple times. ' + f"(Default: {defaults['filters']})", + choices=['GU', 'AU', 'CG', 'lp', 'lpMfe', 'he', 'heMfe']) + parser.add_argument('-g', '--generator', dest='mutation_generator', default=defaults['generator'], + choices=['flip', 'any'], help='Defines the method used for generating mutated sequences. ' + f"(Default: {defaults['generator']})") + parser.add_argument('--mutationEncoding', dest='mutation_encoding', + help='Allows direct candidate selection by specifying a mutation encoding. ' + 'Overwrites options -s, -f, and -g') + parser.add_argument('-o', '--output', dest='output', nargs='?', type=FileType('w'), default=sys.stdout, + help='Which file the output should be written to. (Default: STDOUT)') + parser.add_argument('-d', '--delimiter', dest='delimiter', default='\t', + help='Defines the delimiter used to separate columns in the output, default tab. ' + '(Default: \\t)') + parser.add_argument('-p', '--parameterFile', dest='param_file', default='', + help='Optional parameter file for IntaRNA to provide further parameters and ' + 'prediction constraints.') + parser.add_argument('--threads', dest='threads', default=defaults['threads'], type=int, + help='Threads used for IntaRNA call') + parser.add_argument('--alpha', dest='alpha', type=float, default=defaults['alpha'], help=SUPPRESS) + parser.add_argument('--beta', dest='beta', type=float, default=defaults['beta'], help=SUPPRESS) + + args = parser.parse_args() + + mm = [] # filter out any duplicate entries for measures + args.measures = defaults['measures'] if not args.measures else args.measures + for m in args.measures: + if m not in mm: + mm.append(m) + args.measures = mm + + ff = [] # filter out any duplicate entries for filters + args.candidate_filters = defaults['filters'] if not args.candidate_filters else args.candidate_filters + for f in args.candidate_filters: + if f not in ff: + ff.append(f) + args.candidate_filters = ff + + args.query = args.query.upper().replace('T', 'U') # capitalize sequences and replace T -> U + args.target = args.target.upper().replace('T', 'U') + + if args.param_file and not os.path.exists(args.param_file): + sys.stderr.write(f'Cannot find provided parameter file: {args.param_file}\n') + sys.exit(1) + + for key, value in args.__dict__.items(): + self.__setattr__(key, value) + + def run_tests(self, mutations: List[Mutation]): + for mut in mutations: + outcsvcols = list(set([j for i in [mm.outcsvcols for mm in mut.measures] for j in i])) # combine lists + outcsvcols = ','.join(outcsvcols) # join columns + outcsvcols = 'id1' if not outcsvcols else outcsvcols # if no col -> select id1 + + out_pipes = [j for i in [m.out_pipes for m in mut.measures] for j in i] # combine lists + + # create args for IntaRNA call + out_param = [f'--out={x}:{os.path.join(gettempdir(), f"CopomuS_{x}")}_{os.getpid()}.temp' for x in out_pipes] + + for mode in ['ww', 'wm', 'mw', 'mm']: # go through each mutation mode + q = mut.qw if mode[0] == 'w' else mut.qm # get actual sequences + t = mut.tw if mode[1] == 'w' else mut.tm + + i = IntaRNA() # call IntaRNA, we have 4 calls per base pair, one for each mutation mode + data = i.run(q, t, mut.qidxpos0, mut.tidxpos0, outcsvcols, self.threads, 1, self.param_file, out_param) + + for m in mut.measures: # add results to each measure + m.calculate_score(data, mode, mut, self.param_file) + + for m in mut.measures: + m.post_compute() + + def calculate_ranks(self, mutations: List[Mutation]): + # Calculate total ranking order + rank = 1 + mutations[0].ranks['total'] = rank + for i in range(1, len(mutations)): + if mutations[i - 1] < mutations[i]: + rank += 1 + mutations[i].ranks['total'] = rank + + # Calculate ranking for each measurement + for i in range(len(self.measures)): + rank = 1 + m_id = self.measures[i] + + measures = [(x.measures[i], j) for j, x in enumerate(mutations)] + measures.sort(key=lambda x: x[0]) + + mutations[measures[0][1]].ranks[m_id] = rank + for j in range(1, len(measures)): + measure, mut_id = measures[j] + if measures[j - 1][0] < measures[j][0]: + rank += 1 + mutations[mut_id].ranks[m_id] = rank + + def output_csv(self, mutations: List[Mutation]): + # CREATE CSV WRITER + writer = csv.writer(self.output, delimiter=self.delimiter) + + # WRITE CSV + headers = [j for i in + [[f'{m.id}_rank'] + [f'{m.id}_{k}' for k in m.score.keys()] for m in mutations[0].measures] + for j in i] + writer.writerow(['mutation', 'rank', 'qIndex', 'tIndex', 'bpWildtype', 'bpMutated'] + headers) + + for m in mutations: + values = [] + for x in m.measures: + values.append(m.ranks[x.id]) + scores = list(x.score.values()) + for i in range(len(scores)): # replace None with 'NA' + if scores[i] is None: + scores[i] = 'NA' + values += scores + # values = [j for i in [list(x.score.values()) for x in r.measures] for j in i] + writer.writerow([str(m), m.ranks['total'], m.query_n, m.target_n, m.bp_ww, m.bp_mm] + values) # write csv + + +if __name__ == '__main__': + c = CopomuS() + c.main() diff --git a/python/Makefile.am b/python/Makefile.am index a9ce82fa..f5681a3d 100644 --- a/python/Makefile.am +++ b/python/Makefile.am @@ -1,12 +1,19 @@ # sub directories to check for Makefiles -SUBDIRS = intarnapvalue bin +SUBDIRS = copomus intarnapvalue bin -noinst_PYTHON = setup.py +# scripts to be installed in $(bindir) directly + +dist_bin_SCRIPTS = \ + CopomuS.py + +# auxiliary scripts to be part of distribution +noinst_PYTHON = setup.py # install python modules if HAVE_PYTHON + if INSTALL_INTARNAPVALUE # ensure pip is installed @@ -18,4 +25,5 @@ install-exec-hook: $(PYTHON) setup.py install --prefix=${prefix} endif + endif diff --git a/python/README.md b/python/README.md index 7c67eb11..d74a0b2f 100644 --- a/python/README.md +++ b/python/README.md @@ -1,86 +1,14 @@ -# IntaRNApvalue -A tool for calculating p-values of IntaRNA energy scores. - -### How it works: -This tool shuffles one or more of the given sequences (depending on shuffle mode) while preserving mono- and dinucleotide frequences. -Thus the resulting sequences are very similar with similar properties to the original one. -It then calculates the IntaRNA interaction scores for these newly generated sequences and uses them to approximate the p-value of the original target/query combination. -This p-value is a score for the likelihood that a sequence with a better interaction score than the original ones can randomly occur. -It can thus be used as a measurement for how good an interaction between two sequences is. - -## Dependencies: -- IntaRNA -##### And if you want to run it with python or compile it yourself: -- Python 3 (>= 3.6) -- numpy -- scipy -## Installation: -You can either run this tool directly with python, install it with setuptools or run it as a compiled binary. -##### Run with python directly: -You don't have to install it, if you just plan on running it from a certain directory. -Simply copy the "intarnapvalue" folder where you want to run it from. -You have to get the dependencies yourself in this case. +# auxiliary tools -To use the tool from anywhere however, you need to install it as a python module. -All dependencies (except IntaRNA) will be installed automatically. -This way it can also be installed in a virtual environment. -```console -python setup.py install -``` +## [IntaRNApvalue](intarnapvalue) -##### Run as binary: -Go into the bin directory. You can find pre-compiled builds for linux and windows for x64 directly. -You don't need to install any dependencies for running these, on Windows however you will need Microsoft Visual C++ Redistributable. -If you want to compile it yourself, you need to get the dependencies and PyInstaller. -I had problems with numpy 1.17.0, so I had to use 1.16.4. -Simply run the build.py with the python binary that has the dependencies and PyInstaller installed. -You will find your binary in the build folder. - -## Usage: -Run it as a module without installing (you need to be in the parent dir of intarnapvalue): -```console -python3 -m intarnapvalue -``` - -Run it as a module from anywhere with python (need to run setup.py first, see [Installation](#installation)): -```console -python3 -m intarnapvalue -``` -Or use as a compiled binary: -```console -./IntaRNApvalue -``` +A tool for calculating p-values of IntaRNA energy scores. -Or import it and use it from within other python code: -```python -from intarnapvalue.intarna_pvalue import IntaRNApvalue -IntaRNApvalue(['--flag1', 'arg1']) -``` +## [CopomuS](copomus) -## Arguments: +The Compensatory Point Mutation Selector helps to identify the most promising +base pairs from an IntaRNA mfe prediction to verify the interaction via +compensatory mutation experiments. -| Flag | Value | Default | Description | -| ------------------ |:---------------------- | :------ | -------------------- | -| -h, --help | | | Gives detailed command help. | -| -q, --query | Sequence or FASTA file | | Query as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | -| -t, --target | Sequence or FASTA file | | Target as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | -| -c, --cardinality | Integer | | How many sequence pairs are randomly permuted and considered for p-value calculation. | -| -m, --shuffle-mode | {q, t, b} | | Which sequence will be shuffled: Query, Target or both. | -| -d, --distribution | {gev, gumbel, gauss} | gev | (optional) The distribution used for p-value calculation: Generalized Extreme Value Distribution, Gumbel Distribution or Gauss. | -| -o, --output | {pvalue, scores} | pvalue | (optional) If set to p-value, will only output p-value. If set to scores, will output every score from randomly generated sequences, but no p-value. | -| --threads | 0 - {max threads} | 0 | (optional) How many threads IntaRNA uses for score calculation. If set to 0 it will use all available threads. | -| --randSeed | Integer | None | (optional) The seed used for generating random sequences. | -| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | -## Example -An example use-case could look like this with sequences in raw format: -```console -$ python3 -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0 -0.2421277140114205 -``` -Or you could specify any of the sequences as files: -```console -$ python3 -m intarnapvalue --query query.fasta --target target.fasta --scores 10000 --shuffle-mode b --threads 0 -0.2421277140114205 -``` \ No newline at end of file diff --git a/python/copomus/CopomuS-logo.jpg b/python/copomus/CopomuS-logo.jpg new file mode 100644 index 00000000..383ab203 Binary files /dev/null and b/python/copomus/CopomuS-logo.jpg differ diff --git a/python/copomus/IntaRNA.py b/python/copomus/IntaRNA.py new file mode 100644 index 00000000..67efd434 --- /dev/null +++ b/python/copomus/IntaRNA.py @@ -0,0 +1,127 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import sys +from typing import Dict, Union, List +from subprocess import Popen, PIPE, run + + +class IntaRNA: + """IntaRNA python3 wrapper""" + def __init__(self): + pass + + @staticmethod + def check_binary(required_version: str): + """ + Checks weather or not the IntaRNA binary can be found in PATH and exits with returncode 1 if not found. + :param required_version: The minimum required version for IntaRNA + """ + p = run('IntaRNA --version', shell=True, stdout=PIPE, stderr=PIPE, universal_newlines=True) + if p.returncode: + sys.stderr.write('Please add the IntaRNA binary to your PATH.\n') + sys.exit(1) + if p.stdout.split('\n')[0].split()[1] < required_version: + sys.stderr.write(f'This tool requires IntaRNA version {required_version}, please upgrade\n') + sys.exit(1) + + def run(self, query: str, target: str, qidxpos0: int, tidxpos0: int, outcsvcols: str, threads: int, n: int = 1, + param_file: str = '', extra_params: list = '', raw_stdout: bool = False) -> \ + Union[Dict[str, any], List[Dict[str, any]], str]: + """ + Runs IntaRNA on a pair of sequences and returns a dictionary with the output + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The starting index for the query sequence + :param tidxpos0: The starting index for the target sequence + :param outcsvcols: The csvcols returned by IntaRNA + :param threads: Thread count used by IntaRNA + :param n: The number of suboptimal interactions returned + :param param_file: Optional path to a file with additional parameters + :param extra_params: Optional list of additional parameters + :param raw_stdout: Weather or not stdout should be returned in raw format, not as a dict + :return: Dictionary with column name as key and payload as value + """ + if not extra_params: + extra_params = [] + param_file = f'--parameterFile={param_file}' if param_file else '' + p = Popen(['IntaRNA', '-q', query, '-t', target, '--outMode=C', f'--outcsvcols={outcsvcols}', + f'--qIdxPos0={qidxpos0}', f'--tIdxPos0={tidxpos0}', f'--outNumber={n}', f'--threads={threads}', param_file] + + extra_params, stdout=PIPE, stderr=PIPE, universal_newlines=True) + + stdout, stderr = p.communicate() + if p.returncode: + sys.stderr.write(f'IntaRNA Error: {stdout} {stderr}\n') + sys.exit(p.returncode) + if not raw_stdout: + if n == 1: + d = self.output_to_dict(stdout) + if type(d) != bool: + return d[0] + else: + return d + return self.output_to_dict(stdout) + else: + return stdout + + @staticmethod + def output_to_dict(data: str) -> Union[List[Dict[str, any]], bool]: + """ + Transforms IntaRNA output into a dictionary + :param data: Raw IntaRNA output + :return: dictionary with colname as key and payload as value (casted into fitting type) + >>> d = IntaRNA.output_to_dict('start1;end1;id1;id2;E\\n34;42;target;query;-4.7\\n') + >>> d + [{'start1': 34, 'end1': 42, 'id1': 'target', 'id2': 'query', 'E': -4.7}] + >>> [type(x) for x in d[0].values()] + [, , , , ] + """ + subopts = [] + + lines = data.rstrip().split('\n') + if len(lines) == 2: + headers, entries = lines + entries = [entries] + elif len(lines) > 2: + headers = lines[0] + entries = lines[1:] + else: + headers = lines[0] + entries = [''] + headers = headers.split(';') + + for values in entries: + d = {} + values = values.split(';') + if len(headers) != len(values): # no interaction between query and target + return False + + for i, h in enumerate(headers): + try: + value = int(values[i]) # try to cast to int + except ValueError: + try: + value = float(values[i]) # try to cast to float + except ValueError: + value = values[i] # leave as string + d[h] = value + subopts.append(d) + + for subopt in subopts: + for k, v in subopt.items(): + if not v: + subopt[k] = None + + return subopts + + @staticmethod + def to_fasta(sequences: Dict[str, str]) -> str: + """ + Transforms sequences into a FASTA string + :param sequences: A dictionary with sequence name as key and raw sequence as tuple + :return: A FASTA string containing all sequences + """ + fasta_str = '' + for name, seq in sequences.items(): + fasta_str += f'>{name}\n{seq}\n' + return fasta_str diff --git a/python/copomus/Makefile.am b/python/copomus/Makefile.am new file mode 100644 index 00000000..3ab43800 --- /dev/null +++ b/python/copomus/Makefile.am @@ -0,0 +1,12 @@ + +copomus_DATA = \ + __init__.py \ + candidate_filters.py \ + candidate_selectors.py \ + indexing.py \ + measures.py \ + mutation.py \ + mutation_generators.py + +copomusdir = $(bindir)/copomus + diff --git a/python/copomus/README.md b/python/copomus/README.md new file mode 100644 index 00000000..bc237923 --- /dev/null +++ b/python/copomus/README.md @@ -0,0 +1,97 @@ +# CopomuS - Compensatory point mutation selector for sRNA-RNA interaction verification + +![CopomuS logo](CopomuS-logo.jpg) + +## How it works + +CopomuS is a tool for selecting mutation candidate points for sRNA-RNA interactions. +For this it takes different measures for evaluating and ranking each position. +CopomuS is highly configurable for specific purposes, see below for an explanation of all arguments. + +## Overview +- [Dependencies](#dependencies) +- [Usage](#usage) +- [Arguments](#arguments) + - [Measures](#measures) + - [Candidate Selection](#candidate-selection) + - [Candidate Filters](#candidate-filters) + - [Mutation Generators](#mutation-generators) + +## Dependencies +- [IntaRNA](https://github.com/BackofenLab/IntaRNA) >= 3.1.2 (needs to be in PATH) +- [python](https://www.python.org/) >= 3.7.0 + +## Usage +```console +./CopomoS.py -q -t + +mutation rank qIndex tIndex bpWildtype bpMutated mfeCover_rank mfeCover_ww mfeCover_wm mfeCover_mw mfeCover_mm mfeCover_is_valid E_rank E_ww E_wm E_mw E_mm E_is_valid minDeltaE_rank minDeltaE_wm/ww minDeltaE_mw/ww minDeltaE_wm/mm minDeltaE_mw/mm minDeltaE_ww/mm minDeltaE_min +U4A&A11U 1 4 11 UA AU 1 True False False True 1 1 -9.32 -5.43 -7.25 -8.46 1 3 3.89 2.07 3.03 1.21 -0.86 1.21 +C16G&G1C 2 16 1 CG GC 1 True False True True 1 2 -9.32 -8.11 -5.31 -5.99 0 5 1.21 4.01 -2.12 0.68 -3.33 -2.12 +G15C&C2G 2 15 2 GC CG 1 True True False True 1 2 -9.32 -12.37 -8.12 -12.2 0 5 -3.05 1.2 -0.17 4.08 2.88 -3.05 +... +``` + +## Arguments + +| Flag | Value | Default | Description | +| :------------------ |:---------------------- | :------ | :-------------------- | +| -h, --help | | | Gives detailed command help. | +| -q, --query | sequence | | Query as a raw sequence. | +| -t, --target | sequence | | Target as a raw sequence. | +| -m, --measure | {E, ed1, ed2, maxED, mfeCover, mfeOverlap, spotProb, qAccessProfile, tAccessProfile, qEnergyProfile, tEnergyProfile} | E | The measure used to calculate the rank of a base pair. Can be used multiple times. In that case, base pairs are ranked in order of measurements specified. | +| --qIdxPos0 | integer | 1 | (Optional) starting index for the query. | +| --tIdxPos0 | integer | 1 | (Optional) starting index for the target. | +| -c, --candidateSelection | {mfe, mfeSO} | mfe | (Optional) method by which base pair candidates are selected from the sequences. | +| -f, --candidateFilters | {GU, AU, CG, lp, lpMfe, he, heMfe} | None | (Optional) filter by which base pair candidates are filtered. Can be used multiple times to combine filters. | +| -g, --generator | {flip, any} | flip | (Optional) method used to generate mutated sequences by which base pairs are ranked. | +| --mutationEncoding | string (ex. G1C&U7G) | | (Optional) allows specific selection of candidates by specifying a mutation encoding. If this option is specified, -c, -f and -g will be ignored. | +| -o, --output | file name | STDOUT | (Optional) file to which the output should be written to. | +| -d, --delimiter | character | "\t" (tab) | (Optional) delimiter used to separate csv output. | +| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | +| -t, --threads | integer | 0 (all) | (Optional) thread count used by IntaRNA. | + +#### Measures + +| Name | Description | +| :---- | :-------------------------------------------| +| E | (class) Interaction Energy between query and target. | +| minDeltaE | (sort) Energy difference between different combinations of mutation types (ww, wm, mw, mm). | +| mfeCover | (class) Mutation position has to be in MFE. | +| Eqi | (class) Energy profile on query at mutation position. | +| Eti | (class) Energy profile on target at mutation position. | +| EDqi | (sort) Accessibility profile on query at mutation position. | +| cEDqi | (class) Accessibility profile on query at mutation position needs to be constant. | + +The default measure combination is `-m mfeCover -m E -m minDeltaE`. +It is advised to always use a sorting measure at the end, to improve the resolution of the result, +since class measures only split mutations in 3 ranks (valid, invalid, no more interaction). + +#### Candidate Selection +The choice specified by this argument defines the method, which is used to select candidate base pairs from the given sequences. + +| Name | Description | +| :---- | :-------------------------------------------| +| mfe | Selects all base pairs from the MFE region. | +| mfeSO | Selects all base pairs from all suboptimal interactions that are in the MFE region. | + +#### Candidate Filters +The filters filter out base pairs from all candidates that do not comply with the constraints. + +| Name | Description | +| :---- | :-------------------------------------------| +| GU | Filters out any GU or UG base pairs. | +| AU | Filters out any AU or UA base pairs. | +| GC | Filters out any GC or CG base pairs. | +| lp | Filters lonely base pairs that can not stack (both neighbors not in MFE and both neighbors cant pair). | +| lpMfe | Filters lonely base pairs in MFE interaction (both neighbors not in MFE). | +| he | Filters helix ends (One neighbor can pair, the other one can not). | +| heMfe | Filters helix ends (One neighbor in MFE, the other one is not). | + +#### Mutation Generators +The mutation generator chooses the method by which mutated sequences are generated. + +| Name | Description | +| :---- | :-------------------------------------------| +| flip | Flips a base pair to generate the mutated sequences (for example GC --> CG) | +| any | Generates all logical base pair combinations from a given base pair (for example GC --> CG, UG, UA) | \ No newline at end of file diff --git a/python/copomus/__init__.py b/python/copomus/__init__.py new file mode 100644 index 00000000..6f56e119 --- /dev/null +++ b/python/copomus/__init__.py @@ -0,0 +1 @@ +__all__ = ['mutation', 'indexing', 'IntaRNA', 'measures.py'] diff --git a/python/copomus/candidate_filters.py b/python/copomus/candidate_filters.py new file mode 100644 index 00000000..70baabcc --- /dev/null +++ b/python/copomus/candidate_filters.py @@ -0,0 +1,154 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from typing import Tuple, List +from utils.indexing import idx_to_array_index + + +def _filter_bp_type(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int, bp_type='') -> \ + List[Tuple[int, int]]: + """ + Filters out base pairs of a specified type + :param bps: List of tuples in form (q_index, t_index) representing the base pairs + :param q: The query sequence + :param t: The target sequence + :param qidxpos0: Starting index for the query + :param tidxpos0: Starting index for the target + :param bp_type: The type of base pairs to filter out, for example GU, CG, AU, GC, ... + :return: List of tuples in form (q_index, t_index) representing the base pairs + >>> q, t = 'GCUACGAUC', 'UUUGCGAGCAGCUAGG' + >>> bps = [(1,1),(2,2),(6,13),(8,15),(9,16)] + >>> _filter_bp_type(bps, q, t, 1, 1, 'GU') + [(2, 2), (9, 16)] + >>> _filter_bp_type(bps, q, t, 1, 1, 'GC') + [(1, 1), (2, 2), (6, 13), (8, 15)] + """ + new_bps = [] + + for q_index, t_index in bps: + q_array_index = idx_to_array_index(q_index, qidxpos0) + t_array_index = idx_to_array_index(t_index, tidxpos0) + if f'{q[q_array_index]}{t[t_array_index]}' in [bp_type, bp_type[::-1]]: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def _neighbors_in_mfe(q_index: int, t_index: int, bps: List[Tuple[int, int]]) -> int: + """ + Counts how many neighboring base pairs are in the MFE region + :param q_index: Base pair index on the query + :param t_index: Base pair index on the target + :param bps: List of tuples in form (q_index, t_index) representing the base pairs + :return: Count of neighboring base pairs that are in the MFE region + >>> _neighbors_in_mfe(3, 5, [(1, 7), (6, 9), (2, 6), (4, 4), (7, 4)]) + 2 + >>> _neighbors_in_mfe(3, 5, [(1, 7), (6, 9), (4, 4), (7, 4)]) + 1 + """ + count = 0 + for q_o, t_o in [(+1, -1), (-1, +1)]: # get neighbors by offset + if (q_index+q_o, t_index+t_o) in bps: + count += 1 + return count + + +def _neighbors_can_pair(q_index: int, t_index: int, q: str, t: str, qidxpos0: int, tidxpos0: int) -> int: + """ + Counts how many neighboring base pairs can pair + :param q_index: Base pair index on the query + :param t_index: Base pair index on the target + :param q: The query sequence + :param t: The target sequence + :param qidxpos0: Starting index for the query + :param tidxpos0: Starting index for the target + :return: Count of neighboring base pairs that can pair + >>> _neighbors_can_pair(1, 5, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 0 + >>> _neighbors_can_pair(3, 3, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 1 + >>> _neighbors_can_pair(6, 9, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 2 + """ + count = 0 + q_array_index = idx_to_array_index(q_index, qidxpos0) + t_array_index = idx_to_array_index(t_index, tidxpos0) + pairable_bps = ['GC', 'CG', 'AU', 'UA', 'GU', 'UG'] + for q_o, t_o in [(+1, -1), (-1, +1)]: # get neighbors by offset + if q_array_index+q_o in range(len(q)) and t_array_index+t_o in range(len(t)): + if f'{q[q_array_index+q_o]}{t[t_array_index+t_o]}' in pairable_bps: + count += 1 + return count + + +def filter_gu(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters any GU or UG base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='GU') + + +def filter_au(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters any AU or UA base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='AU') + + +def filter_gc(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters ay GC or CG base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='GC') + + +def filter_lp(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters lonely base pairs that can not stack (both neighbors not in MFE and both neighbors cant pair)""" + new_bps = [] + + for q_index, t_index in bps: + if not _neighbors_in_mfe(q_index, t_index, bps) and \ + not _neighbors_can_pair(q_index, t_index, q, t, qidxpos0, tidxpos0): + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_lp_mfe(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters lonely base pairs in MFE interaction (both neighbors not in MFE)""" + new_bps = [] + + for q_index, t_index in bps: + if not _neighbors_in_mfe(q_index, t_index, bps): + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_he(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters helix ends (One neighbor can pair, the other one can not)""" + new_bps = [] + + for q_index, t_index in bps: + if _neighbors_can_pair(q_index, t_index, q, t, qidxpos0, tidxpos0) == 1: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_he_mfe(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters helix ends (One neighbor in MFE, the other one is not)""" + new_bps = [] + + for q_index, t_index in bps: + if _neighbors_in_mfe(q_index, t_index, bps) == 1: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def get_filter(string: str) -> any: + candidate_filters = { + 'GU': filter_gu, + 'AU': filter_au, + 'GC': filter_gc, + 'lp': filter_lp, + 'lpMfe': filter_lp_mfe, + 'he': filter_he, + 'heMfe': filter_he_mfe + } + return candidate_filters.get(string) diff --git a/python/copomus/candidate_selectors.py b/python/copomus/candidate_selectors.py new file mode 100644 index 00000000..898a77fc --- /dev/null +++ b/python/copomus/candidate_selectors.py @@ -0,0 +1,103 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import sys +from typing import List, Tuple +from utils.IntaRNA import IntaRNA +from utils.indexing import idx_to_array_index as idx_aidx + + +def get_mfe_bps(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, threads: int, **kwargs) -> \ + List[Tuple[int, int]]: + """ + Gets a list of all base pairs for a given interaction + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :param threads: Thread count used by IntaRNA + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + candidates = [] + i = IntaRNA() + data = i.run(query, target, qidxpos0, tidxpos0, 'bpList', threads, param_file=param_file) + if not data or not data['bpList']: + print('No favorable interaction between the specified sequences!') + sys.exit(1) + for t in data['bpList'].strip().split(':'): + t_index, q_index = [int(x) for x in t.strip('(').strip(')').split(',')] + candidates.append((q_index, t_index)) # note we do query first, then target + return candidates + + +def get_mfe_bps_so(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, threads: int, **kwargs) -> \ + List[Tuple[int, int]]: + """ + Gets a list of all suboptimal base pairs for a given interaction + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :param threads: Thread count used by IntaRNA + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + candidates = [] + i = IntaRNA() + data = i.run(query, target, qidxpos0, tidxpos0, 'start1,end1,start2,end2', threads, param_file=param_file) + if not data: + print('No favorable interaction between the specified sequences!') + sys.exit(1) + data = i.run(query, target, qidxpos0, tidxpos0, 'bpList', threads, 1000, param_file, + ['--qRegion', f"{data['start2']}-{data['end2']}", '--tRegion', f"{data['start1']}-{data['end1']}"]) + for subopt in data: + for t in subopt['bpList'].strip().split(':'): + t_index, q_index = [int(x) for x in t.strip('(').strip(')').split(',')] + if (q_index, t_index) not in candidates: + candidates.append((q_index, t_index)) # note we do query first, then target + return candidates + + +def get_mutation_enc(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, **kwargs) \ + -> List[Tuple[int, int]]: + """ + Only selects the base pairs specified by the mutation encoding + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + try: + candidates = [] + enc = kwargs['enc'] + for mut in enc.split(','): # split block mutations into single mutations + q, t = mut.split('&') + q_index = int(q[1:len(q) - 1]) # mutation indices + t_index = int(t[1:len(t) - 1]) + + if query[idx_aidx(q_index, qidxpos0)] != q[0] or target[idx_aidx(t_index, tidxpos0)] != t[0]: + print('The encoding you specified does not match with the sequences you specified!') + sys.exit(1) + candidates.append((q_index, t_index)) + return candidates + + except (IndexError, ValueError, KeyError): + print('Error parsing the encoding you specified!') + sys.exit(1) + + +def get_selector(string: str) -> any: + """ + Gets a selector by its string name + :param string: The name of the selector + :return: A reference to a selector function + """ + selectors = { + 'mfe': get_mfe_bps, + 'mfeSO': get_mfe_bps_so, + 'mutEnc': get_mutation_enc + } + return selectors.get(string) diff --git a/python/copomus/indexing.py b/python/copomus/indexing.py new file mode 100644 index 00000000..6b08be81 --- /dev/null +++ b/python/copomus/indexing.py @@ -0,0 +1,50 @@ +# Copyright 2019 +# Author: Fabio Gutmann + + +def idx_to_array_index(idx: int, idxpos0: int): + """ + Converts a nucleotide index to an 0-included array index + :param idx: The index to convert + :param idxpos0: The start index + :return: The index of the element in the array + >>> idx_to_array_index(10, 5) + 5 + >>> idx_to_array_index(10, -200) + 209 + >>> idx_to_array_index(1, -1) + 1 + """ + return idx - idxpos0 - (1 if (idxpos0 < 0 < idx or idx < 0 < idxpos0) else 0) + + +def array_index_to_idx(i: int, idxpos0: int): + """ + Converts a nucleotide index to an 0-included array index + :param i: The array index + :param idxpos0: The start index + :return: The index of the element in the array + >>> array_index_to_idx(5, 5) + 10 + >>> array_index_to_idx(209, -200) + 10 + >>> array_index_to_idx(1, -1) + 1 + """ + return idxpos0 + i + (1 if (idxpos0 < 0 < i or i < 0 < idxpos0) else 0) + + +def add_sub_idx(i1: int, change: int): + """ + Adds or subtracts an index from another one and skips 0 if encountered. + :param i1: The start index + :param change: The change (+ or -) + :return: The new index + >>> add_sub_idx(-10, 10) + 1 + >>> add_sub_idx(10, -10) + -1 + >>> add_sub_idx(-5, 10) + 6 + """ + return i1 + change + (1 if i1 < 0 < i1+change+1 else -1 if i1+change-1 < 0 < i1 else 0) diff --git a/python/copomus/measures.py b/python/copomus/measures.py new file mode 100644 index 00000000..602a1902 --- /dev/null +++ b/python/copomus/measures.py @@ -0,0 +1,334 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import os +from tempfile import gettempdir +from utils.mutation import Mutation +from utils.indexing import idx_to_array_index + + +class Measure: + def __init__(self): + self.outcsvcols = [] # list with all needed outcsvcols + self.out_pipes = [] + self.score = {'ww': None, 'wm': None, 'mw': None, 'mm': None} # measurement scores + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + if data: + self.score[mut_type] = data[self.outcsvcols[0]] + + def post_compute(self): + pass + + def read_temp_file(self): + data = [] + for pipe in self.out_pipes: + file_name = f'CopomuS_{pipe}_{os.getpid()}.temp' + with open(os.path.join(gettempdir(), file_name), 'r') as f: + data.append(f.read()) + os.remove(os.path.join(gettempdir(), file_name)) + return data + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None) -> bool: + beta = alpha if not beta else beta + + res = (self.score['ww'] < 0 and self.score['mm'] < 0 and + (self.score['mw'] >= 0 or + ((self.score['ww'] + alpha < self.score['mw']) and (self.score['mm'] + beta < self.score['mw']))) + and + (self.score['wm'] >= 0 or + ((self.score['ww'] + alpha < self.score['wm']) and (self.score['mm'] + beta < self.score['wm']))) + ) + return res + + def non_i(self) -> bool: + """Checks if the Measure as values from non-interactions""" + return None in self.score.values() + + def __lt__(self, other): + return False # overwrite for custom comparisons + + def __eq__(self, other): + return self.score == other.score + + +class Energy(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.outcsvcols = ['E'] + self.id = 'E' + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class MinDeltaEnergy(Measure): + def __init__(self): + super().__init__() + self.outcsvcols = ['E'] + self.id = 'minDeltaE' + + def post_compute(self): + score = { + 'wm/ww': round(self.score['wm'] - self.score['ww'], 2) if self.score['wm'] and self.score['ww'] else '', + 'mw/ww': round(self.score['mw'] - self.score['ww'], 2) if self.score['mw'] and self.score['ww'] else '', + 'wm/mm': round(self.score['wm'] - self.score['mm'], 2) if self.score['wm'] and self.score['mm'] else '', + 'mw/mm': round(self.score['mw'] - self.score['mm'], 2) if self.score['mw'] and self.score['mm'] else '', + 'ww/mm': round(self.score['ww'] - self.score['mm'], 2) if self.score['ww'] and self.score['mm'] else '' + } + self.score = score + entries = [v for k, v in self.score.items() if v != '' and k != 'ww/mm'] + self.score['min'] = min(entries) if entries else None + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.score['min'] > 0 > other.score['min'] or 0 < other.score['min'] < self.score['min'] + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.score['min'] == other.score['min'] + + +class MFECover(Measure): + def __init__(self): + super().__init__() + self.outcsvcols = ['start1', 'end1', 'start2', 'end2'] + self.id = 'mfeCover' + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.in_mfe()) + + def in_mfe(self): + return self.score['ww'] and self.score['mm'] + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + if not data: + return + mfe_cover = (m.target_n in range(data['start1'], data['end1']+1)) and \ + (m.query_n in range(data['start2'], data['end2']+1)) + # noinspection PyTypeChecker + self.score[mut_type] = mfe_cover + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.in_mfe() and not other.in_mfe() + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.in_mfe() == other.in_mfe() + + +class EnergyProfileQuery(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.id = 'Eqi' + self.out_pipes = ['qMinE'] + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + value = data[idx_to_array_index(m.query_n, m.qidxpos0) + 1].split(';')[2] + if value == 'NA': + self.score[mut_type] = None + else: + # noinspection PyTypeChecker + self.score[mut_type] = float(value) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class EnergyProfileTarget(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.id = 'Eti' + self.out_pipes = ['tMinE'] + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + # noinspection PyTypeChecker + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + value = data[idx_to_array_index(m.target_n, m.tidxpos0) + 1].split(';')[2] + if value == 'NA': + self.score[mut_type] = None + else: + # noinspection PyTypeChecker + self.score[mut_type] = float(value) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class AccessProfileQuery(Measure): + def __init__(self): + super().__init__() + self.id = 'EDqi' + self.out_pipes = ['qAcc'] + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(0)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + # noinspection PyTypeChecker + self.score[mut_type] = float(data[idx_to_array_index(m.query_n, m.qidxpos0) + 2].split('\t')[1]) + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None, gamma: float = 0) -> bool: + return self.score['ww'] < self.score['mm'] + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return (self.is_valid(0) and not other.is_valid(0)) or ( + self.is_valid(0) and other.is_valid(0) and + self.score['mm'] - self.score['ww'] < other.score['mm'] - other.score['ww']) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(0) == other.is_valid(0) and \ + self.score['mm'] - self.score['ww'] == other.score['mm'] - other.score['ww'] + + +class ConstantAccessProfileQuery(Measure): + def __init__(self): + super().__init__() + self.id = 'cEDqi' + self.out_pipes = ['qAcc'] + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(1)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + # noinspection PyTypeChecker + self.score[mut_type] = float(data[idx_to_array_index(m.query_n, m.qidxpos0) + 2].split('\t')[1]) + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None, gamma: float = 0) -> bool: + return abs(self.score['ww'] - self.score['mm']) < alpha + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(1) and not other.is_valid(1) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(1) == other.is_valid(1) + + +def get_measure(string: str, alpha: float, beta: float) -> Measure: + mm = { + 'E': Energy(alpha, beta), + 'minDeltaE': MinDeltaEnergy(), + 'mfeCover': MFECover(), + 'EDqi': AccessProfileQuery(), + 'cEDqi': ConstantAccessProfileQuery(), + 'Eqi': EnergyProfileQuery(alpha, beta), + 'Eti': EnergyProfileTarget(alpha, beta) + } + return mm.get(string) diff --git a/python/copomus/mutation.py b/python/copomus/mutation.py new file mode 100644 index 00000000..af163340 --- /dev/null +++ b/python/copomus/mutation.py @@ -0,0 +1,35 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from dataclasses import dataclass, field +from typing import List +from utils.indexing import idx_to_array_index + + +@dataclass() +class Mutation: + measures: List[any] + qw: str # wildtype sequences + tw: str + qidxpos0: int # indexing start of query + tidxpos0: int + query_n: int # index of mutation on query + target_n: int + bp_mm: str # mm base pair + bp_ww: str = '' # ww base pair + qm: str = '' # mutated sequences + tm: str = '' + ranks: dict = field(default_factory=dict) + + def __post_init__(self): + mut_pos_q = idx_to_array_index(self.query_n, self.qidxpos0) + mut_pos_t = idx_to_array_index(self.target_n, self.tidxpos0) + self.bp_ww = f'{self.qw[mut_pos_q]}{self.tw[mut_pos_t]}' + self.qm = self.qw[0:mut_pos_q] + self.bp_mm[0] + self.qw[mut_pos_q+1:] + self.tm = self.tw[0:mut_pos_t] + self.bp_mm[1] + self.tw[mut_pos_t+1:] + + def __lt__(self, other): + return self.measures < other.measures + + def __str__(self): + return f'{self.bp_ww[0]}{self.query_n}{self.bp_mm[0]}&{self.bp_ww[1]}{self.target_n}{self.bp_mm[1]}' diff --git a/python/copomus/mutation_generators.py b/python/copomus/mutation_generators.py new file mode 100644 index 00000000..0825de43 --- /dev/null +++ b/python/copomus/mutation_generators.py @@ -0,0 +1,92 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from copy import deepcopy +from utils.indexing import idx_to_array_index +from utils.measures import get_measure +from utils.mutation import Mutation +from typing import List, Tuple + + +def gen_flip(query: str, target: str, qidxpos0: int, tidxpos0: int, bp_list: List[Tuple[int, int]], mms: List[str], + alpha: float, beta: float) -> List[Mutation]: + mutations = [] + + for q_index, t_index in bp_list: + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + q_index_a = idx_to_array_index(q_index, qidxpos0) + t_index_a = idx_to_array_index(t_index, tidxpos0) + + bp_mm = f'{target[t_index_a]}{query[q_index_a]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def gen_any(query: str, target: str, qidxpos0: int, tidxpos0: int, bp_list: List[Tuple[int, int]], mms: List[str], + alpha: float, beta: float) -> List[Mutation]: + mutations = [] + + possible_mutations = { + 'AU': ['UA', 'UG', 'CG'], + 'UA': ['AU', 'GU', 'GC'], + 'GC': ['UA', 'UG', 'CG'], + 'CG': ['AU', 'GU', 'GC'], + 'GU': ['UA', 'UG', 'CG'], + 'UG': ['AU', 'GU', 'GC'] + } + + for q_index, t_index in bp_list: + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + q_index_a = idx_to_array_index(q_index, qidxpos0) + t_index_a = idx_to_array_index(t_index, tidxpos0) + + pos = possible_mutations[f'{query[q_index_a]}{target[t_index_a]}'] + + for p in pos: + bp_mm = f'{p[0]}{p[1]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def gen_specific(query: str, target: str, qidxpos0: int, tidxpos0: int, mms: List[str], alpha: float, beta: float, + encoding: str) -> List[Mutation]: + mutations = [] + + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + for mut in encoding.split(','): # split block mutations into single mutations + q, t = mut.split('&') + q_index = int(q[1:len(q)-1]) # mutation indices + t_index = int(t[1:len(t)-1]) + + bp_mm = f'{q[len(q)-1]}{t[len(t)-1]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def get_generator(string: str) -> any: + """ + Gets a mutation generator by its string name + :param string: The string name + :return: A reference to a generator function + """ + generators = { + 'flip': gen_flip, + 'any': gen_any, + 'specific': gen_specific + } + return generators.get(string) diff --git a/python/intarnapvalue/README.md b/python/intarnapvalue/README.md new file mode 100644 index 00000000..7c67eb11 --- /dev/null +++ b/python/intarnapvalue/README.md @@ -0,0 +1,86 @@ +# IntaRNApvalue +A tool for calculating p-values of IntaRNA energy scores. + +### How it works: +This tool shuffles one or more of the given sequences (depending on shuffle mode) while preserving mono- and dinucleotide frequences. +Thus the resulting sequences are very similar with similar properties to the original one. +It then calculates the IntaRNA interaction scores for these newly generated sequences and uses them to approximate the p-value of the original target/query combination. +This p-value is a score for the likelihood that a sequence with a better interaction score than the original ones can randomly occur. +It can thus be used as a measurement for how good an interaction between two sequences is. + +## Dependencies: +- IntaRNA +##### And if you want to run it with python or compile it yourself: +- Python 3 (>= 3.6) +- numpy +- scipy + +## Installation: +You can either run this tool directly with python, install it with setuptools or run it as a compiled binary. +##### Run with python directly: +You don't have to install it, if you just plan on running it from a certain directory. +Simply copy the "intarnapvalue" folder where you want to run it from. +You have to get the dependencies yourself in this case. + +To use the tool from anywhere however, you need to install it as a python module. +All dependencies (except IntaRNA) will be installed automatically. +This way it can also be installed in a virtual environment. +```console +python setup.py install +``` + +##### Run as binary: +Go into the bin directory. You can find pre-compiled builds for linux and windows for x64 directly. +You don't need to install any dependencies for running these, on Windows however you will need Microsoft Visual C++ Redistributable. +If you want to compile it yourself, you need to get the dependencies and PyInstaller. +I had problems with numpy 1.17.0, so I had to use 1.16.4. +Simply run the build.py with the python binary that has the dependencies and PyInstaller installed. +You will find your binary in the build folder. + +## Usage: +Run it as a module without installing (you need to be in the parent dir of intarnapvalue): +```console +python3 -m intarnapvalue +``` + +Run it as a module from anywhere with python (need to run setup.py first, see [Installation](#installation)): +```console +python3 -m intarnapvalue +``` +Or use as a compiled binary: +```console +./IntaRNApvalue +``` + +Or import it and use it from within other python code: +```python +from intarnapvalue.intarna_pvalue import IntaRNApvalue +IntaRNApvalue(['--flag1', 'arg1']) +``` + +## Arguments: + +| Flag | Value | Default | Description | +| ------------------ |:---------------------- | :------ | -------------------- | +| -h, --help | | | Gives detailed command help. | +| -q, --query | Sequence or FASTA file | | Query as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | +| -t, --target | Sequence or FASTA file | | Target as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | +| -c, --cardinality | Integer | | How many sequence pairs are randomly permuted and considered for p-value calculation. | +| -m, --shuffle-mode | {q, t, b} | | Which sequence will be shuffled: Query, Target or both. | +| -d, --distribution | {gev, gumbel, gauss} | gev | (optional) The distribution used for p-value calculation: Generalized Extreme Value Distribution, Gumbel Distribution or Gauss. | +| -o, --output | {pvalue, scores} | pvalue | (optional) If set to p-value, will only output p-value. If set to scores, will output every score from randomly generated sequences, but no p-value. | +| --threads | 0 - {max threads} | 0 | (optional) How many threads IntaRNA uses for score calculation. If set to 0 it will use all available threads. | +| --randSeed | Integer | None | (optional) The seed used for generating random sequences. | +| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | + +## Example +An example use-case could look like this with sequences in raw format: +```console +$ python3 -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0 +0.2421277140114205 +``` +Or you could specify any of the sequences as files: +```console +$ python3 -m intarnapvalue --query query.fasta --target target.fasta --scores 10000 --shuffle-mode b --threads 0 +0.2421277140114205 +``` \ No newline at end of file diff --git a/src/IntaRNA/IndexRange.h b/src/IntaRNA/IndexRange.h index c5258b4e..5b4285db 100644 --- a/src/IntaRNA/IndexRange.h +++ b/src/IntaRNA/IndexRange.h @@ -176,8 +176,12 @@ class IndexRange { if( ! boost::regex_match(stringEncoding, regex, boost::match_perl) ) { throw std::runtime_error("IndexRange::fromString("+stringEncoding+") uses no valid index range string encoding"); } - // find split position - const size_t splitPos = stringEncoding.find('-'); + // find split position assuming second index to be negative + size_t splitPos = stringEncoding.find("--"); + if (splitPos == std::string::npos) { + // last index should not be negative, thus, split at last '-' + splitPos = stringEncoding.find_last_of('-'); + } // parse interval boundaries if (seq == NULL) { from = boost::lexical_cast(stringEncoding.substr(0,splitPos)); diff --git a/src/IntaRNA/InteractionEnergy.h b/src/IntaRNA/InteractionEnergy.h index 53bbfd17..0241bde5 100644 --- a/src/IntaRNA/InteractionEnergy.h +++ b/src/IntaRNA/InteractionEnergy.h @@ -148,7 +148,8 @@ class InteractionEnergy { * Checks whether or not two positions can form a base pair * @param i1 index in first sequence * @param i2 index in second sequence - * @return true if seq1(i1) can form a base pair with seq2(i2) + * @return true if seq1(i1) can form a base pair with seq2(i2) and both + * positions are accessible */ virtual bool @@ -685,7 +686,9 @@ bool InteractionEnergy:: areComplementary( const size_t i1, const size_t i2 ) const { - return RnaSequence::areComplementary( accS1.getSequence(), accS2.getSequence(), i1, i2); + return RnaSequence::areComplementary( accS1.getSequence(), accS2.getSequence(), i1, i2) + && isAccessible1(i1) + && isAccessible2(i2); } //////////////////////////////////////////////////////////////////////////// diff --git a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp index 4e124e0f..86abd0c6 100644 --- a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp @@ -267,8 +267,12 @@ fillHybridE_left( const size_t si1, const size_t si2 ) } // update mfe if needed if ( E_isNotINF( curMinE ) ) { + // safe cases for Zall update: + // - i==si && j==sj : seed only + // - i!=si && and no seed between i (incl.) and si (excl) : si == left-most seed + // leftE (incl. E_init) + seedE - updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, i1==si1 && i2==si2 ); + updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, (i1==si1 && i2==si2) ); // check if right opt != right seed boundary // check for max interaction length @@ -276,9 +280,16 @@ fillHybridE_left( const size_t si1, const size_t si2 ) && (j1opt+1-i1)<=energy.getAccessibility1().getMaxLength() && (j2opt+1-i2)<=energy.getAccessibility2().getMaxLength() ) { + // check if no seed between i and si + // or no Z update for left extensions to avoid duplicated handling (underestimates Zall) + bool noSeedLeftOfsi = (i1 < si1 && !seedHandler.isSeedBound(i1,i2)); + if (noSeedLeftOfsi) { + size_t x1=si1, x2=si2; + // check of no seed start between i and si + noSeedLeftOfsi = !(seedHandler.updateToNextSeed(x1,x2, i1+1,si1-1, i2+1, si2-1)); + } // leftE (incl. E_init) + seedE + rightOptE - // no Z update for left extensions to avoid duplicated handling (underestimates Zall) - updateOptima( i1,j1opt,i2,j2opt, curMinE + seedE + rightOptE, true, false ); + updateOptima( i1,j1opt,i2,j2opt, curMinE + seedE + rightOptE, true, noSeedLeftOfsi ); } } } diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index 9d56491f..697e13b5 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -142,6 +142,13 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) tShapeMethod("Zb0.89"), tShapeConversion("Os1.6i-2.29"), + // meta constraints + acc("acc","NC", 'C'), + accW("accW", 0, 99999, 150), + accL("accL", 0, 99999, 100), + intLenMax("intLenMax", 0, 99999, 0), + intLoopMax("intLoopMax", 0, 30, 10), + // Helix Constraints helixMinBP("helixMinBP",2,4,2), helixMaxBP("helixMaxBP",2,20,10), @@ -228,6 +235,10 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 0); resetParamDefault<>(tAccL, 0); resetParamDefault<>(tIntLoopMax, 16); + resetParamDefault<>(accW, 0); + resetParamDefault<>(accL, 0); + resetParamDefault<>(intLenMax, 0); + resetParamDefault<>(intLoopMax, 16); #if INTARNA_MULITHREADING resetParamDefault<>(threads, 1); #endif @@ -259,6 +270,10 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 150); resetParamDefault<>(tAccL, 100); resetParamDefault<>(tIntLoopMax, 16); + resetParamDefault<>(accW, 150); + resetParamDefault<>(accL, 100); + resetParamDefault<>(intLenMax, 0); + resetParamDefault<>(intLoopMax, 16); resetParamDefault<>(energyFile, std::string(VrnaHandler::Turner04), "energyVRNA"); #if INTARNA_MULITHREADING resetParamDefault<>(threads, 1); @@ -302,6 +317,9 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 0); resetParamDefault<>(tAccL, 0); resetParamDefault<>(tIntLenMax, 60); + resetParamDefault<>(accW, 0); + resetParamDefault<>(accL, 0); + resetParamDefault<>(intLenMax, 60); break; case IntaRNAseed : // seed-only prediction @@ -316,6 +334,8 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tIntLoopMax, 8); resetParamDefault<>(qIntLenMax, 60); resetParamDefault<>(qIntLoopMax, 8); + resetParamDefault<>(intLenMax, 60); + resetParamDefault<>(intLoopMax, 8); resetParamDefault<>(outMinPu, 0.001); // new features added with 3.1.0 resetParamDefault<>(outNoLP, true, "outNoLP"); @@ -346,11 +366,29 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ("query,q" , value(&queryArg) ->required() - ->notifier(boost::bind(&CommandLineParsing::validate_query,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_sequenceArgument,this,"query",_1)) , "either an RNA sequence or the stream/file name from where to read" " the query sequences (should be the shorter sequences to increase efficiency);" " use 'STDIN' to read from standard input stream; sequences have to use" " IUPAC nucleotide encoding; output alias is [seq2]") + ; + opts_cmdline_short.add(opts_query); + opts_query.add_options() + ("qId" + , value(&qId) + ->default_value("") + ->notifier(boost::bind(&CommandLineParsing::validate_id,this,"qId",_1)) + , std::string("id (FASTA-prefix) to be used for query sequence naming.").c_str()) + (qIdxPos0.name.c_str() + , value(&(qIdxPos0.val)) + ->default_value(qIdxPos0.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,qIdxPos0,_1)) + , std::string("index of first (5') sequence position of all query sequences" + " (arg in range ["+toString(qIdxPos0.min)+","+toString(qIdxPos0.max)+"];").c_str()) + ("qSet" + , value(&(qSetString)) + ->notifier(boost::bind(&CommandLineParsing::validate_qSet,this,_1)) + , std::string("query subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) (qAcc.name.c_str() , value(&(qAcc.val)) ->default_value(qAcc.def) @@ -364,7 +402,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (qAccW.name.c_str() , value(&(qAccW.val)) ->default_value(qAccW.def) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccW,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,qAccW,_1,1,2)) , std::string("accessibility computation : sliding window size for query accessibility computation" " (arg in range ["+toString(qAccW.min)+","+toString(qAccW.max)+"];" " 0 will use to the full sequence length)." @@ -373,25 +411,12 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (qAccL.name.c_str() , value(&(qAccL.val)) ->default_value(qAccL.def) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccL,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,qAccL,_1,1,2)) , std::string("accessibility computation : maximal loop length (base pair span) for query accessibility computation" " (arg in range ["+toString(qAccL.min)+","+toString(qAccL.max)+"]; 0 will use to sliding window size 'qAccW')").c_str()) - ; - opts_cmdline_short.add(opts_query); - opts_query.add_options() - (qIdxPos0.name.c_str() - , value(&(qIdxPos0.val)) - ->default_value(qIdxPos0.def) - ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,qIdxPos0,_1)) - , std::string("index of first (5') sequence position of all query sequences" - " (arg in range ["+toString(qIdxPos0.min)+","+toString(qIdxPos0.max)+"];").c_str()) - ("qSet" - , value(&(qSetString)) - ->notifier(boost::bind(&CommandLineParsing::validate_qSet,this,_1)) - , std::string("query subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) ("qAccConstr" , value(&(qAccConstr)) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccConstr,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_structureConstraintArgument,this,"qAccConstr",_1)) , std::string("accessibility computation : structure constraint :" "\n EITHER a string of query sequence length encoding for each position:" " '.' no constraint," @@ -424,7 +449,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(qIntLoopMax.min)+","+toString(qIntLoopMax.max)+"]; 0 enforces stackings only)").c_str()) ("qRegion" , value(&(qRegionString)) - ->notifier(boost::bind(&CommandLineParsing::validate_qRegion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validateRegion,this,"qRegion",_1)) , std::string("interaction site : query regions to be considered for" " interaction prediction. Format =" " 'from1-to1,from2-to2,..' where indexing starts with 'qIdxPos0'." @@ -450,8 +475,26 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ("target,t" , value(&targetArg) ->required() - ->notifier(boost::bind(&CommandLineParsing::validate_target,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_sequenceArgument,this,"target",_1)) , "either an RNA sequence or the stream/file name from where to read the target sequences (should be the longer sequences to increase efficiency); use 'STDIN' to read from standard input stream; sequences have to use IUPAC nucleotide encoding; output alias is [seq1]") + ; + opts_cmdline_short.add(opts_target); + opts_target.add_options() + ("tId" + , value(&tId) + ->default_value("") + ->notifier(boost::bind(&CommandLineParsing::validate_id,this,"tId",_1)) + , std::string("id (FASTA-prefix) to be used for target sequence naming.").c_str()) + (tIdxPos0.name.c_str() + , value(&(tIdxPos0.val)) + ->default_value(tIdxPos0.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,tIdxPos0,_1)) + , std::string("index of first (5') sequence position of all target sequences" + " (arg in range ["+toString(tIdxPos0.min)+","+toString(tIdxPos0.max)+"];").c_str()) + ("tSet" + , value(&(tSetString)) + ->notifier(boost::bind(&CommandLineParsing::validate_tSet,this,_1)) + , std::string("target subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) (tAcc.name.c_str() , value(&(tAcc.val)) ->default_value(tAcc.def) @@ -465,8 +508,8 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (tAccW.name.c_str() , value(&(tAccW.val)) ->default_value(tAccW.def) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccW,this,_1)) - , std::string("accessibility computation : sliding window size for query accessibility computation" + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,tAccW,_1,1,2)) + , std::string("accessibility computation : sliding window size for target accessibility computation" " (arg in range ["+toString(tAccW.min)+","+toString(tAccW.max)+"];" " 0 will use the full sequence length)" " Note, this also restricts the maximal interaction length (see --tIntLenMax)." @@ -474,25 +517,12 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (tAccL.name.c_str() , value(&(tAccL.val)) ->default_value(tAccL.def) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccL,this,_1)) - , std::string("accessibility computation : maximal loop size (base pair span) for query accessibility computation" + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,tAccL,_1,1,2)) + , std::string("accessibility computation : maximal loop size (base pair span) for target accessibility computation" " (arg in range ["+toString(tAccL.min)+","+toString(tAccL.max)+"]; 0 will use the sliding window size 'tAccW')").c_str()) - ; - opts_cmdline_short.add(opts_target); - opts_target.add_options() - (tIdxPos0.name.c_str() - , value(&(tIdxPos0.val)) - ->default_value(tIdxPos0.def) - ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,tIdxPos0,_1)) - , std::string("index of first (5') sequence position of all target sequences" - " (arg in range ["+toString(tIdxPos0.min)+","+toString(tIdxPos0.max)+"];").c_str()) - ("tSet" - , value(&(tSetString)) - ->notifier(boost::bind(&CommandLineParsing::validate_tSet,this,_1)) - , std::string("target subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) ("tAccConstr" , value(&(tAccConstr)) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccConstr,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_structureConstraintArgument,this,"tAccConstr",_1)) , std::string("accessibility computation : structure constraint :" "\n EITHER a string of target sequence length encoding for each position:" " '.' no constraint," @@ -525,7 +555,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(tIntLoopMax.min)+","+toString(tIntLoopMax.max)+"]; 0 enforces stackings only)").c_str()) ("tRegion" , value(&(tRegionString)) - ->notifier(boost::bind(&CommandLineParsing::validate_tRegion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validateRegion,this,"tRegion",_1)) , std::string("interaction site : target regions to be considered for" " interaction prediction. Format =" " 'from1-to1,from2-to2,..' where indexing starts with 'tIdxPos0'." @@ -653,11 +683,11 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(seedMinPu.min)+","+toString(seedMinPu.max)+"]).").c_str()) ("seedQRange" , value(&(seedQRange)) - ->notifier(boost::bind(&CommandLineParsing::validate_seedQRange,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_seedRange,this,"seedQRange",_1)) , std::string("interval(s) in the query to search for seeds in format 'from1-to1,from2-to2,...' (Note, only for single query)").c_str()) ("seedTRange" , value(&(seedTRange)) - ->notifier(boost::bind(&CommandLineParsing::validate_seedTRange,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_seedRange,this,"seedTRange",_1)) , std::string("interval(s) in the target to search for seeds in format 'from1-to1,from2-to2,...' (Note, only for single target)").c_str()) ("seedNoGU", value(&seedNoGU) ->default_value(seedNoGU) @@ -674,15 +704,15 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) opts_shape.add_options() ("qShape" , value(&qShape) - ->notifier(boost::bind(&CommandLineParsing::validate_qShape,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shape,this,"qShape",_1)) , "file name from where to read the query sequence's SHAPE reactivity data to guide its accessibility computation") ("tShape" , value(&tShape) - ->notifier(boost::bind(&CommandLineParsing::validate_tShape,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shape,this,"tShape",_1)) , "SHAPE: file name from where to read the target sequence's SHAPE reactivity data to guide its accessibility computation") ("qShapeMethod" , value(&qShapeMethod) - ->notifier(boost::bind(&CommandLineParsing::validate_qShapeMethod,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeMethod,this,"qShapeMethod",_1)) , std::string("SHAPE: method how to integrate SHAPE reactivity data into query accessibility computation via pseudo energies:" "\n" " 'D': Convert by using a linear equation according to Deigan et al. (2009). The calculated pseudo energies will " @@ -697,13 +727,13 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ).c_str()) ("tShapeMethod" , value(&tShapeMethod) - ->notifier(boost::bind(&CommandLineParsing::validate_tShapeMethod,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeMethod,this,"tShapeMethod",_1)) , std::string("SHAPE: method how to integrate SHAPE reactivity data into target accessibility computation via pseudo energies.\n" "[for encodings see --qShapeMethod]" ).c_str()) ("qShapeConversion" , value(&qShapeConversion) - ->notifier(boost::bind(&CommandLineParsing::validate_qShapeConversion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeConversion,this,"qShapeConversion",_1)) , std::string("SHAPE: method how to convert SHAPE reactivities to pairing probabilities for query accessibility computation. " "This parameter is useful when dealing with the SHAPE incorporation according to Zarringhalam et al. (2012). " "The following methods can be used to convert SHAPE reactivities into the probability for a certain nucleotide to be unpaired:" @@ -720,7 +750,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ).c_str()) ("tShapeConversion" , value(&tShapeConversion) - ->notifier(boost::bind(&CommandLineParsing::validate_tShapeConversion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeConversion,this,"tShapeConversion",_1)) , std::string("SHAPE: method how to convert SHAPE reactivities to pairing probabilities for target accessibility computation.\n" "[for encodings see --qShapeConversion]" ).c_str()) @@ -738,9 +768,6 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) "\n 'M' = exact (slow), " "\n 'S' = seed-only" ).c_str()) - ; - opts_cmdline_short.add(opts_inter); - opts_inter.add_options() (model.name.c_str() , value(&(model.val)) ->default_value(model.def) @@ -751,6 +778,48 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) "\n 'B' = single-site, helix-block-based, minimum-free-energy interaction (blocks of stable helices and interior loops only), " "\n 'P' = single-site interaction with minimal free ensemble energy per site (interior loops only)" ).c_str()) + (acc.name.c_str() + , value(&(acc.val)) + ->default_value(acc.def) + ->notifier(boost::bind(&CommandLineParsing::validate_charArgument,this,acc,_1)) + , std::string("accessibility computation :" + "\n 'N' no accessibility contributions" + "\n 'C' computation of accessibilities (see --accW and --accL)" + ).c_str()) + (intLenMax.name.c_str() + , value(&(intLenMax.val)) + ->default_value(intLenMax.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,intLenMax,_1)) + , std::string("interaction site : maximal window size to be considered for" + " interaction" + " (arg in range ["+toString(intLenMax.min)+","+toString(intLenMax.max)+"];" + " 0 refers to the full sequence length)." + " If --accW is provided, the smaller window size of both is used." + ).c_str()) + (intLoopMax.name.c_str() + , value(&(intLoopMax.val)) + ->default_value(intLoopMax.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,intLoopMax,_1)) + , std::string("interaction site : maximal number of unpaired bases between neighbored interacting bases to be considered in interactions" + " (arg in range ["+toString(intLoopMax.min)+","+toString(intLoopMax.max)+"]; 0 enforces stackings only)").c_str()) + ; + opts_cmdline_short.add(opts_inter); + opts_inter.add_options() + (accW.name.c_str() + , value(&(accW.val)) + ->default_value(accW.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,accW,_1,1,2)) + , std::string("accessibility computation : sliding window size for accessibility computation" + " (arg in range ["+toString(accW.min)+","+toString(accW.max)+"];" + " 0 will use the full sequence length)" + " Note, this also restricts the maximal interaction length (see --intLenMax)." + ).c_str()) + (accL.name.c_str() + , value(&(accL.val)) + ->default_value(accL.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,accL,_1,1,2)) + , std::string("accessibility computation : maximal loop size (base pair span) for accessibility computation" + " (arg in range ["+toString(tAccL.min)+","+toString(accL.max)+"]; 0 will use the sliding window size 'accW')").c_str()) ((energy.name+",e").c_str() , value(&(energy.val)) ->default_value(energy.def) @@ -1104,7 +1173,6 @@ parse(int argc, char** argv) // parsing escape literals outSep = unescaped_string::getUnescaped( outSep ); - // open output stream { // open according stream @@ -1118,13 +1186,33 @@ parse(int argc, char** argv) } // parse the sequences - parseSequences("query",queryArg,query,qSet,qIdxPos0.val); - parseSequences("target",targetArg,target,tSet,tIdxPos0.val); + parseSequences("query",qId,queryArg,query,qSet,qIdxPos0.val); + parseSequences("target",tId,targetArg,target,tSet,tIdxPos0.val); // validate accessibility input from file (requires parsed sequences) validate_qAccFile( qAccFile ); validate_tAccFile( tAccFile ); + //////////////// INTERACTION CONSTRAINTS /////////////////// + + // interaction length: meta vs. special + if (intLenMax.isSet()) { + if (qIntLenMax.isSet() && intLenMax.val != qIntLenMax.val) { throw error("--intLenMax and --qIntLenMax are set to different values"); } + if (tIntLenMax.isSet() && intLenMax.val != tIntLenMax.val) { throw error("--intLenMax and --tIntLenMax are set to different values"); } + qIntLenMax.val = intLenMax.val; + tIntLenMax.val = intLenMax.val; + } + + // interior loop length: meta vs. special + if (intLoopMax.isSet()) { + if (qIntLoopMax.isSet() && intLoopMax.val != qIntLoopMax.val) { throw error("--intLoopMax and --qIntLoopMax are set to different values"); } + if (tIntLoopMax.isSet() && intLoopMax.val != tIntLoopMax.val) { throw error("--intLoopMax and --tIntLoopMax are set to different values"); } + qIntLoopMax.val = intLoopMax.val; + tIntLoopMax.val = intLoopMax.val; + } + + //////////////// SEED //////////////////////// + // check seed setup if (noSeedRequired) { // reset model if needed @@ -1135,13 +1223,13 @@ parse(int argc, char** argv) if (mode.val == 'S') throw error("mode=S not applicable for non-seed predictions!"); // input sanity check : maybe seed constraints defined -> warn if (!seedTQ.empty()) LOG(INFO) <<"no seed constraint wanted, but explicit seedTQ provided (will be ignored)"; - if (seedBP.val != seedBP.def) LOG(INFO) <<"no seed constraint wanted, but seedBP provided (will be ignored)"; - if (seedMaxUP.val != seedMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxUP provided (will be ignored)"; - if (seedQMaxUP.val != seedQMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedQMaxUP provided (will be ignored)"; - if (seedTMaxUP.val != seedTMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedTMaxUP provided (will be ignored)"; - if (seedMaxE.val != seedMaxE.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxE provided (will be ignored)"; - if (seedMaxEhybrid.val != seedMaxEhybrid.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxEhybrid provided (will be ignored)"; - if (seedMinPu.val != seedMinPu.def) LOG(INFO) <<"no seed constraint wanted, but seedMinPu provided (will be ignored)"; + if (seedBP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedBP provided (will be ignored)"; + if (seedMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxUP provided (will be ignored)"; + if (seedQMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedQMaxUP provided (will be ignored)"; + if (seedTMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedTMaxUP provided (will be ignored)"; + if (seedMaxE.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxE provided (will be ignored)"; + if (seedMaxEhybrid.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxEhybrid provided (will be ignored)"; + if (seedMinPu.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMinPu provided (will be ignored)"; if (!seedQRange.empty()) LOG(INFO) <<"no seed constraint wanted, but seedQRange provided (will be ignored)"; if (!seedTRange.empty()) LOG(INFO) <<"no seed constraint wanted, but seedTRange provided (will be ignored)"; if (seedNoGU) LOG(INFO) <<"no seed constraint wanted, but seedNoGU provided (will be ignored)"; @@ -1183,18 +1271,21 @@ parse(int argc, char** argv) throw error("explicit seed definition only for single query/target available"); } // input sanity check : maybe seed constraints defined -> warn - if (seedBP.val != seedBP.def) LOG(INFO) <<"explicit seeds defined, but seedBP provided (will be ignored)"; - if (seedMaxUP.val != seedMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedMaxUP provided (will be ignored)"; - if (seedQMaxUP.val != seedQMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedQMaxUP provided (will be ignored)"; - if (seedTMaxUP.val != seedTMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedTMaxUP provided (will be ignored)"; - if (seedMaxE.val != seedMaxE.def) LOG(INFO) <<"explicit seeds defined, but seedMaxE provided (will be ignored)"; - if (seedMaxEhybrid.val != seedMaxEhybrid.def) LOG(INFO) <<"explicit seeds defined, but seedMaxEhybrid provided (will be ignored)"; - if (seedMinPu.val != seedMinPu.def) LOG(INFO) <<"explicit seeds defined, but seedMinPu provided (will be ignored)"; + if (seedBP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedBP provided (will be ignored)"; + if (seedMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxUP provided (will be ignored)"; + if (seedQMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedQMaxUP provided (will be ignored)"; + if (seedTMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedTMaxUP provided (will be ignored)"; + if (seedMaxE.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxE provided (will be ignored)"; + if (seedMaxEhybrid.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxEhybrid provided (will be ignored)"; + if (seedMinPu.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMinPu provided (will be ignored)"; if (!seedQRange.empty()) LOG(INFO) <<"explicit seeds defined, but seedQRange provided (will be ignored)"; if (!seedTRange.empty()) LOG(INFO) <<"explicit seeds defined, but seedTRange provided (will be ignored)"; if (seedNoGU) LOG(INFO) <<"explicit seeds defined, but seedNoGU provided (will be ignored)"; if (seedNoGUend) LOG(INFO) <<"explicit seeds defined, but seedNoGUend provided (will be ignored)"; } + // compare maximal interaction length with minimal seed length + if (qIntLenMax.val > 0 && qIntLenMax.val < seedBP.val) { throw error("maximal query interaction length < seedBP"); } + if (tIntLenMax.val > 0 && tIntLenMax.val < seedBP.val) { throw error("maximal target interaction length < seedBP"); } } /////////////// PARSE AND PREPARE PREDICTION RANGES ////////////// @@ -1216,55 +1307,27 @@ parse(int argc, char** argv) parseRegion( "qRegion", qRegionString, query, qRegion ); parseRegion( "tRegion", tRegionString, target, tRegion ); + //////////////// ACCESSIBILITY CONSTRAINTS /////////////////// - //////////////// WINDOW-BASED COMPUTATION /////////////////// - - // check if window-based computation enabled - if (windowWidth.val > 0) { - // minimal window width - if (windowWidth.val < 10) { - throw error("window-based computation: --windowWidth should be at least 10 but is "+toString(windowWidth.val)); - } - // ensure interaction length is restricted - size_t maxIntLength = 0; - if (qAcc.val != 'N') { // check if accessibility computation enabled - if (qIntLenMax.val == 0 && qAccW.val == 0) { - throw error("window-based computation: maximal query interaction length has to be restricted either via --qAccW or --qIntLenMax"); - } else { // update maximum - maxIntLength = std::max(qIntLenMax.val, qAccW.val); - } - } else { // max interaction length of query - if (qIntLenMax.val == 0) { - throw error("window-based computation: maximal query interaction length has to be restricted via --qIntLenMax"); - } else { - maxIntLength = qIntLenMax.val; - } - } - if (tAcc.val != 'N') { - if (tIntLenMax.val == 0 && tAccW.val == 0) { - throw error("window-based computation: maximal target interaction length has to be restricted either via --tAccW or --tIntLenMax"); - } else { // update maximum - maxIntLength = maxIntLength < std::max(tAccW.val,tIntLenMax.val) ? std::max(tAccW.val,tIntLenMax.val) : maxIntLength; - } - } else { - if (tIntLenMax.val == 0) { - throw error("window-based computation: maximal target interaction length has to be restricted via --tIntLenMax"); - } else { // update maximum - maxIntLength = maxIntLength < tIntLenMax.val ? tIntLenMax.val : maxIntLength; - } - } - if (windowOverlap.val < maxIntLength) { - throw error("window-based computation: --windowOverlap ("+toString(windowOverlap.val)+") has to be at least as large as the maximal interaction length, i.e. max of --q|tAccW and --q|tIntLenMax ("+toString(maxIntLength)+")"); - } - if (windowWidth.val <= windowOverlap.val) { - throw error("window-based computation: --windowWidth ("+toString(windowWidth.val)+") has to exceed --windowOverlap ("+toString(windowOverlap.val)+")"); - } - if (outNumber.val > 1 && outOverlap.val != 'B') { - throw error("window-based computation: non-overlapping subopt output (-n > 1) only supported for --outOverlap=B"); - } + // accessibility setup: meta vs. special + if (acc.isSet()) { + if (qAcc.isSet() && acc.val != qAcc.val) { throw error("--acc and --qAcc are set to different values"); } + if (tAcc.isSet() && acc.val != tAcc.val) { throw error("--acc and --tAcc are set to different values"); } + qAcc.val = acc.val; + tAcc.val = acc.val; + } + if (accW.isSet()) { + if (qAccW.isSet() && accW.val != qAccW.val) { throw error("--accW and --qAccW are set to different values"); } + if (tAccW.isSet() && accW.val != tAccW.val) { throw error("--accW and --tAccW are set to different values"); } + qAccW.val = accW.val; + tAccW.val = accW.val; + } + if (accL.isSet()) { + if (qAccL.isSet() && accL.val != qAccL.val) { throw error("--accL and --qAccL are set to different values"); } + if (tAccL.isSet() && accL.val != tAccL.val) { throw error("--accL and --tAccL are set to different values"); } + qAccL.val = accL.val; + tAccL.val = accL.val; } - - //////////////// ACCESSIBILITY CONSTRAINTS /////////////////// // check qAccConstr - query sequence compatibility if (vm.count("qAccConstr") > 0) { @@ -1306,8 +1369,8 @@ parse(int argc, char** argv) break; } // drop to next handling case 'N' : { - if (qAccL.val != qAccL.def) LOG(INFO) <<"qAcc = "< 1 || query.size() > 1) { throw error("outmode=E allows only single sequence input for query and target"); } + // ensure separator is not used within sequence name + if (qId.find(outSep) != std::string::npos) { throw error("--qId contains --outSep separator..."); } + if (tId.find(outSep) != std::string::npos) { throw error("--tId contains --outSep separator..."); } + break; + case 'C' : + // ensure separator is not used within sequence name + if (qId.find(outSep) != std::string::npos) { throw error("--qId contains --outSep separator..."); } + if (tId.find(outSep) != std::string::npos) { throw error("--tId contains --outSep separator..."); } + break; + default: + break; } // check CSV stuff @@ -1394,6 +1472,57 @@ parse(int argc, char** argv) } } + + //////////////// WINDOW-BASED COMPUTATION /////////////////// + + // check if window-based computation enabled + if (windowWidth.val > 0) { + // minimal window width + if (windowWidth.val < 10) { + throw error("window-based computation: --windowWidth should be at least 10 but is "+toString(windowWidth.val)); + } + // ensure interaction length is restricted + size_t maxIntLength = 0; + if (qAcc.val != 'N') { // check if accessibility computation enabled + if (qIntLenMax.val == 0 && qAccW.val == 0) { + throw error("window-based computation: maximal query interaction length has to be restricted either via --qAccW or --qIntLenMax"); + } else { // update maximum + maxIntLength = std::max(qIntLenMax.val, qAccW.val); + } + } else { // max interaction length of query + if (qIntLenMax.val == 0) { + throw error("window-based computation: maximal query interaction length has to be restricted via --qIntLenMax"); + } else { + maxIntLength = qIntLenMax.val; + } + } + if (tAcc.val != 'N') { + if (tIntLenMax.val == 0 && tAccW.val == 0) { + throw error("window-based computation: maximal target interaction length has to be restricted either via --tAccW or --tIntLenMax"); + } else { // update maximum + maxIntLength = maxIntLength < std::max(tAccW.val,tIntLenMax.val) ? std::max(tAccW.val,tIntLenMax.val) : maxIntLength; + } + } else { + if (tIntLenMax.val == 0) { + throw error("window-based computation: maximal target interaction length has to be restricted via --tIntLenMax"); + } else { // update maximum + maxIntLength = maxIntLength < tIntLenMax.val ? tIntLenMax.val : maxIntLength; + } + } + if (windowOverlap.val < maxIntLength) { + throw error("window-based computation: --windowOverlap ("+toString(windowOverlap.val)+") has to be at least as large as the maximal interaction length, i.e. max of --q|tAccW and --q|tIntLenMax ("+toString(maxIntLength)+")"); + } + if (windowWidth.val <= windowOverlap.val) { + throw error("window-based computation: --windowWidth ("+toString(windowWidth.val)+") has to exceed --windowOverlap ("+toString(windowOverlap.val)+")"); + } + if (outNumber.val > 1 && outOverlap.val != 'B') { + throw error("window-based computation: non-overlapping subopt output (-n > 1) only supported for --outOverlap=B"); + } + } + + + //////////////// MODEL //////////////////////// + // final checks if parameter set compatible with model switch (model.val) { // ensemble based predictions @@ -1908,6 +2037,7 @@ getOutputConstraint() const void CommandLineParsing:: parseSequences(const std::string & paramName, + const std::string & idPrefix, const std::string& paramArg, RnaSequenceVec& sequences, const IndexRangeList & seqSubset, @@ -1919,13 +2049,13 @@ parseSequences(const std::string & paramName, // read FASTA from STDIN stream if (boost::iequals(paramArg,"STDIN")) { - parseSequencesFasta(paramName, std::cin, sequences, seqSubset,idxPos0); + parseSequencesFasta(paramName,idPrefix, std::cin, sequences, seqSubset,idxPos0); } else if (RnaSequence::isValidSequenceIUPAC(paramArg)) { // check if sequence is to be stored if (seqSubset.empty() || seqSubset.covers(1)) { // direct sequence input - sequences.push_back(RnaSequence(paramName,paramArg,idxPos0)); + sequences.push_back(RnaSequence(idPrefix.empty()?paramName:idPrefix,paramArg,idxPos0)); } else { // would result in no sequence -> error LOG(ERROR) <<"Parsing of "< qIdxPos0; //! subset of query sequence indices to be processed @@ -541,6 +553,8 @@ class CommandLineParsing { std::string targetArg; //! the container holding all target sequences RnaSequenceVec target; + //! the id (prefix) to be used for target naming + std::string tId; //! in/output index of pos 0 (of all targets) NumberParameter tIdxPos0; //! subset of target sequence indices to be processed @@ -578,6 +592,22 @@ class CommandLineParsing { //! probabilities for according accessibility prediction std::string tShapeConversion; + // META PARAMETER applied to both query and target + //! accessibility computation mode + CharParameter acc; + //! window length for accessibility computation (plFold) + NumberParameter accW; + //! maximal base pair span for accessibility computation (plFold) + NumberParameter accL; + //! whether or not lonely base pairs are allowed in accessibility computation + bool accNoLP; + //! whether or not GU base pairs are allowed to close loops in accessibility computation + bool accNoGUend; + //! window length to be considered accessible/interacting + NumberParameter intLenMax; + //! maximal internal loop length to be considered accessible/interacting + NumberParameter intLoopMax; + //! the minimal number of base pairs allowed in the helix (>2) NumberParameter helixMinBP; @@ -650,11 +680,6 @@ class CommandLineParsing { bool energyNoDangles; - //! whether or not lonely base pairs are allowed in accessibility computation - bool accNoLP; - //! whether or not GU base pairs are allowed to close loops in accessibility computation - bool accNoGUend; - //! where to write the output to and for each in what format // std::string out; std::vector< std::string > out; @@ -731,6 +756,7 @@ class CommandLineParsing { resetParamDefault( Param & param, Value value ) { if (param.def != value) { param.def = value; + param.val = value; VLOG(1) <<" "< void validate_numberArgument(const NumberParameter & param, const T& value) { - // alphabet check + // range check if ( ! param.isInRange(value) ) { LOG(ERROR) < + void validate_numberArgumentExcludeRange(const NumberParameter & param, const T& value, const T& minExcl, const T& maxExcl) + { + // standard check + validate_numberArgument( param, value ); + + // check excluded range + if (param.val >= minExcl && param.val <= maxExcl) { + LOG(ERROR) <<"\n "< "<(qAccW,value); - // check lower bound - if (qAccW.val > 0 && qAccW.val < 3) { - LOG(ERROR) <<"\n qAccW = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_qAccL(const int & value) -{ - // standard check - validate_numberArgument(qAccL,value); - // check lower bound - if (qAccL.val > 0 && qAccL.val < 3) { - LOG(ERROR) <<"qAccL = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_qAccConstr(const std::string & value) -{ - // forward check to general method - validate_structureConstraintArgument("qAccConstr", value); -} - -//////////////////////////////////////////////////////////////////////////// - inline void CommandLineParsing::validate_qAccFile(const std::string & value) { @@ -1234,7 +1179,7 @@ void CommandLineParsing::validate_qAccFile(const std::string & value) // if not STDIN if ( boost::iequals(value,"STDIN") ) { if (getTargetSequences().size()>1) { - LOG(ERROR) <<"reading quary accessibilities for multiple sequences from '"<(tAccW,value); - // check lower bound - if (tAccW.val > 0 && tAccW.val < 3) { - LOG(ERROR) <<"tAccW = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tAccL(const int & value) -{ - // standard check - validate_numberArgument(tAccL,value); - // check lower bound - if (tAccL.val > 0 && tAccL.val < 3) { - LOG(ERROR) <<"tAccL = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tAccConstr(const std::string & value) -{ - // forward check to general method - validate_structureConstraintArgument("tAccConstr", value); -} - -//////////////////////////////////////////////////////////////////////////// - inline void CommandLineParsing::validate_tAccFile(const std::string & value) { @@ -1387,48 +1279,6 @@ void CommandLineParsing::validate_tAccFile(const std::string & value) //////////////////////////////////////////////////////////////////////////// -inline -void CommandLineParsing::validate_tRegion(const std::string & value) { - // check and store region information - validateRegion( "tRegion", value ); -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tShape( const std::string & value ) -{ - if (!validateFile( value )) { - LOG(ERROR) <<"Can not access/read target's SHAPE reactivity file '" <getSequence().getId() + VLOG(1) <<"Sequence '"<getSequence().getId() <<"' contains ambiguous nucleotide encodings. These positions are ignored for interaction computation."; } #if INTARNA_MULITHREADING @@ -182,8 +182,8 @@ int main(int argc, char **argv){ #if INTARNA_MULITHREADING #pragma omp critical(intarna_omp_logOutput) #endif - { LOG(INFO) <<"Sequence '"<getSequence().getId() - <<"' contains ambiguous IUPAC nucleotide encodings. These positions are ignored for interaction computation and replaced by 'N'.";} + { VLOG(1) <<"Sequence '"<getSequence().getId() + <<"' contains ambiguous IUPAC nucleotide encodings. These positions are ignored for interaction computation and are replaced by 'N'.";} } // second: iterate over all query sequences