diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt index b1f775da..91b75084 100644 --- a/config/sample_config_complex.txt +++ b/config/sample_config_complex.txt @@ -7,7 +7,7 @@ # Minimal settings required before this configuration can be executed: # - set your environment, paths to tools and databases (at the end of this file) # - under "global", set prefix -# - under "align_1" and "align_2", set the monomer sequence_id +# - under "align_1" and "align_2", set the monomer sequence_id # - run it! :) # Configuration rules: @@ -29,12 +29,12 @@ stages: - compare - mutate - fold - + # Global job settings. These will override settings of the same name in each of the stages. # These are typically the settings you want to modify for each of your jobs, together with some settings in the align stage. global: # mandatory output prefix of the job (e.g. output/HRAS will store outputs in folder "output", using files prefixed with "HRAS") - prefix: + prefix: # Clustering threshold for downweighting redudant sequences (Meff computation). E.g. 0.8 will cluster sequences # at a 80% sequence identity cutoff @@ -47,20 +47,20 @@ global: align_1: # use complex protocol to properly prepare inputs for concatenation protocol: complex - + # monomer alignment creation protocol to nest within the complex alignment protocol # choose either existing (below) to use a previously created alignment - # or standard to construct an alignment + # or standard to construct an alignment alignment_protocol: standard # Mandatory: specify the sequence identifier # Region can be left blank # Sequence file can be left blank - sequence_id: - region: + sequence_id: + region: sequence_file: - + # The following typically do not need to be set because 'global' overrides them # prefix: # theta: @@ -87,7 +87,7 @@ align_1: # sequence database (specify possible databases and paths in "databases" section below) # note: use uniprot for genome distance based concatenation - database: uniref100 + database: uniprot # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage. # To save compute time, this computation is normally carried out in the couplings stage @@ -134,10 +134,10 @@ align_1: # minimum_column_coverage: 70 # extract_annotation: True -# # if using existing alignment protocol, provide a path to the annotations.csv file +# # if using existing alignment protocol, provide a path to the annotations.csv file # # from the monomer run that generated the input alignment # # Needed to correctly find the species identifiers for best hit concatenation -# override_annotation_file: +# override_annotation_file: # Sequence alignment generation/processing for the second monomer. align_2: @@ -148,10 +148,10 @@ align_2: alignment_protocol: standard # Mandatory: specify the sequence identifier and region # Sequence file can be left blank - sequence_id: + sequence_id: region: sequence_file: - + # The following typically do not need to be set because 'global' overrides them # prefix: # theta: @@ -178,7 +178,7 @@ align_2: # sequence database (specify possible databases and paths in "databases" section below) # note: use uniprot for genome distance based concatenation - database: uniref100 + database: uniprot # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage. # To save compute time, this computation is normally carried out in the couplings stage @@ -224,10 +224,10 @@ align_2: # minimum_sequence_coverage: 50 # minimum_column_coverage: 70 # extract_annotation: True -# # if using existing alignment protocol, provide a path to the annotations.csv file +# # if using existing alignment protocol, provide a path to the annotations.csv file # # from the monomer run that generated the input alignment # # Needed to correctly find the species identifiers for best hit concatenation -# override_annotation_file: +# override_annotation_file: #Generation of concatenated sequence alignment for evolutionary couplings calculation concatenate: @@ -238,31 +238,35 @@ concatenate: second_alignment_file: # Select protocol for concatenation of sequence alignments - # Available protocols: + # Available protocols: # genome_distance: pair sequences that are closest neighbors on the genome # best_hit: for each genome, pair the sequences that have the highest % identity to the target sequence # for best hit protocol, user can set use_best_reciprocal to take the best reciprocal hits only (recommended) protocol: best_hit use_best_reciprocal: true - + # Maximum genome distance in bases allowed between pairs # Required for genome_distance protocol only genome_distance_threshold: 10000 - + # Maximum sequence identity allowed for hits to be designated # as paralogs. Required for best_hit in best reciprocal mode only paralog_identity_threshold: 0.95 - + + # forbid overlapping regions of the same seqeunce ID from being concatenated + # for typical heteromultimeric complexes, this should be true + forbid_overlapping_concatenation: true + # Parameters for filtering of concatenated alignment - + # Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in # the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence # already present in the alignment). If blank, no filtering. If filtering, HHfilter must be installed. seqid_filter: - + # Only keep sequences that align to at least x% of the target sequence (i.e. remove fragments) minimum_sequence_coverage: 50 - + # Only include alignment columns with at least x% residues (rather than gaps) during model inference minimum_column_coverage: 50 @@ -305,12 +309,12 @@ couplings: # Sequence separation filter for generation of CouplingScores_longrange.csv table (i.e. to take out short-range # ECs from table, only pairs with abs(i-j)>=min_sequence_distance will be kept. min_sequence_distance: 6 - + # Parameters specific to complex pipeline scoring # Scoring model to assess confidence in computed ECs # available options: skewnormal, normal, evcomplex scoring_model: skewnormal - + # Specify whether to use all ECs or only inter-molecular ECs for scoring use_all_ecs_for_scoring: False @@ -327,7 +331,7 @@ couplings: compare: # Current options: standard, complex protocol: complex - + # Following parameters will be usually overriden by global settings / output of previous stage prefix: ec_file: @@ -340,28 +344,30 @@ compare: # sequence_id and SIFTS database (sequence_id must be UniProt AC/ID in this case) first_by_alignment: True second_by_alignment: True - # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch - # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). - # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). - # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. + # Alignment method to use to find sequences corresponding to PDB structures. Options: jackhmmer, hmmsearch + # Set to jackhmmer to search using jackhmmer from the target sequence only (more stringent). + # Set to hmmsearch to search using an HMM built from the output monomer alignment (less stringent). + # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. first_pdb_alignment_method: jackhmmer second_pdb_alignment_method: jackhmmer - + # Leave this parameter empty to use all PDB structures for given sequence_id, otherwise # will be limited to the given IDs (single value or list). Important: note that this acts only as a filter on the # structures found by alignment or in the SIFTS table (!) - pdb_ids: + inter_pdb_ids: first_pdb_ids: second_pdb_ids: # Limit number of structures and chains for comparison - # Note - the intersection of the monomer structural hits is taken to find the - # Inter-protein structures. If you limit the number of monomer structures found in this step, - # you may miss some inter-protein structures - first_max_num_structures: 100 - first_max_num_hits: 100 - second_max_num_structures: 100 - second_max_num_hits: 100 + inter_max_num_structures: 10 + inter_max_num_hits: 10 + + # Limit number chains and structures to use for each monomer comparison, IN ADDITION to those found from the + # inter protein compariso + first_max_num_structures: 10 + first_max_num_hits: 10 + second_max_num_structures: 10 + second_max_num_hits: 10 # compare to multimer contacts (if multiple chains of the same sequence or its homologs are present in a structure) first_compare_multimer: True @@ -376,7 +382,7 @@ compare: first_use_bitscores: True first_domain_threshold: 0.5 first_sequence_threshold: 0.5 - + second_sequence_file: second_first_index: second_region: @@ -386,10 +392,10 @@ compare: second_sequence_threshold: 0.5 # Comparison and plotting settings - + # Return an error if we fail to automatically retrieve information about a given pdb id raise_missing: False - + # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances, @@ -406,7 +412,7 @@ compare: plot_probability_cutoffs: [0.90, 0.99] # Plot fixed numbers of inter-protein ECS, and all intra ECs scoring at least as high - # As those inter-protein ECs. + # As those inter-protein ECs. # Use integers only plot_lowest_count: 5 plot_highest_count: 10 @@ -421,7 +427,7 @@ compare: # draw secondary structure on contact map plots draw_secondary_structure: True - + # Settings for Mutation effect predictions mutate: # Options: standard, complex @@ -489,7 +495,7 @@ environment: # command that will be executed before running actual computation (can be used to set up environment) configuration: - + # Paths to databases used by evcouplings. databases: @@ -510,14 +516,22 @@ databases: # Periodically delete these files to more recent versions of SIFTS are used. sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta - + # the following two databases are exclusive to EVcomplex and need to be manually downloaded and saved locally # then add the paths to your local copies of the database - # Download urls: + # Download urls: # ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt - # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt + # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt uniprot_to_embl_table: /n/groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt ena_genome_location_table: /n/groups/marks/databases/complexes/ena/2017_02/cds_pro.txt + structurefree_model_file: /n/groups/marks/users/agreen/dev/EVcouplings/evcouplings/compare/aux/residue_strucfree.saved + structureaware_model_file: /n/groups/marks/users/agreen/dev/EVcouplings/evcouplings/compare/aux/residue_strucaware.saved + + complex_strucfree_model_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucfree.saved + complex_strucfree_scaler_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucfree.scaler + complex_strucaware_model_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucaware.saved + complex_strucaware_scaler_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucaware.saved + # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required. tools: @@ -529,4 +543,4 @@ tools: psipred: /n/groups/marks/software/runpsipred cns: /n/groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns maxcluster: /n/groups/marks/pipelines/evcouplings/software/maxcluster64bit - + dssp: /n/groups/marks/software/dssp diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index 084bbd61..67df56b1 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -278,9 +278,9 @@ compare: # print information about used PDB structures on contact map plots print_pdb_information: True - # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch - # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). - # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). + # Alignment method to use to find sequences corresponding to PDB structures. Options: jackhmmer, hmmsearch + # Set to jackhmmer to search using jackhmmer from the target sequence only (more stringent). + # Set to hmmsearch to search using an HMM built from the output monomer alignment (less stringent). # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. pdb_alignment_method: jackhmmer diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 06fc1902..08334f72 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -48,7 +48,7 @@ def _verify_sequence_id(sequence_id): Verify if a target sequence identifier is in proper format for the pipeline to run without errors (not none, and contains no whitespace) - + Parameters ---------- id : str @@ -80,8 +80,8 @@ def _verify_sequence_id(sequence_id): def _make_hmmsearch_raw_fasta(alignment_result, prefix): """ HMMsearch results do not contain the query sequence - so we must construct a raw_fasta file with the query - sequence as the first hit, to ensure proper numbering. + so we must construct a raw_fasta file with the query + sequence as the first hit, to ensure proper numbering. The search result is filtered to only contain the columns with match states to the HMM, which has a one to one mapping to the query sequence. @@ -110,7 +110,7 @@ def _add_gaps_to_query(query_sequence_ali, ali): i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x" ] - # ensure that the length of the match states + # ensure that the length of the match states # match the length of the sequence if len(match_index) != query_sequence_ali.L: raise ValueError( @@ -142,7 +142,7 @@ def _add_gaps_to_query(query_sequence_ali, ali): with open(alignment_result["target_sequence_file"]) as a: query_sequence_ali = Alignment.from_file(a, format="fasta") - # if the provided alignment is empty, just return the target sequence + # if the provided alignment is empty, just return the target sequence raw_focus_alignment_file = prefix + "_raw.fasta" if not valid_file(alignment_result["raw_alignment_file"]): # write the query sequence to a fasta file @@ -162,14 +162,14 @@ def _add_gaps_to_query(query_sequence_ali, ali): "Stockholm alignment {} missing RF" " annotation of match states".format(alignment_result["raw_alignment_file"]) ) - + # add insertions to the query sequence in order to preserve correct # numbering of match sequences gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali) - # write a new alignment file with the query sequence as + # write a new alignment file with the query sequence as # the first entry - + with open(raw_focus_alignment_file, "w") as of: gapped_sequence_ali.write(of) ali.write(of) @@ -1164,9 +1164,9 @@ def hmmbuild_and_search(**kwargs): """ Protocol: - Build HMM from sequence alignment using hmmbuild and + Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. - + Parameters ---------- Mandatory kwargs arguments: @@ -1298,7 +1298,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): return focus_fasta_file, target_sequence_file, region_start, region_end - # define the gap threshold for inclusion in HMM's build by HMMbuild. + # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options @@ -1343,7 +1343,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch - sequence_length = region_end - region_start + 1 + sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], @@ -1371,7 +1371,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], - binary=kwargs["hmmsearch"], + binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately @@ -1400,7 +1400,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): "hittable_file": ali["domtblout"], } - # convert the raw output alignment to fasta format + # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file @@ -1605,13 +1605,13 @@ def complex(**kwargs): annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) - # extract cds identifiers for alignment uniprot IDs + # # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids( outcfg["alignment_file"], kwargs["uniprot_to_embl_table"] ) - # extract genome location information from ENA + ##extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py index 8db22f9b..8a21a225 100644 --- a/evcouplings/compare/distances.py +++ b/evcouplings/compare/distances.py @@ -1290,11 +1290,13 @@ def _get_chains(sifts_result): individual_distance_maps = [] # determine which combinations of chains to look at - # (anything that has same PDB identifier) + # (anything that has same PDB identifier)... combis = sifts_result_i.hits.reset_index().merge( sifts_result_j.hits.reset_index(), on="pdb_id", suffixes=("_i", "_j") ) + # but not the same chain (else we get i to i contacts) + combis = combis.query("pdb_chain_i != pdb_chain_j") # extract chains for each subunit chains_i = _get_chains(sifts_result_i) diff --git a/evcouplings/compare/ecs.py b/evcouplings/compare/ecs.py index e3c69a7e..d895b84d 100644 --- a/evcouplings/compare/ecs.py +++ b/evcouplings/compare/ecs.py @@ -76,7 +76,7 @@ def add_precision(ec_table, dist_cutoff=5, score="cn", ec_table = ec_table.sort_values(by=score, ascending=False) if min_sequence_dist is not None: - ec_table = ec_table.query("abs(i - j) >= @min_sequence_dist") + ec_table = ec_table.query("abs(i - j) >= @min_sequence_dist or segment_i != segment_j") ec_table = ec_table.copy() @@ -136,7 +136,7 @@ def coupling_scores_compared(ec_table, dist_map, dist_map_multimer=None, ) if min_sequence_dist is not None: - x = x.query("abs(i - j) >= @min_sequence_dist") + x = x.query("abs(i - j) >= @min_sequence_dist or segment_i != segment_j") # if distance cutoff is given, add precision if dist_cutoff is not None: diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index 1f07002f..f088f095 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -5,7 +5,7 @@ Thomas A. Hopf Anna G. Green (complex and _make_complex_contact_maps) """ - +import joblib from copy import deepcopy from math import ceil import pandas as pd @@ -20,7 +20,7 @@ ) from evcouplings.utils.system import ( - create_prefix_folders, insert_dir, verify_resources, + create_prefix_folders, insert_dir, verify_resources, valid_file ) from evcouplings.couplings import Segment from evcouplings.compare.pdb import load_structures @@ -112,21 +112,77 @@ def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR, ) -def _identify_structures(**kwargs): +def complex_probability(ecs, scoring_model, use_all_ecs=False, + score="cn", Neff_over_L=None): """ - Identify set of 3D structures for comparison + Adds confidence measure for complex evolutionary couplings Parameters ---------- - **kwargs - See check_required in code below + ecs : pandas.DataFrame + Table with evolutionary couplings + scoring_model : {"skewnormal", "normal", "evcomplex"} + Use this scoring model to assign EC confidence measure + use_all_ecs : bool, optional (default: False) + If true, fits the scoring model to all ECs; + if false, fits the model to only the inter ECs. + score : str, optional (default: "cn") + Use this score column for confidence assignment Returns ------- - SIFTSResult - Identified structures and residue index mappings + ecs : pandas.DataFrame + EC table with additional column "probability" + containing confidence measure """ + from evcouplings.couplings.pairs import add_mixture_probability + if use_all_ecs: + ecs = add_mixture_probability( + ecs, model=scoring_model + ) + else: + inter_ecs = ecs.query("segment_i != segment_j") + intra_ecs = ecs.query("segment_i == segment_j") + + intra_ecs.assign(scoring_model = np.nan) + + inter_ecs = add_mixture_probability( + inter_ecs, model=scoring_model, score=score, Neff_over_L=Neff_over_L + ) + + ecs = pd.concat( + [intra_ecs, inter_ecs] + ).sort_values( + score, ascending=False + ) + + return ecs + + +def _filter_structures(sifts_map, pdb_ids=None, max_num_hits=None, max_num_structures=None): + + """ + Filters input SIFTSResult for specific pdb ids and/or number of hits + + Parameters + ---------- + sifts_map: SIFTSResult + Identified structures and residue index mappings + pdb_ids: list of str, optional (default: None) + List of PDB ids to be used for comparison + max_num_hits: int, optional (default: None) + Number of PDB hits to be used for comparison. + Different chains from the same PDB count as multiple hits. + max_num_structures: int, optional (default: None) + Number of unique PDB ids to be used for comparison. + Different chains from the same PDB count as one hit. + + Returns + ------- + SIFTSResult + Filtered identified structures and residue index mappings + """ def _filter_by_id(x, id_list): x = deepcopy(x) x.hits = x.hits.loc[ @@ -134,6 +190,43 @@ def _filter_by_id(x, id_list): ] return x + + # filter ID list down to manually selected PDB entries + if pdb_ids is not None: + + # make sure we have a list of PDB IDs + if not isinstance(pdb_ids, list): + pdb_ids = [pdb_ids] + + pdb_ids = [x.lower() for x in pdb_ids] + sifts_map = _filter_by_id(sifts_map, pdb_ids) + + # limit number of hits and structures + if max_num_hits is not None: + sifts_map.hits = sifts_map.hits.iloc[:max_num_hits] + + if max_num_structures is not None: + keep_ids = sifts_map.hits.pdb_id.unique() + keep_ids = keep_ids[:max_num_structures] + sifts_map = _filter_by_id(sifts_map, keep_ids) + + return sifts_map + +def _identify_structures(**kwargs): + """ + Identify set of 3D structures for comparison + + Parameters + ---------- + **kwargs + See check_required in code below + + Returns + ------- + SIFTSResult + Identified structures and residue index mappings + """ + check_required( kwargs, [ @@ -183,28 +276,13 @@ def _filter_by_id(x, id_list): kwargs["sequence_id"], reduce_chains=reduce_chains ) + # Save the pre-filtered SIFTs map sifts_map_full = deepcopy(sifts_map) - # filter ID list down to manually selected PDB entries - if kwargs["pdb_ids"] is not None: - pdb_ids = kwargs["pdb_ids"] - - # make sure we have a list of PDB IDs - if not isinstance(pdb_ids, list): - pdb_ids = [pdb_ids] - - pdb_ids = [x.lower() for x in pdb_ids] - - sifts_map = _filter_by_id(sifts_map, pdb_ids) - - # limit number of hits and structures - if kwargs["max_num_hits"] is not None: - sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]] - - if kwargs["max_num_structures"] is not None: - keep_ids = sifts_map.hits.pdb_id.unique() - keep_ids = keep_ids[:kwargs["max_num_structures"]] - sifts_map = _filter_by_id(sifts_map, keep_ids) + #Filter the SIFTS map + sifts_map = _filter_structures( + sifts_map, kwargs["pdb_ids"], kwargs["max_num_hits"], kwargs["max_num_structures"] + ) return sifts_map, sifts_map_full @@ -295,6 +373,7 @@ def plot_cm(ecs, output_file=None): "draw_secondary_structure" ] ) + prefix = kwargs["prefix"] cm_files = [] @@ -345,43 +424,12 @@ def _discrete_count(x): # give back list of all contact map file names return cm_files - -def _make_complex_contact_maps(ec_table, d_intra_i, d_multimer_i, - d_intra_j, d_multimer_j, - d_inter, first_segment_name, - second_segment_name, **kwargs): - """ - Plot contact maps with all ECs above a certain probability threshold, - or a given count of ECs - - Parameters - ---------- - ec_table : pandas.DataFrame - Full set of evolutionary couplings (all pairs) - d_intra_i, d_intra_j: DistanceMap - Computed residue-residue distances within chains for - monomers i and j - d_multimer_i, d_multimer_j : DistanceMap - Computed residue-residue distances between homomultimeric - chains for monomers i and j - d_inter: DistanceMap - Computed residue-residue distances between heteromultimeric - chains i and j - first_segment_name, second_segment_name: str - Name of segment i and segment j in the ec_table - **kwargs - Further plotting parameters, see check_required in code - for necessary values. - - Returns - ------- - cm_files : list(str) - Paths of generated contact map files - """ - - def plot_complex_cm(ecs_i, ecs_j, ecs_inter, - first_segment_name, - second_segment_name, output_file=None): +def plot_complex_cm(ecs_i, ecs_j, ecs_inter, + d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, first_segment_name, + second_segment_name, output_file, + **kwargs): """ Simple wrapper for contact map plotting """ @@ -404,7 +452,7 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, if len(ecs_inter) == 0: ecs_inter = None - # Currently, we require at least one of the monomer + # Currently, we require at least one of the monomer # to have either ECs or distances in order to make a plot if ((ecs_i is None or ecs_i.empty) and d_intra_i is None and d_multimer_i is None) \ or ((ecs_j is None or ecs_j.empty) and d_intra_j is None and d_multimer_j is None): @@ -421,7 +469,8 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, margin=5, boundaries=kwargs["boundaries"], scale_sizes=kwargs["scale_sizes"], - show_secstruct=kwargs["draw_secondary_structure"] + show_secstruct=kwargs["draw_secondary_structure"], + distance_cutoff=kwargs["distance_cutoff"] ) # Add title to the plot @@ -430,7 +479,7 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, else: ec_len = len(ecs_inter) plt.suptitle( - "{} inter-molecule evolutionary couplings".format(ec_len), + "{} inter-molecule evolutionary couplings".format(ec_len), fontsize=14 ) @@ -441,6 +490,56 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, return True + +def _make_complex_contact_maps(ec_table, d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, first_segment_name, + second_segment_name, **kwargs): + """ + Plot contact maps with all ECs above a certain probability threshold, + or a given count of ECs + + Parameters + ---------- + ec_table : pandas.DataFrame + Full set of evolutionary couplings (all pairs) + d_intra_i, d_intra_j: DistanceMap + Computed residue-residue distances within chains for + monomers i and j + d_multimer_i, d_multimer_j : DistanceMap + Computed residue-residue distances between homomultimeric + chains for monomers i and j + d_inter: DistanceMap + Computed residue-residue distances between heteromultimeric + chains i and j + first_segment_name, second_segment_name: str + Name of segment i and segment j in the ec_table + **kwargs + Further plotting parameters, see check_required in code + for necessary values. + + Returns + ------- + cm_files : list(str) + Paths of generated contact map files + """ + + # transform fraction of number of sites into discrete number of ECs + def _discrete_count(x): + if isinstance(x, float): + num_sites = 0 + for seg_name in [first_segment_name, second_segment_name]: + num_sites += len( + set.union( + set(ec_table.query("segment_i == @seg_name").i.unique()), + set(ec_table.query("segment_j == @seg_name").j.unique()) + ) + ) + + x = ceil(x * num_sites) + + return int(x) + check_required( kwargs, [ @@ -479,28 +578,15 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter, output_file = prefix + "_significant_ECs_{}.pdf".format(c) plot_completed = plot_complex_cm( ec_set_i, ec_set_j, ec_set_inter, + d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, first_segment_name, second_segment_name, output_file=output_file ) if plot_completed: cm_files.append(output_file) - # transform fraction of number of sites into discrete number of ECs - def _discrete_count(x): - if isinstance(x, float): - num_sites = 0 - for seg_name in [first_segment_name, second_segment_name]: - num_sites += len( - set.union( - set(ec_table.query("segment_i == @seg_name").i.unique()), - set(ec_table.query("segment_j == @seg_name").j.unique()) - ) - ) - - x = ceil(x * num_sites) - - return int(x) - # range of plots to make lowest = _discrete_count(kwargs["plot_lowest_count"]) highest = _discrete_count(kwargs["plot_highest_count"]) @@ -875,7 +961,7 @@ def complex(**kwargs): "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_target_sequence_file", "second_target_sequence_file", - "scale_sizes" + "scale_sizes", "structurefree_model_file", "structureaware_model_file" ] ) @@ -885,11 +971,8 @@ def complex(**kwargs): # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", - "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", - - # initialize output inter distancemap files - "distmap_inter": prefix + "_distmap_inter", - "inter_contacts_file": prefix + "_inter_contacts_file" + "ec_compared_inter_file": prefix + "_CouplingsScoresCompared_inter.csv", + "distmap_inter": prefix + "_distmap_inter" } # Add PDB comparison files for first and second monomer @@ -930,36 +1013,74 @@ def complex(**kwargs): # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer - # remove the "prefix" kwargs so that we can replace with the + # any prefix that starts with a name_prefix will overwrite prefixes that do not start + # eg, "first_sequence_file" will overwrite "sequence_file" + monomer_kwargs = deepcopy(kwargs) + for k,v in kwargs.items(): + if name_prefix + "_" in k: + # only replace first occurrence of name_prefix + monomer_kwargs[k.replace(name_prefix + "_", "", 1)] = v + + # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures - # only replace first occurrence of name_prefix monomer_kwargs = { - k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k + k: v for k, v in monomer_kwargs.items() if "prefix" not in k } - # this field needs to be set explicitly else it gets overwritten by concatenated file - monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] - monomer_kwargs["raw_focus_alignment_file"] = kwargs[name_prefix + "_raw_focus_alignment_file"] - # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures( **monomer_kwargs, prefix=aux_prefix ) + # save full list of hits + sifts_map_full.hits.to_csv( + outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False + ) + + return outcfg, sifts_map, sifts_map_full + + outcfg, first_sifts_map, first_sifts_map_full = _identify_monomer_structures("first", outcfg, first_aux_prefix) + outcfg, second_sifts_map, second_sifts_map_full = _identify_monomer_structures("second", outcfg, second_aux_prefix) + + # Determine the inter-protein PDB hits based on the full sifts map for each monomer + # initialize output inter distancemap files + inter_protein_hits_full = first_sifts_map_full.hits.merge( + second_sifts_map_full.hits, on="pdb_id", how="inner", suffixes=["_1", "_2"] + ) + outcfg["structure_hits_unfiltered_file"] = prefix + "_inter_structure_hits_unfiltered.csv" + inter_protein_hits_full.to_csv(outcfg["structure_hits_unfiltered_file"]) + + # Filter for the number of PDB ids to use + inter_protein_sifts = SIFTSResult(hits=inter_protein_hits_full, mapping=None) + inter_protein_sifts = _filter_structures( + inter_protein_sifts, + kwargs["inter_pdb_ids"], + kwargs["inter_max_num_hits"], + kwargs["inter_max_num_structures"] + ) + + outcfg["structure_hits_file"] = prefix + "_inter_structure_hits.csv" + inter_protein_sifts.hits.to_csv(outcfg["structure_hits_file"]) + + def _add_inter_pdbs(inter_protein_sifts, sifts_map, sifts_map_full, name_prefix,): + """ + ensures that all pdbs used for inter comparison end up in the monomer SIFTS hits + """ + + lines_to_keep = sifts_map_full.hits.query("pdb_id in @inter_protein_sifts.hits.pdb_id").index + sifts_map.hits = pd.concat([ + sifts_map.hits, sifts_map_full.hits.loc[lines_to_keep, :] + ]).drop_duplicates() + # save selected PDB hits sifts_map.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_file"], index=False ) + return sifts_map - # also save full list of hits - sifts_map_full.hits.to_csv( - outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False - ) - return outcfg, sifts_map - - outcfg, first_sifts_map = _identify_monomer_structures("first", outcfg, first_aux_prefix) - outcfg, second_sifts_map = _identify_monomer_structures("second", outcfg, second_aux_prefix) + first_sifts_map = _add_inter_pdbs(inter_protein_sifts, first_sifts_map, first_sifts_map_full, "first") + second_sifts_map = _add_inter_pdbs(inter_protein_sifts, second_sifts_map, second_sifts_map_full, "second") # get the segment names from the kwargs segment_list = kwargs["segments"] @@ -976,6 +1097,17 @@ def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): first_chain_name = Segment.from_list(kwargs["segments"][0]).default_chain_name() second_chain_name = Segment.from_list(kwargs["segments"][1]).default_chain_name() + # load all structures at once + all_ids = set(first_sifts_map.hits.pdb_id).union( + set(second_sifts_map.hits.pdb_id) + ) + + structures = load_structures( + all_ids, + kwargs["pdb_mmtf_dir"], + raise_missing=False + ) + # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): @@ -1024,7 +1156,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) - outcfg[name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" + outcfg[name_prefix + "_multimer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts( @@ -1042,8 +1174,8 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( - sifts_map, aux_prefix, seqmap, chain_name=chain_name, - raise_missing=kwargs["raise_missing"] + sifts_map, aux_prefix, None, chain_name=chain_name, + raise_missing=kwargs["raise_missing"], atom_filter=None ).items() } @@ -1057,16 +1189,6 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): return d_intra, d_multimer, seqmap - # load all structures for both monomers - all_structures = set(first_sifts_map.hits.pdb_id).union( - set(second_sifts_map.hits.pdb_id) - ) - structures = load_structures( - all_structures, - kwargs["pdb_mmtf_dir"], - raise_missing=False - ) - d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", first_chain_name ) @@ -1085,6 +1207,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file + outcfg["inter_contacts_file"] = prefix + "_inter_contacts.csv" d_inter.contacts( kwargs["distance_cutoff"] ).to_csv( @@ -1137,7 +1260,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, - min_sequence_dist=None # does not apply for inter-protein ECs + min_sequence_dist=None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) @@ -1161,7 +1284,8 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, - dist_cutoff=kwargs["distance_cutoff"] + dist_cutoff=kwargs["distance_cutoff"], + min_sequence_dist=min_seq_dist ) # save to file @@ -1194,7 +1318,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, - raise_missing=kwargs["raise_missing"] + raise_missing=kwargs["raise_missing"], atom_filter=None ).items() } @@ -1209,14 +1333,13 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): return outcfg - # list of available EC comparison protocols PROTOCOLS = { # standard monomer comparison protocol "standard": standard, # comparison for protein complexes - "complex": complex + "complex": complex, } diff --git a/evcouplings/compare/tools.py b/evcouplings/compare/tools.py new file mode 100644 index 00000000..748e2aa7 --- /dev/null +++ b/evcouplings/compare/tools.py @@ -0,0 +1,38 @@ +""" +Wrappers for tools for comparison of evolutionary +couplings to experimental structures + +Authors: + Anna G. Green +""" +from evcouplings.utils.system import ( + run, verify_resources +) + +def run_dssp(infile, outfile, binary="dssp"): + """ + Runs DSSP on an input pdb file + + Parameters + ---------- + infile: str + path to input file + outfile: str + path to output file + binary: str + path to DSSP binary + """ + cmd = [ + binary, + "-i", infile, + "-o", outfile + ] + return_code, stdout, stderr = run(cmd, check_returncode=False) + + # verify_resources( + # "DSSP returned empty file: " + # "stdout={} stderr={} file={}".format( + # stdout, stderr, outfile + # ), + # outfile + # ) \ No newline at end of file diff --git a/evcouplings/complex/protocol.py b/evcouplings/complex/protocol.py index 0ec996b9..8481b5e1 100644 --- a/evcouplings/complex/protocol.py +++ b/evcouplings/complex/protocol.py @@ -21,13 +21,14 @@ create_prefix_folders, verify_resources ) from evcouplings.align.protocol import modify_alignment +from evcouplings.align.alignment import parse_header from evcouplings.complex.alignment import ( write_concatenated_alignment ) from evcouplings.complex.distance import ( find_possible_partners, best_reciprocal_matching, - plot_distance_distribution + plot_distance_distribution, get_distance ) from evcouplings.complex.similarity import ( read_species_annotation_table, @@ -35,6 +36,46 @@ filter_best_reciprocal, find_paralogs ) +from evcouplings.couplings.mapping import ( + SegmentIndexMapper, Segment, segment_map_ecs +) + +def remove_overlapping_ids(id_dataframe): + """ + Removes pairs of ids that are from the same sequence + and overlap by at least a certain amount + + Paramters + --------- + id_dataframe: pd.DataFrame + contains columns id_1, id_2 + amount_overlap_allowed: int + threshold of amount overlap allowed between two sequecnes + + Returns + pd.Dataframe + """ + temp_dataframe = id_dataframe.copy() + + id_strings = [parse_header(x) for x in temp_dataframe.id_1] + temp_dataframe["id_string_1"], temp_dataframe["r_s_1"], temp_dataframe["r_e_1"] = \ + zip(*id_strings) + + id_strings = [parse_header(x) for x in temp_dataframe.id_2] + temp_dataframe["id_string_2"], temp_dataframe["r_s_2"], temp_dataframe["r_e_2"] = \ + zip(*id_strings) + + temp_dataframe["overlap"] = None + + for idx, row in temp_dataframe.iterrows(): + temp_dataframe.loc[idx, "overlap"] = get_distance( + (row["r_s_1"], row["r_e_1"]), + (row["r_s_2"], row["r_e_2"]) + ) + + to_keep = temp_dataframe.query("id_string_1 != id_string_2 or overlap > 0") + + return to_keep[id_dataframe.columns] def modify_complex_segments(outcfg, **kwargs): """ @@ -74,6 +115,35 @@ def _modify_segments(seg_list, seg_prefix): return outcfg +def map_frequencies_file(frequencies_file, outcfg, **kwargs): + """ + Renumbers the frequencies file created in complex concatenation + with correct monomer numbering based on segments - otherwise numbering + will be continuous for the concatenated sequence. + + Parameters + ---------- + frequencies_file: str + path to the frequencies file to be renumbered + outcfg : dict + The output configuration + + """ + + frequencies = pd.read_csv(frequencies_file) + + segments = [ + Segment.from_list(s) for s in outcfg["segments"] + ] + seg_mapper = SegmentIndexMapper( + kwargs["first_focus_mode"], outcfg["region_start"], *segments + ) + frequencies = segment_map_ecs( + frequencies, seg_mapper, map_i=True, map_j=False + ) + return frequencies + + def _run_describe_concatenation(outcfg, **kwargs): """ calculate some basic statistics on the concatenated alignment @@ -94,26 +164,26 @@ def describe_concatenation(annotation_file_1, annotation_file_2, genome_location_filename_1, genome_location_filename_2, outfile): """ - Describes properties of concatenated alignment. + Describes properties of concatenated alignment. Writes a csv with the following columns num_seqs_1 : number of sequences in the first monomer alignment num_seqs_2 : number of sequences in the second monomer alignment - num_nonred_species_1 : number of unique species annotations in the + num_nonred_species_1 : number of unique species annotations in the first monomer alignment - num_nonred_species_2 : number of unique species annotations in the + num_nonred_species_2 : number of unique species annotations in the second monomer alignment num_species_overlap: number of unique species found in both alignments - median_num_per_species_1 : median number of paralogs per species in the + median_num_per_species_1 : median number of paralogs per species in the first monomer alignmment - median_num_per_species_2 : median number of paralogs per species in + median_num_per_species_2 : median number of paralogs per species in the second monomer alignment - num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the + num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) - num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the + num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) - + Parameters ---------- annotation_file_1 : str @@ -139,11 +209,11 @@ def describe_concatenation(annotation_file_1, annotation_file_2, annotation_file_2 ) species_2 = annotations_2.species.values - + # calculate the number of sequences found in each alignment num_seqs_1 = len(annotations_1) num_seqs_2 = len(annotations_2) - + # calculate the number of species found in each alignment # where a species is defined as a unique OS or Tax annotation field nonredundant_annotations_1 = len(set(species_1)) @@ -154,7 +224,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2, set(species_1).intersection(set(species_2)) ) n_species_overlap = len(species_overlap) - + # calculate the median number of paralogs per species n_paralogs_1 = float( # counts the number of times each species occurs in the list @@ -165,7 +235,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2, n_paralogs_2 = float( np.median(list(Counter(species_2).values())) ) - + # If the user provided genome location files, calculate the number # of ids for which we found an embl CDS. Default value is np.nan embl_cds1 = np.nan @@ -177,11 +247,20 @@ def describe_concatenation(annotation_file_1, annotation_file_2, genome_location_table_1 = pd.read_csv(genome_location_filename_1) genome_location_table_2 = pd.read_csv(genome_location_filename_2) + if not genome_location_table_1.empty: # Number uniprot IDs with EMBL CDS that is not NA - if "uniprot_ac" in genome_location_table_1.columns: embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac))) - if "uniprot_ac" in genome_location_table_2.columns: + else: + embl_cds1 = np.nan + + if not genome_location_table_2.empty: embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac))) + else: + embl_cds2 = np.nan + + else: + embl_cds1 = np.nan + embl_cds2 = np.nan concatenation_data = [ num_seqs_1, @@ -194,7 +273,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2, embl_cds1, embl_cds2, ] - + cols = [ "num_seqs_1", "num_seqs_2", @@ -308,6 +387,10 @@ def genome_distance(**kwargs): id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"] + # filter for overlapping concatenation of same id + if kwargs["forbid_overlapping_concatenation"]: + id_pairing = remove_overlapping_ids(id_pairing) + # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ @@ -326,11 +409,11 @@ def genome_distance(**kwargs): mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: - mon_ali_1.write(of) + mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: - mon_ali_2.write(of) + mon_ali_2.write(of) # filter the alignment aln_outcfg, _ = modify_alignment( @@ -361,6 +444,11 @@ def genome_distance(**kwargs): outcfg["distance_plot_file"] = prefix + "_distplot.pdf" plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"]) + # Correct the nubering in the frequencies file + map_frequencies_file( + outcfg["frequencies_file"], outcfg, **kwargs + ).to_csv(outcfg["frequencies_file"]) + return outcfg @@ -368,7 +456,7 @@ def best_hit(**kwargs): """ Protocol: - Concatenate alignments based on the best hit + Concatenate alignments based on the best hit to the focus sequence in each species Parameters @@ -404,7 +492,8 @@ def best_hit(**kwargs): "first_segments", "second_segments", "first_identities_file", "second_identities_file", "first_annotation_file", "second_annotation_file", - "use_best_reciprocal", "paralog_identity_threshold" + "use_best_reciprocal", "paralog_identity_threshold", + "forbid_overlapping_concatenation" ] ) @@ -430,7 +519,9 @@ def _load_monomer_info(annotations_file, identities_file, similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism - most_similar_in_species = most_similar_by_organism(similarities, annotation_table) + most_similar_in_species = most_similar_by_organism( + similarities, annotation_table + ) if use_best_reciprocal: paralogs = find_paralogs( @@ -463,7 +554,7 @@ def _load_monomer_info(annotations_file, identities_file, kwargs["paralog_identity_threshold"] ) - # merge the two dataframes to get all species found in + # merge the two dataframes to get all species found in # both alignments species_intersection = most_similar_in_species_1.merge( most_similar_in_species_2, @@ -472,6 +563,10 @@ def _load_monomer_info(annotations_file, identities_file, suffixes=("_1", "_2") ) + # filter for overlapping concatenation of same id + if kwargs["forbid_overlapping_concatenation"]: + species_intersection = remove_overlapping_ids(species_intersection) + # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ @@ -504,6 +599,8 @@ def _load_monomer_info(annotations_file, identities_file, **kwargs ) + # remap the + # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target @@ -520,6 +617,11 @@ def _load_monomer_info(annotations_file, identities_file, # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) + # Correct the numbering in the frequencies file + map_frequencies_file( + outcfg["frequencies_file"], outcfg, **kwargs + ).to_csv(outcfg["frequencies_file"]) + return outcfg @@ -529,7 +631,8 @@ def _load_monomer_info(annotations_file, identities_file, "genome_distance": genome_distance, # concatenate based on best hit per genome ("species") - "best_hit": best_hit + "best_hit": best_hit, + } diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py index 404b1e1f..07b3375f 100644 --- a/evcouplings/couplings/mapping.py +++ b/evcouplings/couplings/mapping.py @@ -13,7 +13,6 @@ import pandas as pd import numpy as np - class Segment: """ Represents a continuous stretch of sequence in a sequence @@ -306,16 +305,24 @@ def to_model(self, x): """ return self.__map(x, self.target_to_model) - -def segment_map_ecs(ecs, mapper): +def segment_map_ecs(ecs, mapper, map_i=True, map_j=True): """ Map EC dataframe in model numbering into - segment numbering + segment numbering. + + Can also be used to map dataframes with only a column i (no column j), + such as amino acid frequencies, by using map_i=True and map_j=False Parameters ---------- ecs : pandas.DataFrame EC table (with columns i and j) + mapper : SegmentIndexMapper + Mapper for renumbering the segments + map_i : Boolean, optional (default=True) + Map ecs in column i + map_j : Boolean, optional (default=True) + Map ecs in column j Returns ------- @@ -324,7 +331,7 @@ def segment_map_ecs(ecs, mapper): mapped, and additional columns segment_i and segment_j) """ - ecs = deepcopy(ecs) + ecs = ecs.copy() def _map_column(col): seg_col = "segment_" + col @@ -340,8 +347,10 @@ def _map_column(col): ecs.loc[:, seg_col] = col_m.loc[:, seg_col].values # map both position columns (and add segment id) - _map_column("i") - _map_column("j") + if map_i: + _map_column("i") + if map_j: + _map_column("j") return ecs @@ -374,7 +383,9 @@ def __init__(self, filename, *segments, # initialize the segment index mapper to update model numbering if len(segments) == 0: - raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel") + raise ValueError( + "Must provide at least one segment for MultiSegmentCouplingsModel" + ) first_segment = segments[0] index_start = first_segment.region_start diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py index 2de96d97..79c49d38 100644 --- a/evcouplings/couplings/pairs.py +++ b/evcouplings/couplings/pairs.py @@ -4,32 +4,33 @@ .. todo:: 1. clean up - 2. add Pompom score - 3. add mapping tools (multidomain, complexes) - 4. ECs to matrix - 5. APC on subsets of positions (e.g. for complexes) + 2. add mapping tools (multidomain, complexes) + 3. ECs to matrix + 4. APC on subsets of positions (e.g. for complexes) Authors: Thomas A. Hopf Agnes Toth-Petroczy (original mixture model code) John Ingraham (skew normal mixture model) - Anna G. Green (EVComplex Score code) + Anna G. Green (EVcomplex Score code, segment-aware enrichment calculation) """ -from math import ceil -from copy import deepcopy -from pkg_resources import resource_filename - import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.optimize as op + +from math import ceil, sqrt +from copy import deepcopy +from pkg_resources import resource_filename + from scipy import stats from sklearn.linear_model import LogisticRegression from evcouplings.utils.calculations import median_absolute_deviation from evcouplings.utils.config import read_config_file +DEFAULT_SEGMENT_NAME = "A_1" def read_raw_ec_file(filename, sort=True, score="cn"): """ @@ -70,14 +71,14 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6): Calculate EC "enrichment" as first described in Hopf et al., Cell, 2012. - .. todo:: - - Make this handle segments if they are in EC table Parameters ---------- ecs : pd.DataFrame Dataframe containing couplings + segment_name: str, optional (default: "A_1") + Value to serve as placeholder name for segment, in the + case of no segments found in EC dataframe num_pairs : int or float, optional (default: 1.0) Number of ECs to use for enrichment calculation. - If float, will be interpreted as fraction of the @@ -96,37 +97,55 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6): Sorted table with enrichment values for each position in the sequence """ - # determine how many positions ECs are over - pos = set(ecs.i.unique()) | set(ecs.j.unique()) - num_pos = len(pos) + + # check if the provided table has segments... + has_segments = "segment_i" in ecs.columns and "segment_j" in ecs.columns + + # ...and if not, create them (will be deleted before return) + if not has_segments: + ecs = ecs.assign({ + "segment_i": DEFAULT_SEGMENT_NAME, + "segment_j": DEFAULT_SEGMENT_NAME + }) + + + # stack dataframe so it contains each + # EC twice as forward and backward pairs + # (i, j) and (j, i) + flipped = ecs.rename( + columns={ + "i": "j", + "j": "i", + "A_i": "A_j", + "A_j": "A_i", + "segment_i": "segment_j", + "segment_j": "segment_i" + } + ) + + stacked_ecs = ecs.append(flipped) + + # determine how many positions ECs are over using the combined dataframe + num_pos = len(stacked_ecs.groupby(["i", "A_i", "segment_i"])) # calculate absolute number of pairs if # fraction of length is given if isinstance(num_pairs, float): num_pairs = int(ceil(num_pairs * num_pos)) - # get longrange ECs and sort by score - sorted_ecs = ecs.query( + # sort the stacked ECs + stacked_sorted_ecs = stacked_ecs.query( "abs(i-j) >= {}".format(min_seqdist) ).sort_values( by=score, ascending=False ) - # select top EC pairs - top_ecs = sorted_ecs.iloc[0:num_pairs] + # take the top num * 2 (because each EC represented twice) + top_ecs = stacked_sorted_ecs[0:num_pairs * 2] - # stack dataframe so it contains each - # EC twice as forward and backward pairs - # (i, j) and (j, i) - flipped = top_ecs.rename( - columns={"i": "j", "j": "i", "A_i": "A_j", "A_j": "A_i"} - ) - - stacked_ecs = top_ecs.append(flipped) - - # now sum cumulative strength of EC for each position + # calculate sum of EC scores for each position ec_sums = pd.DataFrame( - stacked_ecs.groupby(["i", "A_i"]).sum() + top_ecs.groupby(["i", "A_i", "segment_i"]).sum() ) # average EC strength for top ECs @@ -136,7 +155,12 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6): # an individual position exceeds average strength in top ec_sums.loc[:, "enrichment"] = ec_sums.loc[:, score] / avg_degree - e = ec_sums.reset_index().loc[:, ["i", "A_i", "enrichment"]] + # if the ecs had segment information, return a segment column + if has_segments: + e = ec_sums.reset_index().loc[:, ["i", "A_i", "segment_i", "enrichment"]] + else: + e = ec_sums.reset_index().loc[:, ["i", "A_i", "enrichment"]] + return e.sort_values(by="enrichment", ascending=False) @@ -644,11 +668,8 @@ class EVComplexScoreModel: Assign to each EC score a (unnormalized) EVcomplex score as described in Hopf, Schärfe et al. (2014). - TODO: this implementation currently does not take into account - score normalization for the number of sequences and length of - the model """ - def __init__(self, x): + def __init__(self, x, Neff_over_L=None): """ Initialize EVcomplex score model @@ -656,10 +677,14 @@ def __init__(self, x): ---------- x : np.array (or list-like) EC scores from which to infer the mixture model + Neff_over_L : float (optional, default None) + Effective number of sequences divided by L (number + of sites in model) """ self.x = np.array(x) + self.Neff_over_L = Neff_over_L - def probability(self, x, plot=False): + def probability(self, x): """ Calculates evcomplex score as cn_score / min_cn_score. TODO: plotting functionality not yet implemented @@ -679,11 +704,19 @@ def probability(self, x, plot=False): # Calculate the minimum score min_score = abs(np.min(self.x)) - return x / min_score + raw_evcomplex = x / min_score + + # if we have Neff_over_L, use it to calculate the corrected score + if self.Neff_over_L: + return raw_evcomplex / (1 + np.power(self.Neff_over_L, -0.5)) + + # else return the raw score + else: + return raw_evcomplex def add_mixture_probability(ecs, model="skewnormal", score="cn", - clamp_mu=False, plot=False): + clamp_mu=False, plot=False, Neff_over_L=None): """ Add lognormal mixture model probability to EC table. @@ -691,9 +724,10 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn", ---------- ecs : pd.DataFrame EC table with scores - model : {"skewnormal", "normal"}, optional (default: skewnormal) + model : {"skewnormal", "normal", "evcomplex", "evcomplex_uncorrected"}, optional (default: skewnormal) Use model with skew-normal or normal distribution - for the noise component of mixture model + for the noise component of mixture model. + For complexes, apply evcomplex or evcomplex_uncorrected scoring score : str, optional (default: "cn") Score on which mixture model will be based clamp_mu : bool, optional (default: False) @@ -701,6 +735,9 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn", fitting it based on data plot : bool, optional (default: False) Plot score distribution and probabilities + Neff_over_L : float, optional (default: None) + Effective number of sequences divided by model length, needed for corrected EVcomplex + score calculation (Hopf et al., 2014 Elife) Returns ------- @@ -717,17 +754,26 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn", mm = ScoreMixtureModel(ecs.loc[:, score].values) elif model == "normal": mm = LegacyScoreMixtureModel(ecs.loc[:, score].values, clamp_mu) - elif model == "evcomplex": + elif model == "evcomplex_uncorrected": mm = EVComplexScoreModel(ecs.loc[:, score].values) + elif model == "evcomplex": + if Neff_over_L is None: + raise ValueError( + "must provide Neff over L for EVcomplex score calculation, " + "or select model evcomplex_uncorrected" + ) + mm = EVComplexScoreModel(ecs.loc[:, score].values, Neff_over_L) + else: raise ValueError( "Invalid model selection, valid options are: " - "skewnormal, normal, evcomplex" + "skewnormal, normal\n" + "Complexes only: evcomplex, evcomplex_uncorrected" ) # assign probability ec_prob.loc[:, "probability"] = mm.probability( - ec_prob.loc[:, score].values, plot=plot + ec_prob.loc[:, score].values ) return ec_prob diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index 54fe1851..71ae55b1 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -10,6 +10,7 @@ import string import pandas as pd import numpy as np +from itertools import combinations from evcouplings.couplings import tools as ct from evcouplings.couplings import pairs, mapping @@ -52,18 +53,21 @@ "evcomplex", ) +# Define default segment names for complexes +FIRST_SEGMENT_NAME = "A_1" +SECOND_SEGMENT_NAME = "B_1" def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. - + Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required - + Returns ------- outcfg : dict @@ -430,7 +434,7 @@ def standard(**kwargs): def complex_probability(ecs, scoring_model, use_all_ecs=False, - score="cn"): + n_effective=None, num_sites=None, score="cn"): """ Adds confidence measure for complex evolutionary couplings @@ -443,33 +447,52 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False, use_all_ecs : bool, optional (default: False) If true, fits the scoring model to all ECs; if false, fits the model to only the inter ECs. + n_effective : float, optional (default: None) + Effective number of sequences in alignment. + Used for EVcomplex score calculation + num_sites : int, optional (default: None) + Number of positions in alignment score : str, optional (default: "cn") Use this score column for confidence assignment - + Returns ------- ecs : pandas.DataFrame EC table with additional column "probability" containing confidence measure """ + + # if user provided both parameters, calculate n_effective / length + if n_effective and num_sites: + neff_over_l = n_effective / num_sites + else: + neff_over_l = None + if use_all_ecs: + # TODO: user proof so that no one runs evcomplex on all ECs? ecs = pairs.add_mixture_proability( ecs, model=scoring_model ) + else: inter_ecs = ecs.query("segment_i != segment_j") - intra_ecs = ecs.query("segment_i == segment_j") + intra1_ecs = ecs.query("segment_i == segment_j == @FIRST_SEGMENT_NAME") + intra2_ecs = ecs.query("segment_i == segment_j == @SECOND_SEGMENT_NAME") + + intra1_ecs = pairs.add_mixture_probability( + intra1_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l + ) - intra_ecs = pairs.add_mixture_probability( - intra_ecs, model=scoring_model, score=score + intra2_ecs = pairs.add_mixture_probability( + intra2_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l ) inter_ecs = pairs.add_mixture_probability( - inter_ecs, model=scoring_model, score=score + inter_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l ) ecs = pd.concat( - [intra_ecs, inter_ecs] + [intra1_ecs, intra2_ecs, inter_ecs] ).sort_values( score, ascending=False ) @@ -477,6 +500,97 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False, return ecs +def by_segment_ec_enrichment(ecs, outcfg, **kwargs): + """ + Calculates enrichment within all segments and between all segments. + + Calculates within-segment enrichment using intra-protein ECs only and + between segment enrichment using inter-protein ECs only. + + Parameters + ---------- + ecs: pd.DataFrame + EC table + outcfg: dict + Output configuration of the pipeline + mandatory kwargs arguments: + prefix + + Returns + ------- + dict + Output configuration of the pipeline, including + the following new fields: + enrichment_inter_file + enrichment_intra_file + pd.DataFrame + Intra-molecule EC enrichment (all molecules included, indexed by segment) + pd.DataFrame + Inter-molecule EC enrichment (all molecules included, indexed by segment) + """ + + # determine the number of segments in the ECs + has_segments = "segment_i" in ecs.columns and "segment_j" in ecs.columns + + if not has_segments: + ecs.loc[:,"segment_i"] = FIRST_SEGMENT_NAME + ecs.loc[:,"segment_j"] = FIRST_SEGMENT_NAME + + # get a list of unique segments + segments = list( + set(ecs.segment_i.unique()).union( + set(ecs.segment_j.unique())) + ) + + # Calculate all the intra-segment enrichments + intra_enrichment_dfs = [] + for segment in segments: + intra_ecs = ecs.query( + "segment_i == segment_j == @segment" + ) + intra_enrichment = pairs.enrichment( + intra_ecs, min_seqdist=kwargs["min_sequence_distance"] + ) + intra_enrichment.loc[:,"segment_i"] = segment + + intra_enrichment_dfs.append(intra_enrichment) + + all_intra_enrichment = pd.concat(intra_enrichment_dfs) + + # Calculate the inter-segment enrichments + inter_enrichment_dfs = [] + for (segment1, segment2) in combinations(segments, 2): + + # Agnostic to ordering of segments + inter_ecs1 = ecs.query( + "segment_i == @segment1 and segment_j == @segment2" + ) + inter_ecs2 = ecs.query( + "segment_i == @segment2 and segment_j == @segment1" + ) + inter_ecs = pd.concat([inter_ecs1, inter_ecs2]) + + # for inter ECs, sequence distance is 0 + inter_enrichment = pairs.enrichment(inter_ecs, min_seqdist=0) + inter_enrichment_dfs.append(inter_enrichment) + + all_inter_enrichment = pd.concat(inter_enrichment_dfs) + + # Save to a file + outcfg["enrichment_intra_file"] = "{}_enrichment_intra.csv".format( + kwargs["prefix"] + ) + all_intra_enrichment.to_csv(outcfg["enrichment_intra_file"], index=None) + + if len(all_inter_enrichment) > 0: + outcfg["enrichment_inter_file"] = "{}_enrichment_inter.csv".format( + kwargs["prefix"] + ) + inter_enrichment.to_csv(outcfg["enrichment_inter_file"], index=None) + + return outcfg, all_intra_enrichment, all_inter_enrichment + + def complex(**kwargs): """ Protocol: @@ -531,7 +645,7 @@ def complex(**kwargs): use_all_ecs = False ecs = complex_probability( - ecs, kwargs["scoring_model"], use_all_ecs + ecs, kwargs["scoring_model"], use_all_ecs, outcfg["effective_sequences"], outcfg["num_sites"] ) else: @@ -552,17 +666,18 @@ def complex(**kwargs): ) ) + is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, - generate_line_plot=True, - generate_enrichment=False, + generate_line_plot=is_single_segment, + generate_enrichment=is_single_segment, ec_filter="segment_i != segment_j or abs(i - j) >= {}", chain=chain_mapping ) } - + # save just the inter protein ECs ## TODO: eventually have this accomplished by _postprocess_inference ## right now avoiding a second call with a different ec_filter @@ -575,12 +690,7 @@ def complex(**kwargs): write_config_file(prefix + ".couplings_complex.outcfg", outcfg) # TODO: make the following complex-ready - # EC enrichment: - # - # 1) think about making EC enrichment complex-ready and add - # it back here - so far it only makes sense if all ECs are - # on one segment - # + # EVzoom: # # 1) at the moment, EVzoom will use numbering before remapping @@ -594,47 +704,34 @@ def complex(**kwargs): return outcfg -def mean_field(**kwargs): - """ - Protocol: - - Infer ECs from alignment using mean field direct coupling analysis. - - For now, mean field DCA can only be run in focus mode, gaps - included. - - Parameters - ---------- - Mandatory kwargs arguments: - See list below in code where calling check_required. - - Returns - ------- - outcfg : dict - Output configuration of the pipeline, including - the following fields: - - * raw_ec_file - * model_file - * num_sites - * num_sequences - * effective_sequences - - * focus_mode (passed through) - * focus_sequence (passed through) - * segments (passed through) +def infer_mean_field(**kwargs): """ - check_required( - kwargs, - [ - "prefix", "alignment_file", "segments", - "focus_mode", "focus_sequence", "theta", - "pseudo_count", "alphabet", - "min_sequence_distance", # "save_model", - "ec_score_type", - ] - ) - + Run mean field EC computation on alignment. This function + contains the functionality shared between monomer and complex + EC inference. + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required + + Returns + ------- + outcfg : dict + Output configuration of the pipeline, including + the following fields: + + raw_ec_file + model_file + num_sites + num_valid_sequences + effective_sequences + + focus_mode (passed through) + focus_sequence (passed through) + segments (passed through) + + """ if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." @@ -642,24 +739,15 @@ def mean_field(**kwargs): prefix = kwargs["prefix"] - # option to save model disabled - """ - if kwargs["save_model"]: - model = prefix + ".model" - else: - model = None - """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", - # TODO: the following are passed through stage... - # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], - "segments": kwargs["segments"], + "segments": kwargs["segments"] } # make sure input alignment exists @@ -672,12 +760,6 @@ def mean_field(**kwargs): # make sure output directory exists create_prefix_folders(prefix) - segments = kwargs["segments"] - if segments is not None: - segments = [ - mapping.Segment.from_list(s) for s in segments - ] - # determine alphabet # default is protein if kwargs["alphabet"] is None: @@ -725,6 +807,195 @@ def mean_field(**kwargs): "region_start": int(model.index_list[0]), }) + return outcfg, model + + +def complex_mean_field(**kwargs): + """ + Protocol: + + Infer ECs for protein complexes from alignment using mean field. + Allows user to select scoring protocol. + + For now, mean field DCA can only be run in focus mode, gaps + included. + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required. + + Returns + ------- + outcfg : dict + Output configuration of the pipeline, including + the following fields: + + raw_ec_file + model_file + num_sites + num_sequences + effective_sequences + + focus_mode (passed through) + focus_sequence (passed through) + segments (passed through) + """ + # for additional required parameters, see mean field info + check_required( + kwargs, + [ + "prefix", "min_sequence_distance", + "scoring_model", "use_all_ecs_for_scoring", + "alignment_file","segments","focus_mode", + "focus_sequence","theta","pseudo_count", + "alphabet" + ] + ) + + # Run the mean field inference + outcfg, model = infer_mean_field(**kwargs) + prefix = kwargs["prefix"] + + # read and sort ECs + # Note: this now deviates from the original EC format + # file because it has 4 score columns to accomodate + # MI (raw), MI (APC-corrected), DI, CN; + ecs = pd.read_csv( + outcfg["raw_ec_file"], sep=" ", + names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"] + ) + + # If the ECs have segments, remap + segments = kwargs["segments"] + if segments is not None: + segments = [ + mapping.Segment.from_list(s) for s in segments + ] + + seg_mapper = mapping.SegmentIndexMapper( + kwargs["focus_mode"], kwargs["region_start"], *segments + ) + + # apply to EC table + ecs = mapping.segment_map_ecs(ecs,seg_mapper) + + # add mixture model probability + if kwargs["scoring_model"] in SCORING_MODELS: + if kwargs["use_all_ecs_for_scoring"] is not None: + use_all_ecs = kwargs["use_all_ecs_for_scoring"] + else: + use_all_ecs = False + + ecs = complex_probability( + ecs, kwargs["scoring_model"], use_all_ecs + ) + + else: + raise InvalidParameterError( + "Invalid scoring_model parameter: " + + "{}. Valid options are: {}".format( + kwargs["protocol"], ", ".join(SCORING_MODELS) + ) + ) + + # create line-drawing script (for multiple chains) + # by convention, we map first segment to chain A, + # second to B, a.s.f. + chain_mapping = dict( + zip( + [s.segment_id for s in segments], + string.ascii_uppercase, + ) + ) + + is_single_segment = segments is None or len(segments) == 1 + outcfg = { + **outcfg, + **_postprocess_inference( + ecs, kwargs, model, outcfg, prefix, + generate_line_plot=is_single_segment, + generate_enrichment=is_single_segment, + ec_filter="segment_i != segment_j or abs(i - j) >= {}", + chain=chain_mapping + ) + } + + # save just the inter protein ECs + ## TODO: eventually have this accomplished by _postprocess_inference + ## right now avoiding a second call with a different ec_filter + ecs = pd.read_csv(outcfg["ec_file"]) + outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" + inter_ecs = ecs.query("segment_i != segment_j") + inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) + + # dump output config to YAML file for debugging/logging + write_config_file(prefix + ".couplings_complex.outcfg", outcfg) + + # TODO: make the following complex-ready + # EVzoom: + # + # 1) at the moment, EVzoom will use numbering before remapping + # we should eventually get this to a point where segments + residue + # index are displayed on EVzoom + # + # 2) note that this will currently use the default mixture model + # selection for determining the EC cutoff, rather than the selection + # used for the EC table above + + return outcfg + + +def mean_field(**kwargs): + """ + Protocol: + + Infer ECs from alignment using mean field direct coupling analysis. + + For now, mean field DCA can only be run in focus mode, gaps + included. + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required. + + Returns + ------- + outcfg : dict + Output configuration of the pipeline, including + the following fields: + + * raw_ec_file + * model_file + * num_sites + * num_sequences + * effective_sequences + + * focus_mode (passed through) + * focus_sequence (passed through) + * segments (passed through) + """ + check_required( + kwargs, + [ + "prefix", "alignment_file", "segments", + "focus_mode", "focus_sequence", "theta", + "pseudo_count", "alphabet", + "min_sequence_distance", # "save_model", + "ec_score_type", + ] + ) + + outcfg, model = infer_mean_field(**kwargs) + + # establish segments for model + segments = kwargs["segments"] + if segments is not None: + segments = [ + mapping.Segment.from_list(s) for s in segments + ] + # read and sort ECs # Note: this now deviates from the original EC format # file because it has 4 score columns to accomodate @@ -870,27 +1141,29 @@ def _postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_line_plo # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) - if generate_enrichment: - ext_outcfg["enrichment_file"] = prefix + "_enrichment.csv" - min_seqdist = kwargs["min_sequence_distance"] - if min_seqdist is None: - min_seqdist = 0 + min_seqdist = kwargs["min_sequence_distance"] + if min_seqdist is None: + min_seqdist = 0 - ecs_enriched = pairs.enrichment( - ecs, score=score, min_seqdist=min_seqdist - ) - ecs_enriched.to_csv(ext_outcfg["enrichment_file"], index=False) + ## TODO: need to make by_segment_ec_enrichment compatible with new enrichment refactoring + ## TODO: MAKE SURE this computes all needed output files - discrepancy with new release as of Oct 2020 + # enrichment refactoring + outcfg, intra_ecs_enriched, _ = by_segment_ec_enrichment(ecs, outcfg, **kwargs) + ext_outcfg["enrichment_file"] = prefix + "_enrichment.csv" + intra_ecs_enriched.to_csv(ext_outcfg["enrichment_file"], index=False) + for segment, ecs_enriched in intra_ecs_enriched.groupby("segment_i"): # create corresponding enrichment pymol scripts ext_outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ - (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") + (True, f"{segment}_enrichment_spheres.pml"), (False, f"{segment}_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) ext_outcfg["enrichment_pml_files"].append(pml_file) + # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: ext_outcfg["evzoom_file"] = prefix + "_evzoom.json" @@ -928,6 +1201,9 @@ def _postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_line_plo # inference protocol using mean field approximation "mean_field": mean_field, + + # runs DCA/meanfield for complexes + "complex_mean_field": complex_mean_field } diff --git a/evcouplings/score_interaction/__init__.py b/evcouplings/score_interaction/__init__.py new file mode 100644 index 00000000..2126dd1d --- /dev/null +++ b/evcouplings/score_interaction/__init__.py @@ -0,0 +1,3 @@ +from evcouplings.score_interaction.asa import * +from evcouplings.score_interaction.protocol import * + diff --git a/evcouplings/score_interaction/asa.py b/evcouplings/score_interaction/asa.py new file mode 100644 index 00000000..f3cf380d --- /dev/null +++ b/evcouplings/score_interaction/asa.py @@ -0,0 +1,204 @@ +import pandas as pd +import numpy as np +from evcouplings.utils.system import verify_resources, valid_file +from evcouplings.compare.tools import run_dssp +from evcouplings.utils.constants import AA_SURFACE_AREA +from Bio.PDB import make_dssp_dict + +def read_dssp_output(filename): + """ + Reads the output files from DSSP and converts them into a pandas DataFrame + + Parameters + ---------- + filename: str + Path to output file from DSSP + + Returns + ------- + pd.DataFrame with columns i, res, asa + Representing residue number, identity, and accesisble surface area + """ + + if not valid_file(filename): + return pd.DataFrame({ + "i": [], + "res": [], + "asa": [] + }) + + dssp_dict, _ = make_dssp_dict(filename) + data = [] + for key, value in dssp_dict.items(): + # keys are formatted as (chain, ("", i, "")) + chain, (_, i, inscode) = key + + res = value[0] + asa = value[2] + + data.append({ + "i": i, + "res": res, + "asa": asa + }) + + return pd.DataFrame(data) + + +def calculate_rsa(dataframe, aa_surface_area=None, output_column="rsa"): + """ + Converts raw accessible surface area to relative accessible surface area, + by dividing the raw accessible surface area by the max accessible surface area + + Parameters + ---------- + dataframe: pd.DataFrame + Dataframe of raw accessible surface area + AA_SURFACE_AREA: dict of str: numeric + Values of max accessible surface area to use for conversion + output_column: str + Name of output column to create + + Returns + ------- + pd.DataFrame + """ + if aa_surface_area is None: + aa_surface_area = AA_SURFACE_AREA + + modified_dataframe = dataframe.copy() + modified_dataframe.loc[:, output_column] = [x.asa/aa_surface_area[x.res] for _, x in modified_dataframe.iterrows()] + return modified_dataframe + + +def asa_run(pdb_file, dssp_output_file, rsa_output_file, dssp_binary): + """ + Paramters + --------- + file: str + path to pdb file on which to run DSSP + dssp_output_file: str + path to save dssp file + rsa_output_file: str + path to save rsa output file + dssp_binary: str + path to dssp binary + + Returns + ------- + pd.DataFrame with relative accessible surface area for each position in PDB + """ + + verify_resources( + "PDB file input to DSSP does not exist", + pdb_file + ) + + run_dssp(pdb_file, dssp_output_file, dssp_binary) + + d = read_dssp_output(dssp_output_file) + d = calculate_rsa(d, AA_SURFACE_AREA) + + return d + + +def combine_asa(remapped_pdb_files, dssp_binary, prefix, outcfg): + """ + Parameters + ---------- + remapped_pdb_files: list of str + path to all remapped pdb files to be analyzed + prefix: str + path to DSSP binary + outcfg: dict + output configuration + """ + + outcfg["dssp_output_files"] = [] + outcfg["rsa_output_files"] = [] + + # If no remapped pdb files, return empty df + if remapped_pdb_files is None or len(remapped_pdb_files) == 0: + return pd.DataFrame({ + "i": [], + "mean": [], + "max": [], + "min": [] + }), outcfg + + # run dssp for each remapped_pdb_file + d_list = [] + for file in remapped_pdb_files: + if valid_file(file): + + # prepare output file paths + file_path_prefix = file.rsplit(".pdb", maxsplit=1)[0] + pdb_prefix = file_path_prefix.rsplit("/", maxsplit=1)[-1] + + dssp_output_file = prefix + pdb_prefix + ".dssp" + rsa_output_file = prefix + pdb_prefix + "_rsa.csv" + + d = asa_run(file, dssp_output_file, rsa_output_file, dssp_binary) + + # add information to combined dataframe + d_list.append(d) + + # save the output files + outcfg["dssp_output_files"].append(dssp_output_file) + outcfg["rsa_output_files"].append(rsa_output_file) + + #print(d_list) + data = pd.concat(d_list) + + # group the dataframe of RSA by residue + means = data.groupby("i").rsa.mean() + maxes = data.groupby("i").rsa.max() + mins = data.groupby("i").rsa.min() + + return pd.DataFrame({ + "i": means.index, + "mean": list(means), + "max": list(maxes), + "min": list(mins) + }), outcfg + + +def add_asa(ec_df, asa, asa_column): + """ + Add a column for the accessible surface area for each residue i and j to a DataFrame + + Parameters + ---------- + ec_df: pd.DataFrame + dataframe with columns i, j, segment_i, and segment_j + asa: pd.DataFrame + dataframe containing accessible surface area information + asa_column: str + name of column in asa df to use + + Returns + ------- + pd.DataFrame + """ + # make a dictionary of residue and segment pointing to accesible surface area value + s_to_e = { + (x, y): z for x, y, z in zip( + asa.i, asa.segment_i, asa[asa_column] + ) + } + + # Add the accesible surface area for position i + ec_df["asa_i"] =[ + s_to_e[(x, y)] if (x, y) in s_to_e else np.nan for x, y in zip( + ec_df.i, ec_df.segment_i + ) + ] + + # Add the accessible surface area for position j + ec_df["asa_j"] =[ + s_to_e[(x, y)] if (x, y) in s_to_e else np.nan for x, y in zip( + ec_df.j, ec_df.segment_j + ) + ] + + return ec_df diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved new file mode 100644 index 00000000..d7666adf Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved differ diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler new file mode 100644 index 00000000..8eed199e Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler differ diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved new file mode 100644 index 00000000..717711fd Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved differ diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler new file mode 100644 index 00000000..16f653d8 Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler differ diff --git a/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved b/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved new file mode 100644 index 00000000..a45806e8 Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved differ diff --git a/evcouplings/score_interaction/aux/20200717_frozen__struc.saved b/evcouplings/score_interaction/aux/20200717_frozen__struc.saved new file mode 100644 index 00000000..e861249d Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__struc.saved differ diff --git a/evcouplings/score_interaction/protocol.py b/evcouplings/score_interaction/protocol.py new file mode 100644 index 00000000..dc0fcd45 --- /dev/null +++ b/evcouplings/score_interaction/protocol.py @@ -0,0 +1,532 @@ +""" +Fix EVcomplex2 models to inter-protein ECs + +Authors: + Anna G. Green +""" +import joblib +import pandas as pd +import numpy as np + +from evcouplings.utils.config import ( + check_required, InvalidParameterError +) + +from evcouplings.utils.system import ( + valid_file +) +from evcouplings.couplings import enrichment, Segment +from evcouplings.score_interaction.asa import combine_asa, add_asa +from evcouplings.compare.distances import DistanceMap +from evcouplings.compare.protocol import complex_probability, plot_complex_cm +from evcouplings.align import ALPHABET_PROTEIN +from evcouplings.utils.constants import HYDROPATHY_INDEX + +X_STRUCFREE = [ + "evcomplex_normed", + "conservation_max", + "intra_enrich_max", + "inter_relative_rank_longrange" + +] +X_STRUCAWARE = [ + "evcomplex_normed", + "asa_min", + "precision", + "conservation_max", + "intra_enrich_max", + "inter_relative_rank_longrange", +] + +X_COMPLEX_STRUCFREE = [0, 2] + +X_COMPLEX_STRUCAWARE = [0, 3, 4, 7] + + +def fit_model(data, model_file, X, column_name): + """ + Fits a model to predict p(residue interaction) + + data: pd.DataFrame + has columns X used as features in model + model_file: str + path to file containing joblib dumped model (here, an sklearn logistic regression) + X: list of str + the columns to be input as features to the model. N.B., MUST be in the same order + as when the model was originally fit + column_name: str + name of column to create + + Returns + pd.DataFrame of ECs with new column column_name containing the fit model, + or np.nan if the model could not be fit due to missing data + """ + + model = joblib.load(model_file) + + # the default score is np.nan + data[column_name] = np.nan + + # if any of the needed columns are missing, return data + for col in X: + if col not in data.columns: + print("missing", col) + return data + + # drop rows with missing info + subset_data = data.dropna(subset=X) + if len(subset_data) == 0: + return data + + X_var = subset_data[X] + predicted = model.predict_proba(X_var)[:, 1] + + # make prediction and save to correct row + data.loc[subset_data.index, column_name] = predicted + + return data + + +def fit_complex_model(ecs, model_file, scaler_file, residue_score_column, output_column, scores_to_use): + """ + Fits a model to predict p(protein interaction) + + data: pd.DataFrame + has columns X used as features in model + model_file: str + path to file containing joblib dumped model (here, an sklearn logistic regression) + scaler_file: str + path to file containing joblib dumped Scaler object + residue_score_column: str + a column name in data to be used as input to model + output_column: str + name of column to create + scores_to_use: list of int + name of column to create + + Returns + pd.DataFrame of ECs with new column column_name containing the fit model, + or np.nan if the model could not be fit due to missing data + """ + # load the model and scaler + model = joblib.load(model_file) + scaler = joblib.load(scaler_file) + + # sort by the residue score column, and take the instances input + ecs = ecs.sort_values(residue_score_column, ascending=False) + X = list( + ecs[residue_score_column].iloc[scores_to_use] + ) + [ecs.inter_relative_rank_longrange.min()] + + # reshape and clean the data + X = np.array(X).astype(float) + X = X.transpose() + X = np.array(X).reshape(1, -1) + + X = np.nan_to_num(X) + + # transform with the scaler + X = scaler.transform(X) + + ecs.loc[:, output_column] =model.predict_proba(X)[:, 1] + return ecs + + +def _make_complex_contact_maps_probability(ec_table, d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, first_segment_name, + second_segment_name, **kwargs): + """ + Plot contact maps with all ECs above a certain probability threshold, + or a given count of ECs + + Parameters + ---------- + ec_table : pandas.DataFrame + Full set of evolutionary couplings (all pairs) + d_intra_i, d_intra_j: DistanceMap + Computed residue-residue distances within chains for + monomers i and j + d_multimer_i, d_multimer_j : DistanceMap + Computed residue-residue distances between homomultimeric + chains for monomers i and j + d_inter: DistanceMap + Computed residue-residue distances between heteromultimeric + chains i and j + first_segment_name, second_segment_name: str + Name of segment i and segment j in the ec_table + **kwargs + Further plotting parameters, see check_required in code + for necessary values. + + Returns + ------- + cm_files : list(str) + Paths of generated contact map files + """ + check_required( + kwargs, + [ + "prefix", + "plot_probability_cutoffs", + "boundaries", + "draw_secondary_structure", + "scale_sizes" + ] + ) + + prefix = kwargs["prefix"] + + cm_files = [] + + ecs_longrange = ec_table.query( + "abs(i - j) >= {} or segment_i != segment_j".format(kwargs["min_sequence_distance"]) + ) + + # create plots based on significance cutoff + if kwargs["plot_probability_cutoffs"]: + for column, cutoffs in kwargs["plot_probability_cutoffs"].items(): + + if not isinstance(cutoffs, list): + cutoffs = [cutoffs] + + for cutoff in cutoffs: + ec_set_i = ecs_longrange.query("segment_i == segment_j == @first_segment_name") + ec_set_j = ecs_longrange.query("segment_i == segment_j == @second_segment_name") + ec_set = ecs_longrange.loc[ecs_longrange[column] >= cutoff, :] + + ec_set_inter = ec_set.query("segment_i != segment_j") + + output_file = prefix + "_{}_significant_ECs_{}.pdf".format(column, cutoff) + plot_completed = plot_complex_cm( + ec_set_i, ec_set_j, ec_set_inter, + d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, + first_segment_name, second_segment_name, + output_file=output_file, **kwargs + ) + + if plot_completed: + cm_files.append(output_file) + + # give back list of all contact map file names + return cm_files + + +def standard(**kwargs): + """ + Protocol: + Compare ECs for a complex to + 3D structure + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required + + Returns + ------- + outcfg : dict + Output configuration of the pipeline, including + the following fields: + + * ec_file_compared_all + * ec_file_compared_all_longrange + * pdb_structure_hits + * distmap_monomer + * distmap_multimer + * contact_map_files + * remapped_pdb_files + """ + check_required( + kwargs, + [ + "structurefree_model_file", "structureaware_model_file", + "ec_compared_longrange_file", "ec_longrange_file", + "first_remapped_pdb_files", "second_remapped_pdb_files", + "frequencies_file" + ] + ) + + prefix = kwargs["prefix"] + + outcfg = { + # initialize output EC files + "ec_calibration_file": prefix + "_inter_calibration.csv", + "ec_prediction_file": prefix + "_inter_prediction.csv", + } + + # create an inter-ecs file with extra information for calibration purposes + def _calibration_file(prefix, ec_file, outcfg): + + """ + Adds values to the dataframe of ECs that will later be used + for score fitting + """ + + ecs = pd.read_csv(ec_file, index_col=0) + + # calculate intra-protein enrichment + def _add_enrichment(ecs): + + # Calculate the intra-protein enrichment + intra1_ecs = ecs.query("segment_i == segment_j == 'A_1'") + intra2_ecs = ecs.query("segment_i == segment_j == 'B_1'") + + intra1_enrichment = enrichment(intra1_ecs, min_seqdist=6) + intra1_enrichment["segment_i"] = "A_1" + + intra2_enrichment = enrichment(intra2_ecs, min_seqdist=6) + intra2_enrichment["segment_i"] = "B_1" + + enrichment_table = pd.concat([intra1_enrichment, intra2_enrichment]) + + def _seg_to_enrich(enrich_df, ec_df, enrichment_column): + """ + combines the enrichment table with the EC table + """ + s_to_e = {(x, y):z for x, y, z in zip( + enrich_df.i, enrich_df.segment_i, enrich_df[enrichment_column] + )} + + # enrichment for residues in column i + ec_df["enrichment_i"] = [s_to_e[(x, y)] if (x, y) in s_to_e else 0 for x, y in zip( + ec_df.i, ec_df.segment_i + )] + + # enrichment for residues in column j + ec_df["enrichment_j"] = [s_to_e[(x, y)] if (x, y) in s_to_e else 0 for x, y in zip( + ec_df.j, ec_df.segment_j + )] + + return ec_df + + # add the intra-protein enrichment to the EC table + ecs = _seg_to_enrich(enrichment_table, ecs, "enrichment") + # larger of two enrichment values + ecs["intra_enrich_max"] = ecs[["enrichment_i", "enrichment_j"]].max(axis=1) + # smaller of two enrichment values + ecs["intra_enrich_min"] = ecs[["enrichment_i", "enrichment_j"]].min(axis=1) + + return ecs + + ecs = _add_enrichment(ecs) + ecs = ecs.reset_index() + + # get just the inter ECs and calculate Z-score + ecs = ecs.query("segment_i != segment_j") + mean_ec = ecs.cn.mean() + std_ec = ecs.cn.std() + ecs.loc[:, 'Z_score'] = (ecs.cn - mean_ec) / std_ec + + # add the evcomplex score with neffL correction + N_effL = kwargs["effective_sequences"] / kwargs["num_sites"] + ecs = complex_probability(ecs, "evcomplex", use_all_ecs=False, + score="cn", Neff_over_L=N_effL) + ecs.loc[:, "evcomplex_normed"] = ecs.loc[:, "probability"] + + # get only the top 100 inter ECs + L = len(ecs.i.unique()) + len(ecs.j.unique()) + ecs = ecs[0:100] + + # add rank + ecs["inter_relative_rank_longrange"] = ecs.index / L + print(ecs.head()) + # calculate the ASA for the first and second segments by combining asa from all remapped pdb files + first_asa, outcfg = combine_asa(kwargs["first_remapped_pdb_files"], kwargs["dssp"], prefix, outcfg) + first_asa["segment_i"] = "A_1" + + second_asa, outcfg = combine_asa(kwargs["second_remapped_pdb_files"], kwargs["dssp"], prefix, outcfg) + second_asa["segment_i"] = "B_1" + + # save the ASA to a file + asa = pd.concat([first_asa, second_asa]) + outcfg["asa_file"] = prefix + "_surface_area.csv" + asa.to_csv(outcfg["asa_file"]) + + # Add the ASA to the ECs and compute the max and min for each position pair + ecs = add_asa(ecs, asa, asa_column="mean") + ecs["asa_max"] = ecs[["asa_i", "asa_j"]].max(axis=1) + ecs["asa_min"] = ecs[["asa_i", "asa_j"]].min(axis=1) + + # Add min and max conservation to EC file + frequency_file = kwargs["frequencies_file"] + d = pd.read_csv(frequency_file) + conservation = {(x,y):z for x,y,z in zip(d.segment_i, d.i, d.conservation)} + + ecs["conservation_i"] = [ + conservation[(x, y)] if (x, y) in conservation else np.nan for x, y in zip(ecs.segment_i, ecs.i) + ] + ecs["conservation_j"] = [ + conservation[(x, y)] if (x, y) in conservation else np.nan for x, y in zip(ecs.segment_j, ecs.j) + ] + ecs["conservation_max"] = ecs[["conservation_i", "conservation_j"]].max(axis=1) + ecs["conservation_min"] = ecs[["conservation_i", "conservation_j"]].min(axis=1) + + # # amino acid frequencies + # for char in list(ALPHABET_PROTEIN): + # # Frequency of amino acid 'char' in position i + # ecs = ecs.merge(d[["i", "segment_i", char]], on=["i","segment_i"], how="left", suffixes=["", "_1"]) + # ecs = ecs.rename({char: f"f{char}_i"}, axis=1) + # if "i_1" in ecs.columns: + # ecs = ecs.drop(columns=["i_1", "segment_i_1"]) + # + # # Frequency of amino acid 'char' in position j + # ecs = ecs.merge( + # d[["i", "segment_i", char]], left_on=["j", "segment_j"], + # right_on=["i", "segment_i"], how="left", suffixes=["", "_1"] + # ) + # ecs = ecs.rename({char: f"f{char}_j"}, axis=1) + # if "j_1" in ecs.columns: + # ecs = ecs.drop(columns=["j_1", "segment_j_1"]) + # + # # summed frequency of amino acid char in both positions i and j + # # ie, each pair i,j now gets one combined frequency + # print("computing frequencies") + # for char in list(ALPHABET_PROTEIN): + # ecs[f"f{char}"] = ecs[f"f{char}_i"] + ecs[f"f{char}_j"] + # + # # Compute the weighted sum of hydropathy for pair i, j + # hydrophilicity = [] + # + # # For each EC + # print("computing hydrophilicty") + # for _, row in ecs.iterrows(): + # # frequncy of amino acid char * hydopathy index of that AA + # hydro = sum([ + # HYDROPATHY_INDEX[char] * float(row[[f'f{char}']]) for char in list(ALPHABET_PROTEIN) + # ]) + # hydrophilicity.append(hydro) + # + # ecs["f_hydrophilicity"] = hydrophilicity + + # save the calibration file + ecs.to_csv(outcfg["ec_calibration_file"]) + + # Compute the calibration file + if valid_file(kwargs["ec_compared_longrange_file"]): + _calibration_file(prefix, kwargs["ec_compared_longrange_file"], outcfg) + elif valid_file(kwargs["ec_longrange_file"]): + _calibration_file(prefix, kwargs["ec_longrange_file"], outcfg) + else: + raise InvalidParameterError("No valid EC file provided as input for modeling") + + calibration_ecs = pd.read_csv(outcfg["ec_calibration_file"], index_col=0) + calibration_ecs = calibration_ecs.sort_values("cn", ascending=False)[0:20] + + # Fit the structure free model file + print('fitting the model') + calibration_ecs = fit_model( + calibration_ecs, + kwargs["structurefree_model_file"], + X_STRUCFREE, + "residue_prediction_strucfree" + ) + + # Fit the structure aware model prediction file + calibration_ecs = fit_model( + calibration_ecs, + kwargs["structureaware_model_file"], + X_STRUCAWARE, + "residue_prediction_strucaware" + ) + + # Fit the structure free complex model + calibration_ecs = fit_complex_model( + calibration_ecs, + kwargs["complex_strucfree_model_file"], + kwargs["complex_strucfree_scaler_file"], + "residue_prediction_strucfree", + "complex_prediction_strucfree", + X_COMPLEX_STRUCFREE + ) + + # Fit the structure aware complex model + calibration_ecs = fit_complex_model( + calibration_ecs, + kwargs["complex_strucaware_model_file"], + kwargs["complex_strucaware_scaler_file"], + "residue_prediction_strucaware", + "complex_prediction_strucaware", + X_COMPLEX_STRUCAWARE + ) + + calibration_ecs[[ + "inter_relative_rank_longrange", "i", "A_i", "j", "A_j", + "segment_i", "segment_j", "cn", "dist", "precision", "evcomplex_normed", + "Z_score", "asa_min", "conservation_max", "intra_enrich_max", + "residue_prediction_strucaware", "residue_prediction_strucfree", + "complex_prediction_strucaware", "complex_prediction_strucfree" + ]].to_csv(outcfg["ec_prediction_file"]) + + # Step 4: Make contact map plots + # if no structures available, defaults to EC-only plot + def _load_distmap(distmap_file): + if distmap_file is not None and valid_file(distmap_file + ".csv"): + distmap = DistanceMap.from_file(distmap_file) + else: + distmap = None + return distmap + + d_intra_i = _load_distmap(kwargs["first_distmap_monomer"]) + d_multimer_i = _load_distmap(kwargs["first_distmap_multimer"]) + d_intra_j = _load_distmap(kwargs["second_distmap_monomer"]) + d_multimer_j = _load_distmap(kwargs["second_distmap_multimer"]) + d_inter = _load_distmap(kwargs["distmap_inter"]) + + first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id + second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id + + # Add the top L intra ECs to the ec_table + ecs = pd.read_csv(kwargs["ec_file"]).query("segment_i == segment_j") + ecs = ecs.query("abs(i - j) >= {} or segment_i != segment_j".format(kwargs["min_sequence_distance"])) + ecs_intra = ecs.iloc[0:kwargs["num_sites"]] + + calibration_ecs = pd.concat([calibration_ecs, ecs_intra]) + + outcfg["contact_map_files"] = _make_complex_contact_maps_probability( + calibration_ecs, d_intra_i, d_multimer_i, + d_intra_j, d_multimer_j, + d_inter, first_segment_name, + second_segment_name, **kwargs + ) + + return outcfg + + +# list of available EC comparison protocols +PROTOCOLS = { + # standard complex model fitting protocol + "standard": standard, +} + + +def run(**kwargs): + """ + Run inference protocol to calculate ECs from + input sequence alignment. + + Parameters + ---------- + Mandatory kwargs arguments: + protocol: EC protocol to run + prefix: Output prefix for all generated files + + Returns + ------- + outcfg : dict + Output configuration of stage + (see individual protocol for fields) + """ + check_required(kwargs, ["protocol"]) + + if kwargs["protocol"] not in PROTOCOLS: + raise InvalidParameterError( + "Invalid protocol selection: " + + "{}. Valid protocols are: {}".format( + kwargs["protocol"], ", ".join(PROTOCOLS.keys()) + ) + ) + + return PROTOCOLS[kwargs["protocol"]](**kwargs) diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py index dd1ff573..e941afb7 100644 --- a/evcouplings/utils/app.py +++ b/evcouplings/utils/app.py @@ -428,11 +428,11 @@ def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_err def run(**kwargs): """ Exposes command line interface as a Python function. - + Parameters ---------- kwargs - See click.option decorators for app() function + See click.option decorators for app() function """ # substitute commmand line options in config file config = substitute_config(**kwargs) diff --git a/evcouplings/utils/batch.py b/evcouplings/utils/batch.py index ddedd338..6a9d95ac 100644 --- a/evcouplings/utils/batch.py +++ b/evcouplings/utils/batch.py @@ -579,7 +579,7 @@ def _prepare_resources(self, resources): class SlurmSubmitter(AClusterSubmitter): """ - Implements an LSF submitter + Implements a Slurm submitter """ __name = "slurm" __submit = "sbatch --job-name={name} {dependent} {resources} --wrap 'srun {cmd}'" @@ -624,7 +624,7 @@ def __del__(self): os.remove(self.__db_path) except AttributeError: pass - + @property def isBlocking(self): return self.__blocking diff --git a/evcouplings/utils/constants.py b/evcouplings/utils/constants.py index bf361796..a5bc4a1c 100644 --- a/evcouplings/utils/constants.py +++ b/evcouplings/utils/constants.py @@ -3,7 +3,9 @@ Authors: Thomas A. Hopf + Anna G. Green """ +import numpy as np # amino acid one-letter code to three-letter code AA1_to_AA3 = { @@ -36,3 +38,53 @@ AA3_to_AA1 = { v: k for k, v in AA1_to_AA3.items() } + +# Amino acid surface area values from Tien et al, 2013 (empirical values) +AA_SURFACE_AREA = { + "A": 121, + "R": 265, + "D": 187, + "N": 187, + "C": 148, + "E": 214, + "Q": 214, + "G": 97, + "H": 216, + "I": 195, + "L": 191, + "K": 230, + "M": 103, + "F": 228, + "P": 154, + "S": 143, + "T": 163, + "W": 264, + "Y": 255, + "V": 165, + "X": np.nan +} + +#Hydropathy index from Lehninger Principles of Biochemistry, 5th Edition, table 3-1 +HYDROPATHY_INDEX = { + "G": -.4, + "A": 1.8, + "P": -1.6, + "V": 4.2, + "L": 3.8, + "I": 4.5, + "M": 1.9, + "F": 2.8, + "Y": -1.3, + "W": -0.9, + "S": -0.8, + "T": -0.7, + "C": 2.5, + "N": -3.5, + "Q": -3.5, + "K": -3.9, + "H": -3.2, + "R": -4.5, + "D": -3.5, + "E": -3.5, + "-": 0 +} \ No newline at end of file diff --git a/evcouplings/utils/pipeline.py b/evcouplings/utils/pipeline.py index 08278466..990a8bf0 100644 --- a/evcouplings/utils/pipeline.py +++ b/evcouplings/utils/pipeline.py @@ -40,6 +40,7 @@ import evcouplings.mutate.protocol as mt import evcouplings.fold.protocol as fd import evcouplings.complex.protocol as pp +import evcouplings.probability.protocol as pb # supported pipelines # @@ -64,6 +65,7 @@ ("concatenate", pp.run, None), ("couplings", cp.run, None), ("compare", cm.run, None), + ("probability", pb.run, None), ("mutate", mt.run, None), ("fold", fd.run, None) ] @@ -199,6 +201,7 @@ def execute(**config): # one less stage to put through after we ran this... num_stages_to_run -= 1 + else: # skip state by injecting state from previous run verify_resources( @@ -214,9 +217,13 @@ def execute(**config): # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() - if f.endswith("_file") and filepath is not None + if f.endswith("_file") and filepath is not None and f!="model_file" \ + and not filepath.endswith("raw_focus.fasta") and not filepath.endswith("raw.fasta") \ + and not filepath.endswith("monomer_1.fasta") and not filepath.endswith("monomer_2.fasta") \ + and not filepath.endswith("longrange.csv") and not filepath.endswith("_ECs.txt") \ + and not filepath.endswith(".pml") and not filepath.endswith("json") ] - + verify_resources( "Output files from stage '{}' " "missing".format(stage), @@ -377,7 +384,7 @@ def verify_prefix(verify_subdir=True, **config): Check if configuration contains a prefix, and that prefix is a valid directory we can write to on the filesystem - + Parameters ---------- verify_subdir : bool, optional (default: True) @@ -386,7 +393,7 @@ def verify_prefix(verify_subdir=True, **config): app loop. **config Input configuration for pipeline - + Returns ------- prefix : str @@ -550,7 +557,7 @@ def run(**kwargs): EVcouplings pipeline execution from a configuration file (single thread, no batch or environment configuration) - + Parameters ---------- kwargs diff --git a/evcouplings/visualize/pairs.py b/evcouplings/visualize/pairs.py index 079e8c60..e310e32a 100644 --- a/evcouplings/visualize/pairs.py +++ b/evcouplings/visualize/pairs.py @@ -439,7 +439,7 @@ def complex_contact_map(intra1_ecs, intra2_ecs, inter_ecs, ) ) - # Don't compute inter boundaries unless we have inter + # Don't compute inter boundaries unless we have inter # ecs or distances if (inter_ecs is not None and not inter_ecs.empty) or d_inter is not None: inter_boundaries = list( @@ -457,7 +457,7 @@ def _boundary_union(original_boundaries, new_boundaries_axis1, # of the protein. # Default is to update both axes updated_boundaries = original_boundaries - # increase the axis 1 boundaries if the new boundaries + # increase the axis 1 boundaries if the new boundaries # cover more range if axis1: updated_boundaries[0] = ( @@ -491,7 +491,7 @@ def _boundary_union(original_boundaries, new_boundaries_axis1, ) intra2_boundaries = _boundary_union( - intra2_boundaries, inter_boundaries, inter_boundaries, + intra2_boundaries, inter_boundaries, inter_boundaries, axis1=False, axis2=True, symmetric=True ) @@ -1276,7 +1276,7 @@ def ec_lines_pymol_script(ec_table, output_file, distance_cutoff=5, """ Create a Pymol .pml script to visualize ECs on a 3D structure - + Parameters ---------- ec_table : pandas.DataFrame @@ -1349,7 +1349,7 @@ def enrichment_pymol_script(enrichment_table, output_file, ---------- enrichment_table : pandas.DataFrame Mapping of position (column i) to EC enrichment - (column enrichemnt), as returned by + (column enrichemnt), as returned by evcouplings.couplings.pairs.enrichment() output_file : str File path where to store pml script @@ -1364,7 +1364,7 @@ def enrichment_pymol_script(enrichment_table, output_file, Use legacy (2011) red and yellow colormap for EC enrichment """ - if legacy: + if legacy: t = enrichment_table.query("enrichment > 1") t.loc[:, "b_factor"] = t.enrichment # compute boundaries for highly coupled residues @@ -1386,7 +1386,7 @@ def enrichment_pymol_script(enrichment_table, output_file, # set the boundary for number of residues to be rendered as spheres sphere_boundary = boundary2 - + else: t = deepcopy(enrichment_table) t.loc[:, "b_factor"] = t.enrichment @@ -1457,11 +1457,10 @@ def enrichment_pymol_script(enrichment_table, output_file, for idx, c in enumerate(color_list): f.write("set_color color{}, [{},{},{}]\n".format(idx, c[0], c[1], c[2])) f.write("color color{}{}\n".format(len(boundary_list) - 1, chain_sel)) - + f.write("as cartoon{}\n".format(chain_sel)) pymol_mapping(t, f, chain) if not sphere_view: f.write("cartoon putty{}\n".format(chain_sel)) - diff --git a/notebooks/output_files_tutorial.ipynb b/notebooks/output_files_tutorial.ipynb index e6f0ca9b..bdc96d22 100644 --- a/notebooks/output_files_tutorial.ipynb +++ b/notebooks/output_files_tutorial.ipynb @@ -829,6 +829,28 @@ "- enrichment: Sum of the evolutionary coupling scores that involve this residue, normalized by the background level of coupling" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cumulative coupling score files for complex ECs (_enrichment_intra.csv and _enrichment_inter.csv)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Same as the monomer cumulative coupling score file, except calculated using the intra-protein or inter-protein ECs\n", + "\n", + "*output key: enrichment_intra_file and enrichment_inter_file*\n", + "\n", + "Columns:\n", + "- i: position i\n", + "- A_i: character i\n", + "- segment_i: segment of origin of position i\n", + "- enrichment: Sum of the evolutionary coupling scores that involve this residue, normalized by the background level of coupling" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1290,21 +1312,21 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.6.5" } }, "nbformat": 4, diff --git a/test/TestComplex.py b/test/TestComplex.py index 873b24f7..6ee30bbc 100644 --- a/test/TestComplex.py +++ b/test/TestComplex.py @@ -70,16 +70,23 @@ def __init__(self, *args, **kwargs): paralog_file = "{}/concatenate/test_new_paralog_table.csv".format(TRAVIS_PATH) self.paralog_table = pd.read_csv(paralog_file, index_col=0, header=0) + # table of frequencies + self.uncorrected_frequency_file = "{}/concatenate/test_new_frequencies_uncorrected.csv".format(TRAVIS_PATH) + corrected_frequency_file = "{}/concatenate/test_new_frequencies.csv".format(TRAVIS_PATH) + self.freq_file = corrected_frequency_file + self.freqs = pd.read_csv(corrected_frequency_file, index_col=0, header=0) + # input and output configuration with open("{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH)) as inf: self.incfg = yaml.safe_load(inf) + self.incfg["forbid_overlapping_concatenation"] = True with open("{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH)) as inf: self.outcfg = yaml.safe_load(inf) def test_genome_distance(self): """ - tests the genome distance concatenation protocol. + Tests the genome distance concatenation protocol. Verifies that all of the outfiles are created and non-empty (but does not check their contents). Verifies that the output configuration has all of the necessary keys @@ -123,7 +130,7 @@ def test_genome_distance(self): def test_best_hit_normal(self): """ - tests the genome distance concatenation protocol. + Tests the best hit concatenation protocol. Verifies that all of the outfiles are created and non-empty (but does not check their contents). Verifies that the output configuration has all of the necessary keys @@ -173,7 +180,7 @@ def test_best_hit_normal(self): def test_best_hit_reciprocal(self): """ - tests the genome distance concatenation protocol. + Tests the best reciprocal hit concatenation protocol. Verifies that all of the outfiles are created and non-empty (but does not check their contents). Verifies that the output configuration has all of the necessary keys @@ -223,8 +230,7 @@ def test_best_hit_reciprocal(self): def test_modify_complex_segments(self): """ - tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary - :return: + Tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary """ test_configuration = deepcopy(self.incfg) cfg = deepcopy(test_configuration) @@ -236,9 +242,29 @@ def test_modify_complex_segments(self): test_configuration["segments"] = self.outcfg["segments"] self.assertDictEqual(test_configuration, _outcfg) + def test_remove_overlap(self): + """ + Tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary + """ + df = pd.DataFrame({ + "id_1": ["a/1-100", "b/1-200", "c/10-100"], + "id_2": ["a/2-200", "b/300-400", "d/10-100"], + "species": ["A", "B", "C"] + }) + + df1 = remove_overlapping_ids(df).reset_index(drop=True) + + desired_output = pd.DataFrame({ + "id_1": ["b/1-200", "c/10-100"], + "id_2": ["b/300-400", "d/10-100"], + "species": ["B", "C"] + }) + + pd.testing.assert_frame_equal(df1, desired_output) + def test_describe_concatenation(self): """ - test whether describe_concatenation writes a file with the correct contents + Test whether describe_concatenation writes a file with the correct contents """ outfile = tempfile.NamedTemporaryFile(suffix=".csv", delete=False) @@ -260,9 +286,8 @@ def test_describe_concatenation(self): def test_write_concatenated_alignment(self): """ - tests whether write_concatenated alignment returns the correct 3 + Tests whether write_concatenated alignment returns the correct 3 alignment objects - """ alignment_1_file = "{}/concatenate/test_new_monomer_1.fasta".format(TRAVIS_PATH) with(open(alignment_1_file)) as inf: @@ -302,8 +327,7 @@ def _test_aln_equivalence(ali1, ali2): def test_read_species_annotation_table_uniprot(self): """ - tests whether a uniprot annotation table is read correctly - + Tests whether a uniprot annotation table is read correctly """ annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) annotation_data = read_species_annotation_table(annotation_file_uniprot) @@ -311,8 +335,7 @@ def test_read_species_annotation_table_uniprot(self): def test_read_species_annotation_table_uniref(self): """ - tests whether a uniref annotation table is read correctly - + Tests whether a uniref annotation table is read correctly """ _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format(TRAVIS_PATH) _annotation_data = read_species_annotation_table(_annotation_file_uniref) @@ -322,16 +345,14 @@ def test_read_species_annotation_table_uniref(self): def test_most_similar_by_organism(self): """ - tests whether most_similar_by_organism returns the correct dataframe - + Tests whether most_similar_by_organism returns the correct dataframe """ annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data) pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id) def test_find_paralogs(self): """ - tests whether find_paralogs returns the correct dataframe - + Tests whether find_paralogs returns the correct dataframe """ target_id = "DINJ_ECOLI" paralog_table = find_paralogs(target_id, self.annotation_data, self.similarities, 0.9) @@ -339,8 +360,7 @@ def test_find_paralogs(self): def test_filter_best_reciprocal(self): """ - tests whether filter_best_reciprocal returns the correct dataframe - + Tests whether filter_best_reciprocal returns the correct dataframe """ alignment_file = "{}/align_1/test_new.a2m".format(TRAVIS_PATH) best_recip = pd.read_csv("{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH), index_col=0) @@ -349,8 +369,7 @@ def test_filter_best_reciprocal(self): def test_get_distance_overlap(self): """ - tests whether get_distance returns 0 for overlapping genes - + Tests whether get_distance returns 0 for overlapping genes """ annotation_1 = (1000, 1500) annotation_2 = (1400, 1700) @@ -359,8 +378,7 @@ def test_get_distance_overlap(self): def test_get_distance_reverse(self): """ - tests whether get distance correctly measures distance of two genes with opposite strand - + Tests whether get distance correctly measures distance of two genes with opposite strand """ annotation_1 = (1000, 1500) annotation_2 = (1800, 1700) @@ -369,8 +387,7 @@ def test_get_distance_reverse(self): def test_get_distance_increasing(self): """ - tests whether get_distance correctly measures distance of two genes with same strand - + Tests whether get_distance correctly measures distance of two genes with same strand """ annotation_1 = (1000, 1500) annotation_2 = (1700, 1800) @@ -379,8 +396,7 @@ def test_get_distance_increasing(self): def test_best_reciprocal_matching(self): """ - tests whether best_reciprocal_matchin generates the correct pd.DataFrame - + Tests whether best_reciprocal_matchin generates the correct pd.DataFrame """ id_pairing = best_reciprocal_matching(self.possible_partners) id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"]) @@ -389,8 +405,7 @@ def test_best_reciprocal_matching(self): def test_find_possible_partners(self): """ - tests whether find_possible partners generates the correct pd.DataFrame - + Tests whether find_possible partners generates the correct pd.DataFrame """ _possible_partners = find_possible_partners( @@ -408,8 +423,7 @@ def test_find_possible_partners(self): def test_plot_distance_distribution(self): """ - tests whether plot_distance_distribution generates a non-empty PDF - + Tests whether plot_distance_distribution generates a non-empty PDF """ outfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) plot_distance_distribution(self.id_pairing, outfile.name) @@ -418,5 +432,12 @@ def test_plot_distance_distribution(self): outfile.close() os.unlink(outfile.name) + def test_map_frequencies(self): + """ + Tests _map_frequencies_file + """ + freqs = map_frequencies_file(self.uncorrected_frequency_file, self.outcfg, **self.incfg) + pd.testing.assert_frame_equal(freqs, self.freqs) + if __name__ == '__main__': unittest.main() diff --git a/test/TestCouplings.py b/test/TestCouplings.py new file mode 100644 index 00000000..e2464618 --- /dev/null +++ b/test/TestCouplings.py @@ -0,0 +1,139 @@ +""" +Test cases for couplings stage of EVCouplings pipeline + +Author: + Anna G. Green +""" + +import unittest +import os +import tempfile +import pandas as pd +import ruamel.yaml as yaml +from unittest import TestCase +from evcouplings.couplings.pairs import * + +TRAVIS_PATH = "/home/travis/evcouplings_test_cases/complex_test" +TRAVIS_PATH_MONOMER = "/home/travis/evcouplings_test_cases/monomer_test/couplings/RASH_HUMAN_b03" + +#TRAVIS_PATH = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/complex_test" +#TRAVIS_PATH_MONOMER = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/monomer_test/couplings/RASH_HUMAN_b03" +class TestCouplings(TestCase): + + + def __init__(self, *args, **kwargs): + super(TestCouplings, self).__init__(*args, **kwargs) + self.raw_ec_file = "{}/couplings/test_new_ECs.txt".format(TRAVIS_PATH) + self.couplings_file = "{}/couplings/test_new_CouplingScores.csv".format(TRAVIS_PATH) + self.ecs = pd.read_csv(self.couplings_file) + self.inter_ecs = self.ecs.query("segment_i != segment_j") + + with open("{}/couplings/test_new_couplings.outcfg".format(TRAVIS_PATH)) as inf: + self.outcfg = yaml.safe_load(inf) + + self.monomer_couplings = "{}_CouplingScores.csv".format(TRAVIS_PATH_MONOMER) + self.enrichment_file = "{}_enrichment.csv".format(TRAVIS_PATH_MONOMER) + + def test_read_raw_ec_file(self): + """ + tests the sorted and unsorted modes of read_raw_ec_file + """ + ecs = pd.read_csv(self.raw_ec_file, sep=" ", names=["i", "A_i", "j", "A_j", "fn", "cn"]) + _test_ecs = read_raw_ec_file(self.raw_ec_file, sort=False, score="cn") + + pd.testing.assert_frame_equal(ecs, _test_ecs) + + sorted_ecs = ecs.sort_values(by="cn", ascending=False) + _sorted_test_ecs = read_raw_ec_file(self.raw_ec_file, sort=True, score="cn") + + pd.testing.assert_frame_equal(sorted_ecs, _sorted_test_ecs) + + def test_enrichment(self): + """ + tests the EC enrichment function when the monomer ECs have no segments + """ + enrichment_scores = pd.read_csv(self.enrichment_file) + ecs = pd.read_csv(self.monomer_couplings) + ecs = ecs.drop(["segment_i", "segment_j"], axis=1) + _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True) + + pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores) + + def test_enrichment_monomer_segment(self): + """ + tests the EC enrichment function when the monomer ECs have segments + """ + enrichment_scores = pd.read_csv(self.enrichment_file) + enrichment_scores["segment_i"] = "A" + enrichment_scores = enrichment_scores[['i', 'A_i', 'segment_i', 'enrichment']] + + ecs = pd.read_csv(self.monomer_couplings) + _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True) + + pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores) + + def test_enrichment_complex_segment(self): + """ + tests the EC enrichment function with complex ECs + """ + + ecs = pd.read_csv(self.couplings_file) + _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True) + enrichment_scores = pd.read_csv(("{0}/couplings/test_new_enrichment.csv".format(TRAVIS_PATH))) + pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores) + + def test_add_mixture_probability_skewnormal(self): + ecs_with_prob = add_mixture_probability( + self.inter_ecs, model="skewnormal" + ) + + _test_ecs = pd.read_csv( + "{}/couplings/test_new_CouplingScores_skewnormal.csv".format(TRAVIS_PATH), index_col=0 + ) + pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs) + + def test_add_mixture_probability_normal(self): + ecs_with_prob = add_mixture_probability( + self.inter_ecs, model="normal" + ) + + _test_ecs = pd.read_csv( + "{}/couplings/test_new_CouplingScores_normal.csv".format(TRAVIS_PATH), index_col=0 + ) + pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs) + + def test_add_mixture_probability_evcomplex_uncorrected(self): + ecs_with_prob = add_mixture_probability( + self.inter_ecs, model="evcomplex_uncorrected" + ) + + _test_ecs = pd.read_csv( + "{}/couplings/test_new_CouplingScores_evc_raw.csv".format(TRAVIS_PATH), index_col=0 + ) + pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs) + + def test_add_mixture_probability_evcomplex(self): + NeffL = self.outcfg["effective_sequences"] / self.outcfg["num_sites"] + ecs_with_prob = add_mixture_probability( + self.inter_ecs, model="evcomplex", N_effL=NeffL + ) + + _test_ecs = pd.read_csv( + "{}/couplings/test_new_CouplingScores_evc.csv".format(TRAVIS_PATH), index_col=0 + ) + pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs) + + def test_add_mixture_probability_evcomplex_error(self): + """ + tests that EVcomplex score cannot be calculated without a user-supplied Meff + :return: + """ + with self.assertRaises(ValueError): + add_mixture_probability(self.ecs, model="evcomplex") + + def test_add_mixture_probability_invalid_selection(self): + with self.assertRaises(ValueError): + add_mixture_probability(self.ecs, model="fake news") + +if __name__ == '__main__': + unittest.main()