diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt
index b1f775da..91b75084 100644
--- a/config/sample_config_complex.txt
+++ b/config/sample_config_complex.txt
@@ -7,7 +7,7 @@
 # Minimal settings required before this configuration can be executed:
 # - set your environment, paths to tools and databases (at the end of this file)
 # - under "global", set prefix
-# - under "align_1" and "align_2", set the monomer sequence_id 
+# - under "align_1" and "align_2", set the monomer sequence_id
 # - run it! :)
 
 # Configuration rules:
@@ -29,12 +29,12 @@ stages:
     - compare
     - mutate
     - fold
-    
+
 # Global job settings. These will override settings of the same name in each of the stages.
 # These are typically the settings you want to modify for each of your jobs, together with some settings in the align stage.
 global:
     # mandatory output prefix of the job (e.g. output/HRAS will store outputs in folder "output", using files prefixed with "HRAS")
-    prefix: 
+    prefix:
 
     # Clustering threshold for downweighting redudant sequences (Meff computation). E.g. 0.8 will cluster sequences
     # at a 80% sequence identity cutoff
@@ -47,20 +47,20 @@ global:
 align_1:
     # use complex protocol to properly prepare inputs for concatenation
     protocol: complex
-    
+
     # monomer alignment creation protocol to nest within the complex alignment protocol
     # choose either existing (below) to use a previously created alignment
-    # or standard to construct an alignment 
+    # or standard to construct an alignment
     alignment_protocol: standard
 
 
     # Mandatory: specify the sequence identifier
     # Region can be left blank
     # Sequence file can be left blank
-    sequence_id: 
-    region: 
+    sequence_id:
+    region:
     sequence_file:
-    
+
     # The following typically do not need to be set because 'global' overrides them
     # prefix:
     # theta:
@@ -87,7 +87,7 @@ align_1:
 
     # sequence database (specify possible databases and paths in "databases" section below)
     # note: use uniprot for genome distance based concatenation
-    database: uniref100
+    database: uniprot
 
     # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage.
     # To save compute time, this computation is normally carried out in the couplings stage
@@ -134,10 +134,10 @@ align_1:
 #    minimum_column_coverage: 70
 #    extract_annotation: True
 
-#    # if using existing alignment protocol, provide a path to the annotations.csv file 
+#    # if using existing alignment protocol, provide a path to the annotations.csv file
 #    # from the monomer run that generated the input alignment
 #    # Needed to correctly find the species identifiers for best hit concatenation
-#    override_annotation_file: 
+#    override_annotation_file:
 
 # Sequence alignment generation/processing for the second monomer.
 align_2:
@@ -148,10 +148,10 @@ align_2:
     alignment_protocol: standard
     # Mandatory: specify the sequence identifier and region
     # Sequence file can be left blank
-    sequence_id: 
+    sequence_id:
     region:
     sequence_file:
-    
+
     # The following typically do not need to be set because 'global' overrides them
     # prefix:
     # theta:
@@ -178,7 +178,7 @@ align_2:
 
     # sequence database (specify possible databases and paths in "databases" section below)
     # note: use uniprot for genome distance based concatenation
-    database: uniref100
+    database: uniprot
 
     # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage.
     # To save compute time, this computation is normally carried out in the couplings stage
@@ -224,10 +224,10 @@ align_2:
 #    minimum_sequence_coverage: 50
 #    minimum_column_coverage: 70
 #    extract_annotation: True
-#    # if using existing alignment protocol, provide a path to the annotations.csv file 
+#    # if using existing alignment protocol, provide a path to the annotations.csv file
 #    # from the monomer run that generated the input alignment
 #    # Needed to correctly find the species identifiers for best hit concatenation
-#    override_annotation_file: 
+#    override_annotation_file:
 
 #Generation of concatenated sequence alignment for evolutionary couplings calculation
 concatenate:
@@ -238,31 +238,35 @@ concatenate:
     second_alignment_file:
 
     # Select protocol for concatenation of sequence alignments
-    # Available protocols: 
+    # Available protocols:
     # genome_distance: pair sequences that are closest neighbors on the genome
     # best_hit: for each genome, pair the sequences that have the highest % identity to the target sequence
     # for best hit protocol, user can set use_best_reciprocal to take the best reciprocal hits only (recommended)
     protocol: best_hit
     use_best_reciprocal: true
-    
+
     # Maximum genome distance in bases allowed between pairs
     # Required for genome_distance protocol only
     genome_distance_threshold: 10000
-    
+
     # Maximum sequence identity allowed for hits to be designated
     # as paralogs. Required for best_hit in best reciprocal mode only
     paralog_identity_threshold: 0.95
-    
+
+    # forbid overlapping regions of the same seqeunce ID from being concatenated
+    # for typical heteromultimeric complexes, this should be true
+    forbid_overlapping_concatenation: true
+
     # Parameters for filtering of concatenated alignment
-    
+
     # Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in
     # the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence
     # already present in the alignment). If blank, no filtering. If filtering, HHfilter must be installed.
     seqid_filter:
-    
+
     # Only keep sequences that align to at least x% of the target sequence (i.e. remove fragments)
     minimum_sequence_coverage: 50
-    
+
     # Only include alignment columns with at least x% residues (rather than gaps) during model inference
     minimum_column_coverage: 50
 
@@ -305,12 +309,12 @@ couplings:
     # Sequence separation filter for generation of CouplingScores_longrange.csv table (i.e. to take out short-range
     # ECs from table, only pairs with abs(i-j)>=min_sequence_distance will be kept.
     min_sequence_distance: 6
-    
+
     # Parameters specific to complex pipeline scoring
     # Scoring model to assess confidence in computed ECs
     # available options: skewnormal, normal, evcomplex
     scoring_model: skewnormal
-    
+
     # Specify whether to use all ECs or only inter-molecular ECs for scoring
     use_all_ecs_for_scoring: False
 
@@ -327,7 +331,7 @@ couplings:
 compare:
     # Current options: standard, complex
     protocol: complex
-    
+
     # Following parameters will be usually overriden by global settings / output of previous stage
     prefix:
     ec_file:
@@ -340,28 +344,30 @@ compare:
     # sequence_id and SIFTS database (sequence_id must be UniProt AC/ID in this case)
     first_by_alignment: True
     second_by_alignment: True
-    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
-    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
-    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
-    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
+    # Alignment method to use to find sequences corresponding to PDB structures. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search using jackhmmer from the target sequence only (more stringent).
+    # Set to hmmsearch to search using an HMM built from the output monomer alignment (less stringent).
+    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences.
     first_pdb_alignment_method: jackhmmer
     second_pdb_alignment_method: jackhmmer
-    
+
     # Leave this parameter empty to use all PDB structures for given sequence_id, otherwise
     # will be limited to the given IDs (single value or list). Important: note that this acts only as a filter on the
     # structures found by alignment or in the SIFTS table (!)
-    pdb_ids:
+    inter_pdb_ids:
     first_pdb_ids:
     second_pdb_ids:
 
     # Limit number of structures and chains for comparison
-    # Note - the intersection of the monomer structural hits is taken to find the
-    # Inter-protein structures. If you limit the number of monomer structures found in this step, 
-    # you may miss some inter-protein structures
-    first_max_num_structures: 100
-    first_max_num_hits: 100
-    second_max_num_structures: 100
-    second_max_num_hits: 100
+    inter_max_num_structures: 10
+    inter_max_num_hits: 10
+
+    # Limit number chains and structures to use for each monomer comparison, IN ADDITION to those found from the
+    # inter protein compariso
+    first_max_num_structures: 10
+    first_max_num_hits: 10
+    second_max_num_structures: 10
+    second_max_num_hits: 10
 
     # compare to multimer contacts (if multiple chains of the same sequence or its homologs are present in a structure)
     first_compare_multimer: True
@@ -376,7 +382,7 @@ compare:
     first_use_bitscores: True
     first_domain_threshold: 0.5
     first_sequence_threshold: 0.5
-    
+
     second_sequence_file:
     second_first_index:
     second_region:
@@ -386,10 +392,10 @@ compare:
     second_sequence_threshold: 0.5
 
     # Comparison and plotting settings
-    
+
     # Return an error if we fail to automatically retrieve information about a given pdb id
     raise_missing: False
-    
+
     # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be
     # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any
     # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances,
@@ -406,7 +412,7 @@ compare:
     plot_probability_cutoffs: [0.90, 0.99]
 
     # Plot fixed numbers of inter-protein ECS, and all intra ECs scoring at least as high
-    # As those inter-protein ECs. 
+    # As those inter-protein ECs.
     # Use integers only
     plot_lowest_count: 5
     plot_highest_count: 10
@@ -421,7 +427,7 @@ compare:
 
     # draw secondary structure on contact map plots
     draw_secondary_structure: True
-    
+
 # Settings for Mutation effect predictions
 mutate:
     # Options: standard, complex
@@ -489,7 +495,7 @@ environment:
 
     # command that will be executed before running actual computation (can be used to set up environment)
     configuration:
-        
+
 
 # Paths to databases used by evcouplings.
 databases:
@@ -510,14 +516,22 @@ databases:
     # Periodically delete these files to more recent versions of SIFTS are used.
     sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv
     sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta
-    
+
     # the following two databases are exclusive to EVcomplex and need to be manually downloaded and saved locally
     # then add the paths to your local copies of the database
-    # Download urls: 
+    # Download urls:
     # ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt
-    # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt 
+    # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt
     uniprot_to_embl_table: /n/groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt
     ena_genome_location_table: /n/groups/marks/databases/complexes/ena/2017_02/cds_pro.txt
+    structurefree_model_file: /n/groups/marks/users/agreen/dev/EVcouplings/evcouplings/compare/aux/residue_strucfree.saved
+    structureaware_model_file: /n/groups/marks/users/agreen/dev/EVcouplings/evcouplings/compare/aux/residue_strucaware.saved
+  
+    complex_strucfree_model_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucfree.saved
+    complex_strucfree_scaler_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucfree.scaler
+    complex_strucaware_model_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucaware.saved
+    complex_strucaware_scaler_file: /n/home/ag300/EVcouplings/compare/aux/complex_strucaware.saved
+
 
 # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
 tools:
@@ -529,4 +543,4 @@ tools:
     psipred: /n/groups/marks/software/runpsipred
     cns: /n/groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns
     maxcluster: /n/groups/marks/pipelines/evcouplings/software/maxcluster64bit
-    
+    dssp: /n/groups/marks/software/dssp
diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
index 084bbd61..67df56b1 100644
--- a/config/sample_config_monomer.txt
+++ b/config/sample_config_monomer.txt
@@ -278,9 +278,9 @@ compare:
     # print information about used PDB structures on contact map plots
     print_pdb_information: True
 
-    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
-    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
-    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
+    # Alignment method to use to find sequences corresponding to PDB structures. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search using jackhmmer from the target sequence only (more stringent).
+    # Set to hmmsearch to search using an HMM built from the output monomer alignment (less stringent).
     # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
     pdb_alignment_method: jackhmmer
 
diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index 06fc1902..08334f72 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -48,7 +48,7 @@ def _verify_sequence_id(sequence_id):
     Verify if a target sequence identifier is in proper
     format for the pipeline to run without errors
     (not none, and contains no whitespace)
-        
+
     Parameters
     ----------
     id : str
@@ -80,8 +80,8 @@ def _verify_sequence_id(sequence_id):
 def _make_hmmsearch_raw_fasta(alignment_result, prefix):
     """
     HMMsearch results do not contain the query sequence
-    so we must construct a raw_fasta file with the query 
-    sequence as the first hit, to ensure proper numbering. 
+    so we must construct a raw_fasta file with the query
+    sequence as the first hit, to ensure proper numbering.
     The search result is filtered to only contain the columns with
     match states to the HMM, which has a one to one mapping to the
     query sequence.
@@ -110,7 +110,7 @@ def _add_gaps_to_query(query_sequence_ali, ali):
             i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x"
         ]
 
-        # ensure that the length of the match states 
+        # ensure that the length of the match states
         # match the length of the sequence
         if len(match_index) != query_sequence_ali.L:
             raise ValueError(
@@ -142,7 +142,7 @@ def _add_gaps_to_query(query_sequence_ali, ali):
     with open(alignment_result["target_sequence_file"]) as a:
         query_sequence_ali = Alignment.from_file(a, format="fasta")
 
-    # if the provided alignment is empty, just return the target sequence 
+    # if the provided alignment is empty, just return the target sequence
     raw_focus_alignment_file = prefix + "_raw.fasta"
     if not valid_file(alignment_result["raw_alignment_file"]):
         # write the query sequence to a fasta file
@@ -162,14 +162,14 @@ def _add_gaps_to_query(query_sequence_ali, ali):
             "Stockholm alignment {} missing RF"
             " annotation of match states".format(alignment_result["raw_alignment_file"])
         )
-            
+
     # add insertions to the query sequence in order to preserve correct
     # numbering of match sequences
     gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali)
 
-    # write a new alignment file with the query sequence as 
+    # write a new alignment file with the query sequence as
     # the first entry
-    
+
     with open(raw_focus_alignment_file, "w") as of:
         gapped_sequence_ali.write(of)
         ali.write(of)
@@ -1164,9 +1164,9 @@ def hmmbuild_and_search(**kwargs):
     """
     Protocol:
 
-    Build HMM from sequence alignment using hmmbuild and 
+    Build HMM from sequence alignment using hmmbuild and
     search against a sequence database using hmmsearch.
-    
+
     Parameters
     ----------
     Mandatory kwargs arguments:
@@ -1298,7 +1298,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
         return focus_fasta_file, target_sequence_file, region_start, region_end
 
 
-    # define the gap threshold for inclusion in HMM's build by HMMbuild. 
+    # define the gap threshold for inclusion in HMM's build by HMMbuild.
     SYMFRAC_HMMBUILD = 0.0
 
     # check for required options
@@ -1343,7 +1343,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
     else:
         # otherwise, we have to run the alignment
         # modify search thresholds to be suitable for hmmsearch
-        sequence_length = region_end - region_start + 1 
+        sequence_length = region_end - region_start + 1
         seq_threshold, domain_threshold = search_thresholds(
             kwargs["use_bitscores"],
             kwargs["sequence_threshold"],
@@ -1371,7 +1371,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
             seq_threshold=seq_threshold,
             nobias=kwargs["nobias"],
             cpu=kwargs["cpu"],
-            binary=kwargs["hmmsearch"], 
+            binary=kwargs["hmmsearch"],
         )
 
         # get rid of huge stdout log file immediately
@@ -1400,7 +1400,7 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
         "hittable_file": ali["domtblout"],
     }
 
-    # convert the raw output alignment to fasta format 
+    # convert the raw output alignment to fasta format
     # and add the appropriate query sequecne
     raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
     outcfg["raw_focus_alignment_file"] =  raw_focus_alignment_file
@@ -1605,13 +1605,13 @@ def complex(**kwargs):
             annotation_data = pd.read_csv(kwargs["override_annotation_file"])
             annotation_data.to_csv(outcfg["annotation_file"])
 
-    # extract cds identifiers for alignment uniprot IDs
+    # # extract cds identifiers for alignment uniprot IDs
     cds_ids = extract_cds_ids(
         outcfg["alignment_file"],
         kwargs["uniprot_to_embl_table"]
     )
 
-    # extract genome location information from ENA
+    ##extract genome location information from ENA
     genome_location_filename = prefix + "_genome_location.csv"
 
     genome_location_table = extract_embl_annotation(
diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py
index 8db22f9b..8a21a225 100644
--- a/evcouplings/compare/distances.py
+++ b/evcouplings/compare/distances.py
@@ -1290,11 +1290,13 @@ def _get_chains(sifts_result):
     individual_distance_maps = []
 
     # determine which combinations of chains to look at
-    # (anything that has same PDB identifier)
+    # (anything that has same PDB identifier)...
     combis = sifts_result_i.hits.reset_index().merge(
         sifts_result_j.hits.reset_index(),
         on="pdb_id", suffixes=("_i", "_j")
     )
+    # but not the same chain (else we get i to i contacts)
+    combis = combis.query("pdb_chain_i != pdb_chain_j")
 
     # extract chains for each subunit
     chains_i = _get_chains(sifts_result_i)
diff --git a/evcouplings/compare/ecs.py b/evcouplings/compare/ecs.py
index e3c69a7e..d895b84d 100644
--- a/evcouplings/compare/ecs.py
+++ b/evcouplings/compare/ecs.py
@@ -76,7 +76,7 @@ def add_precision(ec_table, dist_cutoff=5, score="cn",
     ec_table = ec_table.sort_values(by=score, ascending=False)
 
     if min_sequence_dist is not None:
-        ec_table = ec_table.query("abs(i - j) >= @min_sequence_dist")
+        ec_table = ec_table.query("abs(i - j) >= @min_sequence_dist or segment_i != segment_j")
 
     ec_table = ec_table.copy()
 
@@ -136,7 +136,7 @@ def coupling_scores_compared(ec_table, dist_map, dist_map_multimer=None,
         )
 
     if min_sequence_dist is not None:
-        x = x.query("abs(i - j) >= @min_sequence_dist")
+        x = x.query("abs(i - j) >= @min_sequence_dist or segment_i != segment_j")
 
     # if distance cutoff is given, add precision
     if dist_cutoff is not None:
diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index 1f07002f..f088f095 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -5,7 +5,7 @@
   Thomas A. Hopf
   Anna G. Green (complex and _make_complex_contact_maps)
 """
-
+import joblib
 from copy import deepcopy
 from math import ceil
 import pandas as pd
@@ -20,7 +20,7 @@
 )
 
 from evcouplings.utils.system import (
-    create_prefix_folders, insert_dir, verify_resources,
+    create_prefix_folders, insert_dir, verify_resources, valid_file
 )
 from evcouplings.couplings import Segment
 from evcouplings.compare.pdb import load_structures
@@ -112,21 +112,77 @@ def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR,
     )
 
 
-def _identify_structures(**kwargs):
+def complex_probability(ecs, scoring_model, use_all_ecs=False,
+                        score="cn", Neff_over_L=None):
     """
-    Identify set of 3D structures for comparison
+    Adds confidence measure for complex evolutionary couplings
 
     Parameters
     ----------
-    **kwargs
-        See check_required in code below
+    ecs : pandas.DataFrame
+        Table with evolutionary couplings
+    scoring_model : {"skewnormal", "normal", "evcomplex"}
+        Use this scoring model to assign EC confidence measure
+    use_all_ecs : bool, optional (default: False)
+        If true, fits the scoring model to all ECs;
+        if false, fits the model to only the inter ECs.
+    score : str, optional (default: "cn")
+        Use this score column for confidence assignment
 
     Returns
     -------
-    SIFTSResult
-        Identified structures and residue index mappings
+    ecs : pandas.DataFrame
+        EC table with additional column "probability"
+        containing confidence measure
     """
+    from evcouplings.couplings.pairs import add_mixture_probability
 
+    if use_all_ecs:
+        ecs = add_mixture_probability(
+            ecs, model=scoring_model
+        )
+    else:
+        inter_ecs = ecs.query("segment_i != segment_j")
+        intra_ecs = ecs.query("segment_i == segment_j")
+
+        intra_ecs.assign(scoring_model = np.nan)
+
+        inter_ecs = add_mixture_probability(
+            inter_ecs, model=scoring_model, score=score, Neff_over_L=Neff_over_L
+        )
+
+        ecs = pd.concat(
+            [intra_ecs, inter_ecs]
+        ).sort_values(
+            score, ascending=False
+        )
+
+    return ecs
+
+
+def _filter_structures(sifts_map, pdb_ids=None, max_num_hits=None, max_num_structures=None):
+
+    """
+    Filters input SIFTSResult for specific pdb ids and/or number of hits
+
+    Parameters
+    ----------
+    sifts_map: SIFTSResult
+        Identified structures and residue index mappings
+    pdb_ids: list of str, optional (default: None)
+        List of PDB ids to be used for comparison
+    max_num_hits: int, optional (default: None)
+        Number of PDB hits to be used for comparison.
+        Different chains from the same PDB count as multiple hits.
+    max_num_structures: int, optional (default: None)
+        Number of unique PDB ids to be used for comparison.
+        Different chains from the same PDB count as one hit.
+
+    Returns
+    -------
+    SIFTSResult
+        Filtered identified structures and residue index mappings
+    """
     def _filter_by_id(x, id_list):
         x = deepcopy(x)
         x.hits = x.hits.loc[
@@ -134,6 +190,43 @@ def _filter_by_id(x, id_list):
         ]
         return x
 
+
+    # filter ID list down to manually selected PDB entries
+    if pdb_ids is not None:
+
+        # make sure we have a list of PDB IDs
+        if not isinstance(pdb_ids, list):
+            pdb_ids = [pdb_ids]
+
+        pdb_ids = [x.lower() for x in pdb_ids]
+        sifts_map = _filter_by_id(sifts_map, pdb_ids)
+
+    # limit number of hits and structures
+    if max_num_hits is not None:
+        sifts_map.hits = sifts_map.hits.iloc[:max_num_hits]
+
+    if max_num_structures is not None:
+        keep_ids = sifts_map.hits.pdb_id.unique()
+        keep_ids = keep_ids[:max_num_structures]
+        sifts_map = _filter_by_id(sifts_map, keep_ids)
+
+    return sifts_map
+
+def _identify_structures(**kwargs):
+    """
+    Identify set of 3D structures for comparison
+
+    Parameters
+    ----------
+    **kwargs
+        See check_required in code below
+
+    Returns
+    -------
+    SIFTSResult
+        Identified structures and residue index mappings
+    """
+
     check_required(
         kwargs,
         [
@@ -183,28 +276,13 @@ def _filter_by_id(x, id_list):
             kwargs["sequence_id"], reduce_chains=reduce_chains
         )
 
+    # Save the pre-filtered SIFTs map
     sifts_map_full = deepcopy(sifts_map)
 
-    # filter ID list down to manually selected PDB entries
-    if kwargs["pdb_ids"] is not None:
-        pdb_ids = kwargs["pdb_ids"]
-
-        # make sure we have a list of PDB IDs
-        if not isinstance(pdb_ids, list):
-            pdb_ids = [pdb_ids]
-
-        pdb_ids = [x.lower() for x in pdb_ids]
-
-        sifts_map = _filter_by_id(sifts_map, pdb_ids)
-
-    # limit number of hits and structures
-    if kwargs["max_num_hits"] is not None:
-        sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]]
-
-    if kwargs["max_num_structures"] is not None:
-        keep_ids = sifts_map.hits.pdb_id.unique()
-        keep_ids = keep_ids[:kwargs["max_num_structures"]]
-        sifts_map = _filter_by_id(sifts_map, keep_ids)
+    #Filter the SIFTS map
+    sifts_map = _filter_structures(
+       sifts_map, kwargs["pdb_ids"], kwargs["max_num_hits"], kwargs["max_num_structures"]
+    )
 
     return sifts_map, sifts_map_full
 
@@ -295,6 +373,7 @@ def plot_cm(ecs, output_file=None):
             "draw_secondary_structure"
         ]
     )
+
     prefix = kwargs["prefix"]
 
     cm_files = []
@@ -345,43 +424,12 @@ def _discrete_count(x):
     # give back list of all contact map file names
     return cm_files
 
-
-def _make_complex_contact_maps(ec_table, d_intra_i, d_multimer_i,
-                               d_intra_j, d_multimer_j,
-                               d_inter, first_segment_name,
-                               second_segment_name, **kwargs):
-    """
-    Plot contact maps with all ECs above a certain probability threshold,
-    or a given count of ECs
-
-    Parameters
-    ----------
-    ec_table : pandas.DataFrame
-        Full set of evolutionary couplings (all pairs)
-    d_intra_i, d_intra_j: DistanceMap
-        Computed residue-residue distances within chains for
-        monomers i and j
-    d_multimer_i, d_multimer_j : DistanceMap
-        Computed residue-residue distances between homomultimeric
-        chains for monomers i and j
-    d_inter: DistanceMap
-        Computed residue-residue distances between heteromultimeric
-        chains i and j
-    first_segment_name, second_segment_name: str
-        Name of segment i and segment j in the ec_table
-    **kwargs
-        Further plotting parameters, see check_required in code
-        for necessary values.
-
-    Returns
-    -------
-    cm_files : list(str)
-        Paths of generated contact map files
-    """
-
-    def plot_complex_cm(ecs_i, ecs_j, ecs_inter, 
-                        first_segment_name,
-                        second_segment_name, output_file=None):
+def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
+                        d_intra_i, d_multimer_i,
+                        d_intra_j, d_multimer_j,
+                        d_inter, first_segment_name,
+                        second_segment_name, output_file,
+                        **kwargs):
         """
         Simple wrapper for contact map plotting
         """
@@ -404,7 +452,7 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
                 if len(ecs_inter) == 0:
                     ecs_inter = None
 
-            # Currently, we require at least one of the monomer 
+                    # Currently, we require at least one of the monomer
             # to have either ECs or distances in order to make a plot
             if ((ecs_i is None or ecs_i.empty) and d_intra_i is None and d_multimer_i is None) \
                     or ((ecs_j is None or ecs_j.empty) and d_intra_j is None and d_multimer_j is None):
@@ -421,7 +469,8 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
                 margin=5,
                 boundaries=kwargs["boundaries"],
                 scale_sizes=kwargs["scale_sizes"],
-                show_secstruct=kwargs["draw_secondary_structure"]
+                show_secstruct=kwargs["draw_secondary_structure"],
+                distance_cutoff=kwargs["distance_cutoff"]
             )
 
             # Add title to the plot
@@ -430,7 +479,7 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
             else:
                 ec_len = len(ecs_inter)
             plt.suptitle(
-                "{} inter-molecule evolutionary couplings".format(ec_len), 
+                "{} inter-molecule evolutionary couplings".format(ec_len),
                 fontsize=14
             )
 
@@ -441,6 +490,56 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
 
             return True
 
+
+def _make_complex_contact_maps(ec_table, d_intra_i, d_multimer_i,
+                               d_intra_j, d_multimer_j,
+                               d_inter, first_segment_name,
+                               second_segment_name,  **kwargs):
+    """
+    Plot contact maps with all ECs above a certain probability threshold,
+    or a given count of ECs
+
+    Parameters
+    ----------
+    ec_table : pandas.DataFrame
+        Full set of evolutionary couplings (all pairs)
+    d_intra_i, d_intra_j: DistanceMap
+        Computed residue-residue distances within chains for
+        monomers i and j
+    d_multimer_i, d_multimer_j : DistanceMap
+        Computed residue-residue distances between homomultimeric
+        chains for monomers i and j
+    d_inter: DistanceMap
+        Computed residue-residue distances between heteromultimeric
+        chains i and j
+    first_segment_name, second_segment_name: str
+        Name of segment i and segment j in the ec_table
+    **kwargs
+        Further plotting parameters, see check_required in code
+        for necessary values.
+
+    Returns
+    -------
+    cm_files : list(str)
+        Paths of generated contact map files
+    """
+
+    # transform fraction of number of sites into discrete number of ECs
+    def _discrete_count(x):
+        if isinstance(x, float):
+            num_sites = 0
+            for seg_name in [first_segment_name, second_segment_name]:
+                num_sites += len(
+                    set.union(
+                        set(ec_table.query("segment_i == @seg_name").i.unique()),
+                        set(ec_table.query("segment_j == @seg_name").j.unique())
+                    )
+                )
+
+            x = ceil(x * num_sites)
+
+        return int(x)
+
     check_required(
         kwargs,
         [
@@ -479,28 +578,15 @@ def plot_complex_cm(ecs_i, ecs_j, ecs_inter,
                 output_file = prefix + "_significant_ECs_{}.pdf".format(c)
                 plot_completed = plot_complex_cm(
                     ec_set_i, ec_set_j, ec_set_inter,
+                    d_intra_i, d_multimer_i,
+                    d_intra_j, d_multimer_j,
+                    d_inter,
                     first_segment_name, second_segment_name,
                     output_file=output_file
                 )
                 if plot_completed:
                     cm_files.append(output_file)
 
-    # transform fraction of number of sites into discrete number of ECs
-    def _discrete_count(x):
-        if isinstance(x, float):
-            num_sites = 0
-            for seg_name in [first_segment_name, second_segment_name]:
-                num_sites += len(
-                    set.union(
-                        set(ec_table.query("segment_i == @seg_name").i.unique()),
-                        set(ec_table.query("segment_j == @seg_name").j.unique())
-                    )
-                )
-
-            x = ceil(x * num_sites)
-
-        return int(x)
-
     # range of plots to make
     lowest = _discrete_count(kwargs["plot_lowest_count"])
     highest = _discrete_count(kwargs["plot_highest_count"])
@@ -875,7 +961,7 @@ def complex(**kwargs):
             "first_sequence_id", "second_sequence_id",
             "first_sequence_file", "second_sequence_file",
             "first_target_sequence_file", "second_target_sequence_file",
-            "scale_sizes"
+            "scale_sizes", "structurefree_model_file", "structureaware_model_file"
         ]
     )
 
@@ -885,11 +971,8 @@ def complex(**kwargs):
         # initialize output EC files
         "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
         "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv",
-        "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv",
-
-        # initialize output inter distancemap files
-        "distmap_inter": prefix + "_distmap_inter",
-        "inter_contacts_file": prefix + "_inter_contacts_file"
+        "ec_compared_inter_file": prefix + "_CouplingsScoresCompared_inter.csv",
+        "distmap_inter": prefix + "_distmap_inter"
     }
 
     # Add PDB comparison files for first and second monomer
@@ -930,36 +1013,74 @@ def complex(**kwargs):
     # Step 1: Identify 3D structures for comparison
     def _identify_monomer_structures(name_prefix, outcfg, aux_prefix):
         # create a dictionary with kwargs for just the current monomer
-        # remove the "prefix" kwargs so that we can replace with the 
+        # any prefix that starts with a name_prefix will overwrite prefixes that do not start
+        # eg, "first_sequence_file" will overwrite "sequence_file"
+        monomer_kwargs = deepcopy(kwargs)
+        for k,v in kwargs.items():
+            if name_prefix + "_" in k:
+                # only replace first occurrence of name_prefix
+                monomer_kwargs[k.replace(name_prefix + "_", "", 1)] = v
+
+        # remove the "prefix" kwargs so that we can replace with the
         # aux prefix when calling _identify_structures
-        # only replace first occurrence of name_prefix
         monomer_kwargs = {
-            k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k
+            k: v for k, v in monomer_kwargs.items() if "prefix" not in k
         }
 
-        # this field needs to be set explicitly else it gets overwritten by concatenated file
-        monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"]
-        monomer_kwargs["raw_focus_alignment_file"] = kwargs[name_prefix + "_raw_focus_alignment_file"]
-
         # identify structures for that monomer
         sifts_map, sifts_map_full = _identify_structures(
             **monomer_kwargs,
             prefix=aux_prefix
         )
 
+        # save full list of hits
+        sifts_map_full.hits.to_csv(
+            outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False
+        )
+
+        return outcfg, sifts_map, sifts_map_full
+
+    outcfg, first_sifts_map, first_sifts_map_full = _identify_monomer_structures("first", outcfg, first_aux_prefix)
+    outcfg, second_sifts_map, second_sifts_map_full = _identify_monomer_structures("second", outcfg, second_aux_prefix)
+
+    # Determine the inter-protein PDB hits based on the full sifts map for each monomer
+    # initialize output inter distancemap files
+    inter_protein_hits_full = first_sifts_map_full.hits.merge(
+        second_sifts_map_full.hits, on="pdb_id", how="inner", suffixes=["_1", "_2"]
+    )
+    outcfg["structure_hits_unfiltered_file"] = prefix + "_inter_structure_hits_unfiltered.csv"
+    inter_protein_hits_full.to_csv(outcfg["structure_hits_unfiltered_file"])
+
+    # Filter for the number of PDB ids to use
+    inter_protein_sifts = SIFTSResult(hits=inter_protein_hits_full, mapping=None)
+    inter_protein_sifts = _filter_structures(
+        inter_protein_sifts,
+        kwargs["inter_pdb_ids"],
+        kwargs["inter_max_num_hits"],
+        kwargs["inter_max_num_structures"]
+    )
+
+    outcfg["structure_hits_file"] = prefix + "_inter_structure_hits.csv"
+    inter_protein_sifts.hits.to_csv(outcfg["structure_hits_file"])
+
+    def _add_inter_pdbs(inter_protein_sifts, sifts_map, sifts_map_full, name_prefix,):
+        """
+        ensures that all pdbs used for inter comparison end up in the monomer SIFTS hits
+        """
+
+        lines_to_keep = sifts_map_full.hits.query("pdb_id in @inter_protein_sifts.hits.pdb_id").index
+        sifts_map.hits = pd.concat([
+            sifts_map.hits, sifts_map_full.hits.loc[lines_to_keep, :]
+        ]).drop_duplicates()
+
         # save selected PDB hits
         sifts_map.hits.to_csv(
             outcfg[name_prefix + "_pdb_structure_hits_file"], index=False
         )
+        return sifts_map
 
-        # also save full list of hits
-        sifts_map_full.hits.to_csv(
-            outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False
-        )
-        return outcfg, sifts_map
-
-    outcfg, first_sifts_map = _identify_monomer_structures("first", outcfg, first_aux_prefix)
-    outcfg, second_sifts_map = _identify_monomer_structures("second", outcfg, second_aux_prefix)
+    first_sifts_map = _add_inter_pdbs(inter_protein_sifts, first_sifts_map, first_sifts_map_full, "first")
+    second_sifts_map = _add_inter_pdbs(inter_protein_sifts, second_sifts_map, second_sifts_map_full, "second")
 
     # get the segment names from the kwargs
     segment_list = kwargs["segments"]
@@ -976,6 +1097,17 @@ def _identify_monomer_structures(name_prefix, outcfg, aux_prefix):
     first_chain_name = Segment.from_list(kwargs["segments"][0]).default_chain_name()
     second_chain_name = Segment.from_list(kwargs["segments"][1]).default_chain_name()
 
+    # load all structures at once
+    all_ids = set(first_sifts_map.hits.pdb_id).union(
+        set(second_sifts_map.hits.pdb_id)
+    )
+
+    structures = load_structures(
+        all_ids,
+        kwargs["pdb_mmtf_dir"],
+        raise_missing=False
+    )
+
     # Step 2: Compute distance maps
     def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
 
@@ -1024,7 +1156,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
             # if we have a multimer contact map, save it
             if d_multimer is not None:
                 d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"])
-                outcfg[name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv"
+                outcfg[name_prefix + "_multimer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_multimer.csv"
 
                 # save contacts to separate file
                 d_multimer.contacts(
@@ -1042,8 +1174,8 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
             outcfg[name_prefix + "_remapped_pdb_files"] = {
                 filename: mapping_index for mapping_index, filename in
                 remap_chains(
-                    sifts_map, aux_prefix, seqmap, chain_name=chain_name,
-                    raise_missing=kwargs["raise_missing"]
+                    sifts_map, aux_prefix, None, chain_name=chain_name,
+                    raise_missing=kwargs["raise_missing"], atom_filter=None
                 ).items()
             }
 
@@ -1057,16 +1189,6 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
 
         return d_intra, d_multimer, seqmap
 
-    # load all structures for both monomers
-    all_structures = set(first_sifts_map.hits.pdb_id).union(
-        set(second_sifts_map.hits.pdb_id)
-    )
-    structures = load_structures(
-        all_structures,
-        kwargs["pdb_mmtf_dir"],
-        raise_missing=False
-    )
-
     d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps(
         first_sifts_map, "first", first_chain_name
     )
@@ -1085,6 +1207,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
             d_inter.to_file(outcfg["distmap_inter"])
 
             # save contacts to separate file
+            outcfg["inter_contacts_file"] = prefix + "_inter_contacts.csv"
             d_inter.contacts(
                 kwargs["distance_cutoff"]
             ).to_csv(
@@ -1137,7 +1260,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
                     ecs_inter, d_inter, dist_map_multimer=None,
                     dist_cutoff=kwargs["distance_cutoff"],
                     output_file=None,
-                    min_sequence_dist=None  # does not apply for inter-protein ECs
+                    min_sequence_dist=None # does not apply for inter-protein ECs
                 )
             else:
                 ecs_inter_compared = ecs_inter.assign(dist=np.nan)
@@ -1161,7 +1284,8 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
             # TODO: implement different cutoffs for intra vs inter contacts
             ec_table_compared = add_precision(
                 ec_table_compared,
-                dist_cutoff=kwargs["distance_cutoff"]
+                dist_cutoff=kwargs["distance_cutoff"],
+                min_sequence_dist=min_seq_dist
             )
 
             # save to file
@@ -1194,7 +1318,7 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
             remap_complex_chains(
                 first_sifts_map, second_sifts_map,
                 seqmap_i, seqmap_j, output_prefix=aux_prefix,
-                raise_missing=kwargs["raise_missing"]
+                raise_missing=kwargs["raise_missing"], atom_filter=None
             ).items()
         }
 
@@ -1209,14 +1333,13 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
 
     return outcfg
 
-
 # list of available EC comparison protocols
 PROTOCOLS = {
     # standard monomer comparison protocol
     "standard": standard,
 
     # comparison for protein complexes
-    "complex": complex
+    "complex": complex,
 }
 
 
diff --git a/evcouplings/compare/tools.py b/evcouplings/compare/tools.py
new file mode 100644
index 00000000..748e2aa7
--- /dev/null
+++ b/evcouplings/compare/tools.py
@@ -0,0 +1,38 @@
+"""
+Wrappers for tools for comparison of evolutionary
+couplings to experimental structures
+
+Authors:
+  Anna G. Green
+"""
+from evcouplings.utils.system import (
+    run, verify_resources
+)
+
+def run_dssp(infile, outfile, binary="dssp"):
+    """
+    Runs DSSP on an input pdb file
+
+    Parameters
+    ----------
+    infile: str
+        path to input file
+    outfile: str
+        path to output file
+    binary: str
+        path to DSSP binary
+    """
+    cmd = [
+        binary,
+        "-i", infile,
+        "-o", outfile
+    ]
+    return_code, stdout, stderr = run(cmd, check_returncode=False)
+
+    # verify_resources(
+    #     "DSSP returned empty file: "
+    #     "stdout={} stderr={} file={}".format(
+    #         stdout, stderr, outfile
+    #     ),
+    #     outfile
+    # )
\ No newline at end of file
diff --git a/evcouplings/complex/protocol.py b/evcouplings/complex/protocol.py
index 0ec996b9..8481b5e1 100644
--- a/evcouplings/complex/protocol.py
+++ b/evcouplings/complex/protocol.py
@@ -21,13 +21,14 @@
     create_prefix_folders, verify_resources
 )
 from evcouplings.align.protocol import modify_alignment
+from evcouplings.align.alignment import parse_header
 
 from evcouplings.complex.alignment import (
     write_concatenated_alignment
 )
 from evcouplings.complex.distance import (
     find_possible_partners, best_reciprocal_matching,
-    plot_distance_distribution
+    plot_distance_distribution, get_distance
 )
 from evcouplings.complex.similarity import (
     read_species_annotation_table,
@@ -35,6 +36,46 @@
     filter_best_reciprocal,
     find_paralogs
 )
+from evcouplings.couplings.mapping import (
+    SegmentIndexMapper, Segment, segment_map_ecs
+)
+
+def remove_overlapping_ids(id_dataframe):
+    """
+    Removes pairs of ids that are from the same sequence
+    and overlap by at least a certain amount
+
+    Paramters
+    ---------
+    id_dataframe: pd.DataFrame
+        contains columns id_1, id_2
+    amount_overlap_allowed: int
+        threshold of amount overlap allowed between two sequecnes
+
+    Returns
+    pd.Dataframe
+    """
+    temp_dataframe = id_dataframe.copy()
+
+    id_strings = [parse_header(x) for x in temp_dataframe.id_1]
+    temp_dataframe["id_string_1"], temp_dataframe["r_s_1"], temp_dataframe["r_e_1"] = \
+        zip(*id_strings)
+
+    id_strings = [parse_header(x) for x in temp_dataframe.id_2]
+    temp_dataframe["id_string_2"], temp_dataframe["r_s_2"], temp_dataframe["r_e_2"] = \
+        zip(*id_strings)
+
+    temp_dataframe["overlap"] = None
+
+    for idx, row in temp_dataframe.iterrows():
+        temp_dataframe.loc[idx, "overlap"] = get_distance(
+            (row["r_s_1"], row["r_e_1"]),
+            (row["r_s_2"], row["r_e_2"])
+        )
+
+    to_keep = temp_dataframe.query("id_string_1 != id_string_2 or overlap > 0")
+
+    return to_keep[id_dataframe.columns]
 
 def modify_complex_segments(outcfg, **kwargs):
     """
@@ -74,6 +115,35 @@ def _modify_segments(seg_list, seg_prefix):
 
     return outcfg
 
+def map_frequencies_file(frequencies_file, outcfg, **kwargs):
+    """
+    Renumbers the frequencies file created in complex concatenation
+    with correct monomer numbering based on segments - otherwise numbering
+    will be continuous for the concatenated sequence.
+
+    Parameters
+    ----------
+    frequencies_file: str
+        path to the frequencies file to be renumbered
+    outcfg : dict
+        The output configuration
+
+    """
+
+    frequencies = pd.read_csv(frequencies_file)
+
+    segments = [
+        Segment.from_list(s) for s in outcfg["segments"]
+    ]
+    seg_mapper = SegmentIndexMapper(
+        kwargs["first_focus_mode"], outcfg["region_start"], *segments
+    )
+    frequencies = segment_map_ecs(
+        frequencies, seg_mapper, map_i=True, map_j=False
+    )
+    return frequencies
+
+
 def _run_describe_concatenation(outcfg, **kwargs):
     """
     calculate some basic statistics on the concatenated alignment
@@ -94,26 +164,26 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
                            genome_location_filename_1, genome_location_filename_2,
                            outfile):
     """
-    Describes properties of concatenated alignment. 
+    Describes properties of concatenated alignment.
 
     Writes a csv with the following columns
 
     num_seqs_1 : number of sequences in the first monomer alignment
     num_seqs_2 : number of sequences in the second monomer alignment
-    num_nonred_species_1 : number of unique species annotations in the 
+    num_nonred_species_1 : number of unique species annotations in the
         first monomer alignment
-    num_nonred_species_2 : number of unique species annotations in the 
+    num_nonred_species_2 : number of unique species annotations in the
         second monomer alignment
     num_species_overlap: number of unique species found in both alignments
-    median_num_per_species_1 : median number of paralogs per species in the 
+    median_num_per_species_1 : median number of paralogs per species in the
         first monomer alignmment
-    median_num_per_species_2 : median number of paralogs per species in 
+    median_num_per_species_2 : median number of paralogs per species in
         the second monomer alignment
-    num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the 
+    num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the
         first monomer alignment (relevant to distance concatention only)
-    num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the 
+    num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the
         first monomer alignment (relevant to distance concatention only)
-    
+
     Parameters
     ----------
     annotation_file_1 : str
@@ -139,11 +209,11 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
         annotation_file_2
     )
     species_2 = annotations_2.species.values
-    
+
     # calculate the number of sequences found in each alignment
     num_seqs_1 = len(annotations_1)
     num_seqs_2 = len(annotations_2)
-    
+
     # calculate the number of species found in each alignment
     # where a species is defined as a unique OS or Tax annotation field
     nonredundant_annotations_1 = len(set(species_1))
@@ -154,7 +224,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
         set(species_1).intersection(set(species_2))
     )
     n_species_overlap = len(species_overlap)
-    
+
     # calculate the median number of paralogs per species
     n_paralogs_1 = float(
         # counts the number of times each species occurs in the list
@@ -165,7 +235,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
     n_paralogs_2 = float(
         np.median(list(Counter(species_2).values()))
     )
-    
+
     # If the user provided genome location files, calculate the number
     # of ids for which we found an embl CDS. Default value is np.nan
     embl_cds1 = np.nan
@@ -177,11 +247,20 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
         genome_location_table_1 = pd.read_csv(genome_location_filename_1)
         genome_location_table_2 = pd.read_csv(genome_location_filename_2)
 
+        if not genome_location_table_1.empty:
         # Number uniprot IDs with EMBL CDS that is not NA
-        if "uniprot_ac" in genome_location_table_1.columns:
             embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac)))
-        if "uniprot_ac" in genome_location_table_2.columns:
+        else:
+            embl_cds1 = np.nan
+
+        if not genome_location_table_2.empty:
             embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac)))
+        else:
+            embl_cds2 = np.nan
+
+    else:
+        embl_cds1 = np.nan
+        embl_cds2 = np.nan
 
     concatenation_data = [
         num_seqs_1,
@@ -194,7 +273,7 @@ def describe_concatenation(annotation_file_1, annotation_file_2,
         embl_cds1,
         embl_cds2,
     ]
-    
+
     cols = [
         "num_seqs_1",
         "num_seqs_2",
@@ -308,6 +387,10 @@ def genome_distance(**kwargs):
     id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"]
     id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"]
 
+    # filter for overlapping concatenation of same id
+    if kwargs["forbid_overlapping_concatenation"]:
+        id_pairing = remove_overlapping_ids(id_pairing)
+
     # write concatenated alignment with distance filtering
     # TODO: save monomer alignments?
     target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
@@ -326,11 +409,11 @@ def genome_distance(**kwargs):
 
     mon_alignment_file_1 = prefix + "_monomer_1.fasta"
     with open(mon_alignment_file_1, "w") as of:
-        mon_ali_1.write(of)   
+        mon_ali_1.write(of)
 
     mon_alignment_file_2 = prefix + "_monomer_2.fasta"
     with open(mon_alignment_file_2, "w") as of:
-        mon_ali_2.write(of)   
+        mon_ali_2.write(of)
 
     # filter the alignment
     aln_outcfg, _ = modify_alignment(
@@ -361,6 +444,11 @@ def genome_distance(**kwargs):
     outcfg["distance_plot_file"] = prefix + "_distplot.pdf"
     plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"])
 
+    # Correct the nubering in the frequencies file
+    map_frequencies_file(
+        outcfg["frequencies_file"], outcfg, **kwargs
+    ).to_csv(outcfg["frequencies_file"])
+
     return outcfg
 
 
@@ -368,7 +456,7 @@ def best_hit(**kwargs):
     """
     Protocol:
 
-    Concatenate alignments based on the best hit 
+    Concatenate alignments based on the best hit
     to the focus sequence in each species
 
     Parameters
@@ -404,7 +492,8 @@ def best_hit(**kwargs):
             "first_segments", "second_segments",
             "first_identities_file", "second_identities_file",
             "first_annotation_file", "second_annotation_file",
-            "use_best_reciprocal", "paralog_identity_threshold"
+            "use_best_reciprocal", "paralog_identity_threshold",
+            "forbid_overlapping_concatenation"
         ]
     )
 
@@ -430,7 +519,9 @@ def _load_monomer_info(annotations_file, identities_file,
         similarities = pd.read_csv(identities_file)
 
         # create a pd.DataFrame containing the best hit in each organism
-        most_similar_in_species = most_similar_by_organism(similarities, annotation_table)
+        most_similar_in_species = most_similar_by_organism(
+            similarities, annotation_table
+        )
 
         if use_best_reciprocal:
             paralogs = find_paralogs(
@@ -463,7 +554,7 @@ def _load_monomer_info(annotations_file, identities_file,
         kwargs["paralog_identity_threshold"]
     )
 
-    # merge the two dataframes to get all species found in 
+    # merge the two dataframes to get all species found in
     # both alignments
     species_intersection = most_similar_in_species_1.merge(
         most_similar_in_species_2,
@@ -472,6 +563,10 @@ def _load_monomer_info(annotations_file, identities_file,
         suffixes=("_1", "_2")
     )
 
+    # filter for overlapping concatenation of same id
+    if kwargs["forbid_overlapping_concatenation"]:
+        species_intersection = remove_overlapping_ids(species_intersection)
+
     # write concatenated alignment with distance filtering
     # TODO: save monomer alignments?
     target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
@@ -504,6 +599,8 @@ def _load_monomer_info(annotations_file, identities_file,
         **kwargs
     )
 
+    # remap the
+
     # make sure we return all the necessary information:
     # * alignment_file: final concatenated alignment that will go into plmc
     # * focus_sequence: this is the identifier of the concatenated target
@@ -520,6 +617,11 @@ def _load_monomer_info(annotations_file, identities_file,
     # Describe the statistics of the concatenation
     outcfg = _run_describe_concatenation(outcfg, **kwargs)
 
+    # Correct the numbering in the frequencies file
+    map_frequencies_file(
+        outcfg["frequencies_file"], outcfg, **kwargs
+    ).to_csv(outcfg["frequencies_file"])
+
     return outcfg
 
 
@@ -529,7 +631,8 @@ def _load_monomer_info(annotations_file, identities_file,
     "genome_distance": genome_distance,
 
     # concatenate based on best hit per genome ("species")
-    "best_hit": best_hit
+    "best_hit": best_hit,
+
 }
 
 
diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
index 404b1e1f..07b3375f 100644
--- a/evcouplings/couplings/mapping.py
+++ b/evcouplings/couplings/mapping.py
@@ -13,7 +13,6 @@
 import pandas as pd
 import numpy as np
 
-
 class Segment:
     """
     Represents a continuous stretch of sequence in a sequence
@@ -306,16 +305,24 @@ def to_model(self, x):
         """
         return self.__map(x, self.target_to_model)
 
-
-def segment_map_ecs(ecs, mapper):
+def segment_map_ecs(ecs, mapper, map_i=True, map_j=True):
     """
     Map EC dataframe in model numbering into
-    segment numbering
+    segment numbering.
+
+    Can also be used to map dataframes with only a column i (no column j),
+    such as amino acid frequencies, by using map_i=True and map_j=False
 
     Parameters
     ----------
     ecs : pandas.DataFrame
         EC table (with columns i and j)
+    mapper : SegmentIndexMapper
+        Mapper for renumbering the segments
+    map_i : Boolean, optional (default=True)
+        Map ecs in column i
+    map_j : Boolean, optional (default=True)
+        Map ecs in column j
 
     Returns
     -------
@@ -324,7 +331,7 @@ def segment_map_ecs(ecs, mapper):
         mapped, and additional columns
         segment_i and segment_j)
     """
-    ecs = deepcopy(ecs)
+    ecs = ecs.copy()
 
     def _map_column(col):
         seg_col = "segment_" + col
@@ -340,8 +347,10 @@ def _map_column(col):
         ecs.loc[:, seg_col] = col_m.loc[:, seg_col].values
 
     # map both position columns (and add segment id)
-    _map_column("i")
-    _map_column("j")
+    if map_i:
+        _map_column("i")
+    if map_j:
+        _map_column("j")
 
     return ecs
 
@@ -374,7 +383,9 @@ def __init__(self, filename, *segments,
 
         # initialize the segment index mapper to update model numbering
         if len(segments) == 0:
-            raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel")
+            raise ValueError(
+                "Must provide at least one segment for MultiSegmentCouplingsModel"
+            )
 
         first_segment = segments[0]
         index_start = first_segment.region_start
diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py
index 2de96d97..79c49d38 100644
--- a/evcouplings/couplings/pairs.py
+++ b/evcouplings/couplings/pairs.py
@@ -4,32 +4,33 @@
 .. todo::
 
     1. clean up
-    2. add Pompom score
-    3. add mapping tools (multidomain, complexes)
-    4. ECs to matrix
-    5. APC on subsets of positions (e.g. for complexes)
+    2. add mapping tools (multidomain, complexes)
+    3. ECs to matrix
+    4. APC on subsets of positions (e.g. for complexes)
 
 Authors:
   Thomas A. Hopf
   Agnes Toth-Petroczy (original mixture model code)
   John Ingraham (skew normal mixture model)
-  Anna G. Green (EVComplex Score code)
+  Anna G. Green (EVcomplex Score code, segment-aware enrichment calculation)
 """
 
-from math import ceil
-from copy import deepcopy
-from pkg_resources import resource_filename
-
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import scipy.optimize as op
+
+from math import ceil, sqrt
+from copy import deepcopy
+from pkg_resources import resource_filename
+
 from scipy import stats
 from sklearn.linear_model import LogisticRegression
 
 from evcouplings.utils.calculations import median_absolute_deviation
 from evcouplings.utils.config import read_config_file
 
+DEFAULT_SEGMENT_NAME = "A_1"
 
 def read_raw_ec_file(filename, sort=True, score="cn"):
     """
@@ -70,14 +71,14 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6):
     Calculate EC "enrichment" as first described in
     Hopf et al., Cell, 2012.
 
-    .. todo::
-
-        Make this handle segments if they are in EC table
 
     Parameters
     ----------
     ecs : pd.DataFrame
         Dataframe containing couplings
+    segment_name: str, optional (default: "A_1")
+        Value to serve as placeholder name for segment, in the
+        case of no segments found in EC dataframe
     num_pairs : int or float, optional (default: 1.0)
         Number of ECs to use for enrichment calculation.
         - If float, will be interpreted as fraction of the
@@ -96,37 +97,55 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6):
         Sorted table with enrichment values for each
         position in the sequence
     """
-    # determine how many positions ECs are over
-    pos = set(ecs.i.unique()) | set(ecs.j.unique())
-    num_pos = len(pos)
+
+    # check if the provided table has segments...
+    has_segments = "segment_i" in ecs.columns and "segment_j" in ecs.columns
+
+    # ...and if not, create them (will be deleted before return)
+    if not has_segments:
+        ecs = ecs.assign({
+            "segment_i": DEFAULT_SEGMENT_NAME,
+            "segment_j": DEFAULT_SEGMENT_NAME
+        })
+
+
+    # stack dataframe so it contains each
+    # EC twice as forward and backward pairs
+    # (i, j) and (j, i)
+    flipped = ecs.rename(
+        columns={
+            "i": "j",
+            "j": "i",
+            "A_i": "A_j",
+            "A_j": "A_i",
+            "segment_i": "segment_j",
+            "segment_j": "segment_i"
+        }
+    )
+
+    stacked_ecs = ecs.append(flipped)
+
+    # determine how many positions ECs are over using the combined dataframe
+    num_pos = len(stacked_ecs.groupby(["i", "A_i", "segment_i"]))
 
     # calculate absolute number of pairs if
     # fraction of length is given
     if isinstance(num_pairs, float):
         num_pairs = int(ceil(num_pairs * num_pos))
 
-    # get longrange ECs and sort by score
-    sorted_ecs = ecs.query(
+    # sort the stacked ECs
+    stacked_sorted_ecs = stacked_ecs.query(
         "abs(i-j) >= {}".format(min_seqdist)
     ).sort_values(
         by=score, ascending=False
     )
 
-    # select top EC pairs
-    top_ecs = sorted_ecs.iloc[0:num_pairs]
+    # take the top num * 2 (because each EC represented twice)
+    top_ecs = stacked_sorted_ecs[0:num_pairs * 2]
 
-    # stack dataframe so it contains each
-    # EC twice as forward and backward pairs
-    # (i, j) and (j, i)
-    flipped = top_ecs.rename(
-        columns={"i": "j", "j": "i", "A_i": "A_j", "A_j": "A_i"}
-    )
-
-    stacked_ecs = top_ecs.append(flipped)
-
-    # now sum cumulative strength of EC for each position
+    # calculate sum of EC scores for each position
     ec_sums = pd.DataFrame(
-        stacked_ecs.groupby(["i", "A_i"]).sum()
+        top_ecs.groupby(["i", "A_i", "segment_i"]).sum()
     )
 
     # average EC strength for top ECs
@@ -136,7 +155,12 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6):
     # an individual position exceeds average strength in top
     ec_sums.loc[:, "enrichment"] = ec_sums.loc[:, score] / avg_degree
 
-    e = ec_sums.reset_index().loc[:, ["i", "A_i", "enrichment"]]
+    # if the ecs had segment information, return a segment column
+    if has_segments:
+        e = ec_sums.reset_index().loc[:, ["i", "A_i", "segment_i", "enrichment"]]
+    else:
+        e = ec_sums.reset_index().loc[:, ["i", "A_i", "enrichment"]]
+
     return e.sort_values(by="enrichment", ascending=False)
 
 
@@ -644,11 +668,8 @@ class EVComplexScoreModel:
     Assign to each EC score a (unnormalized) EVcomplex score as
     described in Hopf, Schärfe et al. (2014).
 
-    TODO: this implementation currently does not take into account
-    score normalization for the number of sequences and length of
-    the model
     """
-    def __init__(self, x):
+    def __init__(self, x, Neff_over_L=None):
         """
         Initialize EVcomplex score model
         
@@ -656,10 +677,14 @@ def __init__(self, x):
         ----------
         x : np.array (or list-like)
             EC scores from which to infer the mixture model
+        Neff_over_L : float (optional, default None)
+            Effective number of sequences divided by L (number
+            of sites in model)
         """
         self.x = np.array(x)
+        self.Neff_over_L = Neff_over_L
 
-    def probability(self, x, plot=False):
+    def probability(self, x):
         """
         Calculates evcomplex score as cn_score / min_cn_score.
         TODO: plotting functionality not yet implemented
@@ -679,11 +704,19 @@ def probability(self, x, plot=False):
         # Calculate the minimum score
         min_score = abs(np.min(self.x))
 
-        return x / min_score
+        raw_evcomplex = x / min_score
+
+        # if we have Neff_over_L, use it to calculate the corrected score
+        if self.Neff_over_L:
+            return raw_evcomplex / (1 + np.power(self.Neff_over_L, -0.5))
+
+        # else return the raw score
+        else:
+            return raw_evcomplex
 
 
 def add_mixture_probability(ecs, model="skewnormal", score="cn",
-                            clamp_mu=False, plot=False):
+                            clamp_mu=False, plot=False, Neff_over_L=None):
     """
     Add lognormal mixture model probability to EC table.
 
@@ -691,9 +724,10 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn",
     ----------
     ecs : pd.DataFrame
         EC table with scores
-    model : {"skewnormal", "normal"}, optional (default: skewnormal)
+    model : {"skewnormal", "normal", "evcomplex", "evcomplex_uncorrected"}, optional (default: skewnormal)
         Use model with skew-normal or normal distribution
-        for the noise component of mixture model
+        for the noise component of mixture model.
+        For complexes, apply evcomplex or evcomplex_uncorrected scoring
     score : str, optional (default: "cn")
         Score on which mixture model will be based
     clamp_mu : bool, optional (default: False)
@@ -701,6 +735,9 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn",
         fitting it based on data
     plot : bool, optional (default: False)
         Plot score distribution and probabilities
+    Neff_over_L : float, optional (default: None)
+        Effective number of sequences divided by model length, needed for corrected EVcomplex
+        score calculation (Hopf et al., 2014 Elife)
 
     Returns
     -------
@@ -717,17 +754,26 @@ def add_mixture_probability(ecs, model="skewnormal", score="cn",
         mm = ScoreMixtureModel(ecs.loc[:, score].values)
     elif model == "normal":
         mm = LegacyScoreMixtureModel(ecs.loc[:, score].values, clamp_mu)
-    elif model == "evcomplex":
+    elif model == "evcomplex_uncorrected":
         mm = EVComplexScoreModel(ecs.loc[:, score].values)
+    elif model == "evcomplex":
+        if Neff_over_L is None:
+            raise ValueError(
+                "must provide Neff over L for EVcomplex score calculation, "
+                "or select model evcomplex_uncorrected"
+            )
+        mm = EVComplexScoreModel(ecs.loc[:, score].values, Neff_over_L)
+
     else:
         raise ValueError(
             "Invalid model selection, valid options are: "
-            "skewnormal, normal, evcomplex"
+            "skewnormal, normal\n"
+            "Complexes only: evcomplex, evcomplex_uncorrected"
         )
 
     # assign probability
     ec_prob.loc[:, "probability"] = mm.probability(
-        ec_prob.loc[:, score].values, plot=plot
+        ec_prob.loc[:, score].values
     )
 
     return ec_prob
diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py
index 54fe1851..71ae55b1 100644
--- a/evcouplings/couplings/protocol.py
+++ b/evcouplings/couplings/protocol.py
@@ -10,6 +10,7 @@
 import string
 import pandas as pd
 import numpy as np
+from itertools import combinations
 
 from evcouplings.couplings import tools as ct
 from evcouplings.couplings import pairs, mapping
@@ -52,18 +53,21 @@
     "evcomplex",
 )
 
+# Define default segment names for complexes
+FIRST_SEGMENT_NAME = "A_1"
+SECOND_SEGMENT_NAME = "B_1"
 
 def infer_plmc(**kwargs):
     """
     Run EC computation on alignment. This function contains
     the functionality shared between monomer and complex EC
     inference.
-    
+
     Parameters
     ----------
     Mandatory kwargs arguments:
         See list below in code where calling check_required
-    
+
     Returns
     -------
     outcfg : dict
@@ -430,7 +434,7 @@ def standard(**kwargs):
 
 
 def complex_probability(ecs, scoring_model, use_all_ecs=False,
-                        score="cn"):
+                        n_effective=None, num_sites=None, score="cn"):
     """
     Adds confidence measure for complex evolutionary couplings
 
@@ -443,33 +447,52 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False,
     use_all_ecs : bool, optional (default: False)
         If true, fits the scoring model to all ECs;
         if false, fits the model to only the inter ECs.
+    n_effective : float, optional (default: None)
+        Effective number of sequences in alignment.
+        Used for EVcomplex score calculation
+    num_sites : int, optional (default: None)
+        Number of positions in alignment
     score : str, optional (default: "cn")
         Use this score column for confidence assignment
-        
+
     Returns
     -------
     ecs : pandas.DataFrame
         EC table with additional column "probability"
         containing confidence measure
     """
+
+    # if user provided both parameters, calculate n_effective / length
+    if n_effective and num_sites:
+        neff_over_l = n_effective / num_sites
+    else:
+        neff_over_l = None
+
     if use_all_ecs:
+        # TODO: user proof so that no one runs evcomplex on all ECs?
         ecs = pairs.add_mixture_proability(
             ecs, model=scoring_model
         )
+
     else:
         inter_ecs = ecs.query("segment_i != segment_j")
-        intra_ecs = ecs.query("segment_i == segment_j")
+        intra1_ecs = ecs.query("segment_i == segment_j == @FIRST_SEGMENT_NAME")
+        intra2_ecs = ecs.query("segment_i == segment_j == @SECOND_SEGMENT_NAME")
+
+        intra1_ecs = pairs.add_mixture_probability(
+            intra1_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l
+        )
 
-        intra_ecs = pairs.add_mixture_probability(
-            intra_ecs, model=scoring_model, score=score
+        intra2_ecs = pairs.add_mixture_probability(
+            intra2_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l
         )
 
         inter_ecs = pairs.add_mixture_probability(
-            inter_ecs, model=scoring_model, score=score
+            inter_ecs, model=scoring_model, score=score, Neff_over_L=neff_over_l
         )
 
         ecs = pd.concat(
-            [intra_ecs, inter_ecs]
+            [intra1_ecs, intra2_ecs, inter_ecs]
         ).sort_values(
             score, ascending=False
         )
@@ -477,6 +500,97 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False,
     return ecs
 
 
+def by_segment_ec_enrichment(ecs, outcfg, **kwargs):
+    """
+    Calculates enrichment within all segments and between all segments.
+
+    Calculates within-segment enrichment using intra-protein ECs only and
+    between segment enrichment using inter-protein ECs only.
+
+    Parameters
+    ----------
+    ecs: pd.DataFrame
+        EC table
+    outcfg: dict
+        Output configuration of the pipeline
+    mandatory kwargs arguments:
+        prefix
+
+    Returns
+    -------
+    dict
+        Output configuration of the pipeline, including
+        the following new fields:
+            enrichment_inter_file
+            enrichment_intra_file
+    pd.DataFrame
+        Intra-molecule EC enrichment (all molecules included, indexed by segment)
+    pd.DataFrame
+        Inter-molecule EC enrichment (all molecules included, indexed by segment)
+    """
+
+    # determine the number of segments in the ECs
+    has_segments = "segment_i" in ecs.columns and "segment_j" in ecs.columns
+
+    if not has_segments:
+        ecs.loc[:,"segment_i"] = FIRST_SEGMENT_NAME
+        ecs.loc[:,"segment_j"] = FIRST_SEGMENT_NAME
+
+    # get a list of unique segments
+    segments = list(
+        set(ecs.segment_i.unique()).union(
+        set(ecs.segment_j.unique()))
+    )
+
+    # Calculate all the intra-segment enrichments
+    intra_enrichment_dfs = []
+    for segment in segments:
+        intra_ecs = ecs.query(
+            "segment_i == segment_j == @segment"
+        )
+        intra_enrichment = pairs.enrichment(
+            intra_ecs, min_seqdist=kwargs["min_sequence_distance"]
+        )
+        intra_enrichment.loc[:,"segment_i"] = segment
+
+        intra_enrichment_dfs.append(intra_enrichment)
+
+    all_intra_enrichment = pd.concat(intra_enrichment_dfs)
+
+    # Calculate the inter-segment enrichments
+    inter_enrichment_dfs = []
+    for (segment1, segment2) in combinations(segments, 2):
+
+            # Agnostic to ordering of segments
+            inter_ecs1 = ecs.query(
+                "segment_i == @segment1 and segment_j == @segment2"
+            )
+            inter_ecs2 = ecs.query(
+                "segment_i == @segment2 and segment_j == @segment1"
+            )
+            inter_ecs = pd.concat([inter_ecs1, inter_ecs2])
+
+            # for inter ECs, sequence distance is 0
+            inter_enrichment = pairs.enrichment(inter_ecs, min_seqdist=0)
+            inter_enrichment_dfs.append(inter_enrichment)
+
+    all_inter_enrichment = pd.concat(inter_enrichment_dfs)
+
+    # Save to a file
+    outcfg["enrichment_intra_file"] = "{}_enrichment_intra.csv".format(
+        kwargs["prefix"]
+    )
+    all_intra_enrichment.to_csv(outcfg["enrichment_intra_file"], index=None)
+
+    if len(all_inter_enrichment) > 0:
+        outcfg["enrichment_inter_file"] = "{}_enrichment_inter.csv".format(
+            kwargs["prefix"]
+        )
+        inter_enrichment.to_csv(outcfg["enrichment_inter_file"], index=None)
+
+    return outcfg, all_intra_enrichment, all_inter_enrichment
+
+
 def complex(**kwargs):
     """
     Protocol:
@@ -531,7 +645,7 @@ def complex(**kwargs):
             use_all_ecs = False
 
         ecs = complex_probability(
-            ecs, kwargs["scoring_model"], use_all_ecs
+            ecs, kwargs["scoring_model"], use_all_ecs, outcfg["effective_sequences"], outcfg["num_sites"]
         )
 
     else:
@@ -552,17 +666,18 @@ def complex(**kwargs):
         )
     )
 
+    is_single_segment = segments is None or len(segments) == 1
     outcfg = {
         **outcfg,
         **_postprocess_inference(
             ecs, kwargs, model, outcfg, prefix,
-            generate_line_plot=True,
-            generate_enrichment=False,
+            generate_line_plot=is_single_segment,
+            generate_enrichment=is_single_segment,
             ec_filter="segment_i != segment_j or abs(i - j) >= {}",
             chain=chain_mapping
         )
     }
-    
+
     # save just the inter protein ECs
     ## TODO: eventually have this accomplished by _postprocess_inference
     ## right now avoiding a second call with a different ec_filter
@@ -575,12 +690,7 @@ def complex(**kwargs):
     write_config_file(prefix + ".couplings_complex.outcfg", outcfg)
 
     # TODO: make the following complex-ready
-    # EC enrichment:
-    #
-    # 1) think about making EC enrichment complex-ready and add
-    # it back here - so far it only makes sense if all ECs are
-    # on one segment
-    #
+
     # EVzoom:
     #
     # 1) at the moment, EVzoom will use numbering before remapping
@@ -594,47 +704,34 @@ def complex(**kwargs):
     return outcfg
 
 
-def mean_field(**kwargs):
-    """
-    Protocol:
-
-    Infer ECs from alignment using mean field direct coupling analysis.
-
-    For now, mean field DCA can only be run in focus mode, gaps
-    included.
-
-    Parameters
-    ----------
-    Mandatory kwargs arguments:
-        See list below in code where calling check_required.
-
-    Returns
-    -------
-    outcfg : dict
-        Output configuration of the pipeline, including
-        the following fields:
-
-        * raw_ec_file
-        * model_file
-        * num_sites
-        * num_sequences
-        * effective_sequences
-
-        * focus_mode (passed through)
-        * focus_sequence (passed through)
-        * segments (passed through)
+def infer_mean_field(**kwargs):
     """
-    check_required(
-        kwargs,
-        [
-            "prefix", "alignment_file", "segments",
-            "focus_mode", "focus_sequence", "theta",
-            "pseudo_count", "alphabet",
-            "min_sequence_distance", # "save_model",
-            "ec_score_type",
-        ]
-    )
-
+   Run mean field EC computation on alignment. This function
+   contains the functionality shared between monomer and complex
+   EC inference.
+
+   Parameters
+   ----------
+   Mandatory kwargs arguments:
+       See list below in code where calling check_required
+
+   Returns
+   -------
+   outcfg : dict
+       Output configuration of the pipeline, including
+       the following fields:
+
+       raw_ec_file
+       model_file
+       num_sites
+       num_valid_sequences
+       effective_sequences
+
+       focus_mode (passed through)
+       focus_sequence (passed through)
+       segments (passed through)
+
+   """
     if not kwargs["focus_mode"]:
         raise InvalidParameterError(
             "For now, mean field DCA can only be run in focus mode."
@@ -642,24 +739,15 @@ def mean_field(**kwargs):
 
     prefix = kwargs["prefix"]
 
-    # option to save model disabled
-    """
-    if kwargs["save_model"]:
-        model = prefix + ".model"
-    else:
-        model = None
-    """
     model = prefix + ".model"
 
     outcfg = {
         "model_file": model,
         "raw_ec_file": prefix + "_ECs.txt",
         "ec_file": prefix + "_CouplingScores.csv",
-        # TODO: the following are passed through stage...
-        # keep this or unnecessary?
         "focus_mode": kwargs["focus_mode"],
         "focus_sequence": kwargs["focus_sequence"],
-        "segments": kwargs["segments"],
+        "segments": kwargs["segments"]
     }
 
     # make sure input alignment exists
@@ -672,12 +760,6 @@ def mean_field(**kwargs):
     # make sure output directory exists
     create_prefix_folders(prefix)
 
-    segments = kwargs["segments"]
-    if segments is not None:
-        segments = [
-            mapping.Segment.from_list(s) for s in segments
-        ]
-
     # determine alphabet
     # default is protein
     if kwargs["alphabet"] is None:
@@ -725,6 +807,195 @@ def mean_field(**kwargs):
         "region_start": int(model.index_list[0]),
     })
 
+    return outcfg, model
+
+
+def complex_mean_field(**kwargs):
+    """
+    Protocol:
+
+    Infer ECs for protein complexes from alignment using mean field.
+    Allows user to select scoring protocol.
+
+    For now, mean field DCA can only be run in focus mode, gaps
+    included.
+
+    Parameters
+    ----------
+    Mandatory kwargs arguments:
+        See list below in code where calling check_required.
+
+    Returns
+    -------
+    outcfg : dict
+        Output configuration of the pipeline, including
+        the following fields:
+
+        raw_ec_file
+        model_file
+        num_sites
+        num_sequences
+        effective_sequences
+
+        focus_mode (passed through)
+        focus_sequence (passed through)
+        segments (passed through)
+    """
+    # for additional required parameters, see mean field info
+    check_required(
+        kwargs,
+        [
+            "prefix", "min_sequence_distance",
+            "scoring_model", "use_all_ecs_for_scoring",
+            "alignment_file","segments","focus_mode",
+            "focus_sequence","theta","pseudo_count",
+            "alphabet"
+        ]
+    )
+
+    # Run the mean field inference
+    outcfg, model = infer_mean_field(**kwargs)
+    prefix = kwargs["prefix"]
+
+    # read and sort ECs
+    # Note: this now deviates from the original EC format
+    # file because it has 4 score columns to accomodate
+    # MI (raw), MI (APC-corrected), DI, CN;
+    ecs = pd.read_csv(
+        outcfg["raw_ec_file"], sep=" ",
+        names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"]
+    )
+
+    # If the ECs have segments, remap
+    segments = kwargs["segments"]
+    if segments is not None:
+        segments = [
+            mapping.Segment.from_list(s) for s in segments
+        ]
+
+        seg_mapper = mapping.SegmentIndexMapper(
+            kwargs["focus_mode"], kwargs["region_start"], *segments
+        )
+
+        # apply to EC table
+        ecs = mapping.segment_map_ecs(ecs,seg_mapper)
+
+    # add mixture model probability
+    if kwargs["scoring_model"] in SCORING_MODELS:
+        if kwargs["use_all_ecs_for_scoring"] is not None:
+            use_all_ecs = kwargs["use_all_ecs_for_scoring"]
+        else:
+            use_all_ecs = False
+
+        ecs = complex_probability(
+            ecs, kwargs["scoring_model"], use_all_ecs
+        )
+
+    else:
+        raise InvalidParameterError(
+            "Invalid scoring_model parameter: " +
+            "{}. Valid options are: {}".format(
+                kwargs["protocol"], ", ".join(SCORING_MODELS)
+            )
+        )
+
+    # create line-drawing script (for multiple chains)
+    # by convention, we map first segment to chain A,
+    # second to B, a.s.f.
+    chain_mapping = dict(
+        zip(
+            [s.segment_id for s in segments],
+            string.ascii_uppercase,
+        )
+    )
+
+    is_single_segment = segments is None or len(segments) == 1
+    outcfg = {
+        **outcfg,
+        **_postprocess_inference(
+            ecs, kwargs, model, outcfg, prefix,
+            generate_line_plot=is_single_segment,
+            generate_enrichment=is_single_segment,
+            ec_filter="segment_i != segment_j or abs(i - j) >= {}",
+            chain=chain_mapping
+        )
+    }
+
+    # save just the inter protein ECs
+    ## TODO: eventually have this accomplished by _postprocess_inference
+    ## right now avoiding a second call with a different ec_filter
+    ecs = pd.read_csv(outcfg["ec_file"])
+    outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv"
+    inter_ecs = ecs.query("segment_i != segment_j")
+    inter_ecs.to_csv(outcfg["inter_ec_file"], index=False)
+
+    # dump output config to YAML file for debugging/logging
+    write_config_file(prefix + ".couplings_complex.outcfg", outcfg)
+
+    # TODO: make the following complex-ready
+    # EVzoom:
+    #
+    # 1) at the moment, EVzoom will use numbering before remapping
+    # we should eventually get this to a point where segments + residue
+    # index are displayed on EVzoom
+    #
+    # 2) note that this will currently use the default mixture model
+    # selection for determining the EC cutoff, rather than the selection
+    # used for the EC table above
+
+    return outcfg
+
+
+def mean_field(**kwargs):
+    """
+    Protocol:
+
+    Infer ECs from alignment using mean field direct coupling analysis.
+
+    For now, mean field DCA can only be run in focus mode, gaps
+    included.
+
+    Parameters
+    ----------
+    Mandatory kwargs arguments:
+        See list below in code where calling check_required.
+
+    Returns
+    -------
+    outcfg : dict
+        Output configuration of the pipeline, including
+        the following fields:
+
+        * raw_ec_file
+        * model_file
+        * num_sites
+        * num_sequences
+        * effective_sequences
+
+        * focus_mode (passed through)
+        * focus_sequence (passed through)
+        * segments (passed through)
+    """
+    check_required(
+        kwargs,
+        [
+            "prefix", "alignment_file", "segments",
+            "focus_mode", "focus_sequence", "theta",
+            "pseudo_count", "alphabet",
+            "min_sequence_distance", # "save_model",
+            "ec_score_type",
+        ]
+    )
+
+    outcfg, model = infer_mean_field(**kwargs)
+
+    # establish segments for model
+    segments = kwargs["segments"]
+    if segments is not None:
+        segments = [
+            mapping.Segment.from_list(s) for s in segments
+        ]
+
     # read and sort ECs
     # Note: this now deviates from the original EC format
     # file because it has 4 score columns to accomodate
@@ -870,27 +1141,29 @@ def _postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_line_plo
 
     # compute EC enrichment (for now, for single segments
     # only since enrichment code cannot handle multiple segments)
-    if generate_enrichment:
-        ext_outcfg["enrichment_file"] = prefix + "_enrichment.csv"
 
-        min_seqdist = kwargs["min_sequence_distance"]
-        if min_seqdist is None:
-            min_seqdist = 0
+    min_seqdist = kwargs["min_sequence_distance"]
+    if min_seqdist is None:
+        min_seqdist = 0
 
-        ecs_enriched = pairs.enrichment(
-            ecs, score=score, min_seqdist=min_seqdist
-        )
-        ecs_enriched.to_csv(ext_outcfg["enrichment_file"], index=False)
+    ## TODO: need to make by_segment_ec_enrichment compatible with new enrichment refactoring
+    ## TODO: MAKE SURE this computes all needed output files - discrepancy with new release as of Oct 2020
+    # enrichment refactoring
+    outcfg, intra_ecs_enriched, _ = by_segment_ec_enrichment(ecs, outcfg, **kwargs)
+    ext_outcfg["enrichment_file"] = prefix + "_enrichment.csv"
+    intra_ecs_enriched.to_csv(ext_outcfg["enrichment_file"], index=False)
 
+    for segment, ecs_enriched in intra_ecs_enriched.groupby("segment_i"):
         # create corresponding enrichment pymol scripts
         ext_outcfg["enrichment_pml_files"] = []
         for sphere_view, pml_suffix in [
-            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
+            (True, f"{segment}_enrichment_spheres.pml"), (False, f"{segment}_enrichment_sausage.pml")
         ]:
             pml_file = prefix + pml_suffix
             enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
             ext_outcfg["enrichment_pml_files"].append(pml_file)
 
+
     # output EVzoom JSON file if we have stored model file
     if outcfg.get("model_file", None) is not None:
         ext_outcfg["evzoom_file"] = prefix + "_evzoom.json"
@@ -928,6 +1201,9 @@ def _postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_line_plo
 
     # inference protocol using mean field approximation
     "mean_field": mean_field,
+
+    # runs DCA/meanfield for complexes
+    "complex_mean_field": complex_mean_field
 }
 
 
diff --git a/evcouplings/score_interaction/__init__.py b/evcouplings/score_interaction/__init__.py
new file mode 100644
index 00000000..2126dd1d
--- /dev/null
+++ b/evcouplings/score_interaction/__init__.py
@@ -0,0 +1,3 @@
+from evcouplings.score_interaction.asa import *
+from evcouplings.score_interaction.protocol import *
+
diff --git a/evcouplings/score_interaction/asa.py b/evcouplings/score_interaction/asa.py
new file mode 100644
index 00000000..f3cf380d
--- /dev/null
+++ b/evcouplings/score_interaction/asa.py
@@ -0,0 +1,204 @@
+import pandas as pd
+import numpy as np
+from evcouplings.utils.system import verify_resources, valid_file
+from evcouplings.compare.tools import run_dssp
+from evcouplings.utils.constants import AA_SURFACE_AREA
+from Bio.PDB import make_dssp_dict
+
+def read_dssp_output(filename):
+    """
+    Reads the output files from DSSP and converts them into a pandas DataFrame
+
+    Parameters
+    ----------
+    filename: str
+        Path to output file from DSSP
+
+    Returns
+    -------
+        pd.DataFrame with columns i, res, asa
+        Representing residue number, identity, and accesisble surface area
+    """
+
+    if not valid_file(filename):
+        return pd.DataFrame({
+            "i": [],
+            "res": [],
+            "asa": []
+        })
+
+    dssp_dict, _ = make_dssp_dict(filename)
+    data = []
+    for key, value in dssp_dict.items():
+        # keys are formatted as (chain, ("", i, ""))
+        chain, (_, i, inscode) = key
+
+        res = value[0]
+        asa = value[2]
+
+        data.append({
+            "i": i,
+            "res": res,
+            "asa": asa
+        })
+
+    return pd.DataFrame(data)
+
+
+def calculate_rsa(dataframe, aa_surface_area=None, output_column="rsa"):
+    """
+    Converts raw accessible surface area to relative accessible surface area,
+    by dividing the raw accessible surface area by the max accessible surface area
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        Dataframe of raw accessible surface area
+    AA_SURFACE_AREA: dict of str: numeric
+        Values of max accessible surface area to use for conversion
+    output_column: str
+        Name of output column to create
+
+    Returns
+    -------
+        pd.DataFrame
+    """
+    if aa_surface_area is None:
+        aa_surface_area = AA_SURFACE_AREA
+
+    modified_dataframe = dataframe.copy()
+    modified_dataframe.loc[:, output_column] = [x.asa/aa_surface_area[x.res] for _, x in modified_dataframe.iterrows()]
+    return modified_dataframe
+
+
+def asa_run(pdb_file, dssp_output_file, rsa_output_file, dssp_binary):
+    """
+    Paramters
+    ---------
+    file: str
+        path to pdb file on which to run DSSP
+    dssp_output_file: str
+        path to save dssp file
+    rsa_output_file: str
+        path to save rsa output file
+    dssp_binary: str
+        path to dssp binary
+
+    Returns
+    -------
+    pd.DataFrame with relative accessible surface area for each position in PDB
+    """
+
+    verify_resources(
+        "PDB file input to DSSP does not exist",
+        pdb_file
+    )
+
+    run_dssp(pdb_file, dssp_output_file, dssp_binary)
+
+    d = read_dssp_output(dssp_output_file)
+    d = calculate_rsa(d, AA_SURFACE_AREA)
+
+    return d
+
+
+def combine_asa(remapped_pdb_files, dssp_binary, prefix, outcfg):
+    """
+    Parameters
+    ----------
+    remapped_pdb_files: list of str
+        path to all remapped pdb files to be analyzed
+    prefix: str
+        path to DSSP binary
+    outcfg: dict
+        output configuration
+    """
+
+    outcfg["dssp_output_files"] = []
+    outcfg["rsa_output_files"] = []
+
+    # If no remapped pdb files, return empty df
+    if remapped_pdb_files is None or len(remapped_pdb_files) == 0:
+        return pd.DataFrame({
+            "i": [],
+            "mean": [],
+            "max": [],
+            "min": []
+        }), outcfg
+
+    # run dssp for each remapped_pdb_file
+    d_list = []
+    for file in remapped_pdb_files:
+        if valid_file(file):
+
+            # prepare output file paths
+            file_path_prefix = file.rsplit(".pdb", maxsplit=1)[0]
+            pdb_prefix = file_path_prefix.rsplit("/", maxsplit=1)[-1]
+
+            dssp_output_file = prefix + pdb_prefix + ".dssp"
+            rsa_output_file = prefix + pdb_prefix + "_rsa.csv"
+
+            d = asa_run(file, dssp_output_file, rsa_output_file, dssp_binary)
+
+            # add information to combined dataframe
+            d_list.append(d)
+
+            # save the output files
+            outcfg["dssp_output_files"].append(dssp_output_file)
+            outcfg["rsa_output_files"].append(rsa_output_file)
+
+    #print(d_list)
+    data = pd.concat(d_list)
+
+    # group the dataframe of RSA by residue
+    means = data.groupby("i").rsa.mean()
+    maxes = data.groupby("i").rsa.max()
+    mins = data.groupby("i").rsa.min()
+
+    return pd.DataFrame({
+        "i": means.index,
+        "mean": list(means),
+        "max": list(maxes),
+        "min": list(mins)
+    }), outcfg
+
+
+def add_asa(ec_df, asa, asa_column):
+    """
+    Add a column for the accessible surface area for each residue i and j to a DataFrame
+
+    Parameters
+    ----------
+    ec_df: pd.DataFrame
+        dataframe with columns i, j, segment_i, and segment_j
+    asa: pd.DataFrame
+        dataframe containing accessible surface area information
+    asa_column: str
+        name of column in asa df to use
+
+    Returns
+    -------
+    pd.DataFrame
+    """
+    # make a dictionary of residue and segment pointing to accesible surface area value
+    s_to_e = {
+        (x, y): z for x, y, z in zip(
+            asa.i, asa.segment_i, asa[asa_column]
+        )
+    }
+
+    # Add the accesible surface area for position i
+    ec_df["asa_i"] =[
+        s_to_e[(x, y)] if (x, y) in s_to_e else np.nan for x, y in zip(
+            ec_df.i, ec_df.segment_i
+        )
+    ]
+
+    # Add the accessible surface area for position j
+    ec_df["asa_j"] =[
+        s_to_e[(x, y)] if (x, y) in s_to_e else np.nan for x, y in zip(
+            ec_df.j, ec_df.segment_j
+        )
+    ]
+
+    return ec_df
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved
new file mode 100644
index 00000000..d7666adf
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.saved differ
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler
new file mode 100644
index 00000000..8eed199e
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucaware.scaler differ
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved
new file mode 100644
index 00000000..717711fd
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.saved differ
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler
new file mode 100644
index 00000000..16f653d8
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__complex_strucfree.scaler differ
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved b/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved
new file mode 100644
index 00000000..a45806e8
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__nostruc.saved differ
diff --git a/evcouplings/score_interaction/aux/20200717_frozen__struc.saved b/evcouplings/score_interaction/aux/20200717_frozen__struc.saved
new file mode 100644
index 00000000..e861249d
Binary files /dev/null and b/evcouplings/score_interaction/aux/20200717_frozen__struc.saved differ
diff --git a/evcouplings/score_interaction/protocol.py b/evcouplings/score_interaction/protocol.py
new file mode 100644
index 00000000..dc0fcd45
--- /dev/null
+++ b/evcouplings/score_interaction/protocol.py
@@ -0,0 +1,532 @@
+"""
+Fix EVcomplex2 models to inter-protein ECs
+
+Authors:
+  Anna G. Green
+"""
+import joblib
+import pandas as pd
+import numpy as np
+
+from evcouplings.utils.config import (
+    check_required, InvalidParameterError
+)
+
+from evcouplings.utils.system import (
+    valid_file
+)
+from evcouplings.couplings import enrichment, Segment
+from evcouplings.score_interaction.asa import combine_asa, add_asa
+from evcouplings.compare.distances import DistanceMap
+from evcouplings.compare.protocol import complex_probability, plot_complex_cm
+from evcouplings.align import ALPHABET_PROTEIN
+from evcouplings.utils.constants import HYDROPATHY_INDEX
+
+X_STRUCFREE = [
+    "evcomplex_normed",
+    "conservation_max",
+    "intra_enrich_max",
+    "inter_relative_rank_longrange"
+
+]
+X_STRUCAWARE = [
+    "evcomplex_normed",
+    "asa_min",
+    "precision",
+    "conservation_max",
+    "intra_enrich_max",
+    "inter_relative_rank_longrange",
+]
+
+X_COMPLEX_STRUCFREE = [0, 2]
+
+X_COMPLEX_STRUCAWARE = [0, 3, 4, 7]
+
+
+def fit_model(data, model_file, X, column_name):
+    """
+    Fits a model to predict p(residue interaction)
+
+    data: pd.DataFrame
+        has columns X used as features in model
+    model_file: str
+        path to file containing joblib dumped model (here, an sklearn logistic regression)
+    X: list of str
+        the columns to be input as features to the model. N.B., MUST be in the same order
+        as when the model was originally fit
+    column_name: str
+        name of column to create
+
+    Returns
+        pd.DataFrame of ECs with new column column_name containing the fit model,
+        or np.nan if the model could not be fit due to missing data
+    """
+
+    model = joblib.load(model_file)
+
+    # the default score is np.nan
+    data[column_name] = np.nan
+
+    # if any of the needed columns are missing, return data
+    for col in X:
+        if col not in data.columns:
+            print("missing", col)
+            return data
+
+    # drop rows with missing info
+    subset_data = data.dropna(subset=X)
+    if len(subset_data) == 0:
+        return data
+
+    X_var = subset_data[X]
+    predicted = model.predict_proba(X_var)[:, 1]
+
+    # make prediction and save to correct row
+    data.loc[subset_data.index, column_name] = predicted
+
+    return data
+
+
+def fit_complex_model(ecs, model_file, scaler_file, residue_score_column, output_column, scores_to_use):
+    """
+    Fits a model to predict p(protein interaction)
+
+    data: pd.DataFrame
+        has columns X used as features in model
+    model_file: str
+        path to file containing joblib dumped model (here, an sklearn logistic regression)
+    scaler_file: str
+        path to file containing joblib dumped Scaler object
+    residue_score_column: str
+        a column name in data to be used as input to model
+    output_column: str
+        name of column to create
+    scores_to_use: list of int
+        name of column to create
+
+    Returns
+        pd.DataFrame of ECs with new column column_name containing the fit model,
+        or np.nan if the model could not be fit due to missing data
+    """
+    # load the model and scaler
+    model = joblib.load(model_file)
+    scaler = joblib.load(scaler_file)
+
+    # sort by the residue score column, and take the instances input
+    ecs = ecs.sort_values(residue_score_column, ascending=False)
+    X = list(
+        ecs[residue_score_column].iloc[scores_to_use]
+    ) + [ecs.inter_relative_rank_longrange.min()]
+
+    # reshape and clean the data
+    X = np.array(X).astype(float)
+    X = X.transpose()
+    X = np.array(X).reshape(1, -1)
+
+    X = np.nan_to_num(X)
+
+    # transform with the scaler
+    X = scaler.transform(X)
+
+    ecs.loc[:, output_column] =model.predict_proba(X)[:, 1]
+    return ecs
+
+
+def _make_complex_contact_maps_probability(ec_table, d_intra_i, d_multimer_i,
+                               d_intra_j, d_multimer_j,
+                               d_inter, first_segment_name,
+                               second_segment_name,  **kwargs):
+    """
+    Plot contact maps with all ECs above a certain probability threshold,
+    or a given count of ECs
+
+    Parameters
+    ----------
+    ec_table : pandas.DataFrame
+        Full set of evolutionary couplings (all pairs)
+    d_intra_i, d_intra_j: DistanceMap
+        Computed residue-residue distances within chains for
+        monomers i and j
+    d_multimer_i, d_multimer_j : DistanceMap
+        Computed residue-residue distances between homomultimeric
+        chains for monomers i and j
+    d_inter: DistanceMap
+        Computed residue-residue distances between heteromultimeric
+        chains i and j
+    first_segment_name, second_segment_name: str
+        Name of segment i and segment j in the ec_table
+    **kwargs
+        Further plotting parameters, see check_required in code
+        for necessary values.
+
+    Returns
+    -------
+    cm_files : list(str)
+        Paths of generated contact map files
+    """
+    check_required(
+        kwargs,
+        [
+            "prefix",
+            "plot_probability_cutoffs",
+            "boundaries",
+            "draw_secondary_structure",
+            "scale_sizes"
+        ]
+    )
+
+    prefix = kwargs["prefix"]
+
+    cm_files = []
+
+    ecs_longrange = ec_table.query(
+        "abs(i - j) >= {} or segment_i != segment_j".format(kwargs["min_sequence_distance"])
+    )
+
+    # create plots based on significance cutoff
+    if kwargs["plot_probability_cutoffs"]:
+        for column, cutoffs in kwargs["plot_probability_cutoffs"].items():
+
+            if not isinstance(cutoffs, list):
+                cutoffs = [cutoffs]
+
+            for cutoff in cutoffs:
+                ec_set_i = ecs_longrange.query("segment_i == segment_j == @first_segment_name")
+                ec_set_j = ecs_longrange.query("segment_i == segment_j == @second_segment_name")
+                ec_set = ecs_longrange.loc[ecs_longrange[column] >= cutoff, :]
+
+                ec_set_inter = ec_set.query("segment_i != segment_j")
+
+                output_file = prefix + "_{}_significant_ECs_{}.pdf".format(column, cutoff)
+                plot_completed = plot_complex_cm(
+                    ec_set_i, ec_set_j, ec_set_inter,
+                    d_intra_i, d_multimer_i,
+                    d_intra_j, d_multimer_j,
+                    d_inter,
+                    first_segment_name, second_segment_name,
+                    output_file=output_file, **kwargs
+                )
+
+                if plot_completed:
+                    cm_files.append(output_file)
+
+    # give back list of all contact map file names
+    return cm_files
+
+
+def standard(**kwargs):
+    """
+    Protocol:
+    Compare ECs for a complex to
+    3D structure
+
+    Parameters
+    ----------
+    Mandatory kwargs arguments:
+        See list below in code where calling check_required
+
+    Returns
+    -------
+    outcfg : dict
+        Output configuration of the pipeline, including
+        the following fields:
+
+        * ec_file_compared_all
+        * ec_file_compared_all_longrange
+        * pdb_structure_hits
+        * distmap_monomer
+        * distmap_multimer
+        * contact_map_files
+        * remapped_pdb_files
+    """
+    check_required(
+        kwargs,
+        [
+            "structurefree_model_file", "structureaware_model_file",
+            "ec_compared_longrange_file", "ec_longrange_file",
+            "first_remapped_pdb_files", "second_remapped_pdb_files",
+            "frequencies_file"
+        ]
+    )
+
+    prefix = kwargs["prefix"]
+
+    outcfg = {
+        # initialize output EC files
+        "ec_calibration_file": prefix + "_inter_calibration.csv",
+        "ec_prediction_file": prefix + "_inter_prediction.csv",
+    }
+
+    # create an inter-ecs file with extra information for calibration purposes
+    def _calibration_file(prefix, ec_file, outcfg):
+
+        """
+        Adds values to the dataframe of ECs that will later be used
+        for score fitting
+        """
+
+        ecs = pd.read_csv(ec_file, index_col=0)
+
+        # calculate intra-protein enrichment
+        def _add_enrichment(ecs):
+
+            # Calculate the intra-protein enrichment
+            intra1_ecs = ecs.query("segment_i == segment_j == 'A_1'")
+            intra2_ecs = ecs.query("segment_i == segment_j == 'B_1'")
+
+            intra1_enrichment = enrichment(intra1_ecs, min_seqdist=6)
+            intra1_enrichment["segment_i"] = "A_1"
+
+            intra2_enrichment = enrichment(intra2_ecs, min_seqdist=6)
+            intra2_enrichment["segment_i"] = "B_1"
+
+            enrichment_table = pd.concat([intra1_enrichment, intra2_enrichment])
+
+            def _seg_to_enrich(enrich_df, ec_df, enrichment_column):
+                """
+                combines the enrichment table with the EC table
+                """
+                s_to_e = {(x, y):z for x, y, z in zip(
+                    enrich_df.i, enrich_df.segment_i, enrich_df[enrichment_column]
+                )}
+
+                # enrichment for residues in column i
+                ec_df["enrichment_i"] = [s_to_e[(x, y)] if (x, y) in s_to_e else 0 for x, y in zip(
+                    ec_df.i, ec_df.segment_i
+                )]
+
+                # enrichment for residues in column j
+                ec_df["enrichment_j"] = [s_to_e[(x, y)] if (x, y) in s_to_e else 0 for x, y in zip(
+                    ec_df.j, ec_df.segment_j
+                )]
+
+                return ec_df
+
+            # add the intra-protein enrichment to the EC table
+            ecs = _seg_to_enrich(enrichment_table, ecs, "enrichment")
+            # larger of two enrichment values
+            ecs["intra_enrich_max"] = ecs[["enrichment_i", "enrichment_j"]].max(axis=1)
+            # smaller of two enrichment values
+            ecs["intra_enrich_min"] = ecs[["enrichment_i", "enrichment_j"]].min(axis=1)
+
+            return ecs
+
+        ecs = _add_enrichment(ecs)
+        ecs = ecs.reset_index()
+
+        # get just the inter ECs and calculate Z-score
+        ecs = ecs.query("segment_i != segment_j")
+        mean_ec = ecs.cn.mean()
+        std_ec = ecs.cn.std()
+        ecs.loc[:, 'Z_score'] = (ecs.cn - mean_ec) / std_ec
+
+        # add the evcomplex score with neffL correction
+        N_effL = kwargs["effective_sequences"] / kwargs["num_sites"]
+        ecs = complex_probability(ecs, "evcomplex", use_all_ecs=False,
+                        score="cn", Neff_over_L=N_effL)
+        ecs.loc[:, "evcomplex_normed"] = ecs.loc[:, "probability"]
+
+        # get only the top 100 inter ECs
+        L = len(ecs.i.unique()) + len(ecs.j.unique())
+        ecs = ecs[0:100]
+
+        # add rank
+        ecs["inter_relative_rank_longrange"] = ecs.index / L
+        print(ecs.head())
+        # calculate the ASA for the first and second segments by combining asa from all remapped pdb files
+        first_asa, outcfg = combine_asa(kwargs["first_remapped_pdb_files"], kwargs["dssp"], prefix, outcfg)
+        first_asa["segment_i"] = "A_1"
+
+        second_asa, outcfg = combine_asa(kwargs["second_remapped_pdb_files"], kwargs["dssp"], prefix, outcfg)
+        second_asa["segment_i"] = "B_1"
+
+        # save the ASA to a file
+        asa = pd.concat([first_asa, second_asa])
+        outcfg["asa_file"] = prefix + "_surface_area.csv"
+        asa.to_csv(outcfg["asa_file"])
+
+        # Add the ASA to the ECs and compute the max and min for each position pair
+        ecs = add_asa(ecs, asa, asa_column="mean")
+        ecs["asa_max"] = ecs[["asa_i", "asa_j"]].max(axis=1)
+        ecs["asa_min"] = ecs[["asa_i", "asa_j"]].min(axis=1)
+
+        # Add min and max conservation to EC file
+        frequency_file = kwargs["frequencies_file"]
+        d = pd.read_csv(frequency_file)
+        conservation = {(x,y):z for x,y,z in zip(d.segment_i, d.i, d.conservation)}
+
+        ecs["conservation_i"] = [
+            conservation[(x, y)] if (x, y) in conservation else np.nan for x, y in zip(ecs.segment_i, ecs.i)
+        ]
+        ecs["conservation_j"] = [
+            conservation[(x, y)] if (x, y) in conservation else np.nan for x, y in zip(ecs.segment_j, ecs.j)
+        ]
+        ecs["conservation_max"] = ecs[["conservation_i", "conservation_j"]].max(axis=1)
+        ecs["conservation_min"] = ecs[["conservation_i", "conservation_j"]].min(axis=1)
+
+        # # amino acid frequencies
+        # for char in list(ALPHABET_PROTEIN):
+        #     # Frequency of amino acid 'char' in position i
+        #     ecs = ecs.merge(d[["i", "segment_i", char]], on=["i","segment_i"], how="left", suffixes=["", "_1"])
+        #     ecs = ecs.rename({char: f"f{char}_i"}, axis=1)
+        #     if "i_1" in ecs.columns:
+        #         ecs = ecs.drop(columns=["i_1", "segment_i_1"])
+        #
+        #     # Frequency of amino acid 'char' in position j
+        #     ecs = ecs.merge(
+        #         d[["i", "segment_i", char]], left_on=["j", "segment_j"],
+        #         right_on=["i", "segment_i"], how="left", suffixes=["", "_1"]
+        #     )
+        #     ecs = ecs.rename({char: f"f{char}_j"}, axis=1)
+        #     if "j_1" in ecs.columns:
+        #         ecs = ecs.drop(columns=["j_1", "segment_j_1"])
+        #
+        # # summed frequency of amino acid char in both positions i and j
+        # # ie, each pair i,j now gets one combined frequency
+        # print("computing frequencies")
+        # for char in list(ALPHABET_PROTEIN):
+        #     ecs[f"f{char}"] = ecs[f"f{char}_i"] + ecs[f"f{char}_j"]
+        #
+        # # Compute the weighted sum of hydropathy for pair i, j
+        # hydrophilicity = []
+        #
+        # # For each EC
+        # print("computing hydrophilicty")
+        # for _, row in ecs.iterrows():
+        #     # frequncy of amino acid char * hydopathy index of that AA
+        #     hydro = sum([
+        #         HYDROPATHY_INDEX[char] * float(row[[f'f{char}']]) for char in list(ALPHABET_PROTEIN)
+        #     ])
+        #     hydrophilicity.append(hydro)
+        #
+        # ecs["f_hydrophilicity"] = hydrophilicity
+
+        # save the calibration file
+        ecs.to_csv(outcfg["ec_calibration_file"])
+
+    # Compute the calibration file
+    if valid_file(kwargs["ec_compared_longrange_file"]):
+        _calibration_file(prefix, kwargs["ec_compared_longrange_file"], outcfg)
+    elif valid_file(kwargs["ec_longrange_file"]):
+        _calibration_file(prefix, kwargs["ec_longrange_file"], outcfg)
+    else:
+        raise InvalidParameterError("No valid EC file provided as input for modeling")
+
+    calibration_ecs = pd.read_csv(outcfg["ec_calibration_file"], index_col=0)
+    calibration_ecs = calibration_ecs.sort_values("cn", ascending=False)[0:20]
+
+    # Fit the structure free model file
+    print('fitting the model')
+    calibration_ecs = fit_model(
+        calibration_ecs,
+        kwargs["structurefree_model_file"],
+        X_STRUCFREE,
+        "residue_prediction_strucfree"
+    )
+
+    # Fit the structure aware model prediction file
+    calibration_ecs = fit_model(
+        calibration_ecs,
+        kwargs["structureaware_model_file"],
+        X_STRUCAWARE,
+        "residue_prediction_strucaware"
+    )
+
+    # Fit the structure free complex model
+    calibration_ecs = fit_complex_model(
+        calibration_ecs,
+        kwargs["complex_strucfree_model_file"],
+        kwargs["complex_strucfree_scaler_file"],
+        "residue_prediction_strucfree",
+        "complex_prediction_strucfree",
+        X_COMPLEX_STRUCFREE
+    )
+
+    # Fit the structure aware complex model
+    calibration_ecs = fit_complex_model(
+        calibration_ecs,
+        kwargs["complex_strucaware_model_file"],
+        kwargs["complex_strucaware_scaler_file"],
+        "residue_prediction_strucaware",
+        "complex_prediction_strucaware",
+        X_COMPLEX_STRUCAWARE
+    )
+
+    calibration_ecs[[
+        "inter_relative_rank_longrange", "i", "A_i", "j", "A_j",
+        "segment_i", "segment_j", "cn", "dist", "precision",  "evcomplex_normed",
+        "Z_score", "asa_min", "conservation_max", "intra_enrich_max",
+        "residue_prediction_strucaware", "residue_prediction_strucfree",
+        "complex_prediction_strucaware", "complex_prediction_strucfree"
+    ]].to_csv(outcfg["ec_prediction_file"])
+
+    # Step 4: Make contact map plots
+    # if no structures available, defaults to EC-only plot
+    def _load_distmap(distmap_file):
+        if distmap_file is not None and valid_file(distmap_file + ".csv"):
+            distmap = DistanceMap.from_file(distmap_file)
+        else:
+            distmap = None
+        return distmap
+
+    d_intra_i = _load_distmap(kwargs["first_distmap_monomer"])
+    d_multimer_i = _load_distmap(kwargs["first_distmap_multimer"])
+    d_intra_j = _load_distmap(kwargs["second_distmap_monomer"])
+    d_multimer_j = _load_distmap(kwargs["second_distmap_multimer"])
+    d_inter = _load_distmap(kwargs["distmap_inter"])
+
+    first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id
+    second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id
+
+    # Add the top L intra ECs to the ec_table
+    ecs = pd.read_csv(kwargs["ec_file"]).query("segment_i == segment_j")
+    ecs = ecs.query("abs(i - j) >= {} or segment_i != segment_j".format(kwargs["min_sequence_distance"]))
+    ecs_intra = ecs.iloc[0:kwargs["num_sites"]]
+
+    calibration_ecs = pd.concat([calibration_ecs, ecs_intra])
+
+    outcfg["contact_map_files"] = _make_complex_contact_maps_probability(
+        calibration_ecs, d_intra_i, d_multimer_i,
+        d_intra_j, d_multimer_j,
+        d_inter, first_segment_name,
+        second_segment_name, **kwargs
+    )
+
+    return outcfg
+
+
+# list of available EC comparison protocols
+PROTOCOLS = {
+    # standard complex model fitting protocol
+    "standard": standard,
+}
+
+
+def run(**kwargs):
+    """
+    Run inference protocol to calculate ECs from
+    input sequence alignment.
+
+    Parameters
+    ----------
+    Mandatory kwargs arguments:
+        protocol: EC protocol to run
+        prefix: Output prefix for all generated files
+
+    Returns
+    -------
+    outcfg : dict
+        Output configuration of stage
+        (see individual protocol for fields)
+    """
+    check_required(kwargs, ["protocol"])
+
+    if kwargs["protocol"] not in PROTOCOLS:
+        raise InvalidParameterError(
+            "Invalid protocol selection: " +
+            "{}. Valid protocols are: {}".format(
+                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
+            )
+        )
+
+    return PROTOCOLS[kwargs["protocol"]](**kwargs)
diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py
index dd1ff573..e941afb7 100644
--- a/evcouplings/utils/app.py
+++ b/evcouplings/utils/app.py
@@ -428,11 +428,11 @@ def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_err
 def run(**kwargs):
     """
     Exposes command line interface as a Python function.
-    
+
     Parameters
     ----------
     kwargs
-        See click.option decorators for app() function 
+        See click.option decorators for app() function
     """
     # substitute commmand line options in config file
     config = substitute_config(**kwargs)
diff --git a/evcouplings/utils/batch.py b/evcouplings/utils/batch.py
index ddedd338..6a9d95ac 100644
--- a/evcouplings/utils/batch.py
+++ b/evcouplings/utils/batch.py
@@ -579,7 +579,7 @@ def _prepare_resources(self, resources):
 
 class SlurmSubmitter(AClusterSubmitter):
     """
-    Implements an LSF submitter
+    Implements a Slurm submitter
     """
     __name = "slurm"
     __submit = "sbatch --job-name={name} {dependent} {resources} --wrap 'srun {cmd}'"
@@ -624,7 +624,7 @@ def __del__(self):
                 os.remove(self.__db_path)
         except AttributeError:
             pass
-          
+
     @property
     def isBlocking(self):
         return self.__blocking
diff --git a/evcouplings/utils/constants.py b/evcouplings/utils/constants.py
index bf361796..a5bc4a1c 100644
--- a/evcouplings/utils/constants.py
+++ b/evcouplings/utils/constants.py
@@ -3,7 +3,9 @@
 
 Authors:
   Thomas A. Hopf
+  Anna G. Green
 """
+import numpy as np
 
 # amino acid one-letter code to three-letter code
 AA1_to_AA3 = {
@@ -36,3 +38,53 @@
 AA3_to_AA1 = {
     v: k for k, v in AA1_to_AA3.items()
 }
+
+# Amino acid surface area values from Tien et al, 2013 (empirical values)
+AA_SURFACE_AREA = {
+    "A": 121,
+    "R": 265,
+    "D": 187,
+    "N": 187,
+    "C": 148,
+    "E": 214,
+    "Q": 214,
+    "G": 97,
+    "H": 216,
+    "I": 195,
+    "L": 191,
+    "K": 230,
+    "M": 103,
+    "F": 228,
+    "P": 154,
+    "S": 143,
+    "T": 163,
+    "W": 264,
+    "Y": 255,
+    "V": 165,
+    "X": np.nan
+}
+
+#Hydropathy index from Lehninger Principles of Biochemistry, 5th Edition, table 3-1
+HYDROPATHY_INDEX = {
+    "G": -.4,
+    "A": 1.8,
+    "P": -1.6,
+    "V": 4.2,
+    "L": 3.8,
+    "I": 4.5,
+    "M": 1.9,
+    "F": 2.8,
+    "Y": -1.3,
+    "W": -0.9,
+    "S": -0.8,
+    "T": -0.7,
+    "C": 2.5,
+    "N": -3.5,
+    "Q": -3.5,
+    "K": -3.9,
+    "H": -3.2,
+    "R": -4.5,
+    "D": -3.5,
+    "E": -3.5,
+    "-": 0
+}
\ No newline at end of file
diff --git a/evcouplings/utils/pipeline.py b/evcouplings/utils/pipeline.py
index 08278466..990a8bf0 100644
--- a/evcouplings/utils/pipeline.py
+++ b/evcouplings/utils/pipeline.py
@@ -40,6 +40,7 @@
 import evcouplings.mutate.protocol as mt
 import evcouplings.fold.protocol as fd
 import evcouplings.complex.protocol as pp
+import evcouplings.probability.protocol as pb
 
 # supported pipelines
 #
@@ -64,6 +65,7 @@
         ("concatenate", pp.run, None),
         ("couplings", cp.run, None),
         ("compare", cm.run, None),
+        ("probability", pb.run, None),
         ("mutate", mt.run, None),
         ("fold", fd.run, None)
     ]
@@ -199,6 +201,7 @@ def execute(**config):
 
             # one less stage to put through after we ran this...
             num_stages_to_run -= 1
+
         else:
             # skip state by injecting state from previous run
             verify_resources(
@@ -214,9 +217,13 @@ def execute(**config):
             # verify all the output files are there
             outfiles = [
                 filepath for f, filepath in outcfg.items()
-                if f.endswith("_file") and filepath is not None
+                if f.endswith("_file") and filepath is not None and f!="model_file" \
+                and not filepath.endswith("raw_focus.fasta") and not filepath.endswith("raw.fasta") \
+                and not filepath.endswith("monomer_1.fasta") and not filepath.endswith("monomer_2.fasta") \
+                and not filepath.endswith("longrange.csv") and not filepath.endswith("_ECs.txt") \
+                and not filepath.endswith(".pml") and not filepath.endswith("json")
             ]
-
+            
             verify_resources(
                 "Output files from stage '{}' "
                 "missing".format(stage),
@@ -377,7 +384,7 @@ def verify_prefix(verify_subdir=True, **config):
     Check if configuration contains a prefix,
     and that prefix is a valid directory we
     can write to on the filesystem
-    
+
     Parameters
     ----------
     verify_subdir : bool, optional (default: True)
@@ -386,7 +393,7 @@ def verify_prefix(verify_subdir=True, **config):
         app loop.
     **config
         Input configuration for pipeline
-        
+
     Returns
     -------
     prefix : str
@@ -550,7 +557,7 @@ def run(**kwargs):
     EVcouplings pipeline execution from a
     configuration file (single thread, no
     batch or environment configuration)
-    
+
     Parameters
     ----------
     kwargs
diff --git a/evcouplings/visualize/pairs.py b/evcouplings/visualize/pairs.py
index 079e8c60..e310e32a 100644
--- a/evcouplings/visualize/pairs.py
+++ b/evcouplings/visualize/pairs.py
@@ -439,7 +439,7 @@ def complex_contact_map(intra1_ecs, intra2_ecs, inter_ecs,
         )
     )
 
-    # Don't compute inter boundaries unless we have inter 
+    # Don't compute inter boundaries unless we have inter
     # ecs or distances
     if (inter_ecs is not None and not inter_ecs.empty) or d_inter is not None:
         inter_boundaries = list(
@@ -457,7 +457,7 @@ def _boundary_union(original_boundaries, new_boundaries_axis1,
             # of the protein.
             # Default is to update both axes
             updated_boundaries = original_boundaries
-            # increase the axis 1 boundaries if the new boundaries 
+            # increase the axis 1 boundaries if the new boundaries
             # cover more range
             if axis1:
                 updated_boundaries[0] = (
@@ -491,7 +491,7 @@ def _boundary_union(original_boundaries, new_boundaries_axis1,
         )
 
         intra2_boundaries = _boundary_union(
-            intra2_boundaries, inter_boundaries, inter_boundaries, 
+            intra2_boundaries, inter_boundaries, inter_boundaries,
             axis1=False, axis2=True, symmetric=True
         )
 
@@ -1276,7 +1276,7 @@ def ec_lines_pymol_script(ec_table, output_file, distance_cutoff=5,
     """
     Create a Pymol .pml script to visualize ECs on a 3D
     structure
-    
+
     Parameters
     ----------
     ec_table : pandas.DataFrame
@@ -1349,7 +1349,7 @@ def enrichment_pymol_script(enrichment_table, output_file,
     ----------
     enrichment_table : pandas.DataFrame
         Mapping of position (column i) to EC enrichment
-        (column enrichemnt), as returned by 
+        (column enrichemnt), as returned by
         evcouplings.couplings.pairs.enrichment()
     output_file : str
         File path where to store pml script
@@ -1364,7 +1364,7 @@ def enrichment_pymol_script(enrichment_table, output_file,
         Use legacy (2011) red and yellow colormap
         for EC enrichment
     """
-    if legacy: 
+    if legacy:
         t = enrichment_table.query("enrichment > 1")
         t.loc[:, "b_factor"] = t.enrichment
         # compute boundaries for highly coupled residues
@@ -1386,7 +1386,7 @@ def enrichment_pymol_script(enrichment_table, output_file,
        # set the boundary for number of residues to be rendered as spheres
         sphere_boundary = boundary2
 
-        
+
     else:
         t = deepcopy(enrichment_table)
         t.loc[:, "b_factor"] = t.enrichment
@@ -1457,11 +1457,10 @@ def enrichment_pymol_script(enrichment_table, output_file,
             for idx, c in enumerate(color_list):
                 f.write("set_color color{}, [{},{},{}]\n".format(idx, c[0], c[1], c[2]))
             f.write("color color{}{}\n".format(len(boundary_list) - 1, chain_sel))
-        
+
         f.write("as cartoon{}\n".format(chain_sel))
 
         pymol_mapping(t, f, chain)
 
         if not sphere_view:
             f.write("cartoon putty{}\n".format(chain_sel))
-
diff --git a/notebooks/output_files_tutorial.ipynb b/notebooks/output_files_tutorial.ipynb
index e6f0ca9b..bdc96d22 100644
--- a/notebooks/output_files_tutorial.ipynb
+++ b/notebooks/output_files_tutorial.ipynb
@@ -829,6 +829,28 @@
     "- enrichment: Sum of the evolutionary coupling scores that involve this residue, normalized by the background level of coupling"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cumulative coupling score files for complex ECs (_enrichment_intra.csv and _enrichment_inter.csv)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Same as the monomer cumulative coupling score file, except calculated using the intra-protein or inter-protein ECs\n",
+    "\n",
+    "*output key: enrichment_intra_file and enrichment_inter_file*\n",
+    "\n",
+    "Columns:\n",
+    "- i: position i\n",
+    "- A_i: character i\n",
+    "- segment_i: segment of origin of position i\n",
+    "- enrichment: Sum of the evolutionary coupling scores that involve this residue, normalized by the background level of coupling"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1290,21 +1312,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,
diff --git a/test/TestComplex.py b/test/TestComplex.py
index 873b24f7..6ee30bbc 100644
--- a/test/TestComplex.py
+++ b/test/TestComplex.py
@@ -70,16 +70,23 @@ def __init__(self, *args, **kwargs):
         paralog_file = "{}/concatenate/test_new_paralog_table.csv".format(TRAVIS_PATH)
         self.paralog_table = pd.read_csv(paralog_file, index_col=0, header=0)
 
+        # table of frequencies
+        self.uncorrected_frequency_file = "{}/concatenate/test_new_frequencies_uncorrected.csv".format(TRAVIS_PATH)
+        corrected_frequency_file = "{}/concatenate/test_new_frequencies.csv".format(TRAVIS_PATH)
+        self.freq_file = corrected_frequency_file
+        self.freqs = pd.read_csv(corrected_frequency_file, index_col=0, header=0)
+
         # input and output configuration
         with open("{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH)) as inf:
             self.incfg = yaml.safe_load(inf)
+            self.incfg["forbid_overlapping_concatenation"] = True
 
         with open("{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH)) as inf:
             self.outcfg = yaml.safe_load(inf)
 
     def test_genome_distance(self):
         """
-        tests the genome distance concatenation protocol.
+        Tests the genome distance concatenation protocol.
         Verifies that all of the outfiles are created and non-empty (but does
         not check their contents).
         Verifies that the output configuration has all of the necessary keys
@@ -123,7 +130,7 @@ def test_genome_distance(self):
 
     def test_best_hit_normal(self):
         """
-        tests the genome distance concatenation protocol.
+        Tests the best hit concatenation protocol.
         Verifies that all of the outfiles are created and non-empty (but does
         not check their contents).
         Verifies that the output configuration has all of the necessary keys
@@ -173,7 +180,7 @@ def test_best_hit_normal(self):
 
     def test_best_hit_reciprocal(self):
         """
-        tests the genome distance concatenation protocol.
+        Tests the best reciprocal hit concatenation protocol.
         Verifies that all of the outfiles are created and non-empty (but does
         not check their contents).
         Verifies that the output configuration has all of the necessary keys
@@ -223,8 +230,7 @@ def test_best_hit_reciprocal(self):
 
     def test_modify_complex_segments(self):
         """
-        tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary
-        :return:
+        Tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary
         """
         test_configuration = deepcopy(self.incfg)
         cfg = deepcopy(test_configuration)
@@ -236,9 +242,29 @@ def test_modify_complex_segments(self):
         test_configuration["segments"] = self.outcfg["segments"]
         self.assertDictEqual(test_configuration, _outcfg)
 
+    def test_remove_overlap(self):
+        """
+        Tests that modify_complex_segments adds the correct "segments" field to the outcfg dictionary
+        """
+        df = pd.DataFrame({
+            "id_1": ["a/1-100", "b/1-200", "c/10-100"],
+            "id_2": ["a/2-200", "b/300-400", "d/10-100"],
+            "species": ["A", "B", "C"]
+        })
+
+        df1 = remove_overlapping_ids(df).reset_index(drop=True)
+
+        desired_output = pd.DataFrame({
+            "id_1": ["b/1-200", "c/10-100"],
+            "id_2": ["b/300-400", "d/10-100"],
+            "species": ["B", "C"]
+        })
+
+        pd.testing.assert_frame_equal(df1, desired_output)
+
     def test_describe_concatenation(self):
         """
-        test whether describe_concatenation writes a file with the correct contents
+        Test whether describe_concatenation writes a file with the correct contents
         """
         outfile = tempfile.NamedTemporaryFile(suffix=".csv", delete=False)
 
@@ -260,9 +286,8 @@ def test_describe_concatenation(self):
 
     def test_write_concatenated_alignment(self):
         """
-        tests whether write_concatenated alignment returns the correct 3
+        Tests whether write_concatenated alignment returns the correct 3
         alignment objects
-
         """
         alignment_1_file = "{}/concatenate/test_new_monomer_1.fasta".format(TRAVIS_PATH)
         with(open(alignment_1_file)) as inf:
@@ -302,8 +327,7 @@ def _test_aln_equivalence(ali1, ali2):
 
     def test_read_species_annotation_table_uniprot(self):
         """
-        tests whether a uniprot annotation table is read correctly
-
+        Tests whether a uniprot annotation table is read correctly
         """
         annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
         annotation_data = read_species_annotation_table(annotation_file_uniprot)
@@ -311,8 +335,7 @@ def test_read_species_annotation_table_uniprot(self):
 
     def test_read_species_annotation_table_uniref(self):
         """
-        tests whether a uniref annotation table is read correctly
-
+        Tests whether a uniref annotation table is read correctly
         """
         _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format(TRAVIS_PATH)
         _annotation_data = read_species_annotation_table(_annotation_file_uniref)
@@ -322,16 +345,14 @@ def test_read_species_annotation_table_uniref(self):
 
     def test_most_similar_by_organism(self):
         """
-        tests whether most_similar_by_organism returns the correct dataframe
-
+        Tests whether most_similar_by_organism returns the correct dataframe
         """
         annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data)
         pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id)
 
     def test_find_paralogs(self):
         """
-        tests whether find_paralogs returns the correct dataframe
-
+        Tests whether find_paralogs returns the correct dataframe
         """
         target_id = "DINJ_ECOLI"
         paralog_table = find_paralogs(target_id, self.annotation_data, self.similarities, 0.9)
@@ -339,8 +360,7 @@ def test_find_paralogs(self):
 
     def test_filter_best_reciprocal(self):
         """
-        tests whether filter_best_reciprocal returns the correct dataframe
-
+        Tests whether filter_best_reciprocal returns the correct dataframe
         """
         alignment_file = "{}/align_1/test_new.a2m".format(TRAVIS_PATH)
         best_recip = pd.read_csv("{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH), index_col=0)
@@ -349,8 +369,7 @@ def test_filter_best_reciprocal(self):
 
     def test_get_distance_overlap(self):
         """
-        tests whether get_distance returns 0 for overlapping genes
-
+        Tests whether get_distance returns 0 for overlapping genes
         """
         annotation_1 = (1000, 1500)
         annotation_2 = (1400, 1700)
@@ -359,8 +378,7 @@ def test_get_distance_overlap(self):
 
     def test_get_distance_reverse(self):
         """
-        tests whether get distance correctly measures distance of two genes with opposite strand
-
+        Tests whether get distance correctly measures distance of two genes with opposite strand
         """
         annotation_1 = (1000, 1500)
         annotation_2 = (1800, 1700)
@@ -369,8 +387,7 @@ def test_get_distance_reverse(self):
 
     def test_get_distance_increasing(self):
         """
-        tests whether get_distance correctly measures distance of two genes with same strand
-
+        Tests whether get_distance correctly measures distance of two genes with same strand
         """
         annotation_1 = (1000, 1500)
         annotation_2 = (1700, 1800)
@@ -379,8 +396,7 @@ def test_get_distance_increasing(self):
 
     def test_best_reciprocal_matching(self):
         """
-        tests whether best_reciprocal_matchin generates the correct pd.DataFrame
-
+        Tests whether best_reciprocal_matchin generates the correct pd.DataFrame
         """
         id_pairing = best_reciprocal_matching(self.possible_partners)
         id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"])
@@ -389,8 +405,7 @@ def test_best_reciprocal_matching(self):
 
     def test_find_possible_partners(self):
         """
-        tests whether find_possible partners generates the correct pd.DataFrame
-
+        Tests whether find_possible partners generates the correct pd.DataFrame
         """
 
         _possible_partners = find_possible_partners(
@@ -408,8 +423,7 @@ def test_find_possible_partners(self):
 
     def test_plot_distance_distribution(self):
         """
-        tests whether plot_distance_distribution generates a non-empty PDF
-
+        Tests whether plot_distance_distribution generates a non-empty PDF
         """
         outfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
         plot_distance_distribution(self.id_pairing, outfile.name)
@@ -418,5 +432,12 @@ def test_plot_distance_distribution(self):
         outfile.close()
         os.unlink(outfile.name)
 
+    def test_map_frequencies(self):
+        """
+        Tests _map_frequencies_file
+        """
+        freqs = map_frequencies_file(self.uncorrected_frequency_file, self.outcfg, **self.incfg)
+        pd.testing.assert_frame_equal(freqs, self.freqs)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/TestCouplings.py b/test/TestCouplings.py
new file mode 100644
index 00000000..e2464618
--- /dev/null
+++ b/test/TestCouplings.py
@@ -0,0 +1,139 @@
+"""
+Test cases for couplings stage of EVCouplings pipeline
+
+Author:
+    Anna G. Green
+"""
+
+import unittest
+import os
+import tempfile
+import pandas as pd
+import ruamel.yaml as yaml
+from unittest import TestCase
+from evcouplings.couplings.pairs import *
+
+TRAVIS_PATH = "/home/travis/evcouplings_test_cases/complex_test"
+TRAVIS_PATH_MONOMER = "/home/travis/evcouplings_test_cases/monomer_test/couplings/RASH_HUMAN_b03"
+
+#TRAVIS_PATH = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/complex_test"
+#TRAVIS_PATH_MONOMER = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/monomer_test/couplings/RASH_HUMAN_b03"
+class TestCouplings(TestCase):
+
+
+    def __init__(self, *args, **kwargs):
+        super(TestCouplings, self).__init__(*args, **kwargs)
+        self.raw_ec_file = "{}/couplings/test_new_ECs.txt".format(TRAVIS_PATH)
+        self.couplings_file = "{}/couplings/test_new_CouplingScores.csv".format(TRAVIS_PATH)
+        self.ecs = pd.read_csv(self.couplings_file)
+        self.inter_ecs = self.ecs.query("segment_i != segment_j")
+
+        with open("{}/couplings/test_new_couplings.outcfg".format(TRAVIS_PATH)) as inf:
+            self.outcfg = yaml.safe_load(inf)
+
+        self.monomer_couplings = "{}_CouplingScores.csv".format(TRAVIS_PATH_MONOMER)
+        self.enrichment_file = "{}_enrichment.csv".format(TRAVIS_PATH_MONOMER)
+
+    def test_read_raw_ec_file(self):
+        """
+        tests the sorted and unsorted modes of read_raw_ec_file
+        """
+        ecs = pd.read_csv(self.raw_ec_file, sep=" ", names=["i", "A_i", "j", "A_j", "fn", "cn"])
+        _test_ecs = read_raw_ec_file(self.raw_ec_file, sort=False, score="cn")
+
+        pd.testing.assert_frame_equal(ecs, _test_ecs)
+
+        sorted_ecs = ecs.sort_values(by="cn", ascending=False)
+        _sorted_test_ecs = read_raw_ec_file(self.raw_ec_file, sort=True, score="cn")
+
+        pd.testing.assert_frame_equal(sorted_ecs, _sorted_test_ecs)
+
+    def test_enrichment(self):
+        """
+        tests the EC enrichment function when the monomer ECs have no segments
+        """
+        enrichment_scores = pd.read_csv(self.enrichment_file)
+        ecs = pd.read_csv(self.monomer_couplings)
+        ecs = ecs.drop(["segment_i", "segment_j"], axis=1)
+        _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True)
+
+        pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores)
+
+    def test_enrichment_monomer_segment(self):
+        """
+        tests the EC enrichment function when the monomer ECs have segments
+        """
+        enrichment_scores = pd.read_csv(self.enrichment_file)
+        enrichment_scores["segment_i"] = "A"
+        enrichment_scores = enrichment_scores[['i', 'A_i', 'segment_i', 'enrichment']]
+
+        ecs = pd.read_csv(self.monomer_couplings)
+        _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True)
+
+        pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores)
+
+    def test_enrichment_complex_segment(self):
+        """
+        tests the EC enrichment function with complex ECs
+        """
+
+        ecs = pd.read_csv(self.couplings_file)
+        _enrichment_scores = enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6).reset_index(drop=True)
+        enrichment_scores = pd.read_csv(("{0}/couplings/test_new_enrichment.csv".format(TRAVIS_PATH)))
+        pd.testing.assert_frame_equal(enrichment_scores, _enrichment_scores)
+
+    def test_add_mixture_probability_skewnormal(self):
+        ecs_with_prob = add_mixture_probability(
+            self.inter_ecs, model="skewnormal"
+        )
+
+        _test_ecs = pd.read_csv(
+            "{}/couplings/test_new_CouplingScores_skewnormal.csv".format(TRAVIS_PATH), index_col=0
+        )
+        pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs)
+
+    def test_add_mixture_probability_normal(self):
+        ecs_with_prob = add_mixture_probability(
+            self.inter_ecs, model="normal"
+        )
+
+        _test_ecs = pd.read_csv(
+            "{}/couplings/test_new_CouplingScores_normal.csv".format(TRAVIS_PATH), index_col=0
+        )
+        pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs)
+
+    def test_add_mixture_probability_evcomplex_uncorrected(self):
+        ecs_with_prob = add_mixture_probability(
+            self.inter_ecs, model="evcomplex_uncorrected"
+        )
+
+        _test_ecs = pd.read_csv(
+            "{}/couplings/test_new_CouplingScores_evc_raw.csv".format(TRAVIS_PATH), index_col=0
+        )
+        pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs)
+
+    def test_add_mixture_probability_evcomplex(self):
+        NeffL = self.outcfg["effective_sequences"] / self.outcfg["num_sites"]
+        ecs_with_prob = add_mixture_probability(
+            self.inter_ecs, model="evcomplex", N_effL=NeffL
+        )
+
+        _test_ecs = pd.read_csv(
+            "{}/couplings/test_new_CouplingScores_evc.csv".format(TRAVIS_PATH), index_col=0
+        )
+        pd.testing.assert_frame_equal(ecs_with_prob, _test_ecs)
+
+    def test_add_mixture_probability_evcomplex_error(self):
+        """
+        tests that EVcomplex score cannot be calculated without a user-supplied Meff
+        :return:
+        """
+        with self.assertRaises(ValueError):
+            add_mixture_probability(self.ecs, model="evcomplex")
+
+    def test_add_mixture_probability_invalid_selection(self):
+        with self.assertRaises(ValueError):
+            add_mixture_probability(self.ecs, model="fake news")
+
+if __name__ == '__main__':
+    unittest.main()