wolberlab · NDoering99 · Jan 21, 2025 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/mdpath/mdpath.py b/mdpath/mdpath.py
@@ -56,6 +56,8 @@ def main():
 
         -graphdist: Default distance for residues making up the graph (default: 5.0)
 
+        -digamma_correction: Use digamma correction for entropy calculation (default: False)
+
         -numpath: Default number of top paths considered for clustering (default: 500)
     """
     parser = argparse.ArgumentParser(
@@ -125,6 +127,14 @@ def main():
         default=500,
     )
 
+    parser.add_argument(
+        "-digamma_correction",
+        dest="digamma_correction",
+        help="Use digamma correction for entropy calculation.",
+        required=False,
+        default=False,
+    )
+
     parser.add_argument(
         "-GMM",
         dest="GMM",
@@ -133,6 +143,22 @@ def main():
         default=False,
     )
 
+    parser.add_argument(
+        "-chain",
+        dest="chain",
+        help="Chain of the protein to be analyzed in the topology file. CAUTION: only one chain can be selected for analysis.",
+        required=False,
+        default=False,
+    )
+
+    parser.add_argument(
+        "-invert",
+        dest="invert",
+        help="Inverts NMI bei subtrackting each NMI from max NMI. Can be used to find Paths, that are the least correlated",
+        required=False,
+        default=False,
+    )
+
     args = parser.parse_args()
     if not args.topology or not args.trajectory:
         print("Both trajectory and topology files are required!")
@@ -148,14 +174,40 @@ def main():
     closedist = float(args.closedist)
     graphdist = float(args.graphdist)
     numpath = int(args.numpath)
+    digamma_correction = bool(args.digamma_correction)
     GMM = bool(args.GMM)
+    invert = bool(args.invert)
 
     # Prepare the trajectory for analysis
     if os.path.exists("first_frame.pdb"):
         os.remove("first_frame.pdb")
     with mda.Writer("first_frame.pdb", multiframe=False) as pdb:
         traj.trajectory[0]
         pdb.write(traj.atoms)
+
+    # Chain selection
+    if args.chain:
+        chain = str(args.chain)
+        chain_atoms = traj.select_atoms(f"chainID {chain}")
+        if len(chain_atoms) == 0:
+            raise ValueError(f"No atoms found for chain {chain}")
+            exit()
+        chain_universe = mda.Merge(chain_atoms)
+        # Write new topology
+        chain_universe.atoms.write("selected_chain.pdb")
+
+        # Write trajectory
+        with mda.Writer("selected_chain.dcd", chain_atoms.n_atoms) as W:
+            for ts in traj.trajectory:
+                chain_universe.atoms.positions = chain_atoms.positions
+                W.write(chain_universe.atoms)
+        topology = "selected_chain.pdb"
+        trajectory = "selected_chain.dcd"
+        traj = mda.Universe(topology, trajectory)
+        print(
+            f"Chain {chain} selected and written to selected_chain.pdb and selected_chain.dcd and will now be analyzed."
+        )
+
     structure_calc = StructureCalculations(topology)
     df_distant_residues = structure_calc.calculate_residue_suroundings(fardist, "far")
     df_close_res = structure_calc.calculate_residue_suroundings(closedist, "close")
@@ -171,11 +223,13 @@ def main():
     print("\033[1mTrajectory is processed and ready for analysis.\033[0m")
 
     # Calculate the mutual information and build the graph
-    nmi_calc = NMICalculator(df_all_residues, GMM=GMM)
+    nmi_calc = NMICalculator(
+        df_all_residues, digamma_correction=digamma_correction, GMM=GMM, invert=invert
+    )
     nmi_calc.entropy_df.to_csv("entropy_df.csv", index=False)
-    nmi_calc.mi_diff_df.to_csv("mi_diff_df.csv", index=False)
+    nmi_calc.nmi_df.to_csv("nmi_df.csv", index=False)
     graph_builder = GraphBuilder(
-        topology, structure_calc.last_res_num, nmi_calc.mi_diff_df, graphdist
+        topology, structure_calc.last_res_num, nmi_calc.nmi_df, graphdist
     )
     MDPathVisualize.visualise_graph(
         graph_builder.graph

diff --git a/mdpath/mdpath_tools.py b/mdpath/mdpath_tools.py
@@ -432,3 +432,34 @@ def gpcr_2D_vis():
             "Topology and cluster pathways are required for creating a 2D visualization of GPCR paths."
         )
         exit(1)
+
+
+def spline():
+    """
+    Create a 3D Visualization of Paths through a protein using accurate spline representations.
+
+    This function provides a command-line interface (CLI) for creating a 3D visualization of paths through a protein.
+    It uses the pre-calculated cluster paths and recalculates them using accurate spline intrapolation.
+    The output meshfiles can be used directly in Blender to accurately capture paths.
+
+    Command-line inputs:
+        -json (str): Json file of the MDPath analysis -> "quick_precomputed_clusters_paths"
+
+    Example usage:
+        $ mdpath_spline -json <path_to_json>
+    """
+    parser = argparse.ArgumentParser(
+        prog="mdpath_spline",
+        description="Create a 3D Spline-Visualization of Paths through a protein.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "-json",
+        dest="json",
+        help="quick_precomputed_clusters_paths file of your MDPath analysis",
+        required=True,
+    )
+
+    args = parser.parse_args()
+    json_file = args.json
+    MDPathVisualize.create_splines(json_file)
diff --git a/mdpath/src/bootstrap.py b/mdpath/src/bootstrap.py
@@ -101,7 +101,7 @@ def process_bootstrap_sample(
         """
         bootstrap_sample = self.create_bootstrap_sample(self.df_all_residues)
         nmi_calculator = NMICalculator(bootstrap_sample, num_bins=self.num_bins)
-        bootstrap_mi_diff = nmi_calculator.mi_diff_df
+        bootstrap_mi_diff = nmi_calculator.nmi_df
         graph = GraphBuilder(
             self.pdb, self.last_residue, bootstrap_mi_diff, self.graphdist
         )

diff --git a/mdpath/src/mutual_information.py b/mdpath/src/mutual_information.py
@@ -17,6 +17,7 @@
 from sklearn.metrics import mutual_info_score
 from sklearn.mixture import GaussianMixture
 from scipy.stats import entropy
+from scipy.special import digamma
 
 
 class NMICalculator:
@@ -29,30 +30,42 @@ class NMICalculator:
 
         GMM (optional): Option to switch between histogram method and Gaussian Mixture Model for binning before NMI calculation. Default is False.
 
-        mi_diff_df (pd.DataFrame): DataFrame containing the mutual information differences. Is calculated using either GMM or histogram method.
+        nmi_df (pd.DataFrame): DataFrame containing the mutual information differences. Is calculated using either GMM or histogram method.
 
         entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values. Is calculated using either GMM or histogram method.
     """
 
     def __init__(
-        self, df_all_residues: pd.DataFrame, num_bins: int = 35, GMM=False
+        self,
+        df_all_residues: pd.DataFrame,
+        digamma_correction=False,
+        num_bins: int = 35,
+        GMM=False,
+        invert=False,
     ) -> None:
         self.df_all_residues = df_all_residues
         self.num_bins = num_bins
+        self.digamma_correction = digamma_correction
         self.GMM = GMM
+        self.invert = invert
         if GMM:
-            self.mi_diff_df, self.entropy_df = self.NMI_calcs_with_GMM()
+            self.nmi_df, self.entropy_df = self.NMI_calcs_with_GMM()
         else:
-            self.mi_diff_df, self.entropy_df = self.NMI_calcs()
-
-    def NMI_calcs(self) -> pd.DataFrame:
-        """Nornmalized Mutual Information and Entropy calculation for all residue pairs.
-
-        Returns:
-            mi_diff_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.
-
-            entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values.
-        """
+            self.nmi_df, self.entropy_df = self.NMI_calcs()
+
+    def calculate_corrected_entropy(self, hist, total_points, num_bins):
+        """Calculate corrected entropy for a histogram."""
+        probabilities = hist / total_points
+        non_zero_probs = probabilities[probabilities > 0]
+        base_entropy = -np.sum(non_zero_probs * np.log(non_zero_probs))
+        correction = (num_bins - 1) / (2 * total_points)
+        digamma_correction = 1 / total_points * np.sum(
+            hist * digamma(hist + 1)
+        ) - digamma(total_points)
+        return base_entropy + correction + digamma_correction
+
+    def NMI_calcs(self):
+        """Extended Normalized Mutual Information and Entropy calculation."""
         entropys = {}
         normalized_mutual_info = {}
         total_iterations = len(self.df_all_residues.columns) ** 2
@@ -63,36 +76,79 @@ def NMI_calcs(self) -> pd.DataFrame:
             for col1 in self.df_all_residues.columns:
                 for col2 in self.df_all_residues.columns:
                     if col1 != col2:
-                        hist_col1, _ = np.histogram(
-                            self.df_all_residues[col1], bins=self.num_bins
-                        )
-                        hist_col2, _ = np.histogram(
-                            self.df_all_residues[col2], bins=self.num_bins
-                        )
-                        hist_joint, _, _ = np.histogram2d(
-                            self.df_all_residues[col1],
-                            self.df_all_residues[col2],
-                            bins=self.num_bins,
-                        )
-                        mi = mutual_info_score(
-                            hist_col1, hist_col2, contingency=hist_joint
-                        )
-                        entropy_col1 = entropy(hist_col1)
-                        entropy_col2 = entropy(hist_col2)
-                        entropys[col1] = entropy_col1
-                        entropys[col2] = entropy_col2
-                        nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
-                        normalized_mutual_info[(col1, col2)] = nmi
-                        progress_bar.update(1)
+                        if self.digamma_correction:
+                            # Adaptive binning
+                            data_col1 = self.df_all_residues[col1].values
+                            data_col2 = self.df_all_residues[col2].values
+                            hist_col1, bin_edges1 = np.histogram(
+                                data_col1, bins=self.num_bins
+                            )
+                            hist_col2, bin_edges2 = np.histogram(
+                                data_col2, bins=self.num_bins
+                            )
+                            hist_joint, _, _ = np.histogram2d(
+                                data_col1,
+                                data_col2,
+                                bins=(self.num_bins, self.num_bins),
+                            )
+
+                            # Total data points
+                            total_points = len(data_col1)
+
+                            # Corrected entropy estimates
+                            entropy_col1 = self.calculate_corrected_entropy(
+                                hist_col1, total_points, self.num_bins
+                            )
+                            entropy_col2 = self.calculate_corrected_entropy(
+                                hist_col2, total_points, self.num_bins
+                            )
+                            joint_entropy = self.calculate_corrected_entropy(
+                                hist_joint.flatten(), total_points, self.num_bins**2
+                            )
+
+                            # Mutual Information
+                            mi = entropy_col1 + entropy_col2 - joint_entropy
+                            entropys[col1] = entropy_col1
+                            entropys[col2] = entropy_col2
+
+                            # Normalized MI
+                            nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
+                            normalized_mutual_info[(col1, col2)] = nmi
+
+                            progress_bar.update(1)
+                        else:
+                            hist_col1, _ = np.histogram(
+                                self.df_all_residues[col1], bins=self.num_bins
+                            )
+                            hist_col2, _ = np.histogram(
+                                self.df_all_residues[col2], bins=self.num_bins
+                            )
+                            hist_joint, _, _ = np.histogram2d(
+                                self.df_all_residues[col1],
+                                self.df_all_residues[col2],
+                                bins=self.num_bins,
+                            )
+                            mi = mutual_info_score(
+                                hist_col1, hist_col2, contingency=hist_joint
+                            )
+                            entropy_col1 = entropy(hist_col1)
+                            entropy_col2 = entropy(hist_col2)
+                            entropys[col1] = entropy_col1
+                            entropys[col2] = entropy_col2
+                            nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
+                            normalized_mutual_info[(col1, col2)] = nmi
+                            progress_bar.update(1)
+
         entropy_df = pd.DataFrame(entropys.items(), columns=["Residue", "Entropy"])
-        mi_diff_df = pd.DataFrame(
+        nmi_df = pd.DataFrame(
             normalized_mutual_info.items(), columns=["Residue Pair", "MI Difference"]
         )
-        max_mi_diff = mi_diff_df["MI Difference"].max()
-        mi_diff_df["MI Difference"] = (
-            max_mi_diff - mi_diff_df["MI Difference"]
-        )  # Calculate the the weights
-        return mi_diff_df, entropy_df
+        if self.invert:
+            max_nmi_diff = nmi_df["MI Difference"].max()
+            nmi_df["MI Difference"] = (
+                max_nmi_diff - nmi_df["MI Difference"]
+            )
+        return nmi_df, entropy_df
 
     def select_n_components(data: pd.DataFrame, max_components: int = 10) -> int:
         """Select the optimal number of GMM components using BIC
@@ -123,7 +179,7 @@ def NMI_calcs_with_GMM(self) -> pd.DataFrame:
         """Nornmalized Mutual Information and Entropy calculation for all residue pairs using Gaussian Mixture Models (GMM) for binning.
 
         Returns:
-            mi_diff_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.
+            nmi_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.
 
             entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values.
         """
@@ -168,13 +224,12 @@ def NMI_calcs_with_GMM(self) -> pd.DataFrame:
                         progress_bar.update(1)
 
         entropy_df = pd.DataFrame(entropys.items(), columns=["Residue", "Entropy"])
-        mi_diff_df = pd.DataFrame(
+        nmi_df = pd.DataFrame(
             normalized_mutual_info.items(), columns=["Residue Pair", "MI Difference"]
         )
-
-        max_mi_diff = mi_diff_df["MI Difference"].max()
-        mi_diff_df["MI Difference"] = (
-            max_mi_diff - mi_diff_df["MI Difference"]
-        )  # Calculate the weights
-
-        return mi_diff_df, entropy_df
+        if self.invert:
+            max_nmi_diff = nmi_df["MI Difference"].max()
+            nmi_df["MI Difference"] = (
+                max_nmi_diff - nmi_df["MI Difference"]
+            )
+        return nmi_df, entropy_df