Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mdpath dev #69

Merged
merged 16 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions mdpath/mdpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def main():

-graphdist: Default distance for residues making up the graph (default: 5.0)

-digamma_correction: Use digamma correction for entropy calculation (default: False)

-numpath: Default number of top paths considered for clustering (default: 500)
"""
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -125,6 +127,14 @@ def main():
default=500,
)

parser.add_argument(
"-digamma_correction",
dest="digamma_correction",
help="Use digamma correction for entropy calculation.",
required=False,
default=False,
)

parser.add_argument(
"-GMM",
dest="GMM",
Expand All @@ -133,6 +143,22 @@ def main():
default=False,
)

parser.add_argument(
"-chain",
dest="chain",
help="Chain of the protein to be analyzed in the topology file. CAUTION: only one chain can be selected for analysis.",
required=False,
default=False,
)

parser.add_argument(
"-invert",
dest="invert",
help="Inverts NMI bei subtrackting each NMI from max NMI. Can be used to find Paths, that are the least correlated",
required=False,
default=False,
)

args = parser.parse_args()
if not args.topology or not args.trajectory:
print("Both trajectory and topology files are required!")
Expand All @@ -148,14 +174,40 @@ def main():
closedist = float(args.closedist)
graphdist = float(args.graphdist)
numpath = int(args.numpath)
digamma_correction = bool(args.digamma_correction)
GMM = bool(args.GMM)
invert = bool(args.invert)

# Prepare the trajectory for analysis
if os.path.exists("first_frame.pdb"):
os.remove("first_frame.pdb")
with mda.Writer("first_frame.pdb", multiframe=False) as pdb:
traj.trajectory[0]
pdb.write(traj.atoms)

# Chain selection
if args.chain:
chain = str(args.chain)
chain_atoms = traj.select_atoms(f"chainID {chain}")
if len(chain_atoms) == 0:
raise ValueError(f"No atoms found for chain {chain}")
exit()
chain_universe = mda.Merge(chain_atoms)
# Write new topology
chain_universe.atoms.write("selected_chain.pdb")

# Write trajectory
with mda.Writer("selected_chain.dcd", chain_atoms.n_atoms) as W:
for ts in traj.trajectory:
chain_universe.atoms.positions = chain_atoms.positions
W.write(chain_universe.atoms)
topology = "selected_chain.pdb"
trajectory = "selected_chain.dcd"
traj = mda.Universe(topology, trajectory)
print(
f"Chain {chain} selected and written to selected_chain.pdb and selected_chain.dcd and will now be analyzed."
)

structure_calc = StructureCalculations(topology)
df_distant_residues = structure_calc.calculate_residue_suroundings(fardist, "far")
df_close_res = structure_calc.calculate_residue_suroundings(closedist, "close")
Expand All @@ -171,11 +223,13 @@ def main():
print("\033[1mTrajectory is processed and ready for analysis.\033[0m")

# Calculate the mutual information and build the graph
nmi_calc = NMICalculator(df_all_residues, GMM=GMM)
nmi_calc = NMICalculator(
df_all_residues, digamma_correction=digamma_correction, GMM=GMM, invert=invert
)
nmi_calc.entropy_df.to_csv("entropy_df.csv", index=False)
nmi_calc.mi_diff_df.to_csv("mi_diff_df.csv", index=False)
nmi_calc.nmi_df.to_csv("nmi_df.csv", index=False)
graph_builder = GraphBuilder(
topology, structure_calc.last_res_num, nmi_calc.mi_diff_df, graphdist
topology, structure_calc.last_res_num, nmi_calc.nmi_df, graphdist
)
MDPathVisualize.visualise_graph(
graph_builder.graph
Expand Down
31 changes: 31 additions & 0 deletions mdpath/mdpath_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,3 +432,34 @@ def gpcr_2D_vis():
"Topology and cluster pathways are required for creating a 2D visualization of GPCR paths."
)
exit(1)


def spline():
"""
Create a 3D Visualization of Paths through a protein using accurate spline representations.

This function provides a command-line interface (CLI) for creating a 3D visualization of paths through a protein.
It uses the pre-calculated cluster paths and recalculates them using accurate spline intrapolation.
The output meshfiles can be used directly in Blender to accurately capture paths.

Command-line inputs:
-json (str): Json file of the MDPath analysis -> "quick_precomputed_clusters_paths"

Example usage:
$ mdpath_spline -json <path_to_json>
"""
parser = argparse.ArgumentParser(
prog="mdpath_spline",
description="Create a 3D Spline-Visualization of Paths through a protein.",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"-json",
dest="json",
help="quick_precomputed_clusters_paths file of your MDPath analysis",
required=True,
)

args = parser.parse_args()
json_file = args.json
MDPathVisualize.create_splines(json_file)
2 changes: 1 addition & 1 deletion mdpath/src/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def process_bootstrap_sample(
"""
bootstrap_sample = self.create_bootstrap_sample(self.df_all_residues)
nmi_calculator = NMICalculator(bootstrap_sample, num_bins=self.num_bins)
bootstrap_mi_diff = nmi_calculator.mi_diff_df
bootstrap_mi_diff = nmi_calculator.nmi_df
graph = GraphBuilder(
self.pdb, self.last_residue, bootstrap_mi_diff, self.graphdist
)
Expand Down
153 changes: 104 additions & 49 deletions mdpath/src/mutual_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.metrics import mutual_info_score
from sklearn.mixture import GaussianMixture
from scipy.stats import entropy
from scipy.special import digamma


class NMICalculator:
Expand All @@ -29,30 +30,42 @@ class NMICalculator:

GMM (optional): Option to switch between histogram method and Gaussian Mixture Model for binning before NMI calculation. Default is False.

mi_diff_df (pd.DataFrame): DataFrame containing the mutual information differences. Is calculated using either GMM or histogram method.
nmi_df (pd.DataFrame): DataFrame containing the mutual information differences. Is calculated using either GMM or histogram method.

entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values. Is calculated using either GMM or histogram method.
"""

def __init__(
self, df_all_residues: pd.DataFrame, num_bins: int = 35, GMM=False
self,
df_all_residues: pd.DataFrame,
digamma_correction=False,
num_bins: int = 35,
GMM=False,
invert=False,
) -> None:
self.df_all_residues = df_all_residues
self.num_bins = num_bins
self.digamma_correction = digamma_correction
self.GMM = GMM
self.invert = invert
if GMM:
self.mi_diff_df, self.entropy_df = self.NMI_calcs_with_GMM()
self.nmi_df, self.entropy_df = self.NMI_calcs_with_GMM()
else:
self.mi_diff_df, self.entropy_df = self.NMI_calcs()

def NMI_calcs(self) -> pd.DataFrame:
"""Nornmalized Mutual Information and Entropy calculation for all residue pairs.

Returns:
mi_diff_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.

entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values.
"""
self.nmi_df, self.entropy_df = self.NMI_calcs()

def calculate_corrected_entropy(self, hist, total_points, num_bins):
"""Calculate corrected entropy for a histogram."""
probabilities = hist / total_points
non_zero_probs = probabilities[probabilities > 0]
base_entropy = -np.sum(non_zero_probs * np.log(non_zero_probs))
correction = (num_bins - 1) / (2 * total_points)
digamma_correction = 1 / total_points * np.sum(
hist * digamma(hist + 1)
) - digamma(total_points)
return base_entropy + correction + digamma_correction

def NMI_calcs(self):
"""Extended Normalized Mutual Information and Entropy calculation."""
entropys = {}
normalized_mutual_info = {}
total_iterations = len(self.df_all_residues.columns) ** 2
Expand All @@ -63,36 +76,79 @@ def NMI_calcs(self) -> pd.DataFrame:
for col1 in self.df_all_residues.columns:
for col2 in self.df_all_residues.columns:
if col1 != col2:
hist_col1, _ = np.histogram(
self.df_all_residues[col1], bins=self.num_bins
)
hist_col2, _ = np.histogram(
self.df_all_residues[col2], bins=self.num_bins
)
hist_joint, _, _ = np.histogram2d(
self.df_all_residues[col1],
self.df_all_residues[col2],
bins=self.num_bins,
)
mi = mutual_info_score(
hist_col1, hist_col2, contingency=hist_joint
)
entropy_col1 = entropy(hist_col1)
entropy_col2 = entropy(hist_col2)
entropys[col1] = entropy_col1
entropys[col2] = entropy_col2
nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
normalized_mutual_info[(col1, col2)] = nmi
progress_bar.update(1)
if self.digamma_correction:
# Adaptive binning
data_col1 = self.df_all_residues[col1].values
data_col2 = self.df_all_residues[col2].values
hist_col1, bin_edges1 = np.histogram(
data_col1, bins=self.num_bins
)
hist_col2, bin_edges2 = np.histogram(
data_col2, bins=self.num_bins
)
hist_joint, _, _ = np.histogram2d(
data_col1,
data_col2,
bins=(self.num_bins, self.num_bins),
)

# Total data points
total_points = len(data_col1)

# Corrected entropy estimates
entropy_col1 = self.calculate_corrected_entropy(
hist_col1, total_points, self.num_bins
)
entropy_col2 = self.calculate_corrected_entropy(
hist_col2, total_points, self.num_bins
)
joint_entropy = self.calculate_corrected_entropy(
hist_joint.flatten(), total_points, self.num_bins**2
)

# Mutual Information
mi = entropy_col1 + entropy_col2 - joint_entropy
entropys[col1] = entropy_col1
entropys[col2] = entropy_col2

# Normalized MI
nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
normalized_mutual_info[(col1, col2)] = nmi

progress_bar.update(1)
else:
hist_col1, _ = np.histogram(
self.df_all_residues[col1], bins=self.num_bins
)
hist_col2, _ = np.histogram(
self.df_all_residues[col2], bins=self.num_bins
)
hist_joint, _, _ = np.histogram2d(
self.df_all_residues[col1],
self.df_all_residues[col2],
bins=self.num_bins,
)
mi = mutual_info_score(
hist_col1, hist_col2, contingency=hist_joint
)
entropy_col1 = entropy(hist_col1)
entropy_col2 = entropy(hist_col2)
entropys[col1] = entropy_col1
entropys[col2] = entropy_col2
nmi = mi / np.sqrt(entropy_col1 * entropy_col2)
normalized_mutual_info[(col1, col2)] = nmi
progress_bar.update(1)

entropy_df = pd.DataFrame(entropys.items(), columns=["Residue", "Entropy"])
mi_diff_df = pd.DataFrame(
nmi_df = pd.DataFrame(
normalized_mutual_info.items(), columns=["Residue Pair", "MI Difference"]
)
max_mi_diff = mi_diff_df["MI Difference"].max()
mi_diff_df["MI Difference"] = (
max_mi_diff - mi_diff_df["MI Difference"]
) # Calculate the the weights
return mi_diff_df, entropy_df
if self.invert:
max_nmi_diff = nmi_df["MI Difference"].max()
nmi_df["MI Difference"] = (
max_nmi_diff - nmi_df["MI Difference"]
)
return nmi_df, entropy_df

def select_n_components(data: pd.DataFrame, max_components: int = 10) -> int:
"""Select the optimal number of GMM components using BIC
Expand Down Expand Up @@ -123,7 +179,7 @@ def NMI_calcs_with_GMM(self) -> pd.DataFrame:
"""Nornmalized Mutual Information and Entropy calculation for all residue pairs using Gaussian Mixture Models (GMM) for binning.

Returns:
mi_diff_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.
nmi_df (pd.DataFrame): Pandas dataframe with residue pair and mutual information difference.

entropy_df (pd.DataFrame): Pandas dataframe with residue and entropy values.
"""
Expand Down Expand Up @@ -168,13 +224,12 @@ def NMI_calcs_with_GMM(self) -> pd.DataFrame:
progress_bar.update(1)

entropy_df = pd.DataFrame(entropys.items(), columns=["Residue", "Entropy"])
mi_diff_df = pd.DataFrame(
nmi_df = pd.DataFrame(
normalized_mutual_info.items(), columns=["Residue Pair", "MI Difference"]
)

max_mi_diff = mi_diff_df["MI Difference"].max()
mi_diff_df["MI Difference"] = (
max_mi_diff - mi_diff_df["MI Difference"]
) # Calculate the weights

return mi_diff_df, entropy_df
if self.invert:
max_nmi_diff = nmi_df["MI Difference"].max()
nmi_df["MI Difference"] = (
max_nmi_diff - nmi_df["MI Difference"]
)
return nmi_df, entropy_df
Loading
Loading