parisepigenetics · cbouyio · Jan 26, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ obsolete_/
 __pycache__/
 *.py[cod]
 *$py.class
+.vscode
 
 # C extensions
 *.so

diff --git a/great3D b/great3D
@@ -7,12 +7,14 @@ Take a 3D gene positions file and a gene expression matrix, compute spatial corr
 
 import argparse
 import pandas as pd
-import dash
-from dash import dcc, html, dash_table
+import plotly.graph_objects as go
+from dash import Dash, dcc, html, dash_table
+
 
 # Import package specific py_modules
 import great_3d.genome_3D_integration as grtGI
-import great_3d.transcriptome_3d as gtr3d
+import great_3d.base_calulations as greatCalc
+import great_3d.plotly_graphs as greatPlot
 
 
 # Command line arguments parsing
@@ -53,63 +55,96 @@ parser.add_argument("-n", "--nb-genes",
                     default=10,
                     dest="nGenes",
                     metavar="noGenes")
-parser.add_argument("-r", "--correlation-method",
+parser.add_argument("-m", "--correlation-method",
                     help="Define correlation method. (DEFAULT = 'pearson')",
                     type=str,
                     default="pearson",
                     dest="mCorr",
                     metavar="correlation_method")
+parser.add_argument("-r", "--resolution",
+                    help="The Hi-C file bins size (resolution). (DEFAULT = 5000)",
+                    type=int,
+                    default=5000,
+                    dest="resolution",
+                    metavar="hicResolution")
 parser.add_argument("-t", "--title",
                     help="Title for the 3D plotly praph. (DEFAULT = '')",
                     type=str,
                     default="",
                     dest="title",
                     metavar="title")
 
-
 # Generate the options-arguments object
 args = parser.parse_args()
-# Calculate the gene 3D coordinates
+# Calculate if needed the gene 3D coordinates
 if args.genePos:
     genePos = pd.read_table(args.genePos)
 elif args.geneStarts:
-    genePos = grtGI.compute_gene_coordinates(args.geneStarts, args.genome)
+    genePos = grtGI.compute_gene_coordinates(args.geneStarts, args.genome, args.resolution)
 else:
-    raise IOError("One of --gene-3Dcoords or --gene-starts arguments MUST be declared.")
-
+    raise IOError("One of --gene-3Dcoords or --gene-starts arguments must be declared.")
+
+
+def visualise_3D(traces):
+    """Render figure layout for Plotly.
+    - Args:
+    - `traces`: The list of traces.
+    - Return:
+    - The Plotly figure.
+    """
+    # Set layout elements, size, margins, legend(s)
+    layout = go.Layout(plot_bgcolor="#FFF",
+                       autosize=False,
+                       width=1600,
+                       height=1200,
+                       margin=dict(l=1, r=1, b=1, t=50),
+                       legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.1),
+                       modebar={"orientation": "h", "bgcolor": "salmon", "color": "white", "activecolor": "#9ED3CD"},
+                       hoverlabel=dict(bgcolor='rgba(255,255,255,0.7)', font=dict(color='black')))
+    # Construct the figure
+    fig = go.Figure(data=traces, layout=layout)
+    # Render axes invisible - cleaner graph.
+    fig.update_scenes(xaxis_visible=False,
+                      yaxis_visible=False,
+                      zaxis_visible=False)
+    return fig
+
+
+# Calculations:
 # Calculate the distance matrix
-distanceMatrix = gtr3d.calculate_distance(genePos)
-
+distanceMatrix = greatCalc.calculate_distance(genePos)
 # Populate the dictionary of sorted distances.
-sortedDict = gtr3d.sorting_distances(distanceMatrix)
-
+sortedDict = greatCalc.sorting_distances(distanceMatrix)
 # Compute a dictionnary of the sum of correlations between each gene and its neighbours.
-sum_Corr = gtr3d.sum_correlation(sortedDict, args.geneExpr, args.nGenes, args.mCorr)
-
-# Load and visualise genome
-traceGenome = gtr3d.generate_genome_3D(args.genome, genePos, sum_Corr, args.userGenes)
+sum_Corr = greatCalc.sum_correlation(sortedDict, args.geneExpr, args.nGenes, args.mCorr)
 
+# Visualisations:
+# Compute all visualisation tracks (traces)
+traceGenome = greatPlot.visualise_genome_3D(args.genome)
+traceGenes = greatPlot.visualise_genes(genePos)
+# genePos, sum_Corr, args.userGenes)
 # 3D visualization with Plotly
-fig = gtr3d.visualise_3D_plotly(traceGenome)
+fig = visualise_3D([traceGenome, traceGenes])
 
-# FIXME remove that later perhaps
+
+# FIXME just to practive tables visualisation remove that later perhaps
 args.geneExpr.seek(0)
 with args.geneExpr as fh:
     df = pd.read_table(fh)
 
-# Simple Dash
+# Dash application
 # TODO Enrich the Dash application with, controls (radio buttons, sliders, etc.), input files and progress bars!!!
-app = dash.Dash("GREAT3D")
+app = Dash("GREAT3D")
 app.layout = html.Div([
     html.H1(children='GREAT 3D transcriptome map'),
     html.Hr(),
     html.H3(children=args.title),
     dcc.Graph(id='dash_GREAT', figure=fig),
-    html.Hr(),
-    html.H3(children="Expression Data"),
-    dash_table.DataTable(data=df.to_dict('records'), page_size=10)
+#    html.Hr(),
+#    # Just to practice with tables visualisation
+#    html.H3(children="Expression Data"),
+#    dash_table.DataTable(data=df.to_dict('records'), page_size=10)
 ])
 
 if __name__ == '__main__':
-    #app.run(debug=True)
-    app.run()
+    app.run()  # (debug=True)
diff --git a/great_3d/__init__.py b/great_3d/__init__.py
@@ -1,8 +1,7 @@
 """The GREAT 3D suite for integrative multi-omics 3D epigenomics."""
 
 # Add imports here
-from .transcriptome_3d import *
-from .great_pdb import *
+import time
 
 # Handle versioneer
 from ._version import get_versions
@@ -14,3 +13,17 @@
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
+
+
+def timing(f):
+    """Wrapper to time functions.py.
+
+    Works as a decorator. Taken from https://stackoverflow.com/questions/5478351/python-time-measure-function
+    """
+    def wrap(*args):
+        time1 = time.time()
+        ret = f(*args)
+        time2 = time.time()
+        print("{:s} function took {:.3f} ms".format(f.__name__, (time2 - time1) * 1000.0))
+        return ret
+    return wrap
diff --git a/great_3d/base_calulations.py b/great_3d/base_calulations.py
@@ -0,0 +1,85 @@
+"""Module with the main classes/functions for the base calculations of the 3D
+analysis of transcriptoms."""
+
+import numpy as np
+import pandas as pd
+from scipy import spatial
+
+# import pprint  # for testing only!
+from great_3d import timing
+
+
+@timing
+def calculate_distance(df, distance_metric="seuclidean"):
+    """Take a genes position dataFrame, return the distance matrix as a pandas data frame.
+    - Args:
+    - `df`: The genes position dataFrame.
+    - `distance_metric`: The numpy distance metric to use (default: "seuclidean").
+
+    - Return:
+    - The distance matrix as a pandas data frame.
+    """
+    geneNames = df.index
+    da = df[["X", "Y", "Z"]].to_numpy()
+    ndarray = spatial.distance.pdist(da, distance_metric, V=None)  # Look at other available implementations from numpy
+    matrix_uni = spatial.distance.squareform(ndarray)
+    return pd.DataFrame(matrix_uni, index=geneNames, columns=geneNames)
+
+
+@timing
+def sorting_distances(dist_df):
+    """Take a genes distance matrix (Pandas DataFrame), return a dictionary of
+    gene names sorted by distance with each gene as a key.
+    - Args:
+    - `dist_df`: The genes distance matrix (Pandas DataFrame).
+
+    - Return:
+    - A dictionary of gene names sorted by distance.
+    """
+    sorted_indices = np.argsort(dist_df.values, axis=1)
+    sorted_distances = np.sort(dist_df.values, axis=1)
+    gene_names = dist_df.index
+    return {
+        gene_names[i]: pd.Series(sorted_distances[i], index=gene_names[sorted_indices[i]])
+        for i in range(len(gene_names))
+    }
+    # The above implementation might be more complicated but it is 2x times faster
+    # return {
+    #     gene_name: dist_df.loc[gene_name, :].sort_values()
+    #     for gene_name in dist_df.index
+    # }
+
+
+@timing
+def sum_correlation(dists_sorted, ge_file, no_genes, correlation_type):
+    """Take the dictionnary of the closest genes, the gene expression file, the
+    number of genes we need to compute correlation and the correlation type.
+
+    - Args:
+    - `dists_sorted`: The dictionary of the closest genes.
+    - `ge_file`: The gene expression file.
+    - `no_genes`: The number of genes to compute correlation.
+    - `correlation_type`: The correlation type.
+
+    - Return:
+    - A complex dictionary containing the sum of correlations, the sum of
+    absolute correlations, and the neighboring genes for each gene.
+    """
+    # Read the GE file
+    geDF = pd.read_table(ge_file)
+    # FIXME If a gene has zero expression we give it zero correlation immediately
+    geNumpy = geDF.to_numpy()
+    corrMnp = np.corrcoef(geNumpy)
+    corrMat = pd.DataFrame(corrMnp, index=geDF.index, columns=geDF.index)
+    correlation_sums = {}
+    # Compute correlation matrix once
+    for gene_ref, sorted_genes in dists_sorted.items():
+        # TODO check if we gain time when we paralelise this for loop!
+        selected_genes = list(sorted_genes[1: no_genes+1].index)
+        # Select all proximal correlations from the coorelation matrix.
+        correlation = corrMat.loc[gene_ref, selected_genes]
+        # correlation = np.nan_to_num(correlation)  # Correction in case of NAN values
+        sum_correlationA = sum(abs(correlation))
+        sum_correlation = sum(correlation)
+        correlation_sums[gene_ref] = [sum_correlation, sum_correlationA, selected_genes]
+    return correlation_sums
diff --git a/great_3d/genome_3D_integration.py b/great_3d/genome_3D_integration.py
@@ -4,45 +4,61 @@
 integration to the GREAT framework analysis.
 """
 
+import math
 import pandas as pd
 
 
-def compute_gene_coordinates(pos_file_Genes, pos_file_Genome3D):
-    """Get two tab files, one with the genenames andtheir respective start sites and one with the 3D genome coordinates (as midpoints of fragments of fixed length i.e. resoluton).
-
-    Return a dataFrame with the 3D coordinates of genes.
+def compute_gene_coordinates(start_sites_Genes, pos_file_Genome3D, resolution):
+    """Get two tab files, one with the gene names and their respective start sites and one with the 3D genome coordinates (as midpoints of fragments of fixed length i.e. resolution).
+
+    - Args:
+    - `starts_sites_Genes`: The file path to the gene names and start sites tab file.
+    - `pos_file_Genome3D`: The file path to the 3D genome coordinates tab file.
+    - `resolution`: The fixed length of the fragments used for 3D genome coordinates.
+    - Return:
+    - A DataFrame with the 3D coordinates of genes.
     """
-    # TODO in the first run the closest known midpoint will be considered, however n later versions amore proper geometric clculation of the 3D position of a gene must be considered.
     dfGenome = pd.read_table(pos_file_Genome3D)
-    dfGenes = pd.read_table(pos_file_Genes)
-    pos_file_Genome3D.seek(0, 0)
-    pos_file_Genes.seek(0, 0)
+    dfGenes = pd.read_table(start_sites_Genes)
     # First simple approach, the gene is assigned to its closest bin median point.
     # Check chromosome consistancy
     chromsGenome = list(set(dfGenome.loc[:, "chr"].tolist()))
     chromsGenes = list(set(dfGenes.loc[:, "chr"].tolist()))
     if chromsGenes != chromsGenome:
-        raise Exception("Chromosome consistence between genes and the genome is not met. Check the input files.")
+        raise ValueError("Chromosome inconsistency between genes file and genome file. Check the input files.")
     # Prepare the data frame lists
     names = []
-    chr = []
+    chrs = []
+    starts = []
     Xs = []
     Ys = []
     Zs = []
     # Loop through all chromosomes in the files
     for ch in chromsGenes:
         dfChromGene = dfGenes[dfGenes["chr"] == ch]
         dfChromGenome = dfGenome[dfGenome["chr"] == ch]
-        for row1 in dfChromGene.itertuples():
-            for row2 in dfChromGenome.itertuples():
-                if abs(row1.start - row2.midpoint) < 5000:  # FIXME THIS MUST NOT BE HARDCODED!!!
+        for gene in dfChromGene.itertuples():
+            for point in dfChromGenome.itertuples():
+                if abs(gene.start - point.midpoint) < resolution:
+                    pointBefor = point
+                    pointAfter = dfChromGenome[dfChromGenome["midpoint"] == point.midpoint + resolution]
                     break
-            names.append(row1.Index)
-            chr.append(ch)
-            Xs.append(row2.X)
-            Ys.append(row2.Y)
-            Zs.append(row2.Z)
+            xb, yb, zb = pointBefor.X, pointBefor.Y, pointBefor.Z
+            xa, ya, za = pointAfter.iloc[0].X, pointAfter.iloc[0].Y, pointAfter.iloc[0].Z
+            distance = math.sqrt((xa - xb)**2 + (ya - yb)**2 + (za - zb)**2)
+            a_bNorm = [(xa - xb) / distance, (ya - yb) / distance, (za - zb) / distance]
+            geneCoef = ((gene.start - point.midpoint) / resolution) * distance
+            geneExtr = [e * geneCoef for e in a_bNorm]
+            names.append(gene.Index)
+            chrs.append(ch)
+            starts.append(gene.start)
+            Xs.append(point.X + geneExtr[0])
+            Ys.append(point.Y + geneExtr[1])
+            Zs.append(point.Z + geneExtr[2])
     # Generate the new data frame and reurn it
-    df = pd.DataFrame({"chr": chr, "X": Xs, "Y": Ys, "Z": Zs})
+    df = pd.DataFrame({"chr": chr, "start": starts, "X": Xs, "Y": Ys, "Z": Zs})
     df.index = names
+    # Reset the files as they needed for later computations
+    pos_file_Genome3D.seek(0)
+    start_sites_Genes.seek(0)
     return df
diff --git a/great_3d/great_pdb.py b/great_3d/great_pdb.py
@@ -1,4 +1,5 @@
-"""Module with the main classes/functions to interface PDB formt with GREAT."""
+"""Module to interface classes/functions of genome 3D PDB with GREAT."""
+
 
 import numpy as np
 import pandas as pd
@@ -9,25 +10,15 @@
 # Bottleneck can speed up SOME functions on numpy arrays https://bottleneck.readthedocs.io/en/latest/intro.html
 import bottleneck
 
-import time
+from great_3d import timing
 import pprint  # for testing only!
 
-def timing(f):
-    """Wrapper to time functions.py
-
-    Works as a decorator. Taken from https://stackoverflow.com/questions/5478351/python-time-measure-function
-    """
-    def wrap(*args):
-        time1 = time.time()
-        ret = f(*args)
-        time2 = time.time()
-        print("{:s} function took {:.3f} ms".format(f.__name__, (time2 - time1) * 1000.0))
-        return ret
-    return wrap
-
 
+@timing
 # TODO under developemnt for parsing (IFF REQUIRED) .pdb genome structure files.
-# parser = PDB.PDBParser()
+# TODO create a PDB genome class perhaps
+def parse_PDB():
+    parser = PDB.PDBParser()
 # io = PDB.PDBIO()
 # struct = parser.get_structure("chr3", "chr3.pdb")
 #