Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev restruct opt #22

Merged
merged 22 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
df17d5b
Correct some whitespaces.
cbouyio Jan 22, 2024
377d3c4
Remove uncessecary impots to the main and add thw timing decorator.
cbouyio Jan 22, 2024
f80a317
Change the imports for the new code break into two major great3d parts.
cbouyio Jan 22, 2024
b3e4d67
Remove the old monolithic file of transcriptome3d.
cbouyio Jan 22, 2024
79715ff
Introduce the new package structure with two seperate modules to cond…
cbouyio Jan 22, 2024
2cf58ad
Implementation of new correlation calculation function. 3times more e…
cbouyio Jan 22, 2024
edb98aa
Some syntactic fixes, comments, imports, etc.
cbouyio Jan 23, 2024
703beba
Some major performace improvements, fix imports, refactor functions. …
cbouyio Jan 23, 2024
a5e6a96
Remove canvas call at the end of the plotly graphs module.
cbouyio Jan 25, 2024
5ff9ce9
Code rearrangement to better handle tasks between the major app and t…
cbouyio Jan 25, 2024
1dbed66
Some prelim fixes on GREAT PDB under development, package. Not any re…
cbouyio Jan 25, 2024
7491ca8
Remove some extraneous seek() commands and remove a hardcoded varianb…
cbouyio Jan 25, 2024
8cb406b
Fix some typos on the integration file.
cbouyio Jan 25, 2024
5ce9071
Add vscode configuration file into gitignore.
cbouyio Jan 25, 2024
25ed692
Add an unimplemented function for parsing PDB files.
cbouyio Jan 25, 2024
7f9be07
Re-include the reseting of genome and positions files (seek) as they …
cbouyio Jan 25, 2024
b3ec4de
Fix some docstrings on the base_calc module.
cbouyio Jan 25, 2024
d944c64
First working implementation of the modular approach for plotting and…
cbouyio Jan 25, 2024
63ebd5c
Proper implementation of gene posiytions file based on geometric reco…
cbouyio Jan 25, 2024
82ddf62
Implementation of the visualise genes function with the right gene st…
cbouyio Jan 25, 2024
c0497a0
Add the genes visualisation on the main program and include a new com…
cbouyio Jan 25, 2024
8b5da68
Merge pull request #21 from parisepigenetics/dev-gene-positions
cbouyio Jan 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ obsolete_/
__pycache__/
*.py[cod]
*$py.class
.vscode

# C extensions
*.so
Expand Down
87 changes: 61 additions & 26 deletions great3D
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ Take a 3D gene positions file and a gene expression matrix, compute spatial corr

import argparse
import pandas as pd
import dash
from dash import dcc, html, dash_table
import plotly.graph_objects as go
from dash import Dash, dcc, html, dash_table


# Import package specific py_modules
import great_3d.genome_3D_integration as grtGI
import great_3d.transcriptome_3d as gtr3d
import great_3d.base_calulations as greatCalc
import great_3d.plotly_graphs as greatPlot


# Command line arguments parsing
Expand Down Expand Up @@ -53,63 +55,96 @@ parser.add_argument("-n", "--nb-genes",
default=10,
dest="nGenes",
metavar="noGenes")
parser.add_argument("-r", "--correlation-method",
parser.add_argument("-m", "--correlation-method",
help="Define correlation method. (DEFAULT = 'pearson')",
type=str,
default="pearson",
dest="mCorr",
metavar="correlation_method")
parser.add_argument("-r", "--resolution",
help="The Hi-C file bins size (resolution). (DEFAULT = 5000)",
type=int,
default=5000,
dest="resolution",
metavar="hicResolution")
parser.add_argument("-t", "--title",
help="Title for the 3D plotly praph. (DEFAULT = '')",
type=str,
default="",
dest="title",
metavar="title")


# Generate the options-arguments object
args = parser.parse_args()
# Calculate the gene 3D coordinates
# Calculate if needed the gene 3D coordinates
if args.genePos:
genePos = pd.read_table(args.genePos)
elif args.geneStarts:
genePos = grtGI.compute_gene_coordinates(args.geneStarts, args.genome)
genePos = grtGI.compute_gene_coordinates(args.geneStarts, args.genome, args.resolution)
else:
raise IOError("One of --gene-3Dcoords or --gene-starts arguments MUST be declared.")

raise IOError("One of --gene-3Dcoords or --gene-starts arguments must be declared.")


def visualise_3D(traces):
"""Render figure layout for Plotly.
- Args:
- `traces`: The list of traces.
- Return:
- The Plotly figure.
"""
# Set layout elements, size, margins, legend(s)
layout = go.Layout(plot_bgcolor="#FFF",
autosize=False,
width=1600,
height=1200,
margin=dict(l=1, r=1, b=1, t=50),
legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.1),
modebar={"orientation": "h", "bgcolor": "salmon", "color": "white", "activecolor": "#9ED3CD"},
hoverlabel=dict(bgcolor='rgba(255,255,255,0.7)', font=dict(color='black')))
# Construct the figure
fig = go.Figure(data=traces, layout=layout)
# Render axes invisible - cleaner graph.
fig.update_scenes(xaxis_visible=False,
yaxis_visible=False,
zaxis_visible=False)
return fig


# Calculations:
# Calculate the distance matrix
distanceMatrix = gtr3d.calculate_distance(genePos)

distanceMatrix = greatCalc.calculate_distance(genePos)
# Populate the dictionary of sorted distances.
sortedDict = gtr3d.sorting_distances(distanceMatrix)

sortedDict = greatCalc.sorting_distances(distanceMatrix)
# Compute a dictionnary of the sum of correlations between each gene and its neighbours.
sum_Corr = gtr3d.sum_correlation(sortedDict, args.geneExpr, args.nGenes, args.mCorr)

# Load and visualise genome
traceGenome = gtr3d.generate_genome_3D(args.genome, genePos, sum_Corr, args.userGenes)
sum_Corr = greatCalc.sum_correlation(sortedDict, args.geneExpr, args.nGenes, args.mCorr)

# Visualisations:
# Compute all visualisation tracks (traces)
traceGenome = greatPlot.visualise_genome_3D(args.genome)
traceGenes = greatPlot.visualise_genes(genePos)
# genePos, sum_Corr, args.userGenes)
# 3D visualization with Plotly
fig = gtr3d.visualise_3D_plotly(traceGenome)
fig = visualise_3D([traceGenome, traceGenes])

# FIXME remove that later perhaps

# FIXME just to practive tables visualisation remove that later perhaps
args.geneExpr.seek(0)
with args.geneExpr as fh:
df = pd.read_table(fh)

# Simple Dash
# Dash application
# TODO Enrich the Dash application with, controls (radio buttons, sliders, etc.), input files and progress bars!!!
app = dash.Dash("GREAT3D")
app = Dash("GREAT3D")
app.layout = html.Div([
html.H1(children='GREAT 3D transcriptome map'),
html.Hr(),
html.H3(children=args.title),
dcc.Graph(id='dash_GREAT', figure=fig),
html.Hr(),
html.H3(children="Expression Data"),
dash_table.DataTable(data=df.to_dict('records'), page_size=10)
# html.Hr(),
# # Just to practice with tables visualisation
# html.H3(children="Expression Data"),
# dash_table.DataTable(data=df.to_dict('records'), page_size=10)
])

if __name__ == '__main__':
#app.run(debug=True)
app.run()
app.run() # (debug=True)
17 changes: 15 additions & 2 deletions great_3d/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""The GREAT 3D suite for integrative multi-omics 3D epigenomics."""

# Add imports here
from .transcriptome_3d import *
from .great_pdb import *
import time

# Handle versioneer
from ._version import get_versions
Expand All @@ -14,3 +13,17 @@
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions


def timing(f):
"""Wrapper to time functions.py.

Works as a decorator. Taken from https://stackoverflow.com/questions/5478351/python-time-measure-function
"""
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print("{:s} function took {:.3f} ms".format(f.__name__, (time2 - time1) * 1000.0))
return ret
return wrap
85 changes: 85 additions & 0 deletions great_3d/base_calulations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Module with the main classes/functions for the base calculations of the 3D
analysis of transcriptoms."""

import numpy as np
import pandas as pd
from scipy import spatial

# import pprint # for testing only!
from great_3d import timing


@timing
def calculate_distance(df, distance_metric="seuclidean"):
"""Take a genes position dataFrame, return the distance matrix as a pandas data frame.
- Args:
- `df`: The genes position dataFrame.
- `distance_metric`: The numpy distance metric to use (default: "seuclidean").

- Return:
- The distance matrix as a pandas data frame.
"""
geneNames = df.index
da = df[["X", "Y", "Z"]].to_numpy()
ndarray = spatial.distance.pdist(da, distance_metric, V=None) # Look at other available implementations from numpy
matrix_uni = spatial.distance.squareform(ndarray)
return pd.DataFrame(matrix_uni, index=geneNames, columns=geneNames)


@timing
def sorting_distances(dist_df):
"""Take a genes distance matrix (Pandas DataFrame), return a dictionary of
gene names sorted by distance with each gene as a key.
- Args:
- `dist_df`: The genes distance matrix (Pandas DataFrame).

- Return:
- A dictionary of gene names sorted by distance.
"""
sorted_indices = np.argsort(dist_df.values, axis=1)
sorted_distances = np.sort(dist_df.values, axis=1)
gene_names = dist_df.index
return {
gene_names[i]: pd.Series(sorted_distances[i], index=gene_names[sorted_indices[i]])
for i in range(len(gene_names))
}
# The above implementation might be more complicated but it is 2x times faster
# return {
# gene_name: dist_df.loc[gene_name, :].sort_values()
# for gene_name in dist_df.index
# }


@timing
def sum_correlation(dists_sorted, ge_file, no_genes, correlation_type):
"""Take the dictionnary of the closest genes, the gene expression file, the
number of genes we need to compute correlation and the correlation type.

- Args:
- `dists_sorted`: The dictionary of the closest genes.
- `ge_file`: The gene expression file.
- `no_genes`: The number of genes to compute correlation.
- `correlation_type`: The correlation type.

- Return:
- A complex dictionary containing the sum of correlations, the sum of
absolute correlations, and the neighboring genes for each gene.
"""
# Read the GE file
geDF = pd.read_table(ge_file)
# FIXME If a gene has zero expression we give it zero correlation immediately
geNumpy = geDF.to_numpy()
corrMnp = np.corrcoef(geNumpy)
corrMat = pd.DataFrame(corrMnp, index=geDF.index, columns=geDF.index)
correlation_sums = {}
# Compute correlation matrix once
for gene_ref, sorted_genes in dists_sorted.items():
# TODO check if we gain time when we paralelise this for loop!
selected_genes = list(sorted_genes[1: no_genes+1].index)
# Select all proximal correlations from the coorelation matrix.
correlation = corrMat.loc[gene_ref, selected_genes]
# correlation = np.nan_to_num(correlation) # Correction in case of NAN values
sum_correlationA = sum(abs(correlation))
sum_correlation = sum(correlation)
correlation_sums[gene_ref] = [sum_correlation, sum_correlationA, selected_genes]
return correlation_sums
54 changes: 35 additions & 19 deletions great_3d/genome_3D_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,61 @@
integration to the GREAT framework analysis.
"""

import math
import pandas as pd


def compute_gene_coordinates(pos_file_Genes, pos_file_Genome3D):
"""Get two tab files, one with the genenames andtheir respective start sites and one with the 3D genome coordinates (as midpoints of fragments of fixed length i.e. resoluton).

Return a dataFrame with the 3D coordinates of genes.
def compute_gene_coordinates(start_sites_Genes, pos_file_Genome3D, resolution):
"""Get two tab files, one with the gene names and their respective start sites and one with the 3D genome coordinates (as midpoints of fragments of fixed length i.e. resolution).

- Args:
- `starts_sites_Genes`: The file path to the gene names and start sites tab file.
- `pos_file_Genome3D`: The file path to the 3D genome coordinates tab file.
- `resolution`: The fixed length of the fragments used for 3D genome coordinates.
- Return:
- A DataFrame with the 3D coordinates of genes.
"""
# TODO in the first run the closest known midpoint will be considered, however n later versions amore proper geometric clculation of the 3D position of a gene must be considered.
dfGenome = pd.read_table(pos_file_Genome3D)
dfGenes = pd.read_table(pos_file_Genes)
pos_file_Genome3D.seek(0, 0)
pos_file_Genes.seek(0, 0)
dfGenes = pd.read_table(start_sites_Genes)
# First simple approach, the gene is assigned to its closest bin median point.
# Check chromosome consistancy
chromsGenome = list(set(dfGenome.loc[:, "chr"].tolist()))
chromsGenes = list(set(dfGenes.loc[:, "chr"].tolist()))
if chromsGenes != chromsGenome:
raise Exception("Chromosome consistence between genes and the genome is not met. Check the input files.")
raise ValueError("Chromosome inconsistency between genes file and genome file. Check the input files.")
# Prepare the data frame lists
names = []
chr = []
chrs = []
starts = []
Xs = []
Ys = []
Zs = []
# Loop through all chromosomes in the files
for ch in chromsGenes:
dfChromGene = dfGenes[dfGenes["chr"] == ch]
dfChromGenome = dfGenome[dfGenome["chr"] == ch]
for row1 in dfChromGene.itertuples():
for row2 in dfChromGenome.itertuples():
if abs(row1.start - row2.midpoint) < 5000: # FIXME THIS MUST NOT BE HARDCODED!!!
for gene in dfChromGene.itertuples():
for point in dfChromGenome.itertuples():
if abs(gene.start - point.midpoint) < resolution:
pointBefor = point
cbouyio marked this conversation as resolved.
Show resolved Hide resolved
pointAfter = dfChromGenome[dfChromGenome["midpoint"] == point.midpoint + resolution]
break
names.append(row1.Index)
chr.append(ch)
Xs.append(row2.X)
Ys.append(row2.Y)
Zs.append(row2.Z)
xb, yb, zb = pointBefor.X, pointBefor.Y, pointBefor.Z
xa, ya, za = pointAfter.iloc[0].X, pointAfter.iloc[0].Y, pointAfter.iloc[0].Z
distance = math.sqrt((xa - xb)**2 + (ya - yb)**2 + (za - zb)**2)
a_bNorm = [(xa - xb) / distance, (ya - yb) / distance, (za - zb) / distance]
geneCoef = ((gene.start - point.midpoint) / resolution) * distance
geneExtr = [e * geneCoef for e in a_bNorm]
names.append(gene.Index)
chrs.append(ch)
starts.append(gene.start)
Xs.append(point.X + geneExtr[0])
Ys.append(point.Y + geneExtr[1])
Zs.append(point.Z + geneExtr[2])
# Generate the new data frame and reurn it
df = pd.DataFrame({"chr": chr, "X": Xs, "Y": Ys, "Z": Zs})
df = pd.DataFrame({"chr": chr, "start": starts, "X": Xs, "Y": Ys, "Z": Zs})
cbouyio marked this conversation as resolved.
Show resolved Hide resolved
df.index = names
# Reset the files as they needed for later computations
pos_file_Genome3D.seek(0)
start_sites_Genes.seek(0)
return df
23 changes: 7 additions & 16 deletions great_3d/great_pdb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module with the main classes/functions to interface PDB formt with GREAT."""
"""Module to interface classes/functions of genome 3D PDB with GREAT."""


import numpy as np
import pandas as pd
Expand All @@ -9,25 +10,15 @@
# Bottleneck can speed up SOME functions on numpy arrays https://bottleneck.readthedocs.io/en/latest/intro.html
import bottleneck

import time
from great_3d import timing
import pprint # for testing only!

def timing(f):
"""Wrapper to time functions.py

Works as a decorator. Taken from https://stackoverflow.com/questions/5478351/python-time-measure-function
"""
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print("{:s} function took {:.3f} ms".format(f.__name__, (time2 - time1) * 1000.0))
return ret
return wrap


@timing
# TODO under developemnt for parsing (IFF REQUIRED) .pdb genome structure files.
# parser = PDB.PDBParser()
# TODO create a PDB genome class perhaps
def parse_PDB():
parser = PDB.PDBParser()
# io = PDB.PDBIO()
# struct = parser.get_structure("chr3", "chr3.pdb")
#
Expand Down
Loading