Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
monicadragan2 committed Nov 2, 2024
1 parent 9c09d26 commit c14d7ae
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 11 deletions.
62 changes: 53 additions & 9 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import sys
from scipy.spatial import distance
from scipy.cluster import hierarchy
from anytree import RenderTree, PreOrderIter
from anytree.importer import JsonImporter

Expand All @@ -19,6 +20,8 @@
from utils_conipher import getConipherTrees
from utils_umap import computeUMAP
from utils_oncotreevis import createOncotreeVISInput
from utils_clustering import getSimilarityClusters
from utils_clustering import jaccard_distance

parser = argparse.ArgumentParser()
parser.add_argument("cancer_type", choices=["tupro_melanoma", "scatrex_melanoma", "tupro_melanoma_side_by_side",
Expand Down Expand Up @@ -116,19 +119,60 @@ def isNaN(num):

elif _CANCER_TYPE == "tupro_aml":
clusters = [["DOROBOF", "DYBEKIM"], ["DOBIFIK", "DYBAHAK", "DYBIDYF"], ["DADEDEM", "DEBEDIG"], ["DABIJUH", "UBADAFA"], ["UTAPYSO", "DUBEJIH"], ["DEJAFAB", "DOROFEG", "DUBIBEP", "DYVUHYB", "DEJIBEB", "DIBAHUC", "DEBEGUC", "DOBAFAM", "DYWYJUB"], ["UGABOLU"]]
neutral_clones_aml = ['DOROBOF_3','DOROFEG_0', 'DOROFEG_1','DOMIBEG_0', 'DOMIBEG_1', 'DOMIBEG_2','DOPIBOJ_0','DADEDEM_0','DEJAFAB_0','DEJIBEB_0','DYBEKIM_0', 'DYBEKIM_1','DYVUHYB_0','DYWYJUB_0','DOBIFIK_1','DYBAHAK_0','DYBIDYF_0','DEBEGUC_0', 'DEBEGUC_1', 'DEBEGUC_2','DIBAHUC_0','DUBIBEP_0','DOBAFAM_0','DIBADAG_0','DOBEKUF_0', 'DOBEKUF_1','DUBEJIH_0','UGABOLU_0', 'UGABOLU_1','DABIJUH_0','UTAPYSO_3','DEBEDIG_0']

anytrees = read_json_trees("data/tupro/samples_aml_v1.15_prioriry_genes.js")

# Populate `matching_labels`.
def gene_state_to_string(gene, cn, neutral_state):
gene = gene.split(".")[0]
if cn > 0:
return gene
else:
if cn == -neutral_state:
return gene + "_lost"
else:
return gene + "_del"

node_gene_map = {}
for sample_name in anytrees:
neutral_state = 2
anytrees[sample_name] = importer.import_(json.dumps(anytrees[sample_name]["event_tree"]))
for node in PreOrderIter(anytrees[sample_name]):
if hasattr(node,"node_label") and sample_name + "_" + str(node.node_label) in neutral_clones_aml:
node.is_neutral = True

del anytrees["DIBADAG"]
del anytrees["URAMOSE"]
del anytrees["DOMIBEG"]
del anytrees["DOBEKUF"]
del anytrees["DOPIBOJ"]
if hasattr(node,"gene_cn_events") and not node.is_neutral:
node_key = "_".join([sample_name, str(node.node_id)])
gene_set = set()
for gene in node.gene_cn_events:
gene_set.add(gene_state_to_string(gene, node.gene_cn_events[gene], neutral_state))
node_gene_map[node_key] = gene_set

nodes = list(node_gene_map.keys())
df_jaccard_distances = pd.DataFrame(0, columns=nodes, index=nodes).astype(float)
for node_1 in nodes:
for node_2 in nodes:
if node_1 < node_2:
dist = jaccard_distance(node_gene_map[node_1], node_gene_map[node_2])
df_jaccard_distances[node_1][node_2] = dist
df_jaccard_distances[node_2][node_1] = dist

# Cluster the clones.
clustering = hierarchy.linkage(
distance.pdist(df_jaccard_distances), metric="euclidean", method="ward")
clone_clusters = getSimilarityClusters(clustering, nodes, df_jaccard_distances, distance_threshold=0.5)
print(clone_clusters)

clone_labels = {}
clone_cluster_id = 2 # start the labeling of the malignant clones from 2 (0 is reserved for the root and 1 for the neutral clones).
for cluster in clone_clusters:
for clone in cluster:
clone_labels[clone] = clone_cluster_id
clone_cluster_id += 1

for sample_name, tree in anytrees.items():
for node in PreOrderIter(tree):
node_key = "_".join([sample_name, str(node.node_id)])
if node.parent and node_key in clone_labels:
if not hasattr(node, "is_neutral") or (hasattr(node, "is_neutral") and not node.is_neutral):
node.matching_label = clone_labels[node_key]

elif _CANCER_TYPE == "tupro_ovarian":
anytrees = read_json_trees("data/tupro_ovarian/samples_ovarian_v1.15_priority_genes_26032024.js")
Expand Down
1 change: 1 addition & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ def create_dir(dir_name):
os.makedirs(dir_name)



102 changes: 102 additions & 0 deletions scripts/utils_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import numpy as np
from anytree import Node, PreOrderIter

def jaccard_distance(set_1, set_2, is_malignant=True):
# By default the jaccard distance between two empty sets is 0. If the nodes are malignant, then we want to return distance 1.
if len(set_1) == 0 and len(set_2) == 0 and is_malignant:
return 1
intersection = len(set_1.intersection(set_2))
union = len(set_1.union(set_2))
if union > 0:
iou = intersection / union
else: # union size is 0, i.e., no genes are affected whatssoever (i.e., neutral clone/event node)
iou = 1 # similarity one betweeb neutral clones / event nodes.
return 1-iou


def get_label_set(node_map, labels, value):
if value < len(labels):
return [labels[int(value)]]
else:
return get_leaves(node_map[value], labels)


def get_leaves(root, labels):
num_labels = len(labels)
output = []
for node in PreOrderIter(root):
if node.name < num_labels:
output.append(labels[node.name])
return output


# Get max over all elements except the diagonal.
def get_max_pairwise_distance(df_distances, label_set):
submatrix = df_distances.loc[label_set, label_set]
submatrix = submatrix.mask(np.eye(len(submatrix.index), dtype = bool))
return submatrix.max().max()


def merge_children(node, threshold):
if hasattr(node, "weight"):
if not node.parent and node.weight <= threshold: # if root
return True # merge everything
if node.weight <= threshold and node.parent.weight > threshold:
return True
else:
return False
else: # leaf
if node.parent.weight > threshold:
return True
return False


## Get the clusters from hierarchical clustering based on pairwise similarity.
def getSimilarityClusters(hierarchy_linkage, labels, df_distances, distance_threshold=1):
num_labels = len(labels)
#assert num_labels - 1 == len(hierarchy_linkage)

node_map = {}
root = None

for i, pair in enumerate(hierarchy_linkage):

# Create tree with distance edge weights.
distance = -1

if pair[0] < num_labels and pair[1] < num_labels:
distance = df_distances[labels[int(pair[0])]].loc[labels[int(pair[1])]]
else:
label_set_0 = get_label_set(node_map, labels, pair[0])
label_set_1 = get_label_set(node_map, labels, pair[1])
distance = get_max_pairwise_distance(df_distances, label_set_0 + label_set_1)

parent = Node(i + num_labels, weight = distance)
node_map[i + num_labels] = parent

if pair[0] < num_labels:
node_0 = Node(int(pair[0]), parent=parent, sample_name=labels[int(pair[0])])
else:
node_map[pair[0]].parent = parent

if pair[1] < num_labels:
node_1 = Node(int(pair[1]), parent=parent, sample_name=labels[int(pair[1])])
else:
node_map[pair[1]].parent = parent

if i == num_labels - 2:
root = parent

clusters = []
for node in PreOrderIter(root):
# if root and weight below threshold, merge everything.
if not node.parent and node.weight <= distance_threshold:
return [get_leaves(node, labels)]
if node.parent: # if not root
if merge_children(node, distance_threshold):
cluster = get_leaves(node, labels)
if len(cluster) != 1:
assert get_max_pairwise_distance(df_distances, cluster) <= distance_threshold
clusters.append(cluster)

return clusters
2 changes: 0 additions & 2 deletions scripts/utils_oncotreevis.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from anytree.exporter import JsonExporter
from anytree import PreOrderIter, RenderTree
#from utils_anytree import *
import sys

def createOncotreeVISInput(anytrees, metadata, dataset_name):
Expand All @@ -23,7 +22,6 @@ def createOncotreeVISInput(anytrees, metadata, dataset_name):
gene_events[gene] = {}
gene_events[gene]["CNA"] = node.gene_cn_events[gene]
del node.gene_cn_events
node.matching_label = 0

elif "morita" in dataset_name:
gene = node.label[0]
Expand Down

0 comments on commit c14d7ae

Please sign in to comment.