From b32d72ad72bdfb5502fa9507d32fb2a4b012e258 Mon Sep 17 00:00:00 2001 From: jessicaw9910 Date: Wed, 16 Oct 2024 14:51:28 -0400 Subject: [PATCH] finalized alignment algorithm for b.l, multi-matching post-concatenation, and all other remaining partial alignments --- .../databases/klifs.py | 722 +++++------------- 1 file changed, 174 insertions(+), 548 deletions(-) diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py index 9c6cde5..b5a84bc 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py @@ -1,12 +1,12 @@ import logging import re -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np from bravado.client import SwaggerClient from Bio import Align from missense_kinase_toolkit.databases.api_schema import SwaggerAPIClient -from missense_kinase_toolkit.databases.aligners import BL2UniProtAligner +from missense_kinase_toolkit.databases.aligners import BL2UniProtAligner, Kincore2UniProtAligner logger = logging.getLogger(__name__) @@ -132,348 +132,6 @@ boolean denoting if subsequent regions are contiguous, and colors.""" -def remove_gaps_from_klifs(klifs_string: str) -> str: - """Remove gaps from KLIFS pocket sequence. - - Parameters - ---------- - klifs_pocket : str - KLIFS pocket sequence; can be entire sequence or substring - - Returns - ------- - klifs_pocket_narm : str - KLIFS pocket sequence without gaps (i.e., "-" removed) - - """ - klifs_pocket_narm = "".join([i for i in klifs_string if i != "-"]) - return klifs_pocket_narm - - -def return_idx_of_substring_in_superstring( - superstring: str, substring: str -) -> list[int] | None: - """ - - Parameters - ---------- - superstring : str - String in which to find substring index - substring : str - String in which to find superstring index - - Returns - ------- - list_out : list[int] | None - Index where substring begins in superstring; None if substring not in superstring - - """ - list_out = [ - i for i in range(len(superstring)) if superstring.startswith(substring, i) - ] - return list_out - - -def align_klifs_pocket_to_uniprot_seq( - idx_start: int, - idx_end: int, - str_uniprot: str, - str_klifs: str, -) -> list[int] | None: - """Align KLIFS region to UniProt canonical Uniprot sequence. - - Parameters - ---------- - idx_start : int - Start index of KLIFS region - idx_end : int - End index of KLIFS region - str_uniprot : str - UniProt canonical sequence - str_klifs : str - KLIFS pocket sequence - - Returns - ------- - substring_klifs : str - Substring of KLIFS pocket that maps to indices for the region(s) provided - list_idx : list[int] | None - List of indices in UniProt sequence where KLIFS region starts - - """ - substring_klifs = str_klifs[idx_start:idx_end] - substring_klifs_narm = remove_gaps_from_klifs(substring_klifs) - if len(substring_klifs_narm) == 0: - list_idx = None - else: - list_idx = return_idx_of_substring_in_superstring( - str_uniprot, substring_klifs_narm - ) - return substring_klifs, list_idx - - -def find_start_or_end_idx_recursively( - idx_in: int, - list_idx: list[int], - list_substr: list[str], - idx_kd: tuple[int | None, int | None] = (None, None), - seq_uniprot: str | None = None, - bool_start: bool = True, -) -> int: - """Find the start or end indices in UniProt canonical sequence of flanking KLIFS regions recursively. - - Parameters - ---------- - idx_in : int - Index of KLIFS region (e.g., I is 0, g.l is 1, etc.) - list_idx : list[int] - list_substring_idxs - list_substr : list[str] - list_klifs_substr_actual - idx_kd : tuple[int | None, int | None] - Index of kinase domain in UniProt sequence (start, end); default is (None, None) - seq_uniprot : str | None - UniProt canonical sequence - bool_start : bool - If True, find start index (default); if False, find end index - """ - # if looking for preceding region, start at idx_in - 1 - if bool_start: - # if first region - if idx_in == 0: - # if KD start is None, return 0 - if idx_kd[0] is None: - return 0 - # if KD start is provided, return KD start - else: - return idx_kd[0] - try: - idx_out = list_idx[idx_in - 1][0] + \ - len(remove_gaps_from_klifs(list_substr[idx_in - 1])) - except IndexError or TypeError: - idx_out = find_start_or_end_idx_recursively( - idx_in - 1, - list_idx, - list_substr, - bool_start=True, - ) - # if looking for subsequent region, start at idx_in + 1 - else: - # if last region - if idx_in == len(DICT_POCKET_KLIFS_REGIONS) - 1: - # if KD end is None, return len(seq_uniprot) - 1 - if idx_kd[1] is None: - return len(seq_uniprot) - 1 - # if KD end is provided, return KD end - else: - return idx_kd[1] - try: - idx_out = int(list_idx[idx_in + 1][0]) - except IndexError or TypeError: - idx_out = find_start_or_end_idx_recursively( - idx_in + 1, - list_idx, - list_substr, - bool_start=False, - ) - - return idx_out - - -def select_correct_bl_alignment( - alignments: Align.PairwiseAlignments, -) -> list[int]: - """Select correct alignment for b.l region. - - Parameters - ---------- - alignments : Align.PairwiseAlignments - Pairwise alignments - - Returns - ------- - list[int] - List of indices for correct alignment - - """ - list_alignments = [re.findall(r"[A-Z]+", alignment[0, :]) \ - for alignment in alignments] - list_idx = [idx for idx, i in enumerate(list_alignments) \ - if len(i) == 2 and len(i[0]) == 2] - - if len(list_idx) > 1: - logging.error(f"{len(list_idx)} correct alignment found for b.l region\n{list_alignments}") - return None - else: - alignment = alignments[list_idx[0]] - # extract target (b.l) and query (UniProt) sequences - target = alignment.indices[0] - query = alignment.indices[1] - # where target is aligned, set to 1; where target is not aligned, set to np.nan - target[target >= 0] = 1 - target = np.where(target == -1, np.nan, target) - # keep only indices where target is aligned to query - output = target * query - output = output[~np.isnan(output)] - output = [int(i) for i in output.tolist()] - return output - -#TODO find_start_or_end_idx_recursively kwargs -def return_partial_alignments( - align_fn: Align.PairwiseAligner, - idx: int, - substring_idx_list: list[list[int] | None], - substring_actual_list: list[str], - uniprot_seq: str, - # kd_idx: tuple[int | None, int | None], -) -> tuple[int, int, Align.PairwiseAlignments]: - """Return partial alignments for b.l region. - - Parameters - ---------- - align_fn : Align.PairwiseAligner - Alignment function - idx : int - Index of region - substring_idx_list : list[list[int] | None] - List of indices in UniProt sequence where KLIFS region starts - - Returns - ------- - Align.PairwiseAlignments - Pairwise alignments - - """ - start_idx = find_start_or_end_idx_recursively( - idx, - substring_idx_list, - substring_actual_list, - bool_start=True - ) - end_idx = find_start_or_end_idx_recursively( - idx, - substring_idx_list, - substring_actual_list, - bool_start=False - ) - aligned = align_fn.align( - remove_gaps_from_klifs(substring_actual_list[idx]), - uniprot_seq[start_idx:end_idx] - ) - return start_idx, end_idx, aligned - - -def iterate_klifs_alignment( - string_uniprot: str, - string_klifs: str, -) -> dict[str, list[str | list[int] | None]]: - """Align KLIFS region to UniProt canonical Uniprot sequence. - - Parameters - ---------- - string_uniprot : str - UniProt canonical sequence - string_klifs : str - KLIFS pocket sequence - - Returns - ------- - dict_out : dict[str, list[str | list[int] | None]] - Dictionary with keys (match part of KLIFSPocket object): - list_klifs_region : list[str] - List of start and end regions of KLIFS pocket separated by ":"; end region will be the - same as start region if no concatenation necessary to find a single exact match - list_klifs_substr_actual : list[str] - List of substring of KLIFS pocket that maps to the *start region* of the KLIFS pocket - list_klifs_substr_match : list[str] - List of the actual substring used to match to the KLIFS pocket for the region(s) provided; - same as list_klifs_substr_actual if no concatenation necessary to find a single exact match - list_substring_idxs : list[list[int | None]] - List of indices in UniProt sequence where KLIFS region starts - - """ - list_klifs_region = [] - list_klifs_substr_actual = [] - list_klifs_substr_match = [] - list_substring_idxs = [] - - dict_klifs = DICT_POCKET_KLIFS_REGIONS - list_klifs = list(dict_klifs.keys()) - - for klifs_index, klifs_region in enumerate(list_klifs): - klifs_region_start, klifs_region_end = klifs_region, klifs_region - klifs_idx_start, klifs_idx_end = ( - dict_klifs[klifs_region_start]["start"] - 1, - dict_klifs[klifs_region_end]["end"], - ) - - str_klifs, list_substring_idx = align_klifs_pocket_to_uniprot_seq( - idx_start=klifs_idx_start, - idx_end=klifs_idx_end, - str_uniprot=string_uniprot, - str_klifs=string_klifs, - ) - list_klifs_substr_actual.append(str_klifs) - - # if None KLIFS all "-" so disregard; if multiple idxs returned, - # concatenate with contiguous regions to identify single match - if list_substring_idx is not None and len(list_substring_idx) > 1: - bool_cont = dict_klifs[klifs_region_start]["contiguous"] - # if contiguous with subsequent, concatenate with susbequent region - if bool_cont: - klifs_region_end = list_klifs[klifs_index + 1] - # if not contiguous with subsequent, concatenate with previous region - else: - klifs_region_start = list_klifs[klifs_index - 1] - # need for offset later - len_klifs = len(remove_gaps_from_klifs(str_klifs)) - klifs_idx_start, klifs_idx_end = ( - dict_klifs[klifs_region_start]["start"] - 1, - dict_klifs[klifs_region_end]["end"], - ) - str_klifs, list_substring_idx = align_klifs_pocket_to_uniprot_seq( - idx_start=klifs_idx_start, - idx_end=klifs_idx_end, - str_uniprot=string_uniprot, - str_klifs=string_klifs, - ) - # if concat with previous, offset by length of preceding KLIFS region with gaps removed - if ( - not bool_cont - and list_substring_idx is not None - and len(list_substring_idx) != 0 - ): - len_offset = len(remove_gaps_from_klifs(str_klifs)) - len_klifs - list_substring_idx = [i + len_offset for i in list_substring_idx] - - # b.l region alignment - idx_bl = [i for i, x in enumerate(list_klifs) if x == "b.l"][0] - start, end, bl_alignments = return_partial_alignments( - idx=idx_bl, - substring_idx_list=list_substring_idxs, - substring_actual_list=list_klifs_substr_actual, - uniprot_seq=string_uniprot, - align_fn=BL2UniProtAligner(), - ) - list_bl = select_correct_bl_alignment(bl_alignments) - list_bl = [i + start for i in list_bl] - - # final non-contiguous alignment algorithm - - list_klifs_region.append(klifs_region_start + ":" + klifs_region_end) - list_klifs_substr_match.append(str_klifs) - list_substring_idxs.append(list_substring_idx) - - dict_out = { - "list_klifs_region": list_klifs_region, - "list_klifs_substr_actual": list_klifs_substr_actual, - "list_klifs_substr_match": list_klifs_substr_match, - "list_substring_idxs": list_substring_idxs, - } - - return dict_out - - class KLIFS(SwaggerAPIClient): """Class to interact with the KLIFS API.""" @@ -646,9 +304,9 @@ class KLIFSPocket: list_klifs_region : list[str] List of start and end regions of KLIFS pocket separated by ":"; end region will be the same as start region if no concatenation necessary to find a single exact match - list_klifs_substr_actual : list[str] + list_klifs_substr_actual : list[str | None] List of substring of KLIFS pocket that maps to the *start region* of the KLIFS pocket - list_klifs_substr_match : list[str] + list_klifs_substr_match : list[str | None] List of the actual substring used to match to the KLIFS pocket for the region(s) provided; will be the same as list_klifs_substr_actual if no concatenation necessary to find a single exact match list_substring_idxs : list[list[int] | None] @@ -662,13 +320,15 @@ class KLIFSPocket: uniprotSeq: str klifsSeq: str idx_kd: tuple[int | None, int | None] - list_klifs_region: list[str] - list_klifs_substr_actual: list[str] - list_klifs_substr_match: list[str] - list_substring_idxs: list[list[int] | None] + list_klifs_region: list[str | None] = field(default_factory=list) + list_klifs_substr_actual: list[str | None] = field(default_factory=list) + list_klifs_substr_match: list[str | None] = field(default_factory=list) + list_substring_idxs: list[list[int | None] | None] = field(default_factory=list) + def __post_init__(self): - pass + self.iterate_klifs_alignment() + @staticmethod def remove_gaps_from_klifs(klifs_string: str) -> str: @@ -691,9 +351,10 @@ def remove_gaps_from_klifs(klifs_string: str) -> str: @staticmethod def return_idx_of_substring_in_superstring( - superstring: str, substring: str + superstring: str, + substring: str, ) -> list[int] | None: - """ + """Returns the index where substring begins in superstring (does not require -1 offset). Parameters ---------- @@ -715,8 +376,42 @@ def return_idx_of_substring_in_superstring( @staticmethod - def select_correct_bl_alignment( + def return_idx_of_alignment_match( + align: Align.PairwiseAlignments, + ) -> list[int]: + """Return indices of alignment match. + + Parameters + ---------- + align : Align.PairwiseAlignments + Pairwise alignments + + Returns + ------- + list[int] + List of indices for alignment match + + """ + # extract target (b.l) and query (UniProt) sequences + target = align.indices[0] + query = align.indices[1] + + # where target is aligned, set to 1; where target is not aligned, set to np.nan + target[target >= 0] = 1 + target = np.where(target == -1, np.nan, target) + + # keep only indices where target is aligned to query + output = target * query + output = output[~np.isnan(output)] + output = [int(i) for i in output.tolist()] + + return output + + + def select_correct_alignment( + self, alignments: Align.PairwiseAlignments, + bool_bl: bool = True, ) -> list[int]: """Select correct alignment for b.l region. @@ -724,6 +419,8 @@ def select_correct_bl_alignment( ---------- alignments : Align.PairwiseAlignments Pairwise alignments + bool_bl : bool + If True, select correct alignment for b.l region; if False, select correct alignment for linker region Returns ------- @@ -732,33 +429,53 @@ def select_correct_bl_alignment( """ list_alignments = [re.findall(r"[A-Z]+", alignment[0, :]) \ - for alignment in alignments] - list_idx = [idx for idx, i in enumerate(list_alignments) \ - if len(i) == 2 and len(i[0]) == 2] - + for alignment in alignments] + + if bool_bl: + # manual review showed 2 matches + gap + 5 matches + list_idx = [idx for idx, i in enumerate(list_alignments) \ + if len(i) == 2 and len(i[0]) == 2] + region = "b.l" + else: + # manual review showed 1 matches + gap + 3 matches + list_idx = [idx for idx, i in enumerate(list_alignments) \ + if len(i) == 2 and len(i[0]) == 1] + region = "linker" + if len(list_idx) > 1: - logging.error(f"{len(list_idx)} correct alignment found for b.l region\n{list_alignments}") + logging.error(f"{len(list_idx)} correct alignments found for {region} region\n{list_alignments}") return None + # BUB1B and PIK3R4 have "-" in b.l region so will not obey heuristic in list_idx + elif len(list_idx) == 0: + if len(alignments) == 1: + alignment = alignments[0] + else: + logging.error(f"{len(alignments)} non-heuristic alignments found for {region} region\n"\ + f"{[print(i) for i in alignments]}") + return None else: alignment = alignments[list_idx[0]] - # extract target (b.l) and query (UniProt) sequences - target = alignment.indices[0] - query = alignment.indices[1] - # where target is aligned, set to 1; where target is not aligned, set to np.nan - target[target >= 0] = 1 - target = np.where(target == -1, np.nan, target) - # keep only indices where target is aligned to query - output = target * query - output = output[~np.isnan(output)] - output = [int(i) for i in output.tolist()] - return output + + # # extract target (b.l) and query (UniProt) sequences + # target = alignment.indices[0] + # query = alignment.indices[1] + # # where target is aligned, set to 1; where target is not aligned, set to np.nan + # target[target >= 0] = 1 + # target = np.where(target == -1, np.nan, target) + # # keep only indices where target is aligned to query + # output = target * query + # output = output[~np.isnan(output)] + # output = [int(i) for i in output.tolist()] + # return output + + return self.return_idx_of_alignment_match(alignment) def align_klifs_pocket_to_uniprot_seq( self, idx_start: int, idx_end: int, - ) -> list[int] | None: + ) -> tuple[str, list[int] | None]: """Align KLIFS region to UniProt canonical Uniprot sequence. Parameters @@ -811,16 +528,12 @@ def find_start_or_end_idx_recursively( # if KD start is provided, return KD start else: return self.idx_kd[0] - try: - idx_out = self.list_substring_idxs[idx_in - 1][0] + \ - len(self.remove_gaps_from_klifs(self.list_klifs_substr_actual[idx_in - 1])) - except IndexError or TypeError: - idx_out = self.find_start_or_end_idx_recursively( - idx_in - 1, - self.list_substring_idxs, - self.list_klifs_substr_actual, - bool_start=True, - ) + idx_temp = self.list_substring_idxs[idx_in - 1] + str_temp = self.list_klifs_substr_actual[idx_in - 1] + if idx_temp is not None and len(idx_temp) == 1: + idx_out = idx_temp[0] + len(self.remove_gaps_from_klifs(str_temp)) + else: + idx_out = self.find_start_or_end_idx_recursively(idx_in - 1, bool_start=True) # if looking for subsequent region, start at idx_in + 1 else: # if last region @@ -831,15 +544,11 @@ def find_start_or_end_idx_recursively( # if KD end is provided, return KD end else: return self.idx_kd[1] - try: - idx_out = int(self.list_substring_idxs[idx_in + 1][0]) - except IndexError or TypeError: - idx_out = self.find_start_or_end_idx_recursively( - idx_in + 1, - self.list_substring_idxs, - self.list_klifs_substr_actual, - bool_start=False, - ) + idx_temp = self.list_substring_idxs[idx_in + 1] + if idx_temp is not None and len(idx_temp) == 1: + idx_out = idx_temp[0] + else: + idx_out = self.find_start_or_end_idx_recursively(idx_in + 1, bool_start=False) return idx_out @@ -847,76 +556,45 @@ def find_start_or_end_idx_recursively( #TODO find_start_or_end_idx_recursively kwargs def return_partial_alignments( self, - align_fn: Align.PairwiseAligner, idx: int, - ) -> tuple[int, int, Align.PairwiseAlignments]: + align_fn: Align.PairwiseAligner | None = None, + ) -> tuple[int, int, Align.PairwiseAlignments | list[int | None] | None]: """Return partial alignments for b.l region. Parameters ---------- - align_fn : Align.PairwiseAligner - Alignment function idx : int Index of region (e.g., I is 0, g.l is 1, etc.) + align_fn : Align.PairwiseAligner | None + Alignment function; if none provided will use exact match Returns ------- - Align.PairwiseAlignments - Pairwise alignments + tuple[int, int, Align.PairwiseAlignments | list[int | None] | None] + Start, end, and alignments (either indices or alignments or None) for region """ - start_idx = self.find_start_or_end_idx_recursively( - idx, - self.list_substring_idxs, - self.list_klifs_substr_actual, - bool_start=True - ) - end_idx = self.find_start_or_end_idx_recursively( - idx, - self.list_substring_idxs, - self.list_klifs_substr_actual, - bool_start=False - ) - aligned = align_fn.align( - self.remove_gaps_from_klifs(self.list_klifs_substr_actual[idx]), - self.uniprotSeq[start_idx:end_idx] - ) - return start_idx, end_idx, aligned + start_idx = self.find_start_or_end_idx_recursively(idx, bool_start=True) + end_idx = self.find_start_or_end_idx_recursively(idx, bool_start=False) + str_klifs = self.remove_gaps_from_klifs(self.list_klifs_substr_actual[idx]) + str_uniprot = self.uniprotSeq[start_idx:end_idx] - def iterate_klifs_alignment( - self, - ) -> dict[str, list[str | list[int] | None]]: - """Align KLIFS region to UniProt canonical Uniprot sequence. + if len(str_klifs) == 0: + return start_idx, end_idx, None + else: + if align_fn is not None: + aligned = align_fn.align(str_klifs, str_uniprot) + else: + aligned = self.return_idx_of_substring_in_superstring(str_uniprot, str_klifs) - Parameters - ---------- - string_uniprot : str - UniProt canonical sequence - string_klifs : str - KLIFS pocket sequence + return start_idx, end_idx, aligned - Returns - ------- - dict_out : dict[str, list[str | list[int] | None]] - Dictionary with keys (match part of KLIFSPocket object): - list_klifs_region : list[str] - List of start and end regions of KLIFS pocket separated by ":"; end region will be the - same as start region if no concatenation necessary to find a single exact match - list_klifs_substr_actual : list[str] - List of substring of KLIFS pocket that maps to the *start region* of the KLIFS pocket - list_klifs_substr_match : list[str] - List of the actual substring used to match to the KLIFS pocket for the region(s) provided; - same as list_klifs_substr_actual if no concatenation necessary to find a single exact match - list_substring_idxs : list[list[int | None]] - List of indices in UniProt sequence where KLIFS region starts - - """ - list_klifs_region = [] - list_klifs_substr_actual = [] - list_klifs_substr_match = [] - list_substring_idxs = [] + def iterate_klifs_alignment( + self, + ) -> None: + """Align KLIFS region to UniProt canonical Uniprot sequence.""" dict_klifs = DICT_POCKET_KLIFS_REGIONS list_klifs = list(dict_klifs.keys()) @@ -930,10 +608,8 @@ def iterate_klifs_alignment( str_klifs, list_substring_idx = self.align_klifs_pocket_to_uniprot_seq( idx_start=klifs_idx_start, idx_end=klifs_idx_end, - str_uniprot=self.uniprotSeq, - str_klifs=self.klifsSeq, ) - list_klifs_substr_actual.append(str_klifs) + self.list_klifs_substr_actual.append(str_klifs) # if None KLIFS all "-" so disregard; if multiple idxs returned, # concatenate with contiguous regions to identify single match @@ -946,16 +622,14 @@ def iterate_klifs_alignment( else: klifs_region_start = list_klifs[klifs_index - 1] # need for offset later - len_klifs = len(remove_gaps_from_klifs(str_klifs)) + len_klifs = len(self.remove_gaps_from_klifs(str_klifs)) klifs_idx_start, klifs_idx_end = ( dict_klifs[klifs_region_start]["start"] - 1, dict_klifs[klifs_region_end]["end"], ) - str_klifs, list_substring_idx = align_klifs_pocket_to_uniprot_seq( + str_klifs, list_substring_idx = self.align_klifs_pocket_to_uniprot_seq( idx_start=klifs_idx_start, idx_end=klifs_idx_end, - str_uniprot=self.uniprotSeq, - str_klifs=self.klifsSeq, ) # if concat with previous, offset by length of preceding KLIFS region with gaps removed if ( @@ -963,106 +637,58 @@ def iterate_klifs_alignment( and list_substring_idx is not None and len(list_substring_idx) != 0 ): - len_offset = len(remove_gaps_from_klifs(str_klifs)) - len_klifs + len_offset = len(self.remove_gaps_from_klifs(str_klifs)) - len_klifs list_substring_idx = [i + len_offset for i in list_substring_idx] + + self.list_klifs_region.append(klifs_region_start + ":" + klifs_region_end) + self.list_klifs_substr_match.append(str_klifs) + self.list_substring_idxs.append(list_substring_idx) - # b.l region alignment - idx_bl = [i for i, x in enumerate(list_klifs) if x == "b.l"][0] - start, end, bl_alignments = return_partial_alignments( + # post-hoc adjustments + + # b.l region non-contiguous alignment + idx_bl = [i for i, x in enumerate(list_klifs) if x == "b.l"][0] + # STK40 has no b.l region, so skip entirely + if self.list_substring_idxs[idx_bl] is None: + pass + else: + start, _, bl_alignments = self.return_partial_alignments( idx=idx_bl, - substring_idx_list=list_substring_idxs, - substring_actual_list=list_klifs_substr_actual, - uniprot_seq=self.uniprotSeq, align_fn=BL2UniProtAligner(), ) - list_bl = select_correct_bl_alignment(bl_alignments) - list_bl = [i + start for i in list_bl] - - # final non-contiguous alignment algorithm - - list_klifs_region.append(klifs_region_start + ":" + klifs_region_end) - list_klifs_substr_match.append(str_klifs) - list_substring_idxs.append(list_substring_idx) - - dict_out = { - "list_klifs_region": list_klifs_region, - "list_klifs_substr_actual": list_klifs_substr_actual, - "list_klifs_substr_match": list_klifs_substr_match, - "list_substring_idxs": list_substring_idxs, - } - - return dict_out - - # @staticmethod - # def remove_gaps_from_klifs(klifs_string: str) -> str: - # """Remove gaps from KLIFS pocket sequence. - - # Parameters - # ---------- - # klifs_pocket : str - # KLIFS pocket sequence; can be entire sequence or substring - - # Returns - # ------- - # klifs_pocket_narm : str - # KLIFS pocket sequence without gaps (i.e., "-" removed) - - # """ - # klifs_pocket_narm = "".join([i for i in klifs_string if i != "-"]) - # return klifs_pocket_narm - - # @staticmethod - # def return_idx_of_substring_in_superstring( - # superstring: str, substring: str - # ) -> list[int] | None: - # """ - - # Parameters - # ---------- - # superstring : str - # String in which to find substring index - # substring : str - # String in which to find superstring index - - # Returns - # ------- - # list_out : list[int] | None - # Index where substring begins in superstring; None if substring not in superstring - - # """ - # list_out = [ - # i for i in range(len(superstring)) if superstring.startswith(substring, i) - # ] - # return list_out - - # def align_klifs_pocket_to_uniprot_seq( - # self, - # idx_start: int, - # idx_end: int, - # ) -> list[int] | None: - # """Align KLIFS region to UniProt canonical Uniprot sequence. - - # Parameters - # ---------- - # idx_start : int - # Start index of KLIFS region - # idx_end : int - # End index of KLIFS region - - # Returns - # ------- - # substring_klifs : str - # Substring of KLIFS pocket that maps to indices for the region(s) provided - # list_idx : list[int] | None - # List of indices in UniProt sequence where KLIFS region starts - - # """ - # substring_klifs = self.klifsSeq[idx_start:idx_end] - # substring_klifs_narm = self.remove_gaps_from_klifs(substring_klifs) - # if len(substring_klifs_narm) == 0: - # list_idx = None - # else: - # list_idx = self.return_idx_of_substring_in_superstring( - # self.uniprotSeq, substring_klifs_narm - # ) - # return substring_klifs, list_idx + list_bl = self.select_correct_alignment(bl_alignments) + self.list_substring_idxs[idx_bl] = [i + start for i in list_bl] + + # interpolate multi-matching using previous and subsequent regions + for idx, substr_idx in enumerate(self.list_substring_idxs): + if idx != idx_bl and substr_idx is not None and len(substr_idx) > 1: + start = self.find_start_or_end_idx_recursively(idx, bool_start=True) + end = self.find_start_or_end_idx_recursively(idx, bool_start=False) + self.list_substring_idxs[idx] = [i for i in substr_idx if i >= start and i <= end] + + # TODO: final partial alignment algorithm + for idx, substr_idx in enumerate(self.list_substring_idxs): + if substr_idx == []: + # check exact match + start_exact, _, align_exact = self.return_partial_alignments(idx=idx) + if align_exact != [] and len(align_exact) == 1: + self.list_substring_idxs[idx] = [i + start_exact for i in align_exact] + # if no exact match, try local alignment + else: + start_local, _, align_local = self.return_partial_alignments( + idx=idx, + align_fn=Kincore2UniProtAligner(), + ) + if len(align_local) == 1 and \ + align_local[0].target == self.remove_gaps_from_klifs(align_local[0][0, :]): + list_local = self.return_idx_of_alignment_match(align_local[0]) + self.list_substring_idxs[idx] = [i + start_local for i in list_local] + # if no exact match, try global alignment + else: + start_global, _, align_global = self.return_partial_alignments( + idx=idx, + align_fn=BL2UniProtAligner(), + ) + # all that remains is linker region where some gaps (1 + 3) occur + list_global = self.select_correct_alignment(align_global, bool_bl=False) + self.list_substring_idxs[idx] = [i + start_global for i in list_global]