Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce front-end information and refactor abstract Assembly representation + minor fixes #8

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assemblyinfo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

_db: AssemblyInfo | None = None


def connect() -> AssemblyInfo:
global _db
if _db is None:
Expand Down
12 changes: 5 additions & 7 deletions assemblyinfo/build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,11 @@ def get_formatted_paths(paths: List[str]) -> List[Tuple]:
(
"_".join(x.split("_", 2)[:2]), # Accession
x.split("_", 2)[-1][:-1], # assembly complete
x.split("_", 2)[-1][
:-1
],
f"{NCBI}/{x.split('.', 1)[0].split('_')[0]}/" +
f"{x.split('.', 1)[0].split('_')[1][0:3]}/" +
f"{x.split('.', 1)[0].split('_')[1][3:6]}/" +
f"{x.split('.', 1)[0].split('_')[1][6:9]}/{x}",
x.split("_", 2)[-1][:-1],
f"{NCBI}/{x.split('.', 1)[0].split('_')[0]}/"
+ f"{x.split('.', 1)[0].split('_')[1][0:3]}/"
+ f"{x.split('.', 1)[0].split('_')[1][3:6]}/"
+ f"{x.split('.', 1)[0].split('_')[1][6:9]}/{x}",
)
for x in paths
]
Expand Down
4 changes: 0 additions & 4 deletions assemblyinfo/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from .acc import (
get_assembly_from_accession,
get_genbank_accession,
get_patch_from_accession,
get_refseq_accession,
)
from .assembly import (
Assembly,
Expand Down Expand Up @@ -43,8 +41,6 @@
"available_patches",
"available_species",
"available_accessions",
"get_genbank_accession",
"get_refseq_accession",
"get_patch_from_accession",
"get_assembly_from_accession",
"filter_chromosome_data",
Expand Down
92 changes: 14 additions & 78 deletions assemblyinfo/core/acc.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,11 @@
from typing import List

__all__ = [
"get_genbank_accession",
"get_refseq_accession",
"get_patch_from_accession",
"get_assembly_from_accession",
]


def get_genbank_accession(cls, patch: str) -> str:
"""
Returns the GenBank accession for the specified patch.

Parameters
----------
patch : str
The patch name to filter by.

Returns
-------
str
The GenBank accession.['GRCh38.p14']

Raises
------
ValueError
If the patch is not provided.

Examples
--------
>>> AssemblyInfo.get_genbank_accession("GRCh38.p14")
"""
if not patch:
raise ValueError("ERROR: you must provide a patch!")
elif patch not in cls._data.patch.tolist():
raise ValueError("ERROR: patch not in database!")

return cls._data.query(f"patch=='{patch}'").genbank_accession.tolist()


def get_refseq_accession(cls, patch: str) -> str:
"""
Returns the RefSeq accession for the specified patch.

Parameters
----------
patch : str
The patch name to filter by.

Returns
-------
str
The RefSeq accession.

Raises
------
ValueError
If the patch is not provided.

Examples
--------
>>> AssemblyInfo.get_refseq_accession("GRCh38.p14")
"""
if not patch:
raise ValueError("ERROR: you must provide a patch!")
elif patch not in cls._data.patch.tolist():
raise ValueError("ERROR: patch not in database!")

return cls._data.query(f"patch=='{patch}'").refseq_accession.tolist()


def get_patch_from_accession(cls, accession: str) -> List[str]:
"""
Returns the patches for the specified accession.
Expand All @@ -96,15 +32,15 @@ def get_patch_from_accession(cls, accession: str) -> List[str]:
if not accession:
raise ValueError("ERROR: you must provide an accession!")
elif (
accession not in cls._data.genbank_accession.dropna().tolist()
and accession not in cls._data.refseq_accession.dropna().tolist()
accession not in cls._data.genbank.dropna().tolist()
and accession not in cls._data.refseq.dropna().tolist()
):
raise ValueError("ERROR: accession not in database!")

if accession in cls._data.genbank_accession.dropna().tolist():
return cls._data.query(f"genbank_accession=='{accession}'").patch.tolist()
elif accession in cls._data.refseq_accession.dropna().tolist():
return cls._data.query(f"refseq_accession=='{accession}'").patch.tolist()
if accession in cls._data.genbank.dropna().tolist():
return cls._data.query(f"genbank=='{accession}'").patch.tolist()
elif accession in cls._data.refseq.dropna().tolist():
return cls._data.query(f"refseq=='{accession}'").patch.tolist()
else:
raise ValueError("ERROR: accession not in database!")

Expand Down Expand Up @@ -135,23 +71,23 @@ def get_assembly_from_accession(cls, accession: str) -> List[str]:
if not accession:
raise ValueError("ERROR: you must provide an accession!")
elif (
accession not in cls._data.genbank_accession.dropna().tolist()
and accession not in cls._data.refseq_accession.dropna().tolist()
accession not in cls._data.genbank.dropna().tolist()
and accession not in cls._data.refseq.dropna().tolist()
):
raise ValueError("ERROR: accession not in database!")

if accession in cls._data.genbank_accession.dropna().tolist():
if accession in cls._data.genbank.dropna().tolist():
return (
cls._data.query(f"genbank_accession=='{accession}'")
cls._data.query(f"genbank=='{accession}'")
.reset_index()
.loc[0, ["assembly", "assembly_ucsc"]]
.loc[0, ["assembly", "ucsc_name"]]
.tolist()
)
elif accession in cls._data.refseq_accession.dropna().tolist():
elif accession in cls._data.refseq.dropna().tolist():
return (
cls._data.query(f"refseq_accession=='{accession}'")
cls._data.query(f"refseq=='{accession}'")
.reset_index()
.loc[0, ["assembly", "assembly_ucsc"]]
.loc[0, ["assembly", "ucsc_name"]]
.tolist()
)
else:
Expand Down
24 changes: 16 additions & 8 deletions assemblyinfo/core/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@ class Assembly:
A dataclass to store assembly information.
"""

assembly: str
name: str
species: str
common_name: str
seqinfo: pd.DataFrame
metadata: Dict[str, str]
aliases: Dict[str, Dict[str, str]]
genbank: str
refseq: str
patch: str

@property
def chromnames(self) -> List[str]:
Expand All @@ -35,16 +38,18 @@ def chromeq(self) -> Dict[str, Dict[str, str]]:
return pd.DataFrame(self.aliases).T

def __repr__(self):
return (f"Assembly(assembly={self.assembly}, "
f"species={self.species}, "
f"common_name={self.common_name})")
return (
f"Assembly(assembly={self.name}, "
f"species={self.species}, "
f"common_name={self.common_name})"
)


def assembly_info(
cls,
assembly: str,
provider: Optional[str] = None,
roles: Optional[List[str]] = None,
roles: Optional[List[str]] = ["assembled"],
units: Optional[List[str]] = None,
length: Optional[str] = None,
) -> Assembly:
Expand Down Expand Up @@ -82,7 +87,7 @@ def assembly_info(

seqinfo = filter_chromosome_data(
cls, assembly=assembly, roles=roles, units=units, length=length
)
).drop_duplicates(subset=["name"], keep="first")

aliases = (
seqinfo[["name", "ncbi", "genbank", "refseq"]]
Expand All @@ -93,10 +98,13 @@ def assembly_info(
metadata = get_assembly_metadata(cls, assembly=assembly)

return Assembly(
assembly=assembly,
name=assembly,
species=metadata["species"],
common_name=metadata["common_name"],
seqinfo=seqinfo.set_index(provider),
seqinfo=seqinfo.set_index(provider).dropna(axis=1, how="all"),
metadata=metadata,
aliases=aliases,
genbank=metadata["genbank"],
refseq=metadata["refseq"],
patch=metadata["patch"],
)
12 changes: 6 additions & 6 deletions assemblyinfo/core/chrom.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ def filter_chromosome_data(
"""
if assembly in cls._data["assembly"].tolist():
group = "assembly"
elif assembly in cls._data["assembly_ucsc"].dropna().tolist():
group = "assembly_ucsc"
elif assembly in cls._data["ucsc_name"].dropna().tolist():
group = "ucsc_name"
else:
raise ValueError(f"{assembly} not in database!")

q1 = f'{group} == "{assembly}" and version == "latest"'
q1 = f'{group} == "{assembly}" and version'
q2 = ""

if length:
Expand Down Expand Up @@ -261,16 +261,16 @@ def get_seqinfo(cls, assembly: str) -> pd.DataFrame:
"""
if assembly in cls._data["assembly"].tolist():
group = "assembly"
elif assembly in cls._data["assembly_ucsc"].dropna().tolist():
group = "assembly_ucsc"
elif assembly in cls._data["ucsc_name"].dropna().tolist():
group = "ucsc_name"
elif assembly in cls._data["patch"].dropna().tolist():
group = "patch"
else:
error_msg = (
f"{assembly} not in database!\n",
"Valid assemblies are:\n\n",
f"NCBI:\n{cls._data.assembly.unique().tolist()}\n\n",
f"UCSC:\n{cls._data.assembly_ucsc.dropna().unique().tolist()}\n\n",
f"UCSC:\n{cls._data.ucsc_name.dropna().unique().tolist()}\n\n",
f"Patch:\n{cls._data.patch.dropna().unique().tolist()}",
)
raise ValueError(error_msg)
Expand Down
Loading