Skip to content

Commit

Permalink
Merge pull request #492 from DeepRank/480_new
Browse files Browse the repository at this point in the history
refactor: `Query` classes and related code
  • Loading branch information
DaniBodor authored Nov 7, 2023
2 parents fe86f62 + c2151e7 commit 97db708
Show file tree
Hide file tree
Showing 20 changed files with 796 additions and 1,186 deletions.
35 changes: 17 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ DeepRank2 extensive documentation can be found [here](https://deeprank2.rtfd.io/

## Installation

The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows.
The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows.

### Dependencies

Expand All @@ -65,9 +65,9 @@ Before installing deeprank2 you need to install some dependencies. We advise to
* [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/)
* Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4:
* on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`.
* on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread.
* on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread.
* [GCC](https://gcc.gnu.org/install/)
* Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`.
* Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`.
* For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html).

### Deeprank2 Package
Expand Down Expand Up @@ -105,25 +105,24 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/).

### Data generation

For each protein-protein complex (or protein structure containing a SRV), a query can be created and added to the `QueryCollection` object, to be processed later on. Different types of queries exist:
- In a `ProteinProteinInterfaceResidueQuery` and `SingleResidueVariantResidueQuery`, each node represents one amino acid residue.
- In a `ProteinProteinInterfaceAtomicQuery` and `SingleResidueVariantAtomicQuery`, each node represents one atom within the amino acid residues.
For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`.

A query takes as inputs:
- a `.pdb` file, representing the protein-protein structure
A `Query` takes as inputs:
- a `.pdb` file, representing the protein-protein structure,
- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom,
- the ids of the chains composing the structure, and
- optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files.

```python
from deeprank2.query import QueryCollection, ProteinProteinInterfaceResidueQuery
from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery

queries = QueryCollection()

# Append data points
queries.add(ProteinProteinInterfaceResidueQuery(
queries.add(ProteinProteinInterfaceQuery(
pdb_path = "tests/data/pdb/1ATN/1ATN_1w.pdb",
chain_id1 = "A",
chain_id2 = "B",
resolution = "residue",
chain_ids = ["A", "B"],
targets = {
"binary": 0
},
Expand All @@ -132,10 +131,10 @@ queries.add(ProteinProteinInterfaceResidueQuery(
"B": "tests/data/pssm/1ATN/1ATN.B.pdb.pssm"
}
))
queries.add(ProteinProteinInterfaceResidueQuery(
queries.add(ProteinProteinInterfaceQuery(
pdb_path = "tests/data/pdb/1ATN/1ATN_2w.pdb",
chain_id1 = "A",
chain_id2 = "B",
resolution = "residue",
chain_ids = ["A", "B"],
targets = {
"binary": 1
},
Expand All @@ -144,10 +143,10 @@ queries.add(ProteinProteinInterfaceResidueQuery(
"B": "tests/data/pssm/1ATN/1ATN.B.pdb.pssm"
}
))
queries.add(ProteinProteinInterfaceResidueQuery(
queries.add(ProteinProteinInterfaceQuery(
pdb_path = "tests/data/pdb/1ATN/1ATN_3w.pdb",
chain_id1 = "A",
chain_id2 = "B",
resolution = "residue",
chain_ids = ["A", "B"],
targets = {
"binary": 0
},
Expand Down
8 changes: 4 additions & 4 deletions deeprank2/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,14 +346,14 @@ def save_hist( # pylint: disable=too-many-arguments, too-many-branches, useless-

for row, feat in enumerate(features_df):
if isinstance(self.df[feat].values[0], np.ndarray):
if(log):
if log:
log_data = np.log(np.concatenate(self.df[feat].values))
log_data[log_data == -np.inf] = 0
axs[row].hist(log_data, bins=bins)
else:
axs[row].hist(np.concatenate(self.df[feat].values), bins=bins)
else:
if(log):
if log:
log_data = np.log(self.df[feat].values)
log_data[log_data == -np.inf] = 0
axs[row].hist(log_data, bins=bins)
Expand All @@ -366,14 +366,14 @@ def save_hist( # pylint: disable=too-many-arguments, too-many-branches, useless-
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
if isinstance(self.df[features_df[0]].values[0], np.ndarray):
if(log):
if log:
log_data = np.log(np.concatenate(self.df[features_df[0]].values))
log_data[log_data == -np.inf] = 0
ax.hist(log_data, bins=bins)
else:
ax.hist(np.concatenate(self.df[features_df[0]].values), bins=bins)
else:
if(log):
if log:
log_data = np.log(self.df[features_df[0]].values)
log_data[log_data == -np.inf] = 0
ax.hist(log_data, bins=bins)
Expand Down
6 changes: 2 additions & 4 deletions deeprank2/domain/aminoacidlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,15 @@
]

def convert_aa_nomenclature(aa: str, output_type: Optional[int] = None):

# pylint: disable = raise-missing-from
try:
if len(aa) == 1:
aa: AminoAcid = [entry for entry in amino_acids if entry.one_letter_code.lower() == aa.lower()][0]
elif len(aa) == 3:
aa: AminoAcid = [entry for entry in amino_acids if entry.three_letter_code.lower() == aa.lower()][0]
else:
aa: AminoAcid = [entry for entry in amino_acids if entry.name.lower() == aa.lower()][0]
except IndexError:
raise ValueError(f'{aa} is not a valid amino acid.')
except IndexError as e:
raise ValueError(f'{aa} is not a valid amino acid.') from e

if not output_type:
return aa.name
Expand Down
43 changes: 21 additions & 22 deletions deeprank2/features/exposure.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,24 @@ def add_features( # pylint: disable=unused-argument
signal.alarm(0)
except TimeoutError as e:
raise TimeoutError('Bio.PDB.ResidueDepth.get_surface timed out.') from e
else:
hse = HSExposureCA(bio_model)

# These can only be calculated per residue, not per atom.
# So for atomic graphs, every atom gets its residue's value.
for node in graph.nodes:
if isinstance(node.id, Residue):
residue = node.id
elif isinstance(node.id, Atom):
atom = node.id
residue = atom.residue
else:
raise TypeError(f"Unexpected node type: {type(node.id)}")

bio_residue = bio_model[residue.chain.id][residue.number]
node.features[Nfeat.RESDEPTH] = residue_depth(bio_residue, surface)
hse_key = (residue.chain.id, (" ", residue.number, space_if_none(residue.insertion_code)))

if hse_key in hse:
node.features[Nfeat.HSE] = np.array(hse[hse_key], dtype=np.float64)
else:
node.features[Nfeat.HSE] = np.array((0, 0, 0), dtype=np.float64)

# These can only be calculated per residue, not per atom.
# So for atomic graphs, every atom gets its residue's value.
hse = HSExposureCA(bio_model)
for node in graph.nodes:
if isinstance(node.id, Residue):
residue = node.id
elif isinstance(node.id, Atom):
atom = node.id
residue = atom.residue
else:
raise TypeError(f"Unexpected node type: {type(node.id)}")

bio_residue = bio_model[residue.chain.id][residue.number]
node.features[Nfeat.RESDEPTH] = residue_depth(bio_residue, surface)
hse_key = (residue.chain.id, (" ", residue.number, space_if_none(residue.insertion_code)))

if hse_key in hse:
node.features[Nfeat.HSE] = np.array(hse[hse_key], dtype=np.float64)
else:
node.features[Nfeat.HSE] = np.array((0, 0, 0), dtype=np.float64)
Loading

0 comments on commit 97db708

Please sign in to comment.