From 4df62d09536ea5c9214e296a1d542115cac69b30 Mon Sep 17 00:00:00 2001 From: Brent Westbrook Date: Fri, 8 Nov 2024 15:07:21 -0500 Subject: [PATCH 1/5] test that the number of molecules is the same as the input --- yammbs/_tests/unit_tests/test_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yammbs/_tests/unit_tests/test_store.py b/yammbs/_tests/unit_tests/test_store.py index 8305416..5536cee 100644 --- a/yammbs/_tests/unit_tests/test_store.py +++ b/yammbs/_tests/unit_tests/test_store.py @@ -57,6 +57,8 @@ def test_from_qcarchive_dataset(small_qcsubmit_collection): # Ensure a new object can be created from the same database assert len(MoleculeStore(db)) == len(store) + assert len(store.get_smiles()) == small_qcsubmit_collection.n_molecules + def test_from_qcarchive_dataset_undefined_stereo(): """Test loading from YAMMBS's QCArchive model with undefined stereochemistry""" From 26d721338b81437f75fc0c8207dc4e68146cb9f1 Mon Sep 17 00:00:00 2001 From: Brent Westbrook Date: Fri, 8 Nov 2024 15:13:02 -0500 Subject: [PATCH 2/5] follow `from_cached_result_collection` instead of `from_qcsubmit` --- yammbs/_store.py | 66 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/yammbs/_store.py b/yammbs/_store.py index e29bd56..99ba1ea 100644 --- a/yammbs/_store.py +++ b/yammbs/_store.py @@ -533,7 +533,7 @@ def from_qcarchive_dataset( """ Create a new MoleculeStore databset from YAMMBS's QCArchiveDataset model. - Largely adopted from `from_qcsubmit_collection`. + Largely adopted from `from_cached_result_collection`. """ from tqdm import tqdm @@ -542,24 +542,60 @@ def from_qcarchive_dataset( store = cls(database_name) - for qm_molecule in tqdm(dataset.qm_molecules, desc="Storing molecules"): - molecule = Molecule.from_mapped_smiles(qm_molecule.mapped_smiles, allow_undefined_stereo=True) - molecule.add_conformer(Quantity(qm_molecule.coordinates, "angstrom")) + # adapted from MoleculeRecord.from_molecule, MoleculeStore.store, and + # DBSessionManager.store_molecule_record + with store._get_session() as db: + # instead of DBSessionManager._smiles_already_exists + seen = set(db.db.query(DBMoleculeRecord.mapped_smiles)) + for qm_molecule in tqdm(dataset.qm_molecules, desc="Storing molecules"): + if qm_molecule.mapped_smiles in seen: + continue + seen.add(qm_molecule.mapped_smiles) + molecule = Molecule.from_mapped_smiles(qm_molecule.mapped_smiles, allow_undefined_stereo=True) + db_record = DBMoleculeRecord( + mapped_smiles=qm_molecule.mapped_smiles, + inchi_key=molecule.to_inchi(fixed_hydrogens=True), + ) + db.db.add(db_record) + db.db.commit() - molecule_record = MoleculeRecord.from_molecule(molecule) - store.store(molecule_record) + # close the session here and re-open to make sure all of the molecule + # IDs have been flushed to the db - store.store_qcarchive( - QMConformerRecord( - molecule_id=store.get_molecule_id_by_smiles( - molecule_record.mapped_smiles, - ), - qcarchive_id=qm_molecule.qcarchive_id, - mapped_smiles=qm_molecule.mapped_smiles, - coordinates=qm_molecule.coordinates, - energy=qm_molecule.final_energy, + # adapted from MoleculeStore.store_qcarchive, + # QMConformerRecord.from_qcarchive_record, and + # DBSessionManager.store_qm_conformer_record + with store._get_session() as db: + seen = set( + db.db.query( + DBQMConformerRecord.qcarchive_id, ), ) + # reversed so the first record encountered wins out. this matches + # the behavior of the version that queries the db each time + smiles_to_id = { + smi: id + for id, smi in reversed( + db.db.query( + DBMoleculeRecord.id, + DBMoleculeRecord.mapped_smiles, + ).all(), + ) + } + for record in tqdm(dataset.qm_molecules, desc="Storing Records"): + if record.qcarchive_id in seen: + continue + seen.add(record.qcarchive_id) + mol_id = smiles_to_id[record.mapped_smiles] + db.db.add( + DBQMConformerRecord( + parent_id=mol_id, + qcarchive_id=record.qcarchive_id, + mapped_smiles=record.mapped_smiles, + coordinates=record.coordinates, + energy=record.final_energy, + ), + ) return store From 79a7c26dd0da090df6e48309dcff7f92c5aa3b83 Mon Sep 17 00:00:00 2001 From: Brent Westbrook Date: Fri, 8 Nov 2024 16:19:49 -0500 Subject: [PATCH 3/5] run pre-commit --- yammbs/_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yammbs/_store.py b/yammbs/_store.py index 99ba1ea..1e4cbe0 100644 --- a/yammbs/_store.py +++ b/yammbs/_store.py @@ -8,7 +8,7 @@ import pandas from numpy.typing import NDArray from openff.qcsubmit.results import OptimizationResultCollection -from openff.toolkit import Molecule, Quantity +from openff.toolkit import Molecule from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from typing_extensions import Self From 7f8bc886ef36f57d80b522f06892d8d312b87a94 Mon Sep 17 00:00:00 2001 From: Brent Westbrook Date: Fri, 8 Nov 2024 17:25:19 -0500 Subject: [PATCH 4/5] delete unused `seen` check on `qcarchive_id` --- yammbs/_store.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/yammbs/_store.py b/yammbs/_store.py index 1e4cbe0..6361b7d 100644 --- a/yammbs/_store.py +++ b/yammbs/_store.py @@ -566,11 +566,6 @@ def from_qcarchive_dataset( # QMConformerRecord.from_qcarchive_record, and # DBSessionManager.store_qm_conformer_record with store._get_session() as db: - seen = set( - db.db.query( - DBQMConformerRecord.qcarchive_id, - ), - ) # reversed so the first record encountered wins out. this matches # the behavior of the version that queries the db each time smiles_to_id = { @@ -583,9 +578,6 @@ def from_qcarchive_dataset( ) } for record in tqdm(dataset.qm_molecules, desc="Storing Records"): - if record.qcarchive_id in seen: - continue - seen.add(record.qcarchive_id) mol_id = smiles_to_id[record.mapped_smiles] db.db.add( DBQMConformerRecord( From 000d91c9eafa09f6068bf307535419e889a3192d Mon Sep 17 00:00:00 2001 From: Brent Westbrook Date: Fri, 15 Nov 2024 16:04:54 -0500 Subject: [PATCH 5/5] delete unnecessary `db.commit()` call --- yammbs/_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yammbs/_store.py b/yammbs/_store.py index 6361b7d..53c07ed 100644 --- a/yammbs/_store.py +++ b/yammbs/_store.py @@ -557,7 +557,6 @@ def from_qcarchive_dataset( inchi_key=molecule.to_inchi(fixed_hydrogens=True), ) db.db.add(db_record) - db.db.commit() # close the session here and re-open to make sure all of the molecule # IDs have been flushed to the db