Low mem cache (#163)

* Lower memory cache generation * Update README
int-brain-lab · Nov 12, 2024 · 24f66d9 · 24f66d9
1 parent ae7a7cc
commit 24f66d9
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,13 +13,14 @@ Also adds a new ALFPath class to replace alf path functions.
 - paths returned by One methods and functions in one.alf.io are now ALFPath instances
 - bugfix: one.alf.path.full_path_parts didn't always raise when invalid path passed
 - one.alf.path module containing ALFPath class
-- one.alf.exceptions.InvalidALF exception
+- ALF cache table generation has lower memory footprint
 
 ### Added
 
 - one.alf.cache.remove_cache_table_files and One.\_remove_cache_table_files for deleting cache table files
 - one.alf.cache.EMPTY_DATASETS_FRAME and EMPTY_SESSION_FRAME vars for table column, index, and dtype template
 - pyproject.toml replaces deprecated setup file
+- one.alf.exceptions.InvalidALF exception
 
 ### Removed
 

diff --git a/README.md b/README.md
@@ -6,10 +6,10 @@ The Open Neurophysiology Environment is a scheme for sharing neurophysiology dat
 
 Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page.  For a quick primer on the file naming convention we use, [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf).
 
-**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`. 
+**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.
 
 ## Requirements
-ONE runs on Python 3.8 or later, and is tested on the latest Ubuntu and Windows (3.8 and 3.11 only).
+ONE runs on Python 3.10 or later, and is tested on the latest Ubuntu and Windows (3.10 and 3.12 only).
 
 ## Installing
 Installing the package via pip typically takes a few seconds.  To install, run

diff --git a/one/alf/cache.py b/one/alf/cache.py
@@ -86,14 +86,32 @@ def _ses_str_id(session_path):
 
 
 def _get_session_info(rel_ses_path):
-    """Parse a relative session path."""
-    out = session_path_parts(rel_ses_path, as_dict=True, assert_valid=True)
-    out['id'] = _ses_str_id(rel_ses_path)
-    out['date'] = pd.to_datetime(out['date']).date()
-    out['number'] = int(out['number'])
-    out['task_protocol'] = ''
-    out['projects'] = ''
-    return out
+    """Parse a relative session path.
+
+    Parameters
+    ----------
+    rel_ses_path : _type_
+        _description_
+
+    Returns
+    -------
+    str
+        Experiment ID expressed as a relative session posix path.
+    str
+        The lab name (empty str).
+    datetime.date
+        The session date.
+    int
+        The session number.
+    str
+        The task protocol (empty str).
+    str
+        The associated project (empty str).
+    """
+    lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
+    eid = _ses_str_id(rel_ses_path)
+    s_date = pd.to_datetime(s_date).date()
+    return eid, lab or '', subject, s_date, int(num), '', ''
 
 
 def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
@@ -110,22 +128,26 @@ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
 
     Returns
     -------
-    dict
-        A dataset record.
-
-    TODO Return tuples for more memory-efficient cache generation.
+    str, uuid.UUID
+        The session uuid.
+    str
+        The dataset ID expressed as a posix path relative to the session.
+    str
+        The dataset posix path, relative to the session.
+    int
+        The dataset file size.
+    str
+        The file hash, or empty str if `compute_hash` is false.
+    True
+        Whether the file exists.
+    str
+        The QC value for the dataset ('NOT_SET').
     """
     rel_dset_path = get_alf_path(dset_path.relative_to_session())
     ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
-    return {
-        'id': rel_dset_path,
-        'eid': ses_eid or pd.NA,
-        'rel_path': rel_dset_path,
-        'file_size': dset_path.stat().st_size,
-        'hash': md5(dset_path) if compute_hash else '',
-        'exists': True,
-        'qc': 'NOT_SET'
-    }
+    file_size = dset_path.stat().st_size
+    file_hash = md5(dset_path) if compute_hash else ''
+    return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET'
 
 
 def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False):
@@ -200,7 +222,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame:
         rel_path = get_alf_path(full_path)
         # A dict of session info extracted from path
         ses_info = _get_session_info(rel_path)
-        assert set(ses_info.keys()) <= set(SESSIONS_COLUMNS)
+        assert len(ses_info) == len(SESSIONS_COLUMNS)
         rows.append(ses_info)
     df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS)
     return df
@@ -222,17 +244,14 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
     pandas.DataFrame
         A pandas DataFrame of dataset info.
     """
-    df = EMPTY_DATASETS_FRAME.copy()
     # Go through sessions and append datasets
+    rows = []
     for session_path in iter_sessions(root_dir):
-        rows = []
         for dset_path in session_path.iter_datasets(recursive=True):
             file_info = _get_dataset_info(dset_path, compute_hash=hash_files)
-            assert set(file_info.keys()) <= set(DATASETS_COLUMNS)
+            assert len(file_info) == len(DATASETS_COLUMNS)
             rows.append(file_info)
-        df = pd.concat((df, pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)),
-                       ignore_index=True, verify_integrity=True)
-    return df.astype({'qc': QC_TYPE})
+    return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)
 
 
 def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
@@ -265,7 +284,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab
     """
     root_dir = Path(root_dir).resolve()
 
-    # Make the dataframes.
+    # Make the data frames.
     df_ses = _make_sessions_df(root_dir)
     df_dsets = _make_datasets_df(root_dir, hash_files=hash_files)
 

diff --git a/one/tests/alf/test_cache.py b/one/tests/alf/test_cache.py
@@ -17,13 +17,13 @@ class TestsONEParquet(unittest.TestCase):
     """Tests for the make_parquet_db function and its helpers"""
     rel_ses_path = 'mylab/Subjects/mysub/2021-02-28/001/'
     ses_info = {
+        'id': 'mylab/mysub/2021-02-28/001',
         'lab': 'mylab',
         'subject': 'mysub',
         'date': datetime.date.fromisoformat('2021-02-28'),
         'number': int('001'),
         'projects': '',
         'task_protocol': '',
-        'id': 'mylab/mysub/2021-02-28/001',
     }
     rel_ses_files = [Path('alf/spikes.clusters.npy'), Path('alf/spikes.times.npy')]
 
@@ -50,7 +50,7 @@ def setUp(self) -> None:
         second_session.joinpath('.invalid').touch()
 
     def test_parse(self):
-        self.assertEqual(apt._get_session_info(self.rel_ses_path), self.ses_info)
+        self.assertEqual(apt._get_session_info(self.rel_ses_path), tuple(self.ses_info.values()))
         self.assertTrue(
             self.full_ses_path.as_posix().endswith(self.rel_ses_path[:-1]))