diff --git a/CHANGELOG.md b/CHANGELOG.md index 743a45dd..4f837857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,13 +13,14 @@ Also adds a new ALFPath class to replace alf path functions. - paths returned by One methods and functions in one.alf.io are now ALFPath instances - bugfix: one.alf.path.full_path_parts didn't always raise when invalid path passed - one.alf.path module containing ALFPath class -- one.alf.exceptions.InvalidALF exception +- ALF cache table generation has lower memory footprint ### Added - one.alf.cache.remove_cache_table_files and One.\_remove_cache_table_files for deleting cache table files - one.alf.cache.EMPTY_DATASETS_FRAME and EMPTY_SESSION_FRAME vars for table column, index, and dtype template - pyproject.toml replaces deprecated setup file +- one.alf.exceptions.InvalidALF exception ### Removed diff --git a/README.md b/README.md index 5c8f75aa..f8b6b4a8 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ The Open Neurophysiology Environment is a scheme for sharing neurophysiology dat Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page. For a quick primer on the file naming convention we use, [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf). -**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`. +**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`. ## Requirements -ONE runs on Python 3.8 or later, and is tested on the latest Ubuntu and Windows (3.8 and 3.11 only). +ONE runs on Python 3.10 or later, and is tested on the latest Ubuntu and Windows (3.10 and 3.12 only). ## Installing Installing the package via pip typically takes a few seconds. To install, run diff --git a/one/alf/cache.py b/one/alf/cache.py index 1143e9a0..895ef22d 100644 --- a/one/alf/cache.py +++ b/one/alf/cache.py @@ -86,14 +86,32 @@ def _ses_str_id(session_path): def _get_session_info(rel_ses_path): - """Parse a relative session path.""" - out = session_path_parts(rel_ses_path, as_dict=True, assert_valid=True) - out['id'] = _ses_str_id(rel_ses_path) - out['date'] = pd.to_datetime(out['date']).date() - out['number'] = int(out['number']) - out['task_protocol'] = '' - out['projects'] = '' - return out + """Parse a relative session path. + + Parameters + ---------- + rel_ses_path : _type_ + _description_ + + Returns + ------- + str + Experiment ID expressed as a relative session posix path. + str + The lab name (empty str). + datetime.date + The session date. + int + The session number. + str + The task protocol (empty str). + str + The associated project (empty str). + """ + lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True) + eid = _ses_str_id(rel_ses_path) + s_date = pd.to_datetime(s_date).date() + return eid, lab or '', subject, s_date, int(num), '', '' def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False): @@ -110,22 +128,26 @@ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False): Returns ------- - dict - A dataset record. - - TODO Return tuples for more memory-efficient cache generation. + str, uuid.UUID + The session uuid. + str + The dataset ID expressed as a posix path relative to the session. + str + The dataset posix path, relative to the session. + int + The dataset file size. + str + The file hash, or empty str if `compute_hash` is false. + True + Whether the file exists. + str + The QC value for the dataset ('NOT_SET'). """ rel_dset_path = get_alf_path(dset_path.relative_to_session()) ses_eid = ses_eid or _ses_str_id(dset_path.session_path()) - return { - 'id': rel_dset_path, - 'eid': ses_eid or pd.NA, - 'rel_path': rel_dset_path, - 'file_size': dset_path.stat().st_size, - 'hash': md5(dset_path) if compute_hash else '', - 'exists': True, - 'qc': 'NOT_SET' - } + file_size = dset_path.stat().st_size + file_hash = md5(dset_path) if compute_hash else '' + return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET' def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False): @@ -200,7 +222,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame: rel_path = get_alf_path(full_path) # A dict of session info extracted from path ses_info = _get_session_info(rel_path) - assert set(ses_info.keys()) <= set(SESSIONS_COLUMNS) + assert len(ses_info) == len(SESSIONS_COLUMNS) rows.append(ses_info) df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS) return df @@ -222,17 +244,14 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame: pandas.DataFrame A pandas DataFrame of dataset info. """ - df = EMPTY_DATASETS_FRAME.copy() # Go through sessions and append datasets + rows = [] for session_path in iter_sessions(root_dir): - rows = [] for dset_path in session_path.iter_datasets(recursive=True): file_info = _get_dataset_info(dset_path, compute_hash=hash_files) - assert set(file_info.keys()) <= set(DATASETS_COLUMNS) + assert len(file_info) == len(DATASETS_COLUMNS) rows.append(file_info) - df = pd.concat((df, pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)), - ignore_index=True, verify_integrity=True) - return df.astype({'qc': QC_TYPE}) + return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS) def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None): @@ -265,7 +284,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab """ root_dir = Path(root_dir).resolve() - # Make the dataframes. + # Make the data frames. df_ses = _make_sessions_df(root_dir) df_dsets = _make_datasets_df(root_dir, hash_files=hash_files) diff --git a/one/tests/alf/test_cache.py b/one/tests/alf/test_cache.py index 05c894c3..9a74a0f4 100644 --- a/one/tests/alf/test_cache.py +++ b/one/tests/alf/test_cache.py @@ -17,13 +17,13 @@ class TestsONEParquet(unittest.TestCase): """Tests for the make_parquet_db function and its helpers""" rel_ses_path = 'mylab/Subjects/mysub/2021-02-28/001/' ses_info = { + 'id': 'mylab/mysub/2021-02-28/001', 'lab': 'mylab', 'subject': 'mysub', 'date': datetime.date.fromisoformat('2021-02-28'), 'number': int('001'), 'projects': '', 'task_protocol': '', - 'id': 'mylab/mysub/2021-02-28/001', } rel_ses_files = [Path('alf/spikes.clusters.npy'), Path('alf/spikes.times.npy')] @@ -50,7 +50,7 @@ def setUp(self) -> None: second_session.joinpath('.invalid').touch() def test_parse(self): - self.assertEqual(apt._get_session_info(self.rel_ses_path), self.ses_info) + self.assertEqual(apt._get_session_info(self.rel_ses_path), tuple(self.ses_info.values())) self.assertTrue( self.full_ses_path.as_posix().endswith(self.rel_ses_path[:-1]))