Skip to content

Commit

Permalink
Low mem cache (#163)
Browse files Browse the repository at this point in the history
* Lower memory cache generation
* Update README
  • Loading branch information
k1o0 authored Nov 12, 2024
1 parent ae7a7cc commit 24f66d9
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 34 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ Also adds a new ALFPath class to replace alf path functions.
- paths returned by One methods and functions in one.alf.io are now ALFPath instances
- bugfix: one.alf.path.full_path_parts didn't always raise when invalid path passed
- one.alf.path module containing ALFPath class
- one.alf.exceptions.InvalidALF exception
- ALF cache table generation has lower memory footprint

### Added

- one.alf.cache.remove_cache_table_files and One.\_remove_cache_table_files for deleting cache table files
- one.alf.cache.EMPTY_DATASETS_FRAME and EMPTY_SESSION_FRAME vars for table column, index, and dtype template
- pyproject.toml replaces deprecated setup file
- one.alf.exceptions.InvalidALF exception

### Removed

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ The Open Neurophysiology Environment is a scheme for sharing neurophysiology dat

Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page. For a quick primer on the file naming convention we use, [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf).

**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.
**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.

## Requirements
ONE runs on Python 3.8 or later, and is tested on the latest Ubuntu and Windows (3.8 and 3.11 only).
ONE runs on Python 3.10 or later, and is tested on the latest Ubuntu and Windows (3.10 and 3.12 only).

## Installing
Installing the package via pip typically takes a few seconds. To install, run
Expand Down
77 changes: 48 additions & 29 deletions one/alf/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,32 @@ def _ses_str_id(session_path):


def _get_session_info(rel_ses_path):
"""Parse a relative session path."""
out = session_path_parts(rel_ses_path, as_dict=True, assert_valid=True)
out['id'] = _ses_str_id(rel_ses_path)
out['date'] = pd.to_datetime(out['date']).date()
out['number'] = int(out['number'])
out['task_protocol'] = ''
out['projects'] = ''
return out
"""Parse a relative session path.
Parameters
----------
rel_ses_path : _type_
_description_
Returns
-------
str
Experiment ID expressed as a relative session posix path.
str
The lab name (empty str).
datetime.date
The session date.
int
The session number.
str
The task protocol (empty str).
str
The associated project (empty str).
"""
lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
eid = _ses_str_id(rel_ses_path)
s_date = pd.to_datetime(s_date).date()
return eid, lab or '', subject, s_date, int(num), '', ''


def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
Expand All @@ -110,22 +128,26 @@ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
Returns
-------
dict
A dataset record.
TODO Return tuples for more memory-efficient cache generation.
str, uuid.UUID
The session uuid.
str
The dataset ID expressed as a posix path relative to the session.
str
The dataset posix path, relative to the session.
int
The dataset file size.
str
The file hash, or empty str if `compute_hash` is false.
True
Whether the file exists.
str
The QC value for the dataset ('NOT_SET').
"""
rel_dset_path = get_alf_path(dset_path.relative_to_session())
ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
return {
'id': rel_dset_path,
'eid': ses_eid or pd.NA,
'rel_path': rel_dset_path,
'file_size': dset_path.stat().st_size,
'hash': md5(dset_path) if compute_hash else '',
'exists': True,
'qc': 'NOT_SET'
}
file_size = dset_path.stat().st_size
file_hash = md5(dset_path) if compute_hash else ''
return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET'


def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False):
Expand Down Expand Up @@ -200,7 +222,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame:
rel_path = get_alf_path(full_path)
# A dict of session info extracted from path
ses_info = _get_session_info(rel_path)
assert set(ses_info.keys()) <= set(SESSIONS_COLUMNS)
assert len(ses_info) == len(SESSIONS_COLUMNS)
rows.append(ses_info)
df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS)
return df
Expand All @@ -222,17 +244,14 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
pandas.DataFrame
A pandas DataFrame of dataset info.
"""
df = EMPTY_DATASETS_FRAME.copy()
# Go through sessions and append datasets
rows = []
for session_path in iter_sessions(root_dir):
rows = []
for dset_path in session_path.iter_datasets(recursive=True):
file_info = _get_dataset_info(dset_path, compute_hash=hash_files)
assert set(file_info.keys()) <= set(DATASETS_COLUMNS)
assert len(file_info) == len(DATASETS_COLUMNS)
rows.append(file_info)
df = pd.concat((df, pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)),
ignore_index=True, verify_integrity=True)
return df.astype({'qc': QC_TYPE})
return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)


def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
Expand Down Expand Up @@ -265,7 +284,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab
"""
root_dir = Path(root_dir).resolve()

# Make the dataframes.
# Make the data frames.
df_ses = _make_sessions_df(root_dir)
df_dsets = _make_datasets_df(root_dir, hash_files=hash_files)

Expand Down
4 changes: 2 additions & 2 deletions one/tests/alf/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ class TestsONEParquet(unittest.TestCase):
"""Tests for the make_parquet_db function and its helpers"""
rel_ses_path = 'mylab/Subjects/mysub/2021-02-28/001/'
ses_info = {
'id': 'mylab/mysub/2021-02-28/001',
'lab': 'mylab',
'subject': 'mysub',
'date': datetime.date.fromisoformat('2021-02-28'),
'number': int('001'),
'projects': '',
'task_protocol': '',
'id': 'mylab/mysub/2021-02-28/001',
}
rel_ses_files = [Path('alf/spikes.clusters.npy'), Path('alf/spikes.times.npy')]

Expand All @@ -50,7 +50,7 @@ def setUp(self) -> None:
second_session.joinpath('.invalid').touch()

def test_parse(self):
self.assertEqual(apt._get_session_info(self.rel_ses_path), self.ses_info)
self.assertEqual(apt._get_session_info(self.rel_ses_path), tuple(self.ses_info.values()))
self.assertTrue(
self.full_ses_path.as_posix().endswith(self.rel_ses_path[:-1]))

Expand Down

0 comments on commit 24f66d9

Please sign in to comment.