Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Low mem cache #163

Merged
merged 2 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ Also adds a new ALFPath class to replace alf path functions.
- paths returned by One methods and functions in one.alf.io are now ALFPath instances
- bugfix: one.alf.path.full_path_parts didn't always raise when invalid path passed
- one.alf.path module containing ALFPath class
- one.alf.exceptions.InvalidALF exception
- ALF cache table generation has lower memory footprint

### Added

- one.alf.cache.remove_cache_table_files and One.\_remove_cache_table_files for deleting cache table files
- one.alf.cache.EMPTY_DATASETS_FRAME and EMPTY_SESSION_FRAME vars for table column, index, and dtype template
- pyproject.toml replaces deprecated setup file
- one.alf.exceptions.InvalidALF exception

### Removed

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ The Open Neurophysiology Environment is a scheme for sharing neurophysiology dat

Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page. For a quick primer on the file naming convention we use, [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf).

**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.
**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.

## Requirements
ONE runs on Python 3.8 or later, and is tested on the latest Ubuntu and Windows (3.8 and 3.11 only).
ONE runs on Python 3.10 or later, and is tested on the latest Ubuntu and Windows (3.10 and 3.12 only).

## Installing
Installing the package via pip typically takes a few seconds. To install, run
Expand Down
77 changes: 48 additions & 29 deletions one/alf/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,32 @@ def _ses_str_id(session_path):


def _get_session_info(rel_ses_path):
"""Parse a relative session path."""
out = session_path_parts(rel_ses_path, as_dict=True, assert_valid=True)
out['id'] = _ses_str_id(rel_ses_path)
out['date'] = pd.to_datetime(out['date']).date()
out['number'] = int(out['number'])
out['task_protocol'] = ''
out['projects'] = ''
return out
"""Parse a relative session path.

Parameters
----------
rel_ses_path : _type_
_description_

Returns
-------
str
Experiment ID expressed as a relative session posix path.
str
The lab name (empty str).
datetime.date
The session date.
int
The session number.
str
The task protocol (empty str).
str
The associated project (empty str).
"""
lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
eid = _ses_str_id(rel_ses_path)
s_date = pd.to_datetime(s_date).date()
return eid, lab or '', subject, s_date, int(num), '', ''


def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
Expand All @@ -110,22 +128,26 @@ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):

Returns
-------
dict
A dataset record.

TODO Return tuples for more memory-efficient cache generation.
str, uuid.UUID
The session uuid.
str
The dataset ID expressed as a posix path relative to the session.
str
The dataset posix path, relative to the session.
int
The dataset file size.
str
The file hash, or empty str if `compute_hash` is false.
True
Whether the file exists.
str
The QC value for the dataset ('NOT_SET').
"""
rel_dset_path = get_alf_path(dset_path.relative_to_session())
ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
return {
'id': rel_dset_path,
'eid': ses_eid or pd.NA,
'rel_path': rel_dset_path,
'file_size': dset_path.stat().st_size,
'hash': md5(dset_path) if compute_hash else '',
'exists': True,
'qc': 'NOT_SET'
}
file_size = dset_path.stat().st_size
file_hash = md5(dset_path) if compute_hash else ''
return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET'


def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False):
Expand Down Expand Up @@ -200,7 +222,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame:
rel_path = get_alf_path(full_path)
# A dict of session info extracted from path
ses_info = _get_session_info(rel_path)
assert set(ses_info.keys()) <= set(SESSIONS_COLUMNS)
assert len(ses_info) == len(SESSIONS_COLUMNS)
rows.append(ses_info)
df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS)
return df
Expand All @@ -222,17 +244,14 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
pandas.DataFrame
A pandas DataFrame of dataset info.
"""
df = EMPTY_DATASETS_FRAME.copy()
# Go through sessions and append datasets
rows = []
for session_path in iter_sessions(root_dir):
rows = []
for dset_path in session_path.iter_datasets(recursive=True):
file_info = _get_dataset_info(dset_path, compute_hash=hash_files)
assert set(file_info.keys()) <= set(DATASETS_COLUMNS)
assert len(file_info) == len(DATASETS_COLUMNS)
rows.append(file_info)
df = pd.concat((df, pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)),
ignore_index=True, verify_integrity=True)
return df.astype({'qc': QC_TYPE})
return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)


def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
Expand Down Expand Up @@ -265,7 +284,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab
"""
root_dir = Path(root_dir).resolve()

# Make the dataframes.
# Make the data frames.
df_ses = _make_sessions_df(root_dir)
df_dsets = _make_datasets_df(root_dir, hash_files=hash_files)

Expand Down
4 changes: 2 additions & 2 deletions one/tests/alf/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ class TestsONEParquet(unittest.TestCase):
"""Tests for the make_parquet_db function and its helpers"""
rel_ses_path = 'mylab/Subjects/mysub/2021-02-28/001/'
ses_info = {
'id': 'mylab/mysub/2021-02-28/001',
'lab': 'mylab',
'subject': 'mysub',
'date': datetime.date.fromisoformat('2021-02-28'),
'number': int('001'),
'projects': '',
'task_protocol': '',
'id': 'mylab/mysub/2021-02-28/001',
}
rel_ses_files = [Path('alf/spikes.clusters.npy'), Path('alf/spikes.times.npy')]

Expand All @@ -50,7 +50,7 @@ def setUp(self) -> None:
second_session.joinpath('.invalid').touch()

def test_parse(self):
self.assertEqual(apt._get_session_info(self.rel_ses_path), self.ses_info)
self.assertEqual(apt._get_session_info(self.rel_ses_path), tuple(self.ses_info.values()))
self.assertTrue(
self.full_ses_path.as_posix().endswith(self.rel_ses_path[:-1]))

Expand Down
Loading