From 139e1b7c1a55d24c7b99c9abc1180aece4136ed8 Mon Sep 17 00:00:00 2001 From: olivier Date: Sat, 6 Jan 2024 18:07:25 +0000 Subject: [PATCH 1/4] log server 500 error message if provided in request text --- one/webclient.py | 1 + 1 file changed, 1 insertion(+) diff --git a/one/webclient.py b/one/webclient.py index cd6a5a12..465f8b67 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -615,6 +615,7 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None): message = json.loads(r.text) message.pop('status_code', None) # Get status code from response object instead message = message.get('detail') or message # Get details if available + _logger.error(message) except json.decoder.JSONDecodeError: message = r.text raise requests.HTTPError(r.status_code, rest_query, message, response=r) From 78765f634b0b47ae0aa9709598054447d3726b06 Mon Sep 17 00:00:00 2001 From: olivier Date: Wed, 10 Jan 2024 14:32:40 +0000 Subject: [PATCH 2/4] _check_filesystem() first check size before hash, add a flag to skip hash --- one/api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/one/api.py b/one/api.py index 6c7340b6..290f3aae 100644 --- a/one/api.py +++ b/one/api.py @@ -523,7 +523,7 @@ def sort_fcn(itm): else: return eids - def _check_filesystem(self, datasets, offline=None, update_exists=True): + def _check_filesystem(self, datasets, offline=None, update_exists=True, check_hash=True): """Update the local filesystem for the given datasets. Given a set of datasets, check whether records correctly reflect the filesystem. @@ -579,15 +579,15 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True): if file.exists(): # Check if there's a hash mismatch # If so, add this index to list of datasets that need downloading - if rec['hash'] is not None: + if rec['file_size'] and file.stat().st_size != rec['file_size']: + _logger.warning('local file size mismatch on dataset: %s', + PurePosixPath(rec.session_path, rec.rel_path)) + indices_to_download.append(i) + elif check_hash and rec['hash'] is not None: if hashfile.md5(file) != rec['hash']: _logger.warning('local md5 mismatch on dataset: %s', PurePosixPath(rec.session_path, rec.rel_path)) indices_to_download.append(i) - elif rec['file_size'] and file.stat().st_size != rec['file_size']: - _logger.warning('local file size mismatch on dataset: %s', - PurePosixPath(rec.session_path, rec.rel_path)) - indices_to_download.append(i) files.append(file) # File exists so add to file list else: # File doesn't exist so add None to output file list From 9b23f15231866279756a280aa8d22a7b5fdeef0c Mon Sep 17 00:00:00 2001 From: olivier Date: Wed, 10 Jan 2024 16:20:17 +0000 Subject: [PATCH 3/4] add changelog and allen s3 bucket convenience functions --- CHANGELOG.md | 16 ++++++++++++++-- one/remote/aws.py | 17 +++++++++++++++++ one/tests/remote/test_aws.py | 14 ++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af4a10a7..d83f962a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.2] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0] + +### Modified + +- one.load_dataset + - add an option to skip computing hash for existing files when loading datasets `check_hash=False` + - check filesize before computing hash for performance + +### Added + +- one.remote.aws.get_s3_allen() convenience function to interact with Allen Institute S3 bucket for atlases + +## [2.5.2] ### Modified @@ -11,7 +23,7 @@ - exclude irrelevant s3 objects with source name in key, e.g. for foo/bar exclude foo/bar_baz/ key -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.0] +## [2.5.0] ### Added diff --git a/one/remote/aws.py b/one/remote/aws.py index dc696c60..9ae2792a 100644 --- a/one/remote/aws.py +++ b/one/remote/aws.py @@ -168,6 +168,23 @@ def get_s3_public(): return s3, S3_BUCKET_IBL +def get_s3_allen(): + """ + Retrieve the Allen public S3 service resource. + + Returns + ------- + s3.ServiceResource + An S3 ServiceResource instance with the provided. + str + The name of the S3 bucket. + """ + S3_BUCKET_ALLEN = 'allen-brain-cell-atlas' + session = boto3.Session(region_name='us-west-2') + s3 = session.resource('s3', config=Config(signature_version=UNSIGNED)) + return s3, S3_BUCKET_ALLEN + + def get_s3_from_alyx(alyx, repo_name=REPO_DEFAULT): """ Create an S3 resource instance using credentials from an Alyx data repository. diff --git a/one/tests/remote/test_aws.py b/one/tests/remote/test_aws.py index fc820c89..151a9a4c 100644 --- a/one/tests/remote/test_aws.py +++ b/one/tests/remote/test_aws.py @@ -143,6 +143,20 @@ def test_url2uri(self): uri, loc = aws.url2uri(url, return_location=True) self.assertEqual(loc, 'eu-east-1') + @mock.patch('boto3.Session') + def test_get_ibl_s3(self, session_mock): + s3, bucket = aws.get_s3_public() + resource = session_mock().resource + self.assertIs(s3, resource()) + self.assertEqual(bucket, 'ibl-brain-wide-map-public') + + @mock.patch('boto3.Session') + def test_get_allen_s3(self, session_mock): + s3, bucket = aws.get_s3_allen() + resource = session_mock().resource + self.assertIs(s3, resource()) + self.assertEqual(bucket, 'allen-brain-cell-atlas') + if __name__ == '__main__': unittest.main(exit=False) From 647123f74381b13f593cb649331e32885bb15611 Mon Sep 17 00:00:00 2001 From: olivier Date: Wed, 24 Jan 2024 11:39:46 +0000 Subject: [PATCH 4/4] add review corrections --- CHANGELOG.md | 9 ++++++++- one/__init__.py | 2 +- one/tests/test_one.py | 10 ++++++++++ one/webclient.py | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f01875ea..16b0b6e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # Changelog ## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0] -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.5] + +### Modified +- `one.load_dataset` + - add an option to skip computing hash for existing files when loading datasets `check_hash=False` + - check filesize before computing hash for performance + + +## [2.5.5] ### Modified diff --git a/one/__init__.py b/one/__init__.py index d319c051..fddf4607 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.5.5' +__version__ = '2.6.0' diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 86ce63cd..0507d4e7 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -68,6 +68,16 @@ def setUp(self) -> None: self.one = ONE(mode='local', cache_dir=self.tempdir.name) # Create dset files from cache util.create_file_tree(self.one) + # here we create some variations to get coverage over more case + # the 10 first records will have the right file size (0) but the wrong hash + # the next 10 records will have the right file size (0) but the correct empty file md5 + # all remaining records have NaN in file_size and None in hash (default cache table) + cols = self.one._cache['datasets'].columns + self.one._cache['datasets'].iloc[:20, cols.get_loc('file_size')] = 0 + self.one._cache['datasets'].iloc[:20, cols.get_loc('hash')]\ + = 'd41d8cd98f00b204e9800998ecf8427e' # empty hash correct + self.one._cache['datasets'].iloc[:10, cols.get_loc('hash')]\ + = 'd41d8cda454aaaa4e9800998ecf8497e' # wrong hash def tearDown(self) -> None: while Path(self.one.cache_dir).joinpath('.cache.lock').exists(): diff --git a/one/webclient.py b/one/webclient.py index 465f8b67..3e57f1ad 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -610,12 +610,12 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None): self.authenticate(username=username, force=True) return self._generic_request(reqfunction, rest_query, data=data, files=files) else: - _logger.debug('Response text: ' + r.text) + _logger.debug('Response text raw: ' + r.text) try: message = json.loads(r.text) message.pop('status_code', None) # Get status code from response object instead message = message.get('detail') or message # Get details if available - _logger.error(message) + _logger.debug(message) except json.decoder.JSONDecodeError: message = r.text raise requests.HTTPError(r.status_code, rest_query, message, response=r)