From 139e1b7c1a55d24c7b99c9abc1180aece4136ed8 Mon Sep 17 00:00:00 2001
From: olivier <olivier.winter@hotmail.fr>
Date: Sat, 6 Jan 2024 18:07:25 +0000
Subject: [PATCH 1/4] log server 500 error message if provided in request text

---
 one/webclient.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/one/webclient.py b/one/webclient.py
index cd6a5a12..465f8b67 100644
--- a/one/webclient.py
+++ b/one/webclient.py
@@ -615,6 +615,7 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None):
                 message = json.loads(r.text)
                 message.pop('status_code', None)  # Get status code from response object instead
                 message = message.get('detail') or message  # Get details if available
+                _logger.error(message)
             except json.decoder.JSONDecodeError:
                 message = r.text
             raise requests.HTTPError(r.status_code, rest_query, message, response=r)

From 78765f634b0b47ae0aa9709598054447d3726b06 Mon Sep 17 00:00:00 2001
From: olivier <olivier.winter@hotmail.fr>
Date: Wed, 10 Jan 2024 14:32:40 +0000
Subject: [PATCH 2/4] _check_filesystem() first check size before hash, add a
 flag to skip hash

---
 one/api.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/one/api.py b/one/api.py
index 6c7340b6..290f3aae 100644
--- a/one/api.py
+++ b/one/api.py
@@ -523,7 +523,7 @@ def sort_fcn(itm):
         else:
             return eids
 
-    def _check_filesystem(self, datasets, offline=None, update_exists=True):
+    def _check_filesystem(self, datasets, offline=None, update_exists=True, check_hash=True):
         """Update the local filesystem for the given datasets.
 
         Given a set of datasets, check whether records correctly reflect the filesystem.
@@ -579,15 +579,15 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True):
             if file.exists():
                 # Check if there's a hash mismatch
                 # If so, add this index to list of datasets that need downloading
-                if rec['hash'] is not None:
+                if rec['file_size'] and file.stat().st_size != rec['file_size']:
+                    _logger.warning('local file size mismatch on dataset: %s',
+                                    PurePosixPath(rec.session_path, rec.rel_path))
+                    indices_to_download.append(i)
+                elif check_hash and rec['hash'] is not None:
                     if hashfile.md5(file) != rec['hash']:
                         _logger.warning('local md5 mismatch on dataset: %s',
                                         PurePosixPath(rec.session_path, rec.rel_path))
                         indices_to_download.append(i)
-                elif rec['file_size'] and file.stat().st_size != rec['file_size']:
-                    _logger.warning('local file size mismatch on dataset: %s',
-                                    PurePosixPath(rec.session_path, rec.rel_path))
-                    indices_to_download.append(i)
                 files.append(file)  # File exists so add to file list
             else:
                 # File doesn't exist so add None to output file list

From 9b23f15231866279756a280aa8d22a7b5fdeef0c Mon Sep 17 00:00:00 2001
From: olivier <olivier.winter@hotmail.fr>
Date: Wed, 10 Jan 2024 16:20:17 +0000
Subject: [PATCH 3/4] add changelog and allen s3 bucket convenience functions

---
 CHANGELOG.md                 | 16 ++++++++++++++--
 one/remote/aws.py            | 17 +++++++++++++++++
 one/tests/remote/test_aws.py | 14 ++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af4a10a7..d83f962a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.2]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0]
+
+### Modified
+
+- one.load_dataset
+  - add an option to skip computing hash for existing files when loading datasets `check_hash=False`
+  - check filesize before computing hash for performance
+
+### Added
+
+- one.remote.aws.get_s3_allen() convenience function to interact with Allen Institute S3 bucket for atlases
+
+## [2.5.2]
 
 ### Modified
 
@@ -11,7 +23,7 @@
 
 - exclude irrelevant s3 objects with source name in key, e.g. for foo/bar exclude foo/bar_baz/ key
 
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.0]
+## [2.5.0]
 
 ### Added
 
diff --git a/one/remote/aws.py b/one/remote/aws.py
index dc696c60..9ae2792a 100644
--- a/one/remote/aws.py
+++ b/one/remote/aws.py
@@ -168,6 +168,23 @@ def get_s3_public():
     return s3, S3_BUCKET_IBL
 
 
+def get_s3_allen():
+    """
+    Retrieve the Allen public S3 service resource.
+
+    Returns
+    -------
+    s3.ServiceResource
+        An S3 ServiceResource instance with the provided.
+    str
+        The name of the S3 bucket.
+    """
+    S3_BUCKET_ALLEN = 'allen-brain-cell-atlas'
+    session = boto3.Session(region_name='us-west-2')
+    s3 = session.resource('s3', config=Config(signature_version=UNSIGNED))
+    return s3, S3_BUCKET_ALLEN
+
+
 def get_s3_from_alyx(alyx, repo_name=REPO_DEFAULT):
     """
     Create an S3 resource instance using credentials from an Alyx data repository.
diff --git a/one/tests/remote/test_aws.py b/one/tests/remote/test_aws.py
index fc820c89..151a9a4c 100644
--- a/one/tests/remote/test_aws.py
+++ b/one/tests/remote/test_aws.py
@@ -143,6 +143,20 @@ def test_url2uri(self):
         uri, loc = aws.url2uri(url, return_location=True)
         self.assertEqual(loc, 'eu-east-1')
 
+    @mock.patch('boto3.Session')
+    def test_get_ibl_s3(self, session_mock):
+        s3, bucket = aws.get_s3_public()
+        resource = session_mock().resource
+        self.assertIs(s3, resource())
+        self.assertEqual(bucket, 'ibl-brain-wide-map-public')
+
+    @mock.patch('boto3.Session')
+    def test_get_allen_s3(self, session_mock):
+        s3, bucket = aws.get_s3_allen()
+        resource = session_mock().resource
+        self.assertIs(s3, resource())
+        self.assertEqual(bucket, 'allen-brain-cell-atlas')
+
 
 if __name__ == '__main__':
     unittest.main(exit=False)

From 647123f74381b13f593cb649331e32885bb15611 Mon Sep 17 00:00:00 2001
From: olivier <olivier.winter@hotmail.fr>
Date: Wed, 24 Jan 2024 11:39:46 +0000
Subject: [PATCH 4/4] add review corrections

---
 CHANGELOG.md          |  9 ++++++++-
 one/__init__.py       |  2 +-
 one/tests/test_one.py | 10 ++++++++++
 one/webclient.py      |  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f01875ea..16b0b6e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,13 @@
 # Changelog
 ## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0]
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.5]
+
+### Modified
+- `one.load_dataset`
+  - add an option to skip computing hash for existing files when loading datasets `check_hash=False`
+  - check filesize before computing hash for performance
+
+
+## [2.5.5]
 
 ### Modified
 
diff --git a/one/__init__.py b/one/__init__.py
index d319c051..fddf4607 100644
--- a/one/__init__.py
+++ b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '2.5.5'
+__version__ = '2.6.0'
diff --git a/one/tests/test_one.py b/one/tests/test_one.py
index 86ce63cd..0507d4e7 100644
--- a/one/tests/test_one.py
+++ b/one/tests/test_one.py
@@ -68,6 +68,16 @@ def setUp(self) -> None:
         self.one = ONE(mode='local', cache_dir=self.tempdir.name)
         # Create dset files from cache
         util.create_file_tree(self.one)
+        # here we create some variations to get coverage over more case
+        # the 10 first records will have the right file size (0) but the wrong hash
+        # the next 10 records will have the right file size (0) but the correct empty file md5
+        # all remaining records have NaN in file_size and None in hash (default cache table)
+        cols = self.one._cache['datasets'].columns
+        self.one._cache['datasets'].iloc[:20, cols.get_loc('file_size')] = 0
+        self.one._cache['datasets'].iloc[:20, cols.get_loc('hash')]\
+            = 'd41d8cd98f00b204e9800998ecf8427e'  # empty hash correct
+        self.one._cache['datasets'].iloc[:10, cols.get_loc('hash')]\
+            = 'd41d8cda454aaaa4e9800998ecf8497e'  # wrong hash
 
     def tearDown(self) -> None:
         while Path(self.one.cache_dir).joinpath('.cache.lock').exists():
diff --git a/one/webclient.py b/one/webclient.py
index 465f8b67..3e57f1ad 100644
--- a/one/webclient.py
+++ b/one/webclient.py
@@ -610,12 +610,12 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None):
             self.authenticate(username=username, force=True)
             return self._generic_request(reqfunction, rest_query, data=data, files=files)
         else:
-            _logger.debug('Response text: ' + r.text)
+            _logger.debug('Response text raw: ' + r.text)
             try:
                 message = json.loads(r.text)
                 message.pop('status_code', None)  # Get status code from response object instead
                 message = message.get('detail') or message  # Get details if available
-                _logger.error(message)
+                _logger.debug(message)
             except json.decoder.JSONDecodeError:
                 message = r.text
             raise requests.HTTPError(r.status_code, rest_query, message, response=r)