Merge pull request #305 from 4dn-dcic/misc-minor-updates-20240423

Added check for ES_HOST_LOCAL environment variable in ff_utils.
4dn-dcic · May 24, 2024 · a9820a0 · a9820a0
2 parents e4bf61b + 16d107e
commit a9820a0
Show file tree

Hide file tree

Showing 14 changed files with 1,679 additions and 1,007 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,29 @@ Change Log
 ----------
 
 
+8.8.6
+=====
+
+* Added check for ES_HOST_LOCAL environment variable in ff_utils.get_es_metadata;
+  for running Foursight checks locally (with local ssh tunnel to ES proxy);
+  came up in foursight/checks/audit_checks (2024-04-23).
+* Allow Python 3.12 (pyproject.toml).
+* Added remove_empty_objects_from_lists options to structured_data.StructuredDataSet, defaulting
+  to True, which deletes empty objects from lists; however, only from the *end* of a list; if
+  this flag is True and there are non-empty objects following empty objects then we flag an error.
+* Few general things initially related to and factored out of rclone support in smaht-submitr:
+  - Added extract_file_from_zip to zip_utils.
+  - Added http_utils with download function.
+  - Added get_app_specific_directory, get_os_name, get_cpu_architecture_name, short_uuid to misc_utils.
+  - Added are_files_equal, create_random_file to file_utils,  compute_file_md5, compute_file_etag,
+    normalize_path, get_file_size, get_file_modified_datetime to file_utils.
+  - Minor extra sanity check to search_for_file in file_utils.
+  - Added deterministic ordering to paths returned by search_for_file in file_utils.
+  - Added create_temporary_file_name and remove_temporary_file tmpfile_utils.
+  - Minor fix to misc_utils.create_dict (do not create property only if its value is None).
+  - Minor updates to utility dcicutils.scripts.view_portal_object.
+
+
 8.8.5
 =====
 

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -895,9 +895,12 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth):
     used to create the generator.
     Should NOT be used directly
     """
+    def get_es_host_local() -> Optional[str]:
+        return os.environ.get("ES_HOST_LOCAL", None)
     health = get_health_page(key=auth)
     if es_client is None:
-        es_url = health['elasticsearch']
+        if not (es_url := get_es_host_local()):
+            es_url = health['elasticsearch']
         es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
     namespace_star = health.get('namespace', '') + '*'
     # match all given uuids to _id fields

diff --git a/dcicutils/file_utils.py b/dcicutils/file_utils.py
@@ -1,13 +1,23 @@
 import glob
+import hashlib
+import io
 import os
 import pathlib
+from datetime import datetime
+import random
+import string
+from tempfile import gettempdir as get_temporary_directory
 from typing import List, Optional, Union
+from uuid import uuid4 as uuid
+
+HOME_DIRECTORY = str(pathlib.Path().home())
 
 
 def search_for_file(file: str,
-                    location: Union[str, Optional[List[str]]] = None,
+                    location: Union[str, pathlib.PosixPath, Optional[List[Union[str, pathlib.PosixPath]]]] = None,
                     recursive: bool = False,
-                    single: bool = False) -> Union[List[str], Optional[str]]:
+                    single: bool = False,
+                    order: bool = True) -> Union[List[str], Optional[str]]:
     """
     Searches for the existence of the given file name, first directly in the given directory or list
     of directories, if specified, and if not then just in the current (working) directory; if the
@@ -16,43 +26,242 @@ def search_for_file(file: str,
     first file which is found is returns (as a string), or None if none; if the single flag
     is False, then all matched files are returned in a list, or and empty list if none.
     """
-    if file and isinstance(file, (str, pathlib.PosixPath)):
-        if os.path.isabs(file):
-            if os.path.exists(file):
-                return file if single else [file]
-            return None if single else []
-        files_found = []
-        if not location:
-            location = ["."]
-        elif isinstance(location, (str, pathlib.PosixPath)):
-            location = [location]
-        elif not isinstance(location, list):
-            location = []
-        for directory in location:
-            if not directory:
+    def order_by_fewest_number_of_paths_and_then_alphabetically(paths: List[str]) -> List[str]:
+        def order_by(path: str):
+            return len(path.split(os.path.sep)), path
+        return sorted(paths, key=order_by)
+
+    if not (file and isinstance(file, (str, pathlib.PosixPath))):
+        return None if single is True else []
+    if os.path.isabs(file):
+        if os.path.exists(file):
+            return str(file) if single is True else [str(file)]
+        return None if single is True else []
+    files_found = []
+    if not location:
+        location = ["."]
+    elif isinstance(location, (str, pathlib.PosixPath)):
+        location = [location]
+    elif not isinstance(location, list):
+        location = []
+    location_pruned = []
+    for directory in location:
+        if not isinstance(directory, str):
+            if not isinstance(directory, pathlib.PosixPath):
+                continue
+            directory = str(directory)
+        if not (directory := directory.strip()):
+            continue
+        if os.path.isfile(directory := os.path.abspath(os.path.normpath(directory))):
+            # Actually, allow a file rather then a directory; assume its parent directory was intended.
+            if not (directory := os.path.dirname(directory)):
                 continue
-            if isinstance(directory, (str, pathlib.PosixPath)) and os.path.exists(os.path.join(directory, file)):
-                file_found = os.path.abspath(os.path.normpath(os.path.join(directory, file)))
-                if single:
-                    return file_found
-                if file_found not in files_found:
-                    files_found.append(file_found)
-        if recursive:
-            for directory in location:
-                if not directory:
-                    continue
-                if not directory.endswith("/**") and not file.startswith("**/"):
-                    path = f"{directory}/**/{file}"
-                else:
-                    path = f"{directory}/{file}"
-                files = glob.glob(path, recursive=recursive)
-                if files:
-                    for file_found in files:
-                        file_found = os.path.abspath(file_found)
-                        if single:
-                            return file_found
-                        if file_found not in files_found:
-                            files_found.append(file_found)
-        if files_found:
-            return files_found[0] if single else files_found
-        return None if single else []
+        if directory not in location_pruned:
+            location_pruned.append(directory)
+    location = location_pruned
+    for directory in location:
+        if os.path.exists(os.path.join(directory, file)):
+            file_found = os.path.abspath(os.path.normpath(os.path.join(directory, file)))
+            if single is True:
+                return file_found
+            if file_found not in files_found:
+                files_found.append(file_found)
+    if recursive is True:
+        for directory in location:
+            if not directory.endswith("/**") and not file.startswith("**/"):
+                path = f"{directory}/**/{file}"
+            else:
+                path = f"{directory}/{file}"
+            files = glob.glob(path, recursive=True if recursive is True else False)
+            if files:
+                for file_found in files:
+                    file_found = os.path.abspath(file_found)
+                    if single is True:
+                        return file_found
+                    if file_found not in files_found:
+                        files_found.append(file_found)
+    if single is True:
+        return files_found[0] if files_found else None
+    elif order is True:
+        return order_by_fewest_number_of_paths_and_then_alphabetically(files_found)
+    else:
+        return files_found
+
+
+def normalize_path(value: Union[str, pathlib.Path], absolute: bool = False, expand_home: Optional[bool] = None) -> str:
+    """
+    Normalizes the given path value and returns the result; does things like remove redundant
+    consecutive directory separators and redundant parent paths. If the given absolute argument
+    is True than converts the path to an absolute path. If the given expand_home argument is False
+    and if the path can reasonably be represented with a home directory indicator (i.e. "~"), then
+    converts it to such. If the expand_home argument is True and path starts with the home directory
+    indicator (i.e. "~") then expands it to the actual (absolute) home path of the caller. If the
+    given path value is not actually even a string (or pathlib.Path) then returns an empty string.
+    """
+    if isinstance(value, pathlib.Path):
+        value = str(value)
+    elif not isinstance(value, str):
+        return ""
+    if not (value := value.strip()) or not (value := os.path.normpath(value)):
+        return ""
+    if expand_home is True:
+        value = os.path.expanduser(value)
+    elif (expand_home is False) and (os.name == "posix"):
+        if value.startswith(home := HOME_DIRECTORY + os.sep):
+            value = "~/" + value[len(home):]
+        elif value == HOME_DIRECTORY:
+            value = "~"
+    if absolute is True:
+        value = os.path.abspath(value)
+    return value
+
+
+def get_file_size(file: str, raise_exception: bool = True) -> Optional[int]:
+    try:
+        return os.path.getsize(file) if isinstance(file, str) else None
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+
+
+def get_file_modified_datetime(file: str, raise_exception: bool = True) -> Optional[datetime]:
+    try:
+        return datetime.fromtimestamp(os.path.getmtime(file)) if isinstance(file, str) else None
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+
+
+def are_files_equal(filea: str, fileb: str, raise_exception: bool = True) -> bool:
+    """
+    Returns True iff the contents of the two given files are exactly the same.
+    """
+    try:
+        with open(filea, "rb") as fa:
+            with open(fileb, "rb") as fb:
+                chunk_size = 4096
+                while True:
+                    chunka = fa.read(chunk_size)
+                    chunkb = fb.read(chunk_size)
+                    if chunka != chunkb:
+                        return False
+                    if not chunka:
+                        break
+        return True
+    except Exception:
+        if raise_exception is True:
+            raise
+        return False
+
+
+def compute_file_md5(file: str, raise_exception: bool = True) -> str:
+    """
+    Returns the md5 checksum for the given file.
+    """
+    if not isinstance(file, str):
+        return ""
+    try:
+        md5 = hashlib.md5()
+        with open(file, "rb") as file:
+            for chunk in iter(lambda: file.read(4096), b""):
+                md5.update(chunk)
+        return md5.hexdigest()
+    except Exception:
+        if raise_exception is True:
+            raise
+        return ""
+
+
+def compute_file_etag(file: str, raise_exception: bool = True) -> Optional[str]:
+    """
+    Returns the AWS S3 "etag" for the given file; this value is md5-like but
+    not the same as a normal md5. We use this to compare that a file in S3
+    appears to be the exact the same file as a local file.
+    """
+    try:
+        with io.open(file, "rb") as f:
+            return _compute_file_etag(f)
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+
+
+def _compute_file_etag(f: io.BufferedReader) -> str:
+    # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
+    MULTIPART_THRESHOLD = 8388608
+    MULTIPART_CHUNKSIZE = 8388608
+    # BUFFER_SIZE = 1048576
+    # Verify some assumptions are correct
+    # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
+    # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
+    # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
+    hash = hashlib.md5()
+    read = 0
+    chunks = None
+    while True:
+        # Read some from stdin, if we're at the end, stop reading
+        bits = f.read(1048576)
+        if len(bits) == 0:
+            break
+        read += len(bits)
+        hash.update(bits)
+        if chunks is None:
+            # We're handling a multi-part upload, so switch to calculating
+            # hashes of each chunk
+            if read >= MULTIPART_THRESHOLD:
+                chunks = b''
+        if chunks is not None:
+            if (read % MULTIPART_CHUNKSIZE) == 0:
+                # Dont with a chunk, add it to the list of hashes to hash later
+                chunks += hash.digest()
+                hash = hashlib.md5()
+    if chunks is None:
+        # Normal upload, just output the MD5 hash
+        etag = hash.hexdigest()
+    else:
+        # Multipart upload, need to output the hash of the hashes
+        if (read % MULTIPART_CHUNKSIZE) != 0:
+            # Add the last part if we have a partial chunk
+            chunks += hash.digest()
+        etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
+    return etag
+
+
+def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
+                       nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
+    """
+    Write to the given file (name/path) some random content. If the given file is None then writes
+    to a temporary file. In either case, returns the file written to. The of bytes written is 1024
+    by default be can be specified with the nbytes argument; default to writing ASCII text but if
+    the binary argument is True then writes binary data as well; if not binary the content is in
+    lines of 80 characters each; use the line_length argumetn in this case to change the line length.
+    """
+    if not isinstance(nbytes, int) or nbytes < 0:
+        nbytes = 0
+    if not isinstance(file, str) or not file:
+        if not isinstance(prefix, str):
+            prefix = ""
+        if not isinstance(suffix, str):
+            suffix = ""
+        file = f"{datetime.utcnow().strftime('%Y%m%d%H%M%S')}{str(uuid()).replace('-', '')}"
+        file = os.path.join(get_temporary_directory(), file)
+    with open(file, "wb" if binary is True else "w") as f:
+        if binary is True:
+            f.write(os.urandom(nbytes))
+        else:
+            if (not isinstance(line_length, int)) or (line_length < 1):
+                line_length = 80
+            line_length += 1
+            nlines = nbytes // line_length
+            nremainder = nbytes % line_length
+            for n in range(nlines):
+                f.write("".join(random.choices(string.ascii_letters + string.digits, k=line_length - 1)))
+                f.write("\n")
+            if nremainder > 1:
+                f.write("".join(random.choices(string.ascii_letters + string.digits, k=nremainder - 1)))
+            if nremainder > 0:
+                f.write("\n")
+    return file
diff --git a/dcicutils/http_utils.py b/dcicutils/http_utils.py
@@ -0,0 +1,39 @@
+from contextlib import contextmanager
+import requests
+from typing import Callable, Optional
+from dcicutils.tmpfile_utils import temporary_file
+
+
+@contextmanager
+def download(url: str, suffix: Optional[str] = None, binary: bool = True,
+             progress: Optional[Callable] = None) -> Optional[str]:
+    """
+    Context manager to download the given URL into a temporary file and yields the file
+    path to it. An optional file suffix may be specified for this temporary file name.
+    Defaults to binary file mode; if not desired then pass False as the binary argument.
+    """
+    with temporary_file(suffix=suffix) as file:
+        download_to(url, file, binary=binary, progress=progress)
+        yield file
+
+
+def download_to(url: str, file: str, binary: bool = True, progress: Optional[Callable] = None) -> None:
+    """
+    Download the given URL into the given file. Defaults to binary
+    file mode; if not desired then pass False as the binary argument.
+    """
+    if not callable(progress):
+        progress = None
+    response = requests.get(url, stream=True)
+    if progress:
+        nbytes = 0
+        nbytes_total = None
+        if isinstance(content_length := response.headers.get("Content-Length"), str) and content_length.isdigit():
+            nbytes_total = int(content_length)
+    with open(file, "wb" if binary is True else "w") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+            if progress:
+                nbytes += len(chunk)
+                progress(nbytes, nbytes_total)