From 0466e85f438c49144f6caa3eb29e0798f5675406 Mon Sep 17 00:00:00 2001 From: Graham Larue Date: Sun, 15 Aug 2021 10:57:22 -0700 Subject: [PATCH] MD5 hash and file size checks, improve clean_exit() (#11) * add sizeInBytes check if no md5 hash in xml * if download error then exit with 1 * change to defaultdict; use double-quotes; add filename instead of org_name * rank order: md5 > sizeInBytes * ' --> " * Default file size to None if not present in XML * ' --> " * Fix MD5 and sizeInBytes logic within is_broken() - Messages to user about broken/missing MD5/size information is only shown when downloading those files - MD5 match takes precedence over sizeInBytes (overrides bad size) - sizeInBytes will be used if no MD5 present, but won't override bad MD5 * Streamline clean_exit() - Leave choice to remove temp files to user if interactive - Don't remove temp if non-interactive and there are failed files * Fix final clean_exit() call Co-authored-by: orangeSi --- jgi-query.py | 208 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 80 deletions(-) diff --git a/jgi-query.py b/jgi-query.py index ab816e6..dcc573e 100755 --- a/jgi-query.py +++ b/jgi-query.py @@ -263,16 +263,19 @@ def get_sizes(d, sizes_by_url=None): """ for k, v in d.items(): if isinstance(v, dict): - if 'url' in v: - address = v['url'] - size = int(v['sizeInBytes']) + if "url" in v: + address = v["url"] + try: + size = int(v["sizeInBytes"]) + except: + size = None sizes_by_url[address] = size else: get_sizes(v, sizes_by_url) return sizes_by_url -def clean_exit(exit_message=None, remove_temp=True): +def clean_exit(exit_message=None, exit_code=0, remove_temp=True): """ Perform a sys.exit() while removing temporary files and informing the user. @@ -286,17 +289,19 @@ def clean_exit(exit_message=None, remove_temp=True): except NameError: pass for f in to_remove: - try: - os.remove(f) - except OSError: - continue - if exit_message: - print_message = "{}\n".format(exit_message) + try: + os.remove(f) + except OSError: + continue + if remove_temp is True: + base_message = "Removing temp files and exiting" else: - print_message = "" + base_message = "Keeping temp files and exiting" + if exit_message: + print(exit_message) + print(base_message) - print("{}Removing temp files and exiting".format(print_message)) - sys.exit(0) + sys.exit(exit_code) def extract_file(file_path, keep_compressed=False): @@ -306,8 +311,8 @@ def extract_file(file_path, keep_compressed=False): TODO: implement .zip decompression """ - tar_pattern = 'tar.gz$' # matches tar.gz - gz_pattern = '(? sizeInBytes rank-order + # in downstream processing + elif "sizeInBytes" in i: + url_to_validate[url]["sizeInBytes"] = int(i["sizeInBytes"]) print_index = " {}:[{}] ".format(str(catID), str(index)) date = fmt_timestamp(i["timestamp"]) - date_string = '{:02d}/{}'.format(date.tm_mon, date.tm_year) + date_string = "{:02d}/{}".format(date.tm_mon, date.tm_year) size_date = "[{}|{}]".format(i["size"], date_string) filename = i["filename"] margin = 80 - (len(size_date) + len(print_index)) @@ -411,7 +420,7 @@ def print_data(data, org_name, display=True): print('\n'.join(print_list)) print() # padding - return dict_to_get, url_to_md5 + return dict_to_get, url_to_validate def get_user_choice(): @@ -572,7 +581,7 @@ def byte_convert(byte_size): return size_string -def is_broken(filename, min_size_bytes=20, md5_hash=None): +def is_broken(filename, min_size_bytes=20, md5_hash=None, sizeInBytes=None): """ Rudimentary check to see if a file appears to be broken. @@ -580,8 +589,9 @@ def is_broken(filename, min_size_bytes=20, md5_hash=None): if ( not os.path.isfile(filename) or os.path.getsize(filename) < min_size_bytes or - (is_xml(filename) and not filename.lower().endswith('xml') or - not check_md5(filename, md5_hash)) + (is_xml(filename) and not filename.lower().endswith("xml")) or + ((not check_md5(filename, md5_hash)) or + (not check_sizeInBytes(filename, sizeInBytes))) ): return True else: @@ -591,7 +601,7 @@ def is_broken(filename, min_size_bytes=20, md5_hash=None): def get_md5(*fns, buffer_size=65536): hash = md5() for fn in fns: - with open(fn, 'rb') as f: + with open(fn, "rb") as f: while True: data = f.read(buffer_size) if not data: @@ -600,6 +610,13 @@ def get_md5(*fns, buffer_size=65536): return hash.hexdigest() +def get_sizeInBytes(filename): + try: + file_sizeInBytes = os.path.getsize(filename) + except: + file_sizeInBytes = 0 + + return file_sizeInBytes def check_md5(filename, md5_hash, print_message=True): if not md5_hash: @@ -621,8 +638,28 @@ def check_md5(filename, md5_hash, print_message=True): return ret_val +def check_sizeInBytes(filename, sizeInBytes, print_message=True): + if not sizeInBytes: + message = "INFO: No sizeInBytes listed for {}; skipping check".format(filename) + ret_val = True + else: + file_sizeInBytes = get_sizeInBytes(filename) + if file_sizeInBytes == sizeInBytes: + message = ( + "SUCCESS: sizeInBytes match for {} ({})".format(filename, sizeInBytes)) + ret_val = True + else: + message = ("ERROR: sizeInBytes mismatch for {} (local: {}, remote: {})" + .format(filename, file_sizeInBytes, sizeInBytes)) + ret_val = False + + if print_message is True: + print(message) + + return ret_val + -def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_md5={}): +def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_validate={}): """ Attempts to download a file from JGI servers using cURL. @@ -630,9 +667,10 @@ def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_md5={ """ success = True - md5_hash = url_to_md5.get(url, None) + md5_hash = url_to_validate[url].get("md5", None) + sizeInBytes = url_to_validate[url].get("sizeInBytes", None) - url = url.replace('&', '&') + url = url.replace("&", "&") filename = re.search('.+/(.+$)', url).group(1) url_prefix = "https://genome.jgi.doe.gov" @@ -640,7 +678,7 @@ def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_md5={ "curl -m {} '{}{}' -b cookies " "> {}".format(timeout, url_prefix, url, filename) ) - if not is_broken(filename, md5_hash=md5_hash): + if not is_broken(filename, md5_hash=md5_hash, sizeInBytes=sizeInBytes): success = True print("Skipping existing file {}".format(filename)) else: @@ -649,13 +687,15 @@ def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_md5={ # The next line doesn't appear to be needed to refresh the cookies. # subprocess.call(login, shell=True) status = subprocess.run(download_command, shell=True).returncode - if status != 0 or is_broken(filename, min_file_bytes, md5_hash=md5_hash): + if status != 0 or is_broken( + filename, min_file_bytes, md5_hash=md5_hash, sizeInBytes=sizeInBytes + ): success = False if retry > 0: # success = False # this may be needed if initial download fails alt_cmd = download_command.replace( - 'blocking=true', 'blocking=false') + "blocking=true", "blocking=false") current_retry = 1 while current_retry <= retry: if current_retry % 2 == 1: @@ -668,7 +708,7 @@ def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_md5={ ) status = subprocess.run(retry_cmd, shell=True).returncode if status == 0 and not is_broken( - filename, min_file_bytes, md5_hash=md5_hash + filename, min_file_bytes, md5_hash=md5_hash, sizeInBytes=sizeInBytes ): success = True break @@ -687,12 +727,12 @@ def get_regex(): # manage to get a working regex compile_success = False while compile_success is False: - pattern = input('Regex pattern: ') + pattern = input("Regex pattern: ") try: pattern = re.compile(pattern) compile_success = True except: - print('[!] ERROR: Regex pattern failed to compile.') + print("[!] ERROR: Regex pattern failed to compile.") return re.compile(pattern) @@ -712,7 +752,7 @@ def retry_from_failed(login_cmd, fail_log, timeout=120, retries=3): "connection and retry.") downloaded, failed = download_list(url_list) - print('Finished downloading {} files'.format(len(downloaded))) + print("Finished downloading {} files".format(len(downloaded))) if failed: log_failed(organism, failed) @@ -724,15 +764,15 @@ def log_failed(organism, failed_urls): Write failed URLs to a local log file. """ - fail_log = '{}.failed.log'.format(organism) + fail_log = "{}.failed.log".format(organism) print( - '{} failed downloads logged to {}'.format(len(failed_urls), fail_log)) + "{} failed downloads logged to {}".format(len(failed_urls), fail_log)) # write failed URLs to local file with open(fail_log, 'w') as f: f.write('\n'.join(failed_urls)) -def download_list(url_list, url_to_md5={}, timeout=120, retries=3): +def download_list(url_list, url_to_validate={}, timeout=120, retries=3): """ Attempts download command on a list of partial file URLs (completed by download_from_url()). @@ -755,7 +795,7 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): subprocess.run(LOGIN_STRING, shell=True) start_time = time.time() fn, cmd, success = download_from_url( - url, timeout=timeout, retry=retries, url_to_md5=url_to_md5) + url, timeout=timeout, retry=retries, url_to_validate=url_to_validate) if not success: broken_urls.append(url) else: @@ -889,19 +929,19 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): help=("number of times to retry downloading files with " "errors (0 to skip such files)")) parser.add_argument( - "-l", "--load_failed", type=str, metavar='logfile', + "-l", "--load_failed", type=str, metavar="logfile", help="retry downloading from URLs listed in log file") parser.add_argument( "-r", "--regex", type=re.compile, # convert to regex object - help='Regex pattern to use to auto-select and download ' - 'files (no interactive prompt)') + help="Regex pattern to use to auto-select and download " + "files (no interactive prompt)") parser.add_argument( "-a", "--all", action="store_true", - help='Auto-select and download all files for query (no interactive prompt)' + help="Auto-select and download all files for query (no interactive prompt)" ) # /ARG PARSER @@ -936,15 +976,15 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): CONFIG_FILEPATH = SCRIPT_HOME + "/{}".format(CONFIG_FILENAME) # Categories to store in default config file -DEFAULT_CATEGORIES = ['ESTs', - 'EST Clusters', - 'Assembled scaffolds (unmasked)', - 'Assembled scaffolds (masked)', - 'Transcripts', - 'Genes', - 'CDS', - 'Proteins', - 'Additional Files'] +DEFAULT_CATEGORIES = ["ESTs", + "EST Clusters", + "Assembled scaffolds (unmasked)", + "Assembled scaffolds (masked)", + "Transcripts", + "Genes", + "CDS", + "Proteins", + "Additional Files"] # Does config file exist? if os.path.isfile(CONFIG_FILEPATH) and not args.configure: # use config file @@ -983,7 +1023,7 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): if args.load_failed: logfile = args.load_failed print("Reading URLs from \'{}\'".format(logfile)) - retry_from_failed(LOGIN_STRING, logfile) + downloaded, failed = retry_from_failed(LOGIN_STRING, logfile) clean_exit("All files in log attempted.") # Get organism name for query @@ -1020,8 +1060,8 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): else: xml_index_filename = xml_arg print( - 'Retrieving information from JGI for query ' - '\'{}\' using local file \'{}\'\n'.format(organism, xml_index_filename)) + "Retrieving information from JGI for query " + "'{}' using local file '{}'\n".format(organism, xml_index_filename)) else: # fetch XML file from JGI xml_index_filename = "{}_jgi_index.xml".format(organism) @@ -1038,8 +1078,8 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): clean_exit("Couldn't connect with server. Please check Internet " "connection and retry.") print( - 'Retrieving information from JGI for query \'{}\' using command ' - '\'{}\'\n'.format(organism, xml_address)) + "Retrieving information from JGI for query '{}' using command " + "'{}'\n".format(organism, xml_address)) subprocess.run(xml_address, shell=True) print() # padding @@ -1081,14 +1121,14 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): user_choice = None display_info = True if GET_ALL: - user_choice = 'a' + user_choice = "a" display_info = False elif DIRECT_REGEX: - user_choice = 'r' + user_choice = "r" regex_filter = DIRECT_REGEX display_info = False -url_dict, url_to_md5 = print_data(file_list, organism, display=display_info) +url_dict, url_to_validate = print_data(file_list, organism, display=display_info) if not user_choice: # Ask user which files to download from xml @@ -1100,11 +1140,11 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): # special case for downloading all available files # or filtering with a regular expression -if user_choice in ('a', 'r'): +if user_choice in ("a", "r"): for k, v in sorted(url_dict.items()): for u in v.values(): if regex_filter: - fn = re.search('.+/([^\/]+$)', u).group(1) + fn = re.search(".+/([^\/]+$)", u).group(1) match = regex_filter.search(fn) if not match: continue @@ -1119,9 +1159,9 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): # Calculate and display total size of selected data urls_to_get = sorted(urls_to_get) -filenames = [u.split('/')[-1] for u in urls_to_get] +filenames = [u.split("/")[-1] for u in urls_to_get] file_sizes = get_sizes(file_list, sizes_by_url={}) -total_size = sum([file_sizes[url] for url in urls_to_get]) +total_size = sum(filter(None, [file_sizes[url] for url in urls_to_get])) size_string = byte_convert(total_size) num_files = len(urls_to_get) print(("Total download size for {} files: {}".format(num_files, size_string))) @@ -1135,7 +1175,7 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): clean_exit("ABORTING DOWNLOAD") downloaded_files, failed_urls = download_list( - urls_to_get, url_to_md5=url_to_md5, retries=args.retry_n) + urls_to_get, url_to_validate=url_to_validate, retries=args.retry_n) print("Finished downloading {} files.".format(len(downloaded_files))) @@ -1143,12 +1183,9 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): n_broken = len(failed_urls) retry_broken = input( "{} files failed to download; retry them? (y/n): ".format(n_broken)) - if retry_broken.lower() in ('yes', 'y'): + if retry_broken.lower() in ("yes", "y"): downloaded_files, failed_urls = download_list( - failed_urls, url_to_md5=url_to_md5, retries=1) - -if failed_urls: - log_failed(organism, failed_urls) + failed_urls, url_to_validate=url_to_validate, retries=1) # Kindly offer to unpack files, if files remain after error check if downloaded_files and INTERACTIVE: @@ -1160,18 +1197,29 @@ def download_list(url_list, url_to_md5={}, timeout=120, retries=3): else: keep_original = False decompress_files(downloaded_files, keep_original) - print('Finished decompressing all files.') + print("Finished decompressing all files.") + +#TODO either offer to delete or append ".error" to local broken files +if failed_urls: + log_failed(organism, failed_urls) + SOME_FAILED = True +else: + SOME_FAILED = False # Clean up and exit # "cookies" file is always created +exit_message = None +remove_temp = True if INTERACTIVE: keep_temp = input("Keep temporary files ('{}' and 'cookies')? (y/n): " .format(xml_index_filename)) - if keep_temp.lower() not in "y, yes": - clean_exit() - else: - print("Leaving temporary files intact and exiting.") -else: - clean_exit() + if keep_temp.lower() in "y, yes": + remove_temp = False +elif SOME_FAILED: # failed files in non-interactive mode + exit_message = ( + 'Some files failed downloading') + remove_temp = False + +exit_code = 1 if SOME_FAILED else 0 -sys.exit(0) +clean_exit(exit_message=exit_message, exit_code=exit_code, remove_temp=remove_temp)