From f9f066f3851a9f42f3770d0b8e3c96f3abc2e99f Mon Sep 17 00:00:00 2001 From: Hongyang Peng Date: Thu, 5 Dec 2024 16:20:02 +0800 Subject: [PATCH] perf:reduce the number of requests to S3 during smart_sync (#438) --- README.md | 2 +- megfile/cli.py | 5 ++++- megfile/smart.py | 31 +++++++++++++++++-------------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 322f0acb..459c8f92 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ megfile - Megvii FILE library ## Quick Start -Path string in `megfile` almost is `protocol://path/to/file`, for example `s3://bucketA/key`. But sftp path is a little different, format is `sftp://[username[:password]@]hostname[:port]//file_path`, and relative path is replace `//file_path` to `/file_path`. +Path string in `megfile` almost is `protocol://path/to/file`, for example `s3://bucketA/key`. But sftp path is a little different, format is `sftp://[username[:password]@]hostname[:port]//absolute_file_path`. More details see [path format document](https://megvii-research.github.io/megfile/path_format.html). Here's an example of writing a file to s3 / fs, syncing to local, reading and finally deleting it. ### Functional Interface diff --git a/megfile/cli.py b/megfile/cli.py index 24db8471..3f24b14a 100644 --- a/megfile/cli.py +++ b/megfile/cli.py @@ -349,6 +349,9 @@ def sync( quiet: bool, skip: bool, ): + if not smart_exists(dst_path): + force = True + with ThreadPoolExecutor(max_workers=worker) as executor: if has_magic(src_path): src_root_path = get_non_glob_dir(src_path) @@ -411,7 +414,7 @@ def callback_after_copy_file(src_file_path, dst_file_path): dict( src_root_path=src_root_path, dst_root_path=dst_path, - src_file_path=file_entry.path, + src_file_entry=file_entry, callback=callback, followlinks=True, callback_after_copy_file=callback_after_copy_file, diff --git a/megfile/smart.py b/megfile/smart.py index 7ad3995f..9b08f91d 100644 --- a/megfile/smart.py +++ b/megfile/smart.py @@ -397,7 +397,9 @@ def smart_copy( def _smart_sync_single_file(items: dict): src_root_path = items["src_root_path"] dst_root_path = items["dst_root_path"] - src_file_path = items["src_file_path"] + src_file_entry = items["src_file_entry"] + src_file_path = src_file_entry.path + src_file_stat = src_file_entry.stat callback = items["callback"] followlinks = items["followlinks"] callback_after_copy_file = items["callback_after_copy_file"] @@ -417,17 +419,17 @@ def _smart_sync_single_file(items: dict): dst_protocol, _ = SmartPath._extract_protocol(dst_abs_file_path) should_sync = True try: - if force: - pass - elif not overwrite and smart_exists(dst_abs_file_path): - should_sync = False - elif smart_exists(dst_abs_file_path) and is_same_file( - smart_stat(src_file_path, follow_symlinks=followlinks), - smart_stat(dst_abs_file_path, follow_symlinks=followlinks), - get_sync_type(src_protocol, dst_protocol), - ): - should_sync = False - except NotImplementedError: + if not force: + dst_file_stat = smart_stat(dst_abs_file_path, follow_symlinks=followlinks) + if not overwrite: + should_sync = False + elif is_same_file( + src_file_stat, + dst_file_stat, + get_sync_type(src_protocol, dst_protocol), + ): + should_sync = False + except (NotImplementedError, FileNotFoundError): pass if should_sync: @@ -513,15 +515,16 @@ def smart_sync( src_path, dst_path = get_traditional_path(src_path), get_traditional_path(dst_path) if not src_file_stats: src_file_stats = smart_scan_stat(src_path, followlinks=followlinks) + if not smart_exists(dst_path): + force = True def create_generator(): for src_file_entry in src_file_stats: if src_file_entry.name: - src_file_path = src_file_entry.path yield dict( src_root_path=src_path, dst_root_path=dst_path, - src_file_path=src_file_path, + src_file_entry=src_file_entry, callback=callback, followlinks=followlinks, callback_after_copy_file=callback_after_copy_file,