Skip to content

Commit

Permalink
perf:reduce the number of requests to S3 during smart_sync (#438)
Browse files Browse the repository at this point in the history
  • Loading branch information
LoveEatCandy authored Dec 5, 2024
1 parent 47c2253 commit f9f066f
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ megfile - Megvii FILE library

## Quick Start

Path string in `megfile` almost is `protocol://path/to/file`, for example `s3://bucketA/key`. But sftp path is a little different, format is `sftp://[username[:password]@]hostname[:port]//file_path`, and relative path is replace `//file_path` to `/file_path`.
Path string in `megfile` almost is `protocol://path/to/file`, for example `s3://bucketA/key`. But sftp path is a little different, format is `sftp://[username[:password]@]hostname[:port]//absolute_file_path`. More details see [path format document](https://megvii-research.github.io/megfile/path_format.html).
Here's an example of writing a file to s3 / fs, syncing to local, reading and finally deleting it.

### Functional Interface
Expand Down
5 changes: 4 additions & 1 deletion megfile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,9 @@ def sync(
quiet: bool,
skip: bool,
):
if not smart_exists(dst_path):
force = True

with ThreadPoolExecutor(max_workers=worker) as executor:
if has_magic(src_path):
src_root_path = get_non_glob_dir(src_path)
Expand Down Expand Up @@ -411,7 +414,7 @@ def callback_after_copy_file(src_file_path, dst_file_path):
dict(
src_root_path=src_root_path,
dst_root_path=dst_path,
src_file_path=file_entry.path,
src_file_entry=file_entry,
callback=callback,
followlinks=True,
callback_after_copy_file=callback_after_copy_file,
Expand Down
31 changes: 17 additions & 14 deletions megfile/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,9 @@ def smart_copy(
def _smart_sync_single_file(items: dict):
src_root_path = items["src_root_path"]
dst_root_path = items["dst_root_path"]
src_file_path = items["src_file_path"]
src_file_entry = items["src_file_entry"]
src_file_path = src_file_entry.path
src_file_stat = src_file_entry.stat
callback = items["callback"]
followlinks = items["followlinks"]
callback_after_copy_file = items["callback_after_copy_file"]
Expand All @@ -417,17 +419,17 @@ def _smart_sync_single_file(items: dict):
dst_protocol, _ = SmartPath._extract_protocol(dst_abs_file_path)
should_sync = True
try:
if force:
pass
elif not overwrite and smart_exists(dst_abs_file_path):
should_sync = False
elif smart_exists(dst_abs_file_path) and is_same_file(
smart_stat(src_file_path, follow_symlinks=followlinks),
smart_stat(dst_abs_file_path, follow_symlinks=followlinks),
get_sync_type(src_protocol, dst_protocol),
):
should_sync = False
except NotImplementedError:
if not force:
dst_file_stat = smart_stat(dst_abs_file_path, follow_symlinks=followlinks)
if not overwrite:
should_sync = False
elif is_same_file(
src_file_stat,
dst_file_stat,
get_sync_type(src_protocol, dst_protocol),
):
should_sync = False
except (NotImplementedError, FileNotFoundError):
pass

if should_sync:
Expand Down Expand Up @@ -513,15 +515,16 @@ def smart_sync(
src_path, dst_path = get_traditional_path(src_path), get_traditional_path(dst_path)
if not src_file_stats:
src_file_stats = smart_scan_stat(src_path, followlinks=followlinks)
if not smart_exists(dst_path):
force = True

def create_generator():
for src_file_entry in src_file_stats:
if src_file_entry.name:
src_file_path = src_file_entry.path
yield dict(
src_root_path=src_path,
dst_root_path=dst_path,
src_file_path=src_file_path,
src_file_entry=src_file_entry,
callback=callback,
followlinks=followlinks,
callback_after_copy_file=callback_after_copy_file,
Expand Down

0 comments on commit f9f066f

Please sign in to comment.