Skip to content

Commit

Permalink
Improved synchronization of cache with remote in retrieve* functions.
Browse files Browse the repository at this point in the history
- Delete files that are no longer present in the remote directory.
- Always perform HEAD requests to check for updates to individual files.
  This allows us to simplify the code by removing the updateDelay
  for files, which was incorrect anyway as the delay was being computed
  from the file's modification time rather than its last HEAD check.
- Wrap the HEAD in a try() to ensure we can still use the cache if the
  request fails, e.g., due to lack of connectivity.
  • Loading branch information
LTLA committed Oct 31, 2024
1 parent ca389ce commit 163d05d
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 40 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: SewerRat
Version: 0.2.11
Date: 2024-10-27
Version: 0.2.12
Date: 2024-10-31
Title: Client for the SewerRat API
Description:
Search metadata files across a shared filesystem via the SewerRat API.
Expand Down
4 changes: 3 additions & 1 deletion R/listFiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
#' @import httr2
listFiles <- function(path, url, forceRemote=FALSE) {
if (!forceRemote && file.exists(path)) {
list.files(path, recursive=TRUE, all.files=TRUE)
.quick_list(path)
} else {
req <- request(paste0(url, "/list?path=", URLencode(path, reserved=TRUE), "&recursive=true"))
req <- handle_error(req)
res <- req_perform(req)
unlist(resp_body_json(res))
}
}

.quick_list <- function(path) list.files(path, recursive=TRUE, all.files=TRUE)
51 changes: 31 additions & 20 deletions R/retrieveDirectory.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,23 @@
#' Only used for remote access.
#' @param concurrent Integer scalar specifying the number of concurrent downloads.
#' Only used for remote access.
#' @param updateDelay Integer scalar specifying the maximum age of a cached file, in seconds.
#' Older files will be automatically checked for updates.
#' @param updateDelay Integer scalar specifying the interval before checking for updates in a cached directory, in seconds.
#' Only used for remote access.
#'
#' @return Path to the subdirectory on the caller's filesystem.
#' This is either a path to the registered (sub)directory if it is accessible,
#' or a path to a local cache of the directory's contents otherwise.
#'
#' @details
#' During remote access, this function exhibits the following behavior:
#' \itemize{
#' \item It will only check for updates to the directory contents after \code{updateDelay} seconds have passed since the previous check.
#' This avoids unnecessarily frequent requests to the SewerRat API.
#' \item If a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates.
#' Any updates on the remote will cause the new file to be re-downloaded to the cache.
#' \item Any cached files that are no longer in the remote \code{path} will be deleted from the cache.
#' }
#'
#' @author Aaron Lun
#'
#' @examples
Expand All @@ -45,7 +54,7 @@
#'
#' @export
#' @import httr2
retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) {
retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) {
if (!forceRemote && file.exists(path)) {
return(path)
}
Expand All @@ -67,12 +76,16 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit
res <- req_perform(req)
listing <- resp_body_json(res)

# Removing files that no longer exist.
existing <- .quick_list(final)
unlink(setdiff(existing, listing))

if (concurrent == 1L) {
lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay)
lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite)
} else {
cl <- parallel::makeCluster(concurrent)
on.exit(parallel::stopCluster(cl), add=TRUE, after=FALSE)
parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay)
parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite)
}

# We use a directory-level OK file to avoid having to scan through all
Expand All @@ -82,24 +95,22 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit
final
}

#' @importFrom utils URLencode
full_file_url <- function(url, path) {
paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE))
}

#' @import httr2
acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) {
#' @importFrom utils URLencode
acquire_file_raw <- function(cache, path, url, overwrite) {
target <- file.path(cache, "LOCAL", path)
url <- paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE))

if (!file.exists(target)) {
overwrite <- TRUE
} else if (!overwrite) {
last_mod <- file.info(target)$mtime
if (last_mod + updateDelay < Sys.time()) { # only check older files for updates, to avoid excessive queries.
req <- request(full_file_url(url, path))
req <- req_method(req, "HEAD")
req <- handle_error(req)
res <- req_perform(req)
req <- request(url)
req <- req_method(req, "HEAD")
req <- handle_error(req)
res <- try(req_perform(req), silent=TRUE)

if (!is(res, "try-error")) { # don't fail if the HEAD didn't work, e.g., no internet but we already have a cached file.
last_mod <- file.info(target)$mtime
remote_mod <- parse_remote_last_modified(res)
if (!is.null(remote_mod) && remote_mod > last_mod) {
overwrite <- TRUE
Expand All @@ -113,16 +124,16 @@ acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) {
tempf <- tempfile(tmpdir=tempdir)
on.exit(unlink(tempf), add=TRUE, after=FALSE)

download_file(full_file_url(url, path), tempf)
download_file(url, tempf)
dir.create(dirname(target), recursive=TRUE, showWarnings=FALSE)
file.rename(tempf, target) # this should be more or less atomic, so no need for locks.
}

target
}

acquire_file <- function(cache, path, name, url, overwrite, updateDelay) {
acquire_file_raw(cache, paste0(path, "/", name), url, overwrite, updateDelay)
acquire_file <- function(cache, path, name, url, overwrite) {
acquire_file_raw(cache, paste0(path, "/", name), url, overwrite)
}

#' @importFrom utils URLencode
Expand Down
8 changes: 6 additions & 2 deletions R/retrieveFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
#' @inheritParams retrieveDirectory
#'
#' @author Aaron Lun
#'
#' @details
#' During remote access, if a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates.
#' Any updates on the remote will cause the new file to be re-downloaded to the cache.
#'
#' @return String containing the path to the file on the caller's filesystem.
#'
Expand All @@ -32,11 +36,11 @@
#' retrieveFile(paste0(mydir, "/diet/bar"), url=info$url, forceRemote=TRUE)
#'
#' @export
retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, updateDelay=3600) {
retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE) {
if (!forceRemote && file.exists(path)) {
path
} else {
cache <- local_root(cache, url)
acquire_file_raw(cache, path, url=url, overwrite=overwrite, updateDelay=updateDelay)
acquire_file_raw(cache, path, url=url, overwrite=overwrite)
}
}
13 changes: 11 additions & 2 deletions man/retrieveDirectory.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 5 additions & 12 deletions man/retrieveFile.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 28 additions & 1 deletion tests/testthat/test-retrieve.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# library(testthat); library(SewerRat); source("test-retrieve.R")

info <- startSewerRat()

mydir <- tempfile()
dir.create(mydir)
write(file=file.path(mydir, "metadata.json"), '{ "first": "Aaron", "last": "Lun" }')
Expand Down Expand Up @@ -80,4 +80,31 @@ test_that("retrieveDirectory works as expected", {
expect_identical(jsonlite::fromJSON(file.path(rdir2, "diet", "metadata.json"))$meal, "lunch")
})

test_that("retrieveDirectory works with remote updates", {
mydir2 <- tempfile()
dir.create(mydir2)
write(file=file.path(mydir2, "metadata.json"), '{ "first": "Kanon", "last": "Shibuya" }')
dir.create(file.path(mydir2, "2"))
write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Kinako", "last": "Sakurakouji" }')
dir.create(file.path(mydir2, "3"))
write(file=file.path(mydir2, "3", "metadata.json"), '{ "first": "Margarete", "last": "Wien" }')

register(mydir2, "metadata.json", url=info$url)
on.exit(deregister(mydir2, url=info$url))

cache <- tempfile()
dir <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE)
expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Kinako")
expect_true(file.exists(file.path(dir, "3", "metadata.json")))

# Checking if it responds correctly to remote updates.
unlink(file.path(mydir2, "3"), recursive=TRUE)
write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Mei", "last": "Yoneme" }')

cache <- tempfile()
dir <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE, updateDelay=0)
expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Mei")
expect_false(file.exists(file.path(dir, "3", "metadata.json")))
})

deregister(mydir, url=info$url)

0 comments on commit 163d05d

Please sign in to comment.