diff --git a/DESCRIPTION b/DESCRIPTION index 50b118f..b0115ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SewerRat -Version: 0.2.11 -Date: 2024-10-27 +Version: 0.2.12 +Date: 2024-10-31 Title: Client for the SewerRat API Description: Search metadata files across a shared filesystem via the SewerRat API. diff --git a/R/listFiles.R b/R/listFiles.R index a015de8..79bdc73 100644 --- a/R/listFiles.R +++ b/R/listFiles.R @@ -38,7 +38,7 @@ #' @import httr2 listFiles <- function(path, url, forceRemote=FALSE) { if (!forceRemote && file.exists(path)) { - list.files(path, recursive=TRUE, all.files=TRUE) + .quick_list(path) } else { req <- request(paste0(url, "/list?path=", URLencode(path, reserved=TRUE), "&recursive=true")) req <- handle_error(req) @@ -46,3 +46,5 @@ listFiles <- function(path, url, forceRemote=FALSE) { unlist(resp_body_json(res)) } } + +.quick_list <- function(path) list.files(path, recursive=TRUE, all.files=TRUE) diff --git a/R/retrieveDirectory.R b/R/retrieveDirectory.R index 7714ae3..129f084 100644 --- a/R/retrieveDirectory.R +++ b/R/retrieveDirectory.R @@ -15,14 +15,23 @@ #' Only used for remote access. #' @param concurrent Integer scalar specifying the number of concurrent downloads. #' Only used for remote access. -#' @param updateDelay Integer scalar specifying the maximum age of a cached file, in seconds. -#' Older files will be automatically checked for updates. +#' @param updateDelay Integer scalar specifying the interval before checking for updates in a cached directory, in seconds. #' Only used for remote access. #' #' @return Path to the subdirectory on the caller's filesystem. #' This is either a path to the registered (sub)directory if it is accessible, #' or a path to a local cache of the directory's contents otherwise. #' +#' @details +#' During remote access, this function exhibits the following behavior: +#' \itemize{ +#' \item It will only check for updates to the directory contents after \code{updateDelay} seconds have passed since the previous check. +#' This avoids unnecessarily frequent requests to the SewerRat API. +#' \item If a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +#' Any updates on the remote will cause the new file to be re-downloaded to the cache. +#' \item Any cached files that are no longer in the remote \code{path} will be deleted from the cache. +#' } +#' #' @author Aaron Lun #' #' @examples @@ -45,7 +54,7 @@ #' #' @export #' @import httr2 -retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) { +retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) { if (!forceRemote && file.exists(path)) { return(path) } @@ -67,12 +76,16 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit res <- req_perform(req) listing <- resp_body_json(res) + # Removing files that no longer exist. + existing <- .quick_list(final) + unlink(setdiff(existing, listing)) + if (concurrent == 1L) { - lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay) + lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite) } else { cl <- parallel::makeCluster(concurrent) on.exit(parallel::stopCluster(cl), add=TRUE, after=FALSE) - parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay) + parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite) } # We use a directory-level OK file to avoid having to scan through all @@ -82,24 +95,22 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit final } -#' @importFrom utils URLencode -full_file_url <- function(url, path) { - paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE)) -} - #' @import httr2 -acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { +#' @importFrom utils URLencode +acquire_file_raw <- function(cache, path, url, overwrite) { target <- file.path(cache, "LOCAL", path) + url <- paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE)) if (!file.exists(target)) { overwrite <- TRUE } else if (!overwrite) { - last_mod <- file.info(target)$mtime - if (last_mod + updateDelay < Sys.time()) { # only check older files for updates, to avoid excessive queries. - req <- request(full_file_url(url, path)) - req <- req_method(req, "HEAD") - req <- handle_error(req) - res <- req_perform(req) + req <- request(url) + req <- req_method(req, "HEAD") + req <- handle_error(req) + res <- try(req_perform(req), silent=TRUE) + + if (!is(res, "try-error")) { # don't fail if the HEAD didn't work, e.g., no internet but we already have a cached file. + last_mod <- file.info(target)$mtime remote_mod <- parse_remote_last_modified(res) if (!is.null(remote_mod) && remote_mod > last_mod) { overwrite <- TRUE @@ -113,7 +124,7 @@ acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { tempf <- tempfile(tmpdir=tempdir) on.exit(unlink(tempf), add=TRUE, after=FALSE) - download_file(full_file_url(url, path), tempf) + download_file(url, tempf) dir.create(dirname(target), recursive=TRUE, showWarnings=FALSE) file.rename(tempf, target) # this should be more or less atomic, so no need for locks. } @@ -121,8 +132,8 @@ acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { target } -acquire_file <- function(cache, path, name, url, overwrite, updateDelay) { - acquire_file_raw(cache, paste0(path, "/", name), url, overwrite, updateDelay) +acquire_file <- function(cache, path, name, url, overwrite) { + acquire_file_raw(cache, paste0(path, "/", name), url, overwrite) } #' @importFrom utils URLencode diff --git a/R/retrieveFile.R b/R/retrieveFile.R index 9e046aa..4f44568 100644 --- a/R/retrieveFile.R +++ b/R/retrieveFile.R @@ -10,6 +10,10 @@ #' @inheritParams retrieveDirectory #' #' @author Aaron Lun +#' +#' @details +#' During remote access, if a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +#' Any updates on the remote will cause the new file to be re-downloaded to the cache. #' #' @return String containing the path to the file on the caller's filesystem. #' @@ -32,11 +36,11 @@ #' retrieveFile(paste0(mydir, "/diet/bar"), url=info$url, forceRemote=TRUE) #' #' @export -retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, updateDelay=3600) { +retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE) { if (!forceRemote && file.exists(path)) { path } else { cache <- local_root(cache, url) - acquire_file_raw(cache, path, url=url, overwrite=overwrite, updateDelay=updateDelay) + acquire_file_raw(cache, path, url=url, overwrite=overwrite) } } diff --git a/man/retrieveDirectory.Rd b/man/retrieveDirectory.Rd index 9b06dc8..8be4555 100644 --- a/man/retrieveDirectory.Rd +++ b/man/retrieveDirectory.Rd @@ -33,8 +33,7 @@ Only used for remote access.} \item{concurrent}{Integer scalar specifying the number of concurrent downloads. Only used for remote access.} -\item{updateDelay}{Integer scalar specifying the maximum age of a cached file, in seconds. -Older files will be automatically checked for updates. +\item{updateDelay}{Integer scalar specifying the interval before checking for updates in a cached directory, in seconds. Only used for remote access.} } \value{ @@ -46,6 +45,16 @@ or a path to a local cache of the directory's contents otherwise. Retrieve the path to a registered directory (or its subdirectories), possibly creating a local copy of the directory's contents if the caller is not on the same filesystem. } +\details{ +During remote access, this function exhibits the following behavior: +\itemize{ +\item It will only check for updates to the directory contents after \code{updateDelay} seconds have passed since the previous check. +This avoids unnecessarily frequent requests to the SewerRat API. +\item If a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +Any updates on the remote will cause the new file to be re-downloaded to the cache. +\item Any cached files that are no longer in the remote \code{path} will be deleted from the cache. +} +} \examples{ info <- startSewerRat() diff --git a/man/retrieveFile.Rd b/man/retrieveFile.Rd index 4089cb7..368c253 100644 --- a/man/retrieveFile.Rd +++ b/man/retrieveFile.Rd @@ -4,14 +4,7 @@ \alias{retrieveFile} \title{Retrieve a single file} \usage{ -retrieveFile( - path, - url, - cache = NULL, - forceRemote = FALSE, - overwrite = FALSE, - updateDelay = 3600 -) +retrieveFile(path, url, cache = NULL, forceRemote = FALSE, overwrite = FALSE) } \arguments{ \item{path}{String containing the absolute path to a file in a registered directory.} @@ -28,10 +21,6 @@ even if \code{path} is on the same filesystem as the caller.} \item{overwrite}{Logical scalar indicating whether to overwrite the existing cache. Only used for remote access.} - -\item{updateDelay}{Integer scalar specifying the maximum age of a cached file, in seconds. -Older files will be automatically checked for updates. -Only used for remote access.} } \value{ String containing the path to the file on the caller's filesystem. @@ -40,6 +29,10 @@ String containing the path to the file on the caller's filesystem. Retrieve the path to a single file in a registered directory. This will call the REST API to obtain and cache a copy of the file if the caller is not on the same filesystem. } +\details{ +During remote access, if a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +Any updates on the remote will cause the new file to be re-downloaded to the cache. +} \examples{ info <- startSewerRat() diff --git a/tests/testthat/test-retrieve.R b/tests/testthat/test-retrieve.R index b237183..0e0ceb2 100644 --- a/tests/testthat/test-retrieve.R +++ b/tests/testthat/test-retrieve.R @@ -2,7 +2,7 @@ # library(testthat); library(SewerRat); source("test-retrieve.R") info <- startSewerRat() - + mydir <- tempfile() dir.create(mydir) write(file=file.path(mydir, "metadata.json"), '{ "first": "Aaron", "last": "Lun" }') @@ -80,4 +80,31 @@ test_that("retrieveDirectory works as expected", { expect_identical(jsonlite::fromJSON(file.path(rdir2, "diet", "metadata.json"))$meal, "lunch") }) +test_that("retrieveDirectory works with remote updates", { + mydir2 <- tempfile() + dir.create(mydir2) + write(file=file.path(mydir2, "metadata.json"), '{ "first": "Kanon", "last": "Shibuya" }') + dir.create(file.path(mydir2, "2")) + write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Kinako", "last": "Sakurakouji" }') + dir.create(file.path(mydir2, "3")) + write(file=file.path(mydir2, "3", "metadata.json"), '{ "first": "Margarete", "last": "Wien" }') + + register(mydir2, "metadata.json", url=info$url) + on.exit(deregister(mydir2, url=info$url)) + + cache <- tempfile() + dir <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE) + expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Kinako") + expect_true(file.exists(file.path(dir, "3", "metadata.json"))) + + # Checking if it responds correctly to remote updates. + unlink(file.path(mydir2, "3"), recursive=TRUE) + write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Mei", "last": "Yoneme" }') + + cache <- tempfile() + dir <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE, updateDelay=0) + expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Mei") + expect_false(file.exists(file.path(dir, "3", "metadata.json"))) +}) + deregister(mydir, url=info$url)