From 49c4e22607694a7391ba9f51ae10410309d7e066 Mon Sep 17 00:00:00 2001 From: Jon Harmon Date: Fri, 26 Apr 2024 14:07:24 -0500 Subject: [PATCH] Pagination vignette. --- vignettes/pagination.Rmd | 127 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 vignettes/pagination.Rmd diff --git a/vignettes/pagination.Rmd b/vignettes/pagination.Rmd new file mode 100644 index 0000000..f77a449 --- /dev/null +++ b/vignettes/pagination.Rmd @@ -0,0 +1,127 @@ +--- +title: "Pagination" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Pagination} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +Many APIs implement some form of pagination: they break up large datasets into "pages" of results, and return a single page at a time. +To get the full dataset, we need to make multiple requests, and combine the results. + +Unfortunately, there isn't a standard way to document API pagination. +Therefore, we cannot automatically generate pagination code. +You will need to edit your `010-call.R` file to implement pagination. + +## Finding pagination information + +Before you can implement pagination in your package, you will need to find out how the API implements pagination. +You can usually find this information in the API documentation. +Sometimes this information is in a separate "Pagination" section at the top of the documentation. +Often it is described in the individual endpoint documentation (even if it is separately described in its own section). +If it isn't clearly described, watch for pagination-related endpoint parameters, such as `page`, `pageSize`, `perPage` `limit`, `offset`, or `cursor`. + +For more tips on finding pagination information, see [How can I get a lot of data from an API?](https://r4ds.github.io/bookclub-wapir/slides/httr2/httr2-pagination.html) in [Web APIs with R](https://r4ds.github.io/bookclub-wapir/). + +## Implementing pagination + +The [req_perform_iterative() function from {httr2}](https://httr2.r-lib.org/reference/req_perform_iterative.html) helps to implement pagination. +It uses the request and some helper functions to create a new request to fetch the next page. +This family of functions is experimental, so be sure to check the latest documentation in case the functions have changed. + +To implement pagination in your package, you will need to edit the `010-call.R` file generated by {beekeeper}. +By default, the perform step is handled by `nectar::req_perform_opinionated()` + +```{r default-perform} +resp <- nectar::req_perform_opinionated(req) +``` + +This function calls `httr2::req_perform()` if you only give it a `req` object, or `httr2::req_perform_iterative()` if you supply an iteration helper function in the `next_req` parameter. +For example, if every endpoint of your API uses a `page` parameter to paginate, you could replace the line above with something like this: + +```{r pagination} +is_complete <- function(resp) { + as.logical(length(httr2::resp_body_json(resp)$data)) +} +resp <- nectar::req_perform_opinionated( + req, + next_req = httr2::iterate_with_offset("page", resp_complete = is_complete) +) +``` + +By default, `nectar::req_perform_opinionated()` only returns 2 responses (`max_reqs = 2`). +Once you have verified that your pagination strategy works, you will likely want to increase this limit, usually to `Inf`. +`nectar::req_perform_opinionated()` also implements a basic `httr2::req_retry()` to try each request up to 3 times, using the default `httr2::retry_retry()` settings to decide if a failure is transient. + +## More complicated pagination + +If you would like to implement more complex pagination, or apply other transformations to the `req` object such as `httr2::req_retry()` or `httr2::req_throttle()`, you can create your own `perform` function. +I name these functions `{api_abbr}_req_perform()`. +For example, this is the `perform` function for the {fecapi} package: + +```{r fecapi-perform} +.fec_req_perform <- function(req, + pagination, + per_page, + max_results, + max_reqs, + call) { + next_req <- .choose_pagination_fn(pagination, call = call) + max_reqs <- min(max_reqs, ceiling(max_results / per_page)) + nectar::req_perform_opinionated( + req, + next_req = next_req, + max_reqs = max_reqs + ) +} + +.choose_pagination_fn <- function(pagination, call = rlang::caller_env()) { + pagination <- .validate_pagination(pagination, call) + switch(pagination, + basic = .iterator_fn_basic(), + none = NULL + ) +} + +.validate_pagination <- function(pagination, call = rlang::caller_env()) { + rlang::arg_match0( + pagination, + c("none", "basic"), + error_call = call + ) +} + +.iterator_fn_basic <- function() { + httr2::iterate_with_offset( + "page", + resp_pages = function(resp) { + httr2::resp_body_json(resp)$pagination$pages + } + ) +} +``` + +Within `010-call.R`, I apply the function like this: + +```{r fecapi-call-with-pagination} + resp <- .fec_req_perform( + req, + pagination = pagination, + per_page = query$per_page, + max_results = max_results, + max_reqs = max_reqs + ) +``` + +## Help us improve + +If you find a pattern in pagination implementation from the API description and/or endpoint function parameters, please [submit an issue or a pull request](https://github.com/jonthegeek/beekeeper/issues) to help us improve the output of this package.