From e6ce8ef6417e2cb53a0e445861907cf5ac152d6b Mon Sep 17 00:00:00 2001 From: cdxker Date: Tue, 10 Dec 2024 17:40:42 -0800 Subject: [PATCH] feature: make pagefind rust lib more sdk like --- pagefind/src/lib.rs | 1 + pagefind/src/service/api.rs | 177 ++++++++++++++++++++++++++++++ pagefind/src/service/mod.rs | 116 +++++++------------- pagefind/src/service/responses.rs | 13 ++- 4 files changed, 227 insertions(+), 80 deletions(-) create mode 100644 pagefind/src/service/api.rs diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs index 370a96cf..cd60dfb4 100644 --- a/pagefind/src/lib.rs +++ b/pagefind/src/lib.rs @@ -6,6 +6,7 @@ use hashbrown::HashMap; use index::PagefindIndexes; pub use options::{PagefindInboundConfig, SearchOptions}; use output::SyntheticFile; +pub use service::api; use wax::{Glob, WalkEntry}; use crate::index::build_indexes; diff --git a/pagefind/src/service/api.rs b/pagefind/src/service/api.rs new file mode 100644 index 00000000..fb44bd5a --- /dev/null +++ b/pagefind/src/service/api.rs @@ -0,0 +1,177 @@ +use hashbrown::HashMap; +use std::path::PathBuf; + +use crate::{ + fossick::{parser::DomParserResult, Fossicker}, + PagefindInboundConfig, SearchOptions, SearchState, +}; +use base64::{engine::general_purpose, Engine as _}; + +use super::{IndexedFileResponse, SyntheticFileResponse}; + +pub struct PagefindIndex { + search_index: SearchState, +} + +impl PagefindIndex { + /// Create a new PagefindIndex instance. + /// + /// # Arguments + /// * `config` - An optional PagefindServiceConfig to apply to the service. + /// + /// # Returns + /// An optional PagefindIndex instance. If the search options are invalid, it + /// will return None. + pub fn new(config: PagefindInboundConfig) -> Option { + match SearchOptions::load(config) { + Ok(opts) => Some(Self { + search_index: SearchState::new(opts), + }), + Err(_) => None, + } + } + + /// Add a file into this search index. + /// Either a filepath or a URL must be provided. + /// + /// # Arguments + /// * `file_path` - The path to the file to add. + /// * `url` - The URL to the file to add. + /// * `file_contents` - The contents of the file to add. + /// + /// # Returns + /// Either the PageFragmentData of the file added or an error message, if it fails to add the + /// file. + pub async fn add_file( + &mut self, + file_path: Option, + url: Option, + file_contents: String, + ) -> Result { + if file_path.is_none() && url.is_none() { + return Err("Either file_path or url must be provided".into()); + } + + let file = Fossicker::new_synthetic(file_path.map(PathBuf::from), url, file_contents); + let data = self.search_index.fossick_one(file).await; + + match data { + Ok(data) => Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta + }), + Err(_) => Err("Failed to add file".to_string()), + } + } + + /// Add a record to the search index. + /// This is a more manual way to add a record to the search index, allowing for more control + /// over the data. This is useful for adding records that are not files. + /// + /// # Arguments + /// * `url` - The URL of the record. + /// * `content` - The content of the record. + /// * `language` - The language of the record. + /// * `meta` - Optional metadata to add to the record. + /// * `filters` - Optional filters to apply to the record. + /// * `sort` - Optional sorting to apply to the record. + pub async fn add_record( + &mut self, + url: String, + content: String, + language: String, + meta: Option>, + filters: Option>>, + sort: Option>, + ) -> Result { + let data = DomParserResult { + digest: content, + filters: filters.unwrap_or_default(), + sort: sort.unwrap_or_default(), + meta: meta.unwrap_or_default(), + anchor_content: HashMap::new(), + has_custom_body: false, + force_inclusion: true, + has_html_element: true, + has_old_bundle_reference: false, + language: self + .search_index + .options + .force_language + .clone() + .unwrap_or(language), + }; + let file = Fossicker::new_with_data(url, data); + let data = self.search_index.fossick_one(file).await; + + match data { + Ok(data) => Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta + }), + Err(_) => Err("Failed to add file".to_string()), + } + } + + /// Add a directory to the search index with a glob pattern. + /// + /// # Arguments + /// * `path` - The path to the directory to index. + /// * `glob` - A glob pattern to match files in the directory. If not provided, the default glob pattern will be used. + /// + /// # Returns + /// Either the number of pages indexed or an error message, if it fails to index the directory. + pub async fn add_dir(&mut self, path: String, glob: Option) -> Result { + let defaults: PagefindInboundConfig = + serde_json::from_str("{}").expect("All fields have serde defaults"); + let glob = glob.unwrap_or(defaults.glob); + + let data = self + .search_index + .fossick_many(PathBuf::from(path), glob) + .await; + match data { + Ok(page_count) => Ok(page_count), + Err(_) => Err("Failed to index directory".to_string()), + } + } + + /// Build the search index for this instance and hold it in memory. + pub async fn build_indexes(&mut self) { + self.search_index.build_indexes().await; + } + + /// Build the search index for this instance and write the files to disk. + /// + /// # Arguments + /// * `output_path` - The path to write the files to. If not provided, the default output path will be used. + pub async fn write_files(&mut self, output_path: Option) -> String { + self.search_index.build_indexes().await; + let resolved_output_path = self + .search_index + .write_files(output_path.map(Into::into)) + .await; + + resolved_output_path.to_string_lossy().into() + } + + /// Build the search index for this instance and return the files as a list of + /// SyntheticFileResponse. + /// + /// # Returns + /// A list of SyntheticFileResponse containing the path and content of each file. + pub async fn get_files(&mut self) -> Vec { + self.search_index.build_indexes().await; + self.search_index + .get_files() + .await + .into_iter() + .map(|file| SyntheticFileResponse { + path: file.filename.to_string_lossy().into(), + content: general_purpose::STANDARD.encode(file.contents), + }) + .collect() + } +} diff --git a/pagefind/src/service/mod.rs b/pagefind/src/service/mod.rs index 56e9b23a..09b54296 100644 --- a/pagefind/src/service/mod.rs +++ b/pagefind/src/service/mod.rs @@ -1,21 +1,17 @@ -use std::{ - io::{BufRead, Write}, - path::PathBuf, -}; +use std::io::{BufRead, Write}; +use api::PagefindIndex; use base64::{engine::general_purpose, Engine as _}; -use hashbrown::HashMap; use rust_patch::Patch; use tokio::sync::mpsc; -use crate::{ - fossick::{parser::DomParserResult, Fossicker}, - PagefindInboundConfig, SearchOptions, SearchState, -}; +pub mod api; use requests::*; use responses::*; +use crate::PagefindInboundConfig; + mod requests; mod responses; @@ -37,19 +33,18 @@ pub async fn run_service() { std::process::exit(0); } - let Ok(decoded) = general_purpose::STANDARD - .decode(buf) else { - parse_error_outgoing_tx - .send(ServiceResponse { - message_id: None, - payload: ResponseAction::Error { - original_message: None, - message: "Unparseable message, not valid base64".into() - }, - }) - .expect("Channel is open"); - return; - }; + let Ok(decoded) = general_purpose::STANDARD.decode(buf) else { + parse_error_outgoing_tx + .send(ServiceResponse { + message_id: None, + payload: ResponseAction::Error { + original_message: None, + message: "Unparseable message, not valid base64".into(), + }, + }) + .expect("Channel is open"); + return; + }; match serde_json::from_slice::(&decoded) { Ok(msg) => { @@ -118,10 +113,10 @@ pub async fn run_service() { }; fn get_index<'a>( - indexes: &'a mut Vec>, + indexes: &'a mut Vec>, index_id: u32, err: impl FnOnce(&str), - ) -> Option<&'a mut SearchState> { + ) -> Option<&'a mut api::PagefindIndex> { match indexes.get_mut(index_id as usize) { Some(Some(index)) => Some(index), Some(None) => { @@ -138,22 +133,21 @@ pub async fn run_service() { match msg.payload { RequestAction::NewIndex { config } => { let index_id = indexes.len(); - let mut service_options: PagefindInboundConfig = serde_json::from_str("{}").expect("All fields have serde defaults"); - service_options.service = true; + if let Some(config) = config { service_options = config.apply(service_options); } - match SearchOptions::load(service_options) { - Ok(opts) => { - indexes.insert(index_id, Some(SearchState::new(opts))); + match PagefindIndex::new(service_options) { + Some(index) => { + indexes.insert(index_id, Some(index)); send(ResponseAction::NewIndex { index_id: index_id as u32, }); } - Err(_) => { + None => { err("Invalid config supplied"); } } @@ -165,22 +159,14 @@ pub async fn run_service() { file_contents, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - if file_path.is_none() && url.is_none() { - return err( - "Either a source path to the file, or an explicit URL must be provided", - ); - } - - let file = - Fossicker::new_synthetic(file_path.map(PathBuf::from), url, file_contents); - let data = index.fossick_one(file).await; - match data { + let page_fragment = index.add_file(file_path, url, file_contents).await; + match page_fragment { Ok(data) => send(ResponseAction::IndexedFile { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url.clone(), - page_meta: data.fragment.data.meta.clone(), + page_word_count: data.page_word_count, + page_url: data.page_url.clone(), + page_meta: data.page_meta.clone(), }), - Err(_) => err("Failed to add file"), + Err(message) => err(&message), } } } @@ -194,25 +180,14 @@ pub async fn run_service() { sort, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - let data = DomParserResult { - digest: content, - filters: filters.unwrap_or_default(), - sort: sort.unwrap_or_default(), - meta: meta.unwrap_or_default(), - anchor_content: HashMap::new(), - has_custom_body: false, - force_inclusion: true, - has_html_element: true, - has_old_bundle_reference: false, - language: index.options.force_language.clone().unwrap_or(language), - }; - let file = Fossicker::new_with_data(url, data); - let data = index.fossick_one(file).await; + let data = index + .add_record(url, content, language, meta, filters, sort) + .await; match data { Ok(data) => send(ResponseAction::IndexedFile { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url.clone(), - page_meta: data.fragment.data.meta.clone(), + page_word_count: data.page_word_count, + page_url: data.page_url.clone(), + page_meta: data.page_meta.clone(), }), Err(_) => err("Failed to add file"), } @@ -224,12 +199,7 @@ pub async fn run_service() { glob, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - let defaults: PagefindInboundConfig = - serde_json::from_str("{}").expect("All fields have serde defaults"); - let glob = glob.unwrap_or_else(|| defaults.glob); - - let data = index.fossick_many(PathBuf::from(path), glob).await; - match data { + match index.add_dir(path, glob).await { Ok(page_count) => send(ResponseAction::IndexedDir { page_count: page_count as u32, }), @@ -251,7 +221,7 @@ pub async fn run_service() { index.build_indexes().await; let resolved_output_path = index.write_files(output_path.map(Into::into)).await; send(ResponseAction::WriteFiles { - output_path: resolved_output_path.to_string_lossy().into(), + output_path: resolved_output_path, }); } } @@ -259,15 +229,7 @@ pub async fn run_service() { if let Some(index) = get_index(&mut indexes, index_id, err) { index.build_indexes().await; let files = index.get_files().await; - send(ResponseAction::GetFiles { - files: files - .into_iter() - .map(|file| SyntheticFileResponse { - path: file.filename.to_string_lossy().into(), - content: general_purpose::STANDARD.encode(file.contents), - }) - .collect(), - }); + send(ResponseAction::GetFiles { files }); } } RequestAction::DeleteIndex { index_id } => match indexes.get_mut(index_id as usize) { diff --git a/pagefind/src/service/responses.rs b/pagefind/src/service/responses.rs index 14e0875a..4ba2cf4f 100644 --- a/pagefind/src/service/responses.rs +++ b/pagefind/src/service/responses.rs @@ -36,7 +36,14 @@ pub(super) enum ResponseAction { } #[derive(Debug, Deserialize, Serialize)] -pub(super) struct SyntheticFileResponse { - pub(super) path: String, - pub(super) content: String, +pub struct IndexedFileResponse { + pub page_word_count: u32, + pub page_url: String, + pub page_meta: HashMap, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct SyntheticFileResponse { + pub path: String, + pub content: String, }