diff --git a/meta/bindings/python/papermuncher.py b/meta/bindings/python/papermuncher.py new file mode 100644 index 00000000..07b688f6 --- /dev/null +++ b/meta/bindings/python/papermuncher.py @@ -0,0 +1,141 @@ +import dataclasses as dc +from email.message import Message +from pathlib import Path +from email.parser import BytesParser +import subprocess +import tempfile +from typing import IO +import magic + + +class Loader: + def handleRequest( + self, url: str, headers: dict[str, str] + ) -> tuple[int, dict[str, str], bytes]: + return ( + 404, + { + "mime": "text/html", + }, + b"404 Not Found", + ) + + +@dc.dataclass +class StaticDir(Loader): + _path: Path + + def __init__(self, path: Path): + self._path = path + + def handleRequest( + self, url: str, headers: dict[str, str] + ) -> tuple[int, dict[str, str], bytes]: + path = self._path / url + if not path.exists(): + return ( + 404, + { + "mime": "text/html", + }, + b"404 Not Found", + ) + with open(path, "rb") as f: + return ( + 200, + { + "mime": magic.Magic(mime=True).from_file(path), + }, + f.read(), + ) + + +def _run( + args: list[str], + loader=Loader(), +) -> bytes: + def _readRequest(fd: IO) -> Message[str, str] | None: + # Read the request header from the file descriptor + parser = BytesParser() + return parser.parse(fd) + + def _sendResponse(fd: IO, status: int, headers: dict[str, str], body: bytes): + fd.write(f"HTTP/2 {status}\r\n".encode()) + for key, value in headers.items(): + fd.write(f"{key}: {value}\r\n".encode()) + fd.write(b"\r\n") + fd.write(body) + + with subprocess.Popen( + args, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as proc: + stdout = proc.stdout + if stdout is None: + raise ValueError("stdout is None") + + stderr = proc.stderr + if stderr is None: + raise ValueError("stderr is None") + + stdin = proc.stdin + if stdin is None: + raise ValueError("stdin is None") + + while True: + request = _readRequest(stdout) + if request is None: + raise ValueError("request is None") + + if request.preamble is None: + raise ValueError("request.preamble is None") + + preamble = request.preamble.split(" ") + if preamble[0] == b"GET": + _sendResponse(stdin, *loader.handleRequest(preamble[1], dict(request))) + elif preamble[0] == b"POST": + payload = request.get_payload() + if not isinstance(payload, bytes): + raise ValueError("payload is not bytes") + proc.terminate() + return payload + else: + raise ValueError("Invalid request") + + +def find() -> Path: + return Path(__file__).parent / "bin" + + +def print( + document: bytes | str | Path, + mime: str = "text/html", + loader: Loader = StaticDir(Path.cwd()), + bin: Path = find(), + **kwargs: str, +) -> bytes: + extraArgs = [] + for key, value in kwargs.items(): + extraArgs.append(f"--{key}") + extraArgs.append(str(value)) + + if isinstance(document, Path): + return _run( + [str(bin), "print", "-i", str(document), "-o", "out.pdf"] + extraArgs, + loader, + ) + else: + with tempfile.NamedTemporaryFile(delete=False) as f: + if isinstance(document, str): + document = document.encode() + f.write(document) + return _run( + [str(bin), "print", "-i", f.name, "-o", "out.pdf"] + extraArgs, + loader, + ) + return b"" + + +__all__ = ["Loader", "StaticDir", "print"] diff --git a/meta/bindings/python/sample.py b/meta/bindings/python/sample.py new file mode 100644 index 00000000..e1a59c49 --- /dev/null +++ b/meta/bindings/python/sample.py @@ -0,0 +1,12 @@ +import papermuncher + +with open("out.pdf", "wb") as f: + document = """ +

Hello, world!

+ """ + f.write( + papermuncher.print( + document, + paper="a4", + ) + ) diff --git a/meta/site/protocol.md b/meta/site/protocol.md new file mode 100644 index 00000000..d97e64ce --- /dev/null +++ b/meta/site/protocol.md @@ -0,0 +1,130 @@ +**HTTPipe Specification** + +This document describes the HTTPipe mode of PaperMuncher and the corresponding wire protocol interactions. It outlines how to start PaperMuncher in HTTPipe mode, as well as the format and flow of requests, responses, and result submissions. + +--- + +## 1. Overview + +HTTPipe mode allows PaperMuncher to fetch content, process it, and submit the processed result back via HTTP. The primary use case is converting web pages or other retrievable resources into PDF documents, though the mechanism can be extended for other transformations. + +--- + +## 2. Starting PaperMuncher in HTTPipe Mode + +To start PaperMuncher in HTTPipe mode, use the following command: + +```bash +paper-muncher print -o --httpipe [options] +``` + +where: + +- `` + The resource to convert. This can be a file path, or a URL if you are using HTTPipe mode. + +- `-o ` + Specifies the output file path. + +- `--httpipe` + Activates HTTPipe mode, indicating the input should be fetched via HTTP, and the result optionally submitted via an HTTP POST. + +- `[options]` + Additional flags or configuration parameters relevant to your setup (e.g., SSL options, custom headers, etc.). + +--- + +## 3. Wire Protocol + +This section covers the HTTP messages exchanged when PaperMuncher operates in HTTPipe mode. + +### 3.1 HttpPipe Request + +PaperMuncher issues an HTTP GET request to retrieve the resource specified by ``: + +``` +GET HTTPIPE/1 +User-Agent: PaperMuncher/0.1.0 Vaev/0.1.0 +``` + +- **Method**: `GET` +- **URL**: `` (Provided as `` from the command line) +- **HTTP Version**: `HTTPIPE/1` +- **User-Agent**: `PaperMuncher/0.1.0 Vaev/0.1.0` + +### 3.2 HttpPipe Response + +The server responds to the GET request, typically returning an HTML page or other resource content. A valid HTTPipe response might look like: + +``` +HTTPIPE/1 200 +content-Type: text/html; charset=UTF-8 +content-Length: 14513 +``` + +- **HTTP Status**: `200 OK` + Indicates a successful retrieval of the resource. + +- **content-Type**: `text/html; charset=UTF-8` (or other valid MIME type) + Specifies the resource type. PaperMuncher will parse and process this content accordingly. + +- **content-Length**: `14513` + Size of the returned resource body (in bytes). + +--- + +### 3.3 HttpPipe Result + +After processing the retrieved resource (e.g., converting it to a PDF), PaperMuncher will post the result back to a specified URL via an HTTP POST: + +``` +POST HTTPIPE/1 +User-Agent: PaperMuncher/0.1.0 Vaev/0.1.0 +Content-Type: application/pdf +Content-Length: 131345 +``` + +- **Method**: `POST` +- **URL**: `` (derived from the `-o` arguments provided to PaperMuncher.) +- **HTTP Version**: `HTTPIPE/1` +- **User-Agent**: `PaperMuncher/0.1.0 Vaev/0.1.0` +- **Content-Type**: `application/pdf` + Indicates the resulting file is in PDF format. + +- **Content-Length**: `131345` + Size of the PDF data (in bytes). + +--- + +## 4. Example Flow + +1. **Client Invocation** + A user runs PaperMuncher to fetch `` and produce `output.pdf`: + ```bash + paper-muncher print https://example.com -o output.pdf --httpipe + ``` + +2. **Initial GET** + PaperMuncher sends a GET request over HTTPIPE/1: + ``` + GET https://example.com HTTPIPE/1 + User-Agent: PaperMuncher/0.1.0 Vaev/0.1.0 + ``` + The server returns an HTML document. + +3. **Processing** + PaperMuncher converts the HTML into a PDF. + +4. **Result POST (Optional)** + PaperMuncher will then POST the PDF file back: + ``` + POST https://example.com/upload HTTPIPE/1 + User-Agent: PaperMuncher/0.1.0 Vaev/0.1.0 + Content-Type: application/pdf + Content-Length: 131345 + + [binary PDF data] + ``` + +5. **Completion** + PaperMuncher exits