From d3a62470baf2793ee55249c1d71b39deaadf0f11 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 6 Dec 2023 15:36:20 +0900 Subject: [PATCH] CdxFormat: Ensure spaces, nulls and newlines are percent encoded in all string fields Unfortunately the CDX spec doesn't standardize how to handle this but at least this should prevent input with these characters from clobbering other fields. Fixes nla/outbackcdx#121 --- src/org/netpreserve/jwarc/cdx/CdxFormat.java | 10 +++++--- .../netpreserve/jwarc/cdx/CdxFormatTest.java | 23 ++++++++++++++++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/org/netpreserve/jwarc/cdx/CdxFormat.java b/src/org/netpreserve/jwarc/cdx/CdxFormat.java index 5a5862a..4a54fd6 100644 --- a/src/org/netpreserve/jwarc/cdx/CdxFormat.java +++ b/src/org/netpreserve/jwarc/cdx/CdxFormat.java @@ -98,6 +98,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo case CHECKSUM: return record.payloadDigest() .map(digestUnchanged ? WarcDigest::raw : WarcDigest::base32) + .map(CdxFormat::escape) .orElse("-"); case COMPRESSED_ARC_FILE_OFFSET: return position < 0 ? "-" : String.valueOf(position); @@ -106,7 +107,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo case DATE: return CdxFields.DATE_FORMAT.format(record.date()); case FILENAME: - return filename == null ? "-" : filename; + return filename == null ? "-" : escape(filename); case MIME_TYPE: if (record instanceof WarcRevisit) { return PYWB_REVISIT_MIMETYPE; @@ -115,7 +116,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo } case NORMALIZED_SURT: if (urlkey != null) { - return urlkey; + return escape(urlkey); } else { return escape(URIs.toNormalizedSurt(record.target())); } @@ -145,7 +146,10 @@ else if (record.contentType().base().equals(MediaType.HTTP)) { } private static String escape(String str) { - return str == null ? null : str.replace(" ", "%20"); + if (str == null) return null; + return str.replace(" ", "%20") + .replace("\n", "%0A") + .replace("\0", "%00"); } public static class Builder { diff --git a/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java b/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java index 8ac823e..1fa5571 100644 --- a/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java +++ b/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java @@ -30,6 +30,26 @@ public void test() throws IOException { CdxFormat.CDX11.format(response, path.getFileName().toString(), 123, 456)); } + @Test + public void testEscapingSpaces() throws IOException { + HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found") + .body(MediaType.HTML, new byte[0]) + .build(); + WarcResponse response2 = new WarcResponse.Builder("data:a b") + .date(Instant.parse("2022-03-02T21:44:34Z")) + .payloadDigest("sha1", "A LNJ7DOPHK477BWWC726H7Y5XBPBNF7") + .body(httpResponse) + .build(); + assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b)/", + CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, null)); + assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b", + CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, "a b")); + assertEquals("Spaces should be escaped in filename", "a%20b", + CdxFormat.CDX11.formatField(CdxFields.FILENAME, response2, "a b", 456, 400, "a b")); + assertEquals("Spaces should be escaped in checksum", "A%20LNJ7DOPHK477BWWC726H7Y5XBPBNF7", + CdxFormat.CDX11.formatField(CdxFields.CHECKSUM, response2, "a b", 456, 400, "a b")); + } + @Test public void testDigestUnchanged() throws Exception { Path path=Paths.get("/home/jwarc/example.warc.gz"); @@ -94,7 +114,4 @@ public void testFullFilePath() throws Exception { assertEquals("org,example)/ 20220302214434 http://example.org/ text/html 404 "+payloadDigest+" - - 456 123 /home/jwarc/example.warc.gz", cdxFormat.format(response, path.toAbsolutePath().toString(), 123, 456)); } - - - } \ No newline at end of file