diff --git a/src/org/netpreserve/jwarc/cdx/CdxFormat.java b/src/org/netpreserve/jwarc/cdx/CdxFormat.java index 5a5862a..4a54fd6 100644 --- a/src/org/netpreserve/jwarc/cdx/CdxFormat.java +++ b/src/org/netpreserve/jwarc/cdx/CdxFormat.java @@ -98,6 +98,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo case CHECKSUM: return record.payloadDigest() .map(digestUnchanged ? WarcDigest::raw : WarcDigest::base32) + .map(CdxFormat::escape) .orElse("-"); case COMPRESSED_ARC_FILE_OFFSET: return position < 0 ? "-" : String.valueOf(position); @@ -106,7 +107,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo case DATE: return CdxFields.DATE_FORMAT.format(record.date()); case FILENAME: - return filename == null ? "-" : filename; + return filename == null ? "-" : escape(filename); case MIME_TYPE: if (record instanceof WarcRevisit) { return PYWB_REVISIT_MIMETYPE; @@ -115,7 +116,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo } case NORMALIZED_SURT: if (urlkey != null) { - return urlkey; + return escape(urlkey); } else { return escape(URIs.toNormalizedSurt(record.target())); } @@ -145,7 +146,10 @@ else if (record.contentType().base().equals(MediaType.HTTP)) { } private static String escape(String str) { - return str == null ? null : str.replace(" ", "%20"); + if (str == null) return null; + return str.replace(" ", "%20") + .replace("\n", "%0A") + .replace("\0", "%00"); } public static class Builder { diff --git a/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java b/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java index 8ac823e..1fa5571 100644 --- a/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java +++ b/test/org/netpreserve/jwarc/cdx/CdxFormatTest.java @@ -30,6 +30,26 @@ public void test() throws IOException { CdxFormat.CDX11.format(response, path.getFileName().toString(), 123, 456)); } + @Test + public void testEscapingSpaces() throws IOException { + HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found") + .body(MediaType.HTML, new byte[0]) + .build(); + WarcResponse response2 = new WarcResponse.Builder("data:a b") + .date(Instant.parse("2022-03-02T21:44:34Z")) + .payloadDigest("sha1", "A LNJ7DOPHK477BWWC726H7Y5XBPBNF7") + .body(httpResponse) + .build(); + assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b)/", + CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, null)); + assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b", + CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, "a b")); + assertEquals("Spaces should be escaped in filename", "a%20b", + CdxFormat.CDX11.formatField(CdxFields.FILENAME, response2, "a b", 456, 400, "a b")); + assertEquals("Spaces should be escaped in checksum", "A%20LNJ7DOPHK477BWWC726H7Y5XBPBNF7", + CdxFormat.CDX11.formatField(CdxFields.CHECKSUM, response2, "a b", 456, 400, "a b")); + } + @Test public void testDigestUnchanged() throws Exception { Path path=Paths.get("/home/jwarc/example.warc.gz"); @@ -94,7 +114,4 @@ public void testFullFilePath() throws Exception { assertEquals("org,example)/ 20220302214434 http://example.org/ text/html 404 "+payloadDigest+" - - 456 123 /home/jwarc/example.warc.gz", cdxFormat.format(response, path.toAbsolutePath().toString(), 123, 456)); } - - - } \ No newline at end of file