Skip to content

Commit

Permalink
CdxFormat: Ensure spaces, nulls and newlines are percent encoded in a…
Browse files Browse the repository at this point in the history
…ll string fields

Unfortunately the CDX spec doesn't standardize how to handle this but at least this should prevent input with these characters from clobbering other fields.

Fixes nla/outbackcdx#121
  • Loading branch information
ato committed Dec 6, 2023
1 parent d2d1135 commit d3a6247
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 6 deletions.
10 changes: 7 additions & 3 deletions src/org/netpreserve/jwarc/cdx/CdxFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo
case CHECKSUM:
return record.payloadDigest()
.map(digestUnchanged ? WarcDigest::raw : WarcDigest::base32)
.map(CdxFormat::escape)
.orElse("-");
case COMPRESSED_ARC_FILE_OFFSET:
return position < 0 ? "-" : String.valueOf(position);
Expand All @@ -106,7 +107,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo
case DATE:
return CdxFields.DATE_FORMAT.format(record.date());
case FILENAME:
return filename == null ? "-" : filename;
return filename == null ? "-" : escape(filename);
case MIME_TYPE:
if (record instanceof WarcRevisit) {
return PYWB_REVISIT_MIMETYPE;
Expand All @@ -115,7 +116,7 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo
}
case NORMALIZED_SURT:
if (urlkey != null) {
return urlkey;
return escape(urlkey);
} else {
return escape(URIs.toNormalizedSurt(record.target()));
}
Expand Down Expand Up @@ -145,7 +146,10 @@ else if (record.contentType().base().equals(MediaType.HTTP)) {
}

private static String escape(String str) {
return str == null ? null : str.replace(" ", "%20");
if (str == null) return null;
return str.replace(" ", "%20")
.replace("\n", "%0A")
.replace("\0", "%00");
}

public static class Builder {
Expand Down
23 changes: 20 additions & 3 deletions test/org/netpreserve/jwarc/cdx/CdxFormatTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,26 @@ public void test() throws IOException {
CdxFormat.CDX11.format(response, path.getFileName().toString(), 123, 456));
}

@Test
public void testEscapingSpaces() throws IOException {
HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found")
.body(MediaType.HTML, new byte[0])
.build();
WarcResponse response2 = new WarcResponse.Builder("data:a b")
.date(Instant.parse("2022-03-02T21:44:34Z"))
.payloadDigest("sha1", "A LNJ7DOPHK477BWWC726H7Y5XBPBNF7")
.body(httpResponse)
.build();
assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b)/",
CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, null));
assertEquals("Spaces should be escaped when formatting the urlkey", "a%20b",
CdxFormat.CDX11.formatField(CdxFields.NORMALIZED_SURT, response2, "foo", 456, 400, "a b"));
assertEquals("Spaces should be escaped in filename", "a%20b",
CdxFormat.CDX11.formatField(CdxFields.FILENAME, response2, "a b", 456, 400, "a b"));
assertEquals("Spaces should be escaped in checksum", "A%20LNJ7DOPHK477BWWC726H7Y5XBPBNF7",
CdxFormat.CDX11.formatField(CdxFields.CHECKSUM, response2, "a b", 456, 400, "a b"));
}

@Test
public void testDigestUnchanged() throws Exception {
Path path=Paths.get("/home/jwarc/example.warc.gz");
Expand Down Expand Up @@ -94,7 +114,4 @@ public void testFullFilePath() throws Exception {
assertEquals("org,example)/ 20220302214434 http://example.org/ text/html 404 "+payloadDigest+" - - 456 123 /home/jwarc/example.warc.gz",
cdxFormat.format(response, path.toAbsolutePath().toString(), 123, 456));
}



}

0 comments on commit d3a6247

Please sign in to comment.