From 6a3cf1b317c87305d05faee73d2c3ee3f5ec08b0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 11 Dec 2024 21:14:06 +0100 Subject: [PATCH] WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" --- .../org/archive/resource/arc/ARCResource.java | 2 + .../http/HTTPHeadersResourceFactory.java | 11 +++-- .../archive/resource/warc/WARCResource.java | 14 ++++-- .../record/WARCMetaDataResourceFactory.java | 10 +++- .../archive/resource/arc/ARCResourceTest.java | 48 +++++++++++++++++++ .../resource/warc/WARCResourceTest.java | 46 ++++++++++++++++++ 6 files changed, 123 insertions(+), 8 deletions(-) create mode 100644 src/test/java/org/archive/resource/arc/ARCResourceTest.java create mode 100644 src/test/java/org/archive/resource/warc/WARCResourceTest.java diff --git a/src/main/java/org/archive/resource/arc/ARCResource.java b/src/main/java/org/archive/resource/arc/ARCResource.java index b6e0a1c1..b0195f08 100644 --- a/src/main/java/org/archive/resource/arc/ARCResource.java +++ b/src/main/java/org/archive/resource/arc/ARCResource.java @@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container, } } + @Override public InputStream getInputStream() { return new EOFNotifyingInputStream(digIS, this); } + @Override public void notifyEOF() throws IOException { metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); String digString = Base32.encode(digIS.getMessageDigest().digest()); diff --git a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java index 79805090..eb25d821 100644 --- a/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java +++ b/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java @@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) { parser = new HttpHeaderParser(); } + @Override public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { @@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true); } - parentMetaData.putLong(PAYLOAD_LENGTH, bytes); - - parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); + if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) { + parentMetaData.putLong(PAYLOAD_LENGTH, bytes); + } + long trailingSlopBytes = StreamCopy.readToEOF(is); + if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) { + parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes); + } if(type != null) { parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type); } diff --git a/src/main/java/org/archive/resource/warc/WARCResource.java b/src/main/java/org/archive/resource/warc/WARCResource.java index d538a25d..a9c3fcc3 100644 --- a/src/main/java/org/archive/resource/warc/WARCResource.java +++ b/src/main/java/org/archive/resource/warc/WARCResource.java @@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container, countingIS = new CountingInputStream( ByteStreams.limit(response, length)); } else { - throw new ResourceParseException(null); + throw new ResourceParseException(new Exception("Zero or negative length: " + length)); } try { digIS = new DigestInputStream(countingIS, @@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container, } } + @Override public InputStream getInputStream() { return new EOFNotifyingInputStream(digIS, this); } + @Override public void notifyEOF() throws IOException { String digString = Base32.encode(digIS.getMessageDigest().digest()); if(container.isCompressed()) { - metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); + if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) { + metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); + } metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response)); metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } else { @@ -81,13 +85,17 @@ public void notifyEOF() throws IOException { (PushBackOneByteInputStream) raw; long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS); if(numNewlines > 0) { - metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount()); + long payloadLength = countingIS.getCount(); + if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) { + metaData.putLong(PAYLOAD_LENGTH, payloadLength); + } metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines); metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString); } } } } + public MetaData getEnvelopeMetaData() { return envelope; } diff --git a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java index 0dfb2834..ba8a35da 100644 --- a/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java +++ b/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java @@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() { parser = new HttpHeaderParser(); } + @Override public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { @@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData, if(headers.isCorrupt()) { md.putBoolean(WARC_META_FIELDS_CORRUPT, true); } - parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is)); - parentMetaData.putLong(PAYLOAD_LENGTH, bytes); + long trailingSlopBytes = StreamCopy.readToEOF(is); + if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) { + parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes); + } + if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) { + parentMetaData.putLong(PAYLOAD_LENGTH, bytes); + } return new WARCMetaDataResource(md,container, headers); } catch (HttpParseException e) { diff --git a/src/test/java/org/archive/resource/arc/ARCResourceTest.java b/src/test/java/org/archive/resource/arc/ARCResourceTest.java new file mode 100644 index 00000000..43116af7 --- /dev/null +++ b/src/test/java/org/archive/resource/arc/ARCResourceTest.java @@ -0,0 +1,48 @@ +package org.archive.resource.arc; + + +import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH; +import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES; + +import java.io.IOException; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; +import org.archive.resource.Resource; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; +import org.archive.util.StreamCopy; + +import org.json.JSONObject; + +import junit.framework.TestCase; + +public class ARCResourceTest extends TestCase { + + public void testARCResource() throws ResourceParseException, IOException { + String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); + + Resource resource = extractor.getNext(); + + while (resource != null) { + JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope") + .getJSONObject("Payload-Metadata"); + System.err.println(payloadMD); + + if (payloadMD.has(PAYLOAD_LENGTH)) { + assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1); + } + if (payloadMD.has(PAYLOAD_SLOP_BYTES)) { + // does not occur with the tested ARC file + } + + StreamCopy.readToEOF(resource.getInputStream()); + resource = extractor.getNext(); + } + } +} diff --git a/src/test/java/org/archive/resource/warc/WARCResourceTest.java b/src/test/java/org/archive/resource/warc/WARCResourceTest.java new file mode 100644 index 00000000..1b935405 --- /dev/null +++ b/src/test/java/org/archive/resource/warc/WARCResourceTest.java @@ -0,0 +1,46 @@ +package org.archive.resource.warc; + +import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH; +import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES; + +import java.io.IOException; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; +import org.archive.resource.Resource; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; +import org.archive.util.StreamCopy; + +import org.json.JSONObject; + +import junit.framework.TestCase; + +public class WARCResourceTest extends TestCase { + + public void testWARCResource() throws ResourceParseException, IOException { + String testFileName = "../../format/warc/IAH-urls-wget.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); + + Resource resource = extractor.getNext(); + + while (resource != null) { + JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope") + .getJSONObject("Payload-Metadata"); + + if (payloadMD.has(PAYLOAD_LENGTH)) { + assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1); + } + if (payloadMD.has(PAYLOAD_SLOP_BYTES)) { + assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES)); + } + + StreamCopy.readToEOF(resource.getInputStream()); + resource = extractor.getNext(); + } + } +}