Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" #103

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/main/java/org/archive/resource/arc/ARCResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
}
}

@Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}

@Override
public void notifyEOF() throws IOException {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
parser = new HttpHeaderParser();
}

@Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
Expand All @@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
}
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);

parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
}
long trailingSlopBytes = StreamCopy.readToEOF(is);
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
}
if(type != null) {
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
}
Expand Down
14 changes: 11 additions & 3 deletions src/main/java/org/archive/resource/warc/WARCResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
countingIS = new CountingInputStream(
ByteStreams.limit(response, length));
} else {
throw new ResourceParseException(null);
throw new ResourceParseException(new Exception("Zero or negative length: " + length));
}
try {
digIS = new DigestInputStream(countingIS,
Expand All @@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
}
}

@Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}

@Override
public void notifyEOF() throws IOException {
String digString = Base32.encode(digIS.getMessageDigest().digest());
if(container.isCompressed()) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
}
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
Expand All @@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
long payloadLength = countingIS.getCount();
if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
metaData.putLong(PAYLOAD_LENGTH, payloadLength);
}
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
}

public MetaData getEnvelopeMetaData() {
return envelope;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
parser = new HttpHeaderParser();
}

@Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
Expand All @@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
long trailingSlopBytes = StreamCopy.readToEOF(is);
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
}
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
}
return new WARCMetaDataResource(md,container, headers);

} catch (HttpParseException e) {
Expand Down
48 changes: 48 additions & 0 deletions src/test/java/org/archive/resource/arc/ARCResourceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.archive.resource.arc;


import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.util.StreamCopy;

import org.json.JSONObject;

import junit.framework.TestCase;

public class ARCResourceTest extends TestCase {

public void testARCResource() throws ResourceParseException, IOException {
String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);

Resource resource = extractor.getNext();

while (resource != null) {
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
.getJSONObject("Payload-Metadata");
System.err.println(payloadMD);

if (payloadMD.has(PAYLOAD_LENGTH)) {
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
}
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
// does not occur with the tested ARC file
}

StreamCopy.readToEOF(resource.getInputStream());
resource = extractor.getNext();
}
}
}
46 changes: 46 additions & 0 deletions src/test/java/org/archive/resource/warc/WARCResourceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.archive.resource.warc;

import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.util.StreamCopy;

import org.json.JSONObject;

import junit.framework.TestCase;

public class WARCResourceTest extends TestCase {

public void testWARCResource() throws ResourceParseException, IOException {
String testFileName = "../../format/warc/IAH-urls-wget.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);

Resource resource = extractor.getNext();

while (resource != null) {
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
.getJSONObject("Payload-Metadata");

if (payloadMD.has(PAYLOAD_LENGTH)) {
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
}
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
}

StreamCopy.readToEOF(resource.getInputStream());
resource = extractor.getNext();
}
}
}
Loading