diff --git a/src/org/netpreserve/jwarc/ConcurrentRecordSet.java b/src/org/netpreserve/jwarc/ConcurrentRecordSet.java new file mode 100644 index 0000000..0493462 --- /dev/null +++ b/src/org/netpreserve/jwarc/ConcurrentRecordSet.java @@ -0,0 +1,42 @@ +package org.netpreserve.jwarc; + +import java.net.URI; +import java.util.HashSet; +import java.util.Set; + +/** + * A set for testing whether WARC records are concurrent (i.e. part of the same capture event). + */ +public class ConcurrentRecordSet { + private final Set set = new HashSet<>(); + + /** + * Adds a record to the set. + */ + public void add(WarcRecord record) { + set.add(record.id()); + if (record instanceof WarcCaptureRecord) { + set.addAll(((WarcCaptureRecord) record).concurrentTo()); + } + } + + /** + * Tests if the given record is concurrent to any previously added record. + */ + public boolean contains(WarcRecord record) { + if (set.contains(record.id())) return true; + if (record instanceof WarcCaptureRecord) { + for (URI id : ((WarcCaptureRecord) record).concurrentTo()) { + if (set.contains(id)) return true; + } + } + return false; + } + + /** + * Removes all records from the set. + */ + public void clear() { + set.clear(); + } +} diff --git a/src/org/netpreserve/jwarc/tools/ExtractTool.java b/src/org/netpreserve/jwarc/tools/ExtractTool.java index 4926a64..ce53660 100644 --- a/src/org/netpreserve/jwarc/tools/ExtractTool.java +++ b/src/org/netpreserve/jwarc/tools/ExtractTool.java @@ -15,9 +15,7 @@ import java.nio.channels.WritableByteChannel; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; +import java.util.*; import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; @@ -85,6 +83,7 @@ private static void usage(int exitValue) { System.err.println(); System.err.println("Options:"); System.err.println(); + System.err.println(" --concurrent\talso outputs any immediately following concurrent records"); System.err.println(" --headers\toutput only record (and HTTP) headers"); System.err.println(" --payload\toutput only record payload, if necessary"); System.err.println(" \tdecode transfer and/or content encoding"); @@ -95,11 +94,16 @@ public static void main(String[] args) throws IOException { ExtractAction action = ExtractAction.RECORD; Path warcFile = null; List offsets = new ArrayList<>(); + boolean extractConcurrent = false; for (String arg : args) { switch (arg) { case "-h": case "--help": usage(0); + break; + case "--concurrent": + extractConcurrent = true; + break; case "--headers": action = ExtractAction.HEADERS; break; @@ -128,7 +132,9 @@ public static void main(String[] args) throws IOException { } if (warcFile == null || offsets.isEmpty()) { usage(1); + return; } + WritableByteChannel out = Channels.newChannel(System.out); for (long offset : offsets) { try (FileChannel channel = FileChannel.open(warcFile); WarcReader reader = new WarcReader(channel.position(offset))) { @@ -137,22 +143,36 @@ public static void main(String[] args) throws IOException { System.err.println("No record found at position " + offset); System.exit(1); } - WritableByteChannel out = Channels.newChannel(System.out); - switch (action) { - case RECORD: - writeWarcHeaders(out, record.get()); - writeBody(out, record.get().body()); - out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII))); - break; - case HEADERS: - writeWarcHeaders(out, record.get()); - writeHttpHeaders(out, record.get()); - break; - case PAYLOAD: - writePayload(out, record.get()); - break; + + writeRecord(record.get(), out, action); + + if (extractConcurrent) { + ConcurrentRecordSet concurrentSet = new ConcurrentRecordSet(); + while (true) { + concurrentSet.add(record.get()); + record = reader.next(); + if (!record.isPresent() || !concurrentSet.contains(record.get())) break; + writeRecord(record.get(), out, action); + } } } } } + + private static void writeRecord(WarcRecord record, WritableByteChannel out, ExtractAction action) throws IOException { + switch (action) { + case RECORD: + writeWarcHeaders(out, record); + writeBody(out, record.body()); + out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII))); + break; + case HEADERS: + writeWarcHeaders(out, record); + writeHttpHeaders(out, record); + break; + case PAYLOAD: + writePayload(out, record); + break; + } + } }