bitextor · jelmervdl · Nov 2, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc
@@ -5,37 +5,67 @@
 #include <boost/log/trivial.hpp>
 #include <boost/algorithm/string/predicate.hpp>
 
+namespace {
+    const std::string kRobotsTxtPath = "/robots.txt";
+
+    bool isRobotsTxt(const warc2text::Record &record) {
+        const auto &url = record.getURL();
+
+        // Find the bit after https://
+        auto host_offset = url.find("://");
+        if (host_offset != std::string::npos) {
+            host_offset += 3; // len(://)
+        }
+        // maybe it is a relative url, i.e. //hostname?
+        else if (url.substr(0, 2) == "//") {
+            host_offset = 2; // len(//)
+        }
+        // Just assume there is no protocol, and we start with the hostname.
+        else {
+            host_offset = 0;
+        }
+
+        // Find the beginning of the path
+        auto path_offset = url.find("/", host_offset);
+        if (path_offset == std::string::npos)
+            return false;
+
+        // If the first bit of the path is robots.txt, that's hopeful.
+        if (url.compare(path_offset, kRobotsTxtPath.size(), kRobotsTxtPath) != 0)
+            return false;
+
+        // Is there anything after the /robots.txt?
+        if (url.size() > path_offset + kRobotsTxtPath.size())
+            return false;
+
+        return true;
+    }
+}
+
 namespace warc2text {
     const std::unordered_set<std::string> WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3",
                                                                                 ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" };
 
-    WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, 
-                                       const std::string& outputFolder, const std::unordered_set<std::string>& output_files,
-                                       const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert,
-                                       const std::string& urlFiltersFile, bool encodeURLs,
-                                       bool paragraph_identification) :
+    WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, WARCPreprocessorOptions const &options) :
         detector(detector),
-        writer(outputFolder, output_files),
+        options(options),
+        writer(options.output, options.output_files),
         totalRecords(0),
         textRecords(0),
         langRecords(0),
         totalBytes(0),
         textBytes(0),
         langBytes(0),
-        tagFilters(),
-        pdf_warc_filename(pdf_warc_filename),
-        invert(invert),
-        encodeURLs(encodeURLs),
-        paragraph_identification(paragraph_identification) {
-            if (!tagFiltersFile.empty())
-                util::readTagFiltersRegex(tagFiltersFile, tagFilters);
-
-            if (!urlFiltersFile.empty())
-                util::readUrlFiltersRegex(urlFiltersFile, urlFilter);
+        tagFilters() {
+            if (!options.tag_filters_filename.empty())
+                util::readTagFiltersRegex(options.tag_filters_filename, tagFilters);
+
+            if (!options.url_filters_filename.empty())
+                util::readUrlFiltersRegex(options.url_filters_filename, urlFilter);
         }
 
     // true if url is good
-    bool WARCPreprocessor::URLfilter(const std::string& url) {
+    bool WARCPreprocessor::URLfilter(const std::string& url) const {
         if (boost::algorithm::ends_with(url, "robots.txt"))
             return false;
 
@@ -51,7 +81,6 @@ namespace warc2text {
         return true;
     }
 
-
     void WARCPreprocessor::process(const std::string& filename) {
         BOOST_LOG_TRIVIAL(info) << "Processing " << filename;
         WARCReader reader(filename);
@@ -60,18 +89,31 @@ namespace warc2text {
         bool done = false;
         int n_langs = 0;
 
-        bool pdfpass = !pdf_warc_filename.empty();
         WARCWriter pdf_warc_writer;
+        if (!options.pdf_warc_filename.empty())
+            pdf_warc_writer.open(options.pdf_warc_filename);
 
+        WARCWriter robots_warc_writer;
+        if (!options.robots_warc_filename.empty())
+            robots_warc_writer.open(options.robots_warc_filename);
+
         while (!done) {
             done = !reader.getRecord(content);
+
+            // Note that content.empty() will also be true when len(record) > max_size (which is 20MB by default)
             if (done or content.empty())
                 continue;
 
             Record record(content);
             if (record.getPayload().empty())
                 continue;
 
+            // Pick out all robots.txt related records.
+            if (::isRobotsTxt(record)) {
+                robots_warc_writer.writeRecord(content); // no-op if robots_warc_writer is not opened.
+                continue;
+            }
+
             if (record.getRecordType() != "response" && record.getRecordType() != "resource")
                 continue;
 
@@ -82,19 +124,8 @@ namespace warc2text {
             // PDFs that have gone through bitextor-warc2htmlwarc.py will have URL ending in .pdf but text HTTP content type
             if (not record.isTextFormat() and (boost::algorithm::ends_with(record.getURL(), ".pdf") or record.getHTTPcontentType() == "application/pdf")) {
                 // found a PDF file, write record to disk and continue
-                if (pdfpass) {
-                    // Work-around for https://github.com/bitextor/warc2text/issues/16 for ParaCrawl
-                    // we do not really have a use case for massive PDFs at this moment. Skip em.
-                    if (content.size() >= static_cast<std::size_t>(std::numeric_limits<uInt>::max())) {
-                        BOOST_LOG_TRIVIAL(info) << "PDF too large to compress with util::GZCompress";
-                        continue;
-                    }
-
-                    if (!pdf_warc_writer.is_open())
-                        pdf_warc_writer.open(pdf_warc_filename);
-
-                    pdf_warc_writer.writeRecord(content);
-                }
+                // this is a no-op if pdf_warc_writer is not opened.
+                pdf_warc_writer.writeRecord(content);
                 continue;
             }
 
@@ -104,7 +135,7 @@ namespace warc2text {
             if (!URLfilter(record.getURL()))
                 continue;
 
-            if (encodeURLs)
+            if (options.encodeURLs)
                 record.encodeURL();
 
             BOOST_LOG_TRIVIAL(trace) << "Processing HTML document " << record.getURL() << "\n";
@@ -123,7 +154,7 @@ namespace warc2text {
                 continue;
             }
 
-            if ((clean_retval == util::FILTERED_DOCUMENT_ERROR) != invert) {
+            if ((clean_retval == util::FILTERED_DOCUMENT_ERROR) != options.tag_filters_invert) {
                 BOOST_LOG_TRIVIAL(info) << "Record " << record.getURL() << " discarded due to tag filters";
                 continue;
             } else if (clean_retval == util::HTML_PARSING_ERROR) {
@@ -169,9 +200,8 @@ namespace warc2text {
 
             langRecords += n_langs;
 
-            writer.write(record, paragraph_identification);
+            writer.write(record, options.paragraph_identification);
         }
-        pdf_warc_writer.close();
     }
 
     void WARCPreprocessor::printStatistics() const{
@@ -188,12 +218,19 @@ namespace warc2text {
         warc = nullptr;
     }
 
+    WARCWriter::~WARCWriter() {
+        close();
+    }
+
     void WARCWriter::open(const std::string& warc_filename) {
         filename = warc_filename;
         if (not boost::algorithm::ends_with(filename, ".warc.gz"))
             filename += ".warc.gz";
-        std::string folder = filename.substr(0, filename.find_last_of('/'));
-        util::createDirectories(folder);
+        auto filename_offset = filename.find_last_of('/');
+        if (filename_offset != std::string::npos) {
+            std::string folder = filename.substr(0, filename_offset);
+            util::createDirectories(folder);
+        }
         warc = std::fopen(filename.c_str(), "wb");
     }
 

diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh
@@ -17,15 +17,35 @@ namespace warc2text {
             std::string filename;
         public:
             WARCWriter();
+            ~WARCWriter();
             void open(const std::string& warc_filename);
             void close();
             bool is_open();
             void writeRecord(const std::string& content);
     };
 
+    struct WARCPreprocessorOptions {
+        std::string pdf_warc_filename;
+        std::string robots_warc_filename;
+
+        bool paragraph_identification{};
+
+        std::string output;
+        std::unordered_set<std::string> output_files;
+
+        std::string tag_filters_filename;
+        bool tag_filters_invert{};
+
+        std::string url_filters_filename;
+
+        bool multilang{};
+        bool encodeURLs{};
+    };
+
     class WARCPreprocessor {
         private:
             LanguageDetector const &detector;
+            WARCPreprocessorOptions const &options;
             BilangWriter writer;
             unsigned int totalRecords;
             unsigned int textRecords;
@@ -35,20 +55,12 @@ namespace warc2text {
             unsigned int langBytes;
             util::umap_tag_filters_regex tagFilters;
             boost::regex urlFilter;
-            std::string pdf_warc_filename;
-            bool invert;
-            bool encodeURLs;
-            bool paragraph_identification;
-
+
             static const std::unordered_set<std::string> removeExtensions;
-            bool URLfilter(const std::string& url);
+            bool URLfilter(const std::string& url) const;
 
         public:
-            explicit WARCPreprocessor(LanguageDetector const &detector,
-                                      const std::string& outputFolder, const std::unordered_set<std::string>& output_files = {},
-                                      const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "",
-                                      bool invert = false, const std::string& urlFiltersFile = "",
-                                      bool encodeURLs = false, bool paragraph_identification = false);
+            explicit WARCPreprocessor(LanguageDetector const &detector, WARCPreprocessorOptions const &options);
             void process(const std::string &filename);
             void printStatistics() const;
     };

diff --git a/warc2text_main.cc b/warc2text_main.cc
@@ -13,21 +13,13 @@
 
 using namespace warc2text;
 
-struct Options {
+struct Options : WARCPreprocessorOptions {
+    std::string file_list;
     std::vector<std::string> warcs;
-    std::string files;
-    std::string pdf_warc_filename;
-    bool paragraph_identification{};
-    bool verbose{};
-    bool silent{};
-    std::string output;
-    std::string tag_filters_filename;
-    bool tag_filters_invert{};
-    std::string url_filters_filename;
-    bool multilang{};
-    bool encodeURLs{};
     std::string classifier;
     std::string fasttext_model;
+    bool verbose{};
+    bool silent{};
 };
 
 void parseArgs(int argc, char *argv[], Options& out) {
@@ -36,12 +28,13 @@ void parseArgs(int argc, char *argv[], Options& out) {
     desc.add_options()
         ("help,h", po::bool_switch(), "Show this help message")
         ("output,o", po::value(&out.output)->default_value("."), "Output folder")
-        ("files,f", po::value(&out.files)->default_value("url,token"), "List of output files separated by commas. Default (mandatory files): 'url,text'. Optional: 'mime,html'")
+        ("files,f", po::value(&out.file_list)->default_value("url,token"), "List of output files separated by commas. Default (mandatory files): 'url,text'. Optional: 'mime,html'")
         ("input,i", po::value(&out.warcs)->multitoken(), "Input WARC file name(s)")
         ("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters")
         ("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application")
         ("url-filters", po::value(&out.url_filters_filename), "Plain text file containing url filters")
         ("pdfpass", po::value(&out.pdf_warc_filename), "Write PDF records to WARC")
+        ("robotspass", po::value(&out.robots_warc_filename), "Write robots.txt records to WARC")
         ("paragraph-identification", po::bool_switch(&out.paragraph_identification)->default_value(false), "Add paragraph index in each b64encoded document as tab separated column")
         ("verbose,v", po::bool_switch(&out.verbose)->default_value(false), "Verbosity level")
         ("silent,s", po::bool_switch(&out.silent)->default_value(false))
@@ -73,6 +66,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " --url-filters <filters_file>     File containing url filters\n"
                 "                                  Format: \"regexp\"\n"
                 " --pdfpass <output_warc>          Write PDF records to <output_warc>\n"
+                " --robotspass <output_warc>       Write Robots.txt records to <output_warc>\n"
                 " --encode-urls                    Encode URLs obtained from WARC records\n"
                 " --paragraph-identification       Add paragraph index for each sentence extracted from the html\n"
                 " -s                               Only output errors\n"
@@ -98,8 +92,8 @@ int main(int argc, char *argv[]) {
 
     // prepare list of output files
     std::vector<std::string> files_list;
-    boost::algorithm::split(files_list, options.files, [](char c) {return c == ',';});
-    std::unordered_set<std::string> output_files(files_list.begin(), files_list.end());
+    boost::algorithm::split(files_list, options.file_list, [](char c) {return c == ',';});
+    options.output_files.insert(files_list.begin(), files_list.end());
 
     std::unique_ptr<LanguageDetector> detector;
 
@@ -122,9 +116,7 @@ int main(int argc, char *argv[]) {
     }
 
     std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    WARCPreprocessor warcpproc(*detector, options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename,
-                               options.tag_filters_invert, options.url_filters_filename,
-                               options.encodeURLs, options.paragraph_identification);
+    WARCPreprocessor warcpproc(*detector, options);
     for (const std::string& file : options.warcs){
         warcpproc.process(file);
     }