Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --robotspass shunt for records related to robots.txt #43

Merged
merged 6 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 75 additions & 38 deletions src/warcpreprocessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,67 @@
#include <boost/log/trivial.hpp>
#include <boost/algorithm/string/predicate.hpp>

namespace {
const std::string kRobotsTxtPath = "/robots.txt";

bool isRobotsTxt(const warc2text::Record &record) {
const auto &url = record.getURL();

// Find the bit after https://
auto host_offset = url.find("://");
if (host_offset != std::string::npos) {
host_offset += 3; // len(://)
}
// maybe it is a relative url, i.e. //hostname?
else if (url.substr(0, 2) == "//") {
host_offset = 2; // len(//)
}
// Just assume there is no protocol, and we start with the hostname.
else {
host_offset = 0;
}

// Find the beginning of the path
auto path_offset = url.find("/", host_offset);
if (path_offset == std::string::npos)
return false;

// If the first bit of the path is robots.txt, that's hopeful.
if (url.compare(path_offset, kRobotsTxtPath.size(), kRobotsTxtPath) != 0)
return false;

// Is there anything after the /robots.txt?
if (url.size() > path_offset + kRobotsTxtPath.size())
return false;

return true;
}
}

namespace warc2text {
const std::unordered_set<std::string> WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3",
".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" };

WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector,
const std::string& outputFolder, const std::unordered_set<std::string>& output_files,
const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert,
const std::string& urlFiltersFile, bool encodeURLs,
bool paragraph_identification) :
WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, WARCPreprocessorOptions const &options) :
detector(detector),
writer(outputFolder, output_files),
options(options),
writer(options.output, options.output_files),
totalRecords(0),
textRecords(0),
langRecords(0),
totalBytes(0),
textBytes(0),
langBytes(0),
tagFilters(),
pdf_warc_filename(pdf_warc_filename),
invert(invert),
encodeURLs(encodeURLs),
paragraph_identification(paragraph_identification) {
if (!tagFiltersFile.empty())
util::readTagFiltersRegex(tagFiltersFile, tagFilters);

if (!urlFiltersFile.empty())
util::readUrlFiltersRegex(urlFiltersFile, urlFilter);
tagFilters() {
if (!options.tag_filters_filename.empty())
util::readTagFiltersRegex(options.tag_filters_filename, tagFilters);

if (!options.url_filters_filename.empty())
util::readUrlFiltersRegex(options.url_filters_filename, urlFilter);
}

// true if url is good
bool WARCPreprocessor::URLfilter(const std::string& url) {
bool WARCPreprocessor::URLfilter(const std::string& url) const {
if (boost::algorithm::ends_with(url, "robots.txt"))
return false;

Expand All @@ -51,7 +81,6 @@ namespace warc2text {
return true;
}


void WARCPreprocessor::process(const std::string& filename) {
BOOST_LOG_TRIVIAL(info) << "Processing " << filename;
WARCReader reader(filename);
Expand All @@ -60,18 +89,31 @@ namespace warc2text {
bool done = false;
int n_langs = 0;

bool pdfpass = !pdf_warc_filename.empty();
WARCWriter pdf_warc_writer;
if (!options.pdf_warc_filename.empty())
pdf_warc_writer.open(options.pdf_warc_filename);

WARCWriter robots_warc_writer;
if (!options.robots_warc_filename.empty())
robots_warc_writer.open(options.robots_warc_filename);

while (!done) {
done = !reader.getRecord(content);

// Note that content.empty() will also be true when len(record) > max_size (which is 20MB by default)
if (done or content.empty())
continue;

Record record(content);
if (record.getPayload().empty())
continue;

// Pick out all robots.txt related records.
if (::isRobotsTxt(record)) {
robots_warc_writer.writeRecord(content); // no-op if robots_warc_writer is not opened.
continue;
}

if (record.getRecordType() != "response" && record.getRecordType() != "resource")
continue;

Expand All @@ -82,19 +124,8 @@ namespace warc2text {
// PDFs that have gone through bitextor-warc2htmlwarc.py will have URL ending in .pdf but text HTTP content type
if (not record.isTextFormat() and (boost::algorithm::ends_with(record.getURL(), ".pdf") or record.getHTTPcontentType() == "application/pdf")) {
// found a PDF file, write record to disk and continue
if (pdfpass) {
// Work-around for https://github.com/bitextor/warc2text/issues/16 for ParaCrawl
// we do not really have a use case for massive PDFs at this moment. Skip em.
if (content.size() >= static_cast<std::size_t>(std::numeric_limits<uInt>::max())) {
BOOST_LOG_TRIVIAL(info) << "PDF too large to compress with util::GZCompress";
continue;
}

if (!pdf_warc_writer.is_open())
pdf_warc_writer.open(pdf_warc_filename);

pdf_warc_writer.writeRecord(content);
}
// this is a no-op if pdf_warc_writer is not opened.
pdf_warc_writer.writeRecord(content);
continue;
}

Expand All @@ -104,7 +135,7 @@ namespace warc2text {
if (!URLfilter(record.getURL()))
continue;

if (encodeURLs)
if (options.encodeURLs)
record.encodeURL();

BOOST_LOG_TRIVIAL(trace) << "Processing HTML document " << record.getURL() << "\n";
Expand All @@ -123,7 +154,7 @@ namespace warc2text {
continue;
}

if ((clean_retval == util::FILTERED_DOCUMENT_ERROR) != invert) {
if ((clean_retval == util::FILTERED_DOCUMENT_ERROR) != options.tag_filters_invert) {
BOOST_LOG_TRIVIAL(info) << "Record " << record.getURL() << " discarded due to tag filters";
continue;
} else if (clean_retval == util::HTML_PARSING_ERROR) {
Expand Down Expand Up @@ -169,9 +200,8 @@ namespace warc2text {

langRecords += n_langs;

writer.write(record, paragraph_identification);
writer.write(record, options.paragraph_identification);
}
pdf_warc_writer.close();
}

void WARCPreprocessor::printStatistics() const{
Expand All @@ -188,12 +218,19 @@ namespace warc2text {
warc = nullptr;
}

WARCWriter::~WARCWriter() {
close();
}

void WARCWriter::open(const std::string& warc_filename) {
filename = warc_filename;
if (not boost::algorithm::ends_with(filename, ".warc.gz"))
filename += ".warc.gz";
std::string folder = filename.substr(0, filename.find_last_of('/'));
util::createDirectories(folder);
auto filename_offset = filename.find_last_of('/');
if (filename_offset != std::string::npos) {
std::string folder = filename.substr(0, filename_offset);
util::createDirectories(folder);
}
warc = std::fopen(filename.c_str(), "wb");
}

Expand Down
34 changes: 23 additions & 11 deletions src/warcpreprocessor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,35 @@ namespace warc2text {
std::string filename;
public:
WARCWriter();
~WARCWriter();
void open(const std::string& warc_filename);
void close();
bool is_open();
void writeRecord(const std::string& content);
};

struct WARCPreprocessorOptions {
std::string pdf_warc_filename;
std::string robots_warc_filename;

bool paragraph_identification{};

std::string output;
std::unordered_set<std::string> output_files;

std::string tag_filters_filename;
bool tag_filters_invert{};

std::string url_filters_filename;

bool multilang{};
bool encodeURLs{};
};

class WARCPreprocessor {
private:
LanguageDetector const &detector;
WARCPreprocessorOptions const &options;
BilangWriter writer;
unsigned int totalRecords;
unsigned int textRecords;
Expand All @@ -35,20 +55,12 @@ namespace warc2text {
unsigned int langBytes;
util::umap_tag_filters_regex tagFilters;
boost::regex urlFilter;
std::string pdf_warc_filename;
bool invert;
bool encodeURLs;
bool paragraph_identification;


static const std::unordered_set<std::string> removeExtensions;
bool URLfilter(const std::string& url);
bool URLfilter(const std::string& url) const;

public:
explicit WARCPreprocessor(LanguageDetector const &detector,
const std::string& outputFolder, const std::unordered_set<std::string>& output_files = {},
const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "",
bool invert = false, const std::string& urlFiltersFile = "",
bool encodeURLs = false, bool paragraph_identification = false);
explicit WARCPreprocessor(LanguageDetector const &detector, WARCPreprocessorOptions const &options);
void process(const std::string &filename);
void printStatistics() const;
};
Expand Down
28 changes: 10 additions & 18 deletions warc2text_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,13 @@

using namespace warc2text;

struct Options {
struct Options : WARCPreprocessorOptions {
std::string file_list;
std::vector<std::string> warcs;
std::string files;
std::string pdf_warc_filename;
bool paragraph_identification{};
bool verbose{};
bool silent{};
std::string output;
std::string tag_filters_filename;
bool tag_filters_invert{};
std::string url_filters_filename;
bool multilang{};
bool encodeURLs{};
std::string classifier;
std::string fasttext_model;
bool verbose{};
bool silent{};
};

void parseArgs(int argc, char *argv[], Options& out) {
Expand All @@ -36,12 +28,13 @@ void parseArgs(int argc, char *argv[], Options& out) {
desc.add_options()
("help,h", po::bool_switch(), "Show this help message")
("output,o", po::value(&out.output)->default_value("."), "Output folder")
("files,f", po::value(&out.files)->default_value("url,token"), "List of output files separated by commas. Default (mandatory files): 'url,text'. Optional: 'mime,html'")
("files,f", po::value(&out.file_list)->default_value("url,token"), "List of output files separated by commas. Default (mandatory files): 'url,text'. Optional: 'mime,html'")
("input,i", po::value(&out.warcs)->multitoken(), "Input WARC file name(s)")
("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters")
("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application")
("url-filters", po::value(&out.url_filters_filename), "Plain text file containing url filters")
("pdfpass", po::value(&out.pdf_warc_filename), "Write PDF records to WARC")
("robotspass", po::value(&out.robots_warc_filename), "Write robots.txt records to WARC")
("paragraph-identification", po::bool_switch(&out.paragraph_identification)->default_value(false), "Add paragraph index in each b64encoded document as tab separated column")
("verbose,v", po::bool_switch(&out.verbose)->default_value(false), "Verbosity level")
("silent,s", po::bool_switch(&out.silent)->default_value(false))
Expand Down Expand Up @@ -73,6 +66,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
" --url-filters <filters_file> File containing url filters\n"
" Format: \"regexp\"\n"
" --pdfpass <output_warc> Write PDF records to <output_warc>\n"
" --robotspass <output_warc> Write Robots.txt records to <output_warc>\n"
" --encode-urls Encode URLs obtained from WARC records\n"
" --paragraph-identification Add paragraph index for each sentence extracted from the html\n"
" -s Only output errors\n"
Expand All @@ -98,8 +92,8 @@ int main(int argc, char *argv[]) {

// prepare list of output files
std::vector<std::string> files_list;
boost::algorithm::split(files_list, options.files, [](char c) {return c == ',';});
std::unordered_set<std::string> output_files(files_list.begin(), files_list.end());
boost::algorithm::split(files_list, options.file_list, [](char c) {return c == ',';});
options.output_files.insert(files_list.begin(), files_list.end());

std::unique_ptr<LanguageDetector> detector;

Expand All @@ -122,9 +116,7 @@ int main(int argc, char *argv[]) {
}

std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
WARCPreprocessor warcpproc(*detector, options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename,
options.tag_filters_invert, options.url_filters_filename,
options.encodeURLs, options.paragraph_identification);
WARCPreprocessor warcpproc(*detector, options);
for (const std::string& file : options.warcs){
warcpproc.process(file);
}
Expand Down