forked from JoshData/pdf-redactor
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsmoketest.py
executable file
·77 lines (63 loc) · 1.73 KB
/
smoketest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
# Example script to check whether pdf-redactor crashes on a PDF.
from __future__ import print_function
import io
import os
import multiprocessing
import pdfrw
import re
import sys
import traceback
import xml.etree.ElementTree
import pdf_redactor
try:
from tqdm import tqdm_gui as tqdm
except ImportError:
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda it: it
def metadata_filter(value):
if isinstance(value, (list, dict)):
return None
return value
def smoke_test_file(path):
options = pdf_redactor.RedactorOptions()
options.input_stream = open(path, "rb")
options.output_stream = io.BytesIO()
options.content_filters = [(re.compile("\w+"), lambda match: match.group(0))]
options.metadata_filters = {"ALL": [metadata_filter]}
try:
pdf_redactor.redactor(options)
except (pdfrw.errors.PdfParseError,
IndexError,
AssertionError,
xml.etree.ElementTree.ParseError,
TypeError,
AttributeError,
StopIteration,
ValueError) as e:
print("{0} while reading {1}".format(e.__class__.__name__, path), file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
finally:
options.input_stream.close()
def gen_filenames(paths):
for path in paths:
if os.path.isfile(path):
yield path
elif os.path.isdir(path):
for dirpath, dirnames, filenames in os.walk(path):
for name in filenames:
if name.lower().endswith(".pdf"):
yield os.path.join(dirpath, name)
def main(paths):
with multiprocessing.Pool() as pool:
open_tasks = []
for fn in tqdm(list(gen_filenames(paths))):
open_tasks.append(pool.apply_async(smoke_test_file, [fn]))
if len(open_tasks) > 20:
open_tasks.pop(0).wait()
pool.close()
pool.join()
if __name__ == "__main__":
main(sys.argv[1:])