-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_to_mongo.py
112 lines (93 loc) · 3.44 KB
/
pdf_to_mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from PyPDF2 import PdfFileReader
import pdftitle
# import pdfx
import os
import datetime
import pprint
from bson.binary import Binary
from pdf2image import convert_from_path
from tesserocr import PyTessBaseAPI, iterate_level, RIL
import gridfs
# folderpath = r"./books"
# filepaths = [os.path.join(folderpath,name) for name in os.listdir(folderpath)]
# create a variable for the database
temp_file = ".temp_frontpage.jpg"
def extractTitle(fp, pdf, page):
# make sure first page can be text processed
# by checking if it has a font
page_data = pdf.getPage(page)
# print(page_data['/Resources'])
if '/Font' in page_data['/Resources']:
# process title using pdftitle by examining first page
title = pdftitle.get_title_from_io(fp)
return title
def orcTitle(path):
# make first page into jpeg
page = convert_from_path(path, first_page=0, last_page = 1)[0]
page.save(temp_file, 'JPEG')
# use ocr to extract title
# image = Image.open(temp_file)
with PyTessBaseAPI() as api:
api.SetImageFile(temp_file)
api.Recognize() # required to get result from the next line
ri = api.GetIterator()
# loop through and find largest text size
level = RIL.TEXTLINE
maxSize = 0
for r in iterate_level(ri, level):
# extract line of text
text = r.GetUTF8Text(level)
# get line's font size
fontSize = r.WordFontAttributes()['pointsize']
# check to see if current max
# remove extra spaces/newlines/tabs (etc.) when testing min length req
if len(''.join(text.split())) > 1 and fontSize > maxSize:
maxSize = fontSize
# loop through again and concatenate largest words
ri = api.GetIterator()
level = RIL.TEXTLINE
title_list = []
for r in iterate_level(ri, level):
text = r.GetUTF8Text(level)
fontSize = ri.WordFontAttributes()['pointsize']
if len(''.join(text.split())) > 1 and fontSize > maxSize - 15:
# add title words to list
title_list.extend(r.GetUTF8Text(level).split())
# concatenate them back together
title = ' '.join(title_list)
os.remove(temp_file)
return title
def pdfToMongo(path, collection):
# print filepath
print(path)
# make sure it's a pdf
extension = os.path.splitext(path)[1]
if(extension == ".pdf"):
# use pypdf2 to read metadata
fp = open(path, 'rb')
pdf = PdfFileReader(fp)
info = pdf.getDocumentInfo()
encoded = Binary(fp.read())
# print("\tMETADATA:")
# for k, v in info.items():
# print("\t\t", k, ": ", v)
# use title from metadata
title = info.title
# if not present
if title is None or title == '':
# extract title from first page
title = extractTitle(fp, pdf, 0)
#if cannot extract
if title is None or title == '':
# attempt to extract from image
title = orcTitle(path)
post = {
"title": title,
"import_date": datetime.datetime.utcnow(),
"file": encoded,
"tags": ["pdf"]
}
# create an object for your PDF data, then insert it into the database
post_id = collection.insert_one(post).inserted_id
else:
print("\tNOT A PDF")