-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathgbd.py
276 lines (226 loc) · 9.26 KB
/
gbd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import os
import urllib
import traceback
import regex as re
from time import sleep
from seleniumwire import webdriver
from progressbar import progressbar as bar
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
print("""
Google Books Downloader by @aprikyan, 2020.
. . . . . . . . . . .
""")
def get_book_url():
"""
Asks user for the URL, takes it,
removes irrelevant suffixes, adds others,
returns two URLs to extract data and links.
"""
url = input("""
Step 1: Paste the URL of the book preview to be downloaded.
(e.g. https://books.google.com/books?id=buc0AAAAMAAJ&printsec=frontcover&sa=X&ved=2ahUKEwj-y8T4r5vrAhWKLewKHaIQBnYQ6AEwAXoECAQQAg#v=onepage&f=false)
Your input: """)
if re.findall(r"id=[A-Za-z0-9]+", url):
id_part = re.findall(r"id=[A-Za-z0-9]+", url)[-1]
else:
print("Invalid input. Please try again.")
get_book_url()
return (f"https://books.google.com/books?{id_part}&pg=1&hl=en#v=onepage&q&f=false",
f"https://books.google.com/books?{id_part}&pg=1&hl=en&f=false&output=embed&source=gbs_embed")
def get_book_data(url):
"""
Inspects the opened book, returns a
`str` with its title and author.
"""
driver.get(url)
driver.refresh()
sleep(3)
title = driver.find_element_by_class_name("gb-volume-title").text
author = driver.find_element_by_class_name("addmd").text
return f"{title} (b{author[1:]})"
def capture_requests(url):
"""
Scrolls through the whole book,
returns the requests driver made.
"""
driver.get(url)
driver.refresh()
sleep(5)
checkpoint = ""
while checkpoint != driver.find_element_by_class_name("pageImageDisplay"):
checkpoint = driver.find_element_by_class_name("pageImageDisplay")
checkpoint.click()
# scrolling ~25 pages
for i in range(25):
html = driver.find_element_by_tag_name("body")
html.click()
html.send_keys(Keys.SPACE)
sleep(2)
return str(driver.requests)
def extract_urls(requests):
"""
Takes driver's requests as an input,
returns a `dict` of page image URLs.
"""
urls = re.findall(r"url='(https:\/\/[^']+content[^']+pg=[A-Z]+([0-9]+)[^']+)(&w=[0-9]+)'", requests)
return {int(url[1]): url[0] + "&w=69420" for url in urls}
def save_backup():
"""
Asks user whether to backup the available
image URLs for later use. Does so, if yes.
"""
save = input("""
Would you like to save a backup file (type Yes or No)?
Your input: """).upper()
if save == "YES":
with open(f"Backup of {book_data}.txt", "w") as f:
f.write(str(all_pages))
print(f"Succesfully backed up the book in \"Backup of {book_data}.txt\"!")
elif save != "NO":
print("Invalid input. Please try again.")
save_backup()
def select_pages(user_input, all_pages):
"""
Takes the range of pages user specified
and image URLs of all pages available,
returns a `dict` with selected pages only.
"""
ranges = user_input.replace(" ", "").split(",")
page_numbers = []
if "all" in ranges:
return all_pages
while "odd" in ranges:
page_numbers.extend([i for i in all_pages.items() if i[0] % 2])
ranges.remove("odd")
while "even" in ranges:
page_numbers.extend([i for i in all_pages.items() if i[0] % 2 == 0])
ranges.remove("even")
for segment in ranges:
if "-" in segment:
a, b = segment.split("-")
page_numbers.extend([i for i in all_pages.items() if int(a) <= i[0] <= int(b)])
elif int(segment) in all_pages.keys():
page_numbers.append((int(segment), all_pages[int(segment)]))
return dict(set(page_numbers))
def get_cookie(url):
"""
Driver needs to behave like a real
user to GET page images. This function
returns a cookie to bribe Google with.
"""
cookies = []
driver.get(url)
driver.refresh()
for request in driver.requests:
if request.headers:
if "Cookie" in request.headers.keys():
cookies.append(request.headers["Cookie"])
return cookies[0]
def download_imgs(pages, cookie, directory):
"""
Takes the `dict` of pages to download,
the cookie to use and the directory
to save to, and then does the magic.
"""
proxy = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy)
opener.addheaders = [("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30"),
("cookie", cookie)]
urllib.request.install_opener(opener)
for number, url in bar(pages.items()):
urllib.request.urlretrieve(url, os.path.join(directory, f"page{number}.png"))
def step1():
global book_data, all_pages
from_url = input("""
Would you like to download a book from URL? Type No if you have a backup, otherwise type Yes.
Your input: """).upper()
if from_url == "YES":
data_url, pages_url = get_book_url()
book_data = get_book_data(data_url)
print(f"\nWe will now process the pages of \"{book_data}\" one by one. Sit back and relax, as this may take some time, depending on the number of its pages.\n")
reqs = capture_requests(pages_url)
all_pages = extract_urls(reqs)
print("""Now that most of the job is done (yahoo!), it is highly recommended to backup the current progress we have made, so as not to lose it if an error happens to be thrown afterward.
Also, if you would like to download another segment of this book later, the backup will be used then to save your precious time.""")
save_backup()
elif from_url == "NO":
backup = input("""
Enter the location of the backup file.
(e.g. C:/Users/User/Downloads/Backup_of_booktitle.txt)
Your input: """)
try:
book_data = os.path.basename(backup)[10:-4]
all_pages = eval(open(backup).read())
except:
print("Invalid input. Please try again.")
step1()
else:
print("Invalid input. Please try again.")
step1()
def step2():
global selected_pages, cookie
selection = input("""
Step 2: Specify the pages to be downloaded. You may use the combinations of:
- exact numbers (e.g. 5, 3, 16)
- ranges (e.g. 11-13, 1-100)
- keywords odd and/or even, to download odd or even pages respectively
- keyword all, to download all pages available
- commas to seperate the tokens
Your input may look like "1, 10-50, odd, 603".
Note that only pages available for preview will be downloaded.
Your input: """)
try:
selected_pages = select_pages(selection, all_pages)
except:
print("Invalid input. Please try again.")
step2()
# it's a surprise tool that will help us later
cookie = get_cookie(list(all_pages.items())[0][1])
def step3():
main_directory = input("""
Step 3 (optional): Specify the location to download the book pages to (a new folder will be created in that directory).
ENTER to save them right here.
Your input: """)
try:
new_directory = os.path.join(main_directory, book_data)
if not os.path.exists(new_directory):
os.mkdir(new_directory)
except:
print("Invalid input. Please try again.")
step3()
print(f"\nWe will now download all {len(selected_pages)} pages you selected. This will take a minute or two.\n")
download_imgs(selected_pages, cookie, new_directory)
if __name__ == "__main__":
global driver
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=3")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
chrome_options.add_experimental_option("prefs", {"safebrowsing.enabled": True})
driver = webdriver.Chrome("chromedriver.exe", options=chrome_options)
try:
step1()
step2()
step3()
except Exception as e:
with open("google-books-downloader crash.log", "w") as log:
log.write(traceback.format_exc())
print(f"""
Something went wrong :/
Please make sure that:
- you are connected to the Internet
- the book you are trying to download has preview
- you entered a valid URL of a Google Books book
- your inputs correspond the formatting
- you have permission to save/create files in this and the download directories
If it still repeats and you think this is an error, please report it on github.com/aprikyan/google-books-downloader.
When reporting, do not forget to attach the following file to the issue:
{os.path.join(os.getcwd(), "google-books-downloader crash.log")}
""")
else:
print(f"""
The selected pages were successfully downloaded into the "{book_data}" folder!
Note that for your convenience the pages are saved as images. If you would like to combine them in a PDF (or another format), it might be done using specialized websites and apps.""")
# combining in PDF involves asking about its DPI, size, etc, and
# it would take much time and RAM, so it's better to leave it to user