-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPoC.py
38 lines (36 loc) · 2.08 KB
/
PoC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# This is a PoC script for downloading PDF of pages of a single category.
# It works by directly parsing links info on a wiki category page.
# If in case the target wiki's API (api.php) isn't available,
# try modifying and running the following code instead.
# Requires BeautifulSoup besides the dependencies specified in `setup.cfg`.
import re
import os
import urllib.request
import urllib.parse
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import pdfkit
dir_path = os.path.join(os.getcwd(), 'artifacts')
try:
os.mkdir(dir_path)
except FileExistsError:
pass
os.chdir(dir_path)
# If the category page is truncated (eg. with 'next page' indicators) to multiple listings,
# go through ALL of them via the said links
# and fill their links one by one into this list (notice the 'pageuntil' and 'pagefrom' params):
html_pages = ["https://magireco.moe/index.php?title=Category:%E9%AD%94%E6%B3%95%E7%BA%AA%E5%BD%95%E4%B8%BB%E7%BA%BF%E5%89%A7%E6%83%85&pageuntil=%E4%B8%BB%E7%BA%BF%E5%89%A7%E6%83%85%2F%E7%AC%AC8%E7%AB%A0%2F43%E8%AF%9D#mw-pages",
"https://magireco.moe/index.php?title=Category:%E9%AD%94%E6%B3%95%E7%BA%AA%E5%BD%95%E4%B8%BB%E7%BA%BF%E5%89%A7%E6%83%85&pagefrom=%E4%B8%BB%E7%BA%BF%E5%89%A7%E6%83%85%2F%E7%AC%AC8%E7%AB%A0%2F43%E8%AF%9D#mw-pages"]
pool = ThreadPoolExecutor(max_workers=10)
for page in html_pages:
html = urllib.request.urlopen(page)
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('div.mw-category li a'): # HTML element with links to category pages. May differ between wikis and can be viewed via F12
link = urllib.parse.unquote(link.get('href'))
name = re.escape(re.sub('/wiki/', '', link)) # used to retrieve title of page; format may differ between wikis and may require manual adjustment
url = f"https://magireco.moe/index.php?title={name}&printable=yes" # replace the site domain with your wiki's
#print(url)
name = re.sub(r'/', '_', name)
pool.submit(pdfkit.from_url, url, output_path=f'{name}.pdf')
#print(f"completed {name}.pdf")
pool.shutdown(wait=True)