-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_tululu_category.py
146 lines (128 loc) · 3.94 KB
/
parse_tululu_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from pathlib import Path
from urllib.parse import urljoin, urlsplit
import argparse
import json
import os
import time
from bs4 import BeautifulSoup
import requests
from main import (
check_for_redirect,
parse_book_page,
download_image,
download_txt,
)
def parse_books_by_page_link(netloc, link):
response = requests.get(link)
response.raise_for_status()
check_for_redirect(response.url)
soup = BeautifulSoup(response.text, 'lxml')
table_selector = 'div#content table'
tables = soup.select(table_selector)
anchor_selector = 'a'
links = [
urljoin(netloc, table.select_one(anchor_selector)['href'].rstrip('/'))
for table in tables
]
return links
def dir_path(path):
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(
f'readable_dir:{path} is not a valid path'
)
def create_parser():
parser = argparse.ArgumentParser(
description='This script is used for '
'downloading books and related materials'
)
parser.add_argument(
'--start_page',
default=1,
type=int,
help='Set first page the script will work with',
)
parser.add_argument(
'--end_page',
default=701,
type=int,
help='Set end page the script will work with',
)
base_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument(
'--dest_folder',
default=base_dir,
type=dir_path,
help='Set destination folder path',
)
parser.add_argument(
'--skip_imgs',
action='store_true',
help='Should images be downloaded',
)
parser.add_argument(
'--skip_txt',
action='store_true',
help='Should textbooks be downloaded',
)
parser.add_argument(
'--json_path',
default=base_dir,
type=dir_path,
help='Set json description path',
)
return parser
def main():
parser = create_parser()
args = parser.parse_args()
netloc = 'https://tululu.org/'
links = []
for counter in range(args.start_page, args.end_page + 1):
try:
sci_fi_page_address = urljoin(netloc, f'l55/{counter}')
links += parse_books_by_page_link(netloc, sci_fi_page_address)
except requests.exceptions.HTTPError as e:
print(e)
except requests.exceptions.ConnectionError as e:
print(e)
time.sleep(10)
book_descriptions = []
for book_url in links:
try:
response = requests.get(book_url)
response.raise_for_status()
check_for_redirect(response.url)
parsed_book_page = parse_book_page(response.text)
book_id = urlsplit(book_url).path.replace('/', '').replace('b', '')
books_folder = Path(args.dest_folder).joinpath('media/books')
if not args.skip_txt:
download_txt(
book_url,
book_id,
parsed_book_page['title'],
str(books_folder),
)
images_folder = Path(args.dest_folder).joinpath('media/images')
if not args.skip_imgs:
download_image(
book_url,
parsed_book_page['img_address'],
str(images_folder),
)
book_descriptions.append(parsed_book_page)
except requests.exceptions.HTTPError as e:
print(e)
except requests.exceptions.ConnectionError as e:
print(e)
time.sleep(10)
json_path = (
args.json_path
if args.dest_folder != args.json_path
else args.dest_folder
)
json_path = Path(json_path).joinpath('books_description.json')
with open(str(json_path), 'w', encoding='utf8') as json_file:
json.dump(book_descriptions, json_file, ensure_ascii=False)
if __name__ == '__main__':
main()