-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
119 lines (94 loc) · 3.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from os.path import join
from urllib.parse import urljoin, urlsplit, unquote
import argparse
import os
import time
from bs4 import BeautifulSoup
from pathvalidate import sanitize_filepath, sanitize_filename
import requests
def check_for_redirect(response_url):
if response_url == urljoin(response_url, '/'):
raise requests.exceptions.HTTPError('Книга не найдена')
def parse_title_author(soup):
post_title = soup.find('h1').text
title, author = [word.strip() for word in post_title.split('::')]
return title, author
def parse_book_image_link(soup):
return soup.find(class_='bookimage').find('a').find('img').attrs['src']
def download_image(book_url, img_address, path='images'):
img_url = urljoin(book_url, img_address)
response = requests.get(img_url)
response.raise_for_status()
check_for_redirect(response.url)
os.makedirs(path, exist_ok=True)
filename = urlsplit(unquote(img_url)).path.split('/')[-1]
with open(f'{path}/{filename}', 'wb') as file:
file.write(response.content)
def parse_comments(soup):
comments = [
x.find(class_='black').text for x in soup.find_all(class_='texts')
]
return comments
def parse_genres(soup):
genres = [x.text for x in soup.find('span', class_='d_book').find_all('a')]
return genres
def parse_book_page(response_text):
soup = BeautifulSoup(response_text, 'lxml')
title, author = parse_title_author(soup)
img_address = parse_book_image_link(soup)
comments = parse_comments(soup)
genres = parse_genres(soup)
return {
'title': title,
'author': author,
'img_address': img_address,
'comments': comments,
'genres': genres,
}
def download_txt(book_url, counter, filename, folder='books/'):
book_file_url = urljoin(book_url, 'txt.php')
params = {'id': counter}
response = requests.get(book_file_url, params=params)
response.raise_for_status()
check_for_redirect(response.url)
folder = sanitize_filepath(folder, platform='auto')
os.makedirs(folder, exist_ok=True)
filename = sanitize_filename(filename)
file_path = join(folder, f'{filename}.txt')
with open(file_path, 'wb') as file:
file.write(response.content)
def main():
parser = argparse.ArgumentParser(
description='This script is used for '
'downloading books and related materials'
)
parser.add_argument(
'--start_id',
default=1,
type=int,
help='Set first ID the script will work with',
)
parser.add_argument(
'--end_id',
default=10,
type=int,
help='Set second ID the script will work with',
)
args = parser.parse_args()
netloc = 'https://tululu.org/'
for counter in range(args.start_id, args.end_id + 1):
try:
book_url = urljoin(netloc, f'b{counter}')
response = requests.get(book_url)
response.raise_for_status()
check_for_redirect(response.url)
parsed_book_page = parse_book_page(response.text)
download_image(book_url, parsed_book_page['img_address'])
download_txt(book_url, counter, parsed_book_page['title'])
except requests.exceptions.HTTPError as e:
print(e)
except requests.exceptions.ConnectionError as e:
print(e)
time.sleep(10)
if __name__ == '__main__':
main()