-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimg_crawler.py
98 lines (89 loc) · 3.14 KB
/
img_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#crawler from http://konachan.net/
"""
Copyright (C) 2018 by
urayoru <[email protected]>
All rights reserved.
BSD license.
github: http://github.com/urayoru113.
"""
import requests
import os
import time
from bs4 import BeautifulSoup
def downloader(url, filepath):
if os.path.exists(filepath):
print("File %s already exist. Try to download next image......" %filepath)
return False
try:
r = requests.get(url, stream=True, timeout=10)
except requests.exceptions.RequestException as e:
print("%s connect failed!" %url)
return True
if r.status_code == 200:
try:
with open(filepath, 'wb') as f:
for chunk in r.iter_content(1024):
if chunk:
f.write(chunk)
f.flush()
print('Download success. Save file to %s' %filepath)
return False
except KeyboardInterrupt:
if os.path.exists(filepath):
os.remove(filepath)
print("Keyboard interrupt")
raise KeyboardInterrupt
os._exit(0)
except Exception:
if os.path.exists(filepath):
os.remove(filepath)
print("Unknow exception")
return True
else:
print("Connect failed when is downloading")
return True
def main():
if not os.path.isdir('image'):
os.makedirs('image')
total_img = 104509
for i in range(7133, 9945):
filename = 1
url = 'http://konachan.net/post?page=%d&tags=' %i
request = False
while not request or request.status_code != 200:
try:
request = requests.get(url, timeout=10)
except requests.exceptions.RequestException:
print("html code: %d" % request.status_code)
print("Connect reject by %s" %url)
print("wait 10 sec to keep connecting......")
time.sleep(10)
soup = BeautifulSoup(request.text, 'html.parser')
soup_post = soup.find(id='post-list-posts')
if not soup_post:
print("Finished download all image")
os._exit(0)
print("Download form %s" %url)
for img in soup.find_all('img', class_='preview'):
filepath = './image/' + str(i) + '_' + str(filename) + '.jpg'
if os.path.exists(filepath):
print("File %s already exist. Try to download next image......" %filepath)
total_img += 1
filename += 1
continue
ret = downloader("http:" + img['src'], filepath)
if ret:
print("Download failed")
print("wait 10 sec to keep downloading......")
time.sleep(10)
while downloader("http:" + img['src'], filepath):
print("wait 10 sec to keep downloading......")
time.sleep(10)
total_img += 1
filename += 1
else:
total_img += 1
filename += 1
print("total image %d" %total_img)
if __name__ == '__main__':
main()