-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathps4_pkg_crawler.py
87 lines (73 loc) · 3.29 KB
/
ps4_pkg_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from retry import retry
import re
host = "http://www.tvgame.org"
header_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
session = requests.session()
session.cookies.update({"token": "41e1425319cb6139651dda3049adb0b89f5708e4bbd6d18735c577f75594640d1532828244",
"username": "dpy1123",
# "addinfo": json.dumps({"chkadmin":1,"chkarticle":0,"levelname":"评论者","userid":"192","useralias":"dpy1123"})
}) # 30d过期
session.headers.update({"User-Agent": header_ua})
@retry(exceptions=(requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout), delay=2, backoff=2,
jitter=(0, 5), max_delay=30)
def get_content_in_detail_page(id):
session.cookies.update({"commshow-{0}".format(id): "1"})
req = session.get(host + "/post/{0}.html".format(id), timeout=20)
if req.status_code == 200:
detail_page = BeautifulSoup(req.content, "lxml")
content = detail_page.select_one('div.post div.post-body')
# if content.find('a') is not None:
# a = content.find('a')
# a.string = a['href']
return content.get_text().strip().replace('\r', ' ').replace('\n', ' ').replace('-请支持正版-', 'http://pan.baidu.com/s/')
else:
return None
@retry(exceptions=(requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout), delay=2, backoff=2,
jitter=(0, 5), max_delay=30)
def get_data(page, last_id=None):
result = []
request = session.get(host + "/page_{0}.html".format(page), timeout=10)
home_page = BeautifulSoup(request.content, "lxml")
if request.status_code != 200:
print('get page {0} err'.format(page))
print(home_page.prettify())
return result
post = home_page.select('div.article div.post')
# id time title content info
for p in post:
if len(p.select('i.fa-arrow-circle-up')) > 0: # 不处理置顶内容
continue
time = p.select('span.date')[0].get_text().strip()
if len(time) == 11: # 2018年06月05日
time = '{0}-{1}-{2}'.format(time[:4], time[5:7], time[-3:-1])
t = p.select('div.div-title')[0]
url = t.find('a')['href']
p_id = int(re.compile(r'(\d+)\.html').findall(url)[0])
if last_id is not None and p_id <= last_id:
break
title = t.find('a').get_text().strip()
content = get_content_in_detail_page(p_id)
info = p.select_one('div.more span.readmore a')
if info is not None:
info = info.get_text().strip()
result += [p_id, time, title, content, info]
return result
if __name__ == "__main__":
data = []
last_id = 440
for p in range(1, 21): # 1到20页
new_page = get_data(p, last_id)
if len(new_page) <= 0:
break
data += new_page
if len(data) > 0:
data = np.array(data)
data = data.reshape((-1, 5))
df = pd.DataFrame(data, columns=['id', 'time', 'title', 'content', 'info'])
print(df.head())
df.to_csv('ps4_pkg_({0}-{1}].csv'.format(last_id, df.iloc[0]['id']), index=False)
# print(get_content_in_detail_page(418))