-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_yahoo.py
115 lines (77 loc) · 3.48 KB
/
scrape_yahoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Scrape player projections from Yahoo
"""
import argparse
import datetime
import lxml.html
import pandas
import requests
import time
from config import YAHOO_COOKIE_STRING, YAHOO_LEAGUE_ID, YAHOO_STATS_TRANSLATION
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--ros", action="store_true", help="only scrape rest of season projections")
parser.add_argument("--l14pt", action="store_true", help="only scrape recent stats in the past 14 days")
args = parser.parse_args()
today = datetime.datetime.today()
yahoo_session = requests.Session()
yahoo_session.headers.update({
'cookie': YAHOO_COOKIE_STRING,
})
if not args.l14pt:
# PSR = rest of season projections
projections = scrape_yahoo_player_list(yahoo_session, 'PSR')
projections.to_csv("yahoo_projections.csv", encoding='utf8', index=False)
projections.to_csv("historical/yahoo_projections_{:%Y-%m-%d}.csv".format(today), encoding='utf8', index=False)
if not args.ros:
# AL14 = last 14 days
playing_time = scrape_yahoo_player_list(yahoo_session, 'AL14')
playing_time.to_csv("yahoo_playing_time.csv", encoding='utf8', index=False)
playing_time.to_csv("historical/yahoo_playing_time_{:%Y-%m-%d}.csv".format(today), encoding='utf8', index=False)
def scrape_yahoo_player_list(session, list_code):
"""
On Yahoo, the player projections are paginated to 25 players at a time. Loop over the pages.
"""
# initialize loop
players = []
count = 0
get_next = True
while get_next:
print(count)
x = session.get(f"https://basketball.fantasysports.yahoo.com/nba/{YAHOO_LEAGUE_ID}/players?&sort=AR&sdir=1&status=ALL&pos=P&stat1=S_{list_code}&jsenabled=0&count={count}")
root = lxml.html.fromstring(x.content)
# maps from column number to stat
header_index = parse_table_header(YAHOO_STATS_TRANSLATION, root)
# take all the table rows that represent players
player_table_rows = root.cssselect('div.players tbody tr')
# loop over the player table rows to extract the projections player by player
for player_tr in player_table_rows:
player = {}
player['abbr_name'] = player_tr.cssselect('.ysf-player-name a')[0].text_content()
player['yahoo_id'] = player_tr.cssselect('.ysf-player-name a')[0].get('href').split('/')[-1]
for index, stat in header_index.items():
player[stat] = player_tr.cssselect('td')[index].text_content()
players.append(player)
# check if the "next page" text links to the next page
# if not, end while loop
if not root.cssselect('ul#playerspagenav1 li')[1].getchildren():
get_next = False
else:
count += 25
time.sleep(1) # be respectful of possible rate limits
projections = pandas.DataFrame(players)
return projections
def parse_table_header(stats_translation, root):
"""
Use table headings to map from index position to stat
This is finicky and could break when Yahoo changes their table headers.
"""
header_index = {}
header_row = root.cssselect('div.players thead tr.Last')[0]
for i, header_cell in enumerate(header_row.cssselect('th')):
text = header_cell.text_content()
if text in stats_translation.keys():
header_index[i] = stats_translation[text]
return header_index
if __name__ == '__main__':
main()