From ea30b7aecec7d585e9a97f5194ada9550171a0a9 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 22 Dec 2022 14:28:07 -0600 Subject: [PATCH 01/53] gid page not found --- src/cbbpy/mens_scraper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 60a670a..b5daf0a 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -449,9 +449,13 @@ def get_game_ids(date: Union[str, datetime]) -> list: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - _log.error( - f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') - return [] + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + else: + _log.error( + f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') + return pd.DataFrame([]) else: # try again time.sleep(1.5) From a602468cb8a65844f5918fd4c927711a1274fe06 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 22 Dec 2022 14:35:07 -0600 Subject: [PATCH 02/53] fix for not fully blank boxsc --- src/cbbpy/mens_scraper.py | 174 ++++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 75 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index b5daf0a..11f18ff 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -508,94 +508,118 @@ def _get_game_boxscore_helper(boxscore, game_id): 'athlts'], tm2_stats[1]['athlts'], tm2_stats[2]['ttls'] # starters' stats - tm1_st_dict = {labels[i].lower(): [tm1_starters[j]['stats'][i] - for j in range(len(tm1_starters))] - for i in range(len(labels))} - - tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, 'starter', True) - tm1_st_df.insert(0, 'position', [ - tm1_starters[i]['athlt']['pos'] for i in range(len(tm1_starters))]) - tm1_st_df.insert(0, 'player_id', [tm1_starters[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm1_starters))]) - tm1_st_df.insert(0, 'player', [ - tm1_starters[i]['athlt']['shrtNm'] for i in range(len(tm1_starters))]) - tm1_st_df.insert(0, 'team', tm1_name) - tm1_st_df.insert(0, 'game_id', game_id) + if len(tm1_starters) > 0: + tm1_st_dict = {labels[i].lower(): [tm1_starters[j]['stats'][i] + for j in range(len(tm1_starters))] + for i in range(len(labels))} + + tm1_st_df = pd.DataFrame(tm1_st_dict) + tm1_st_df.insert(0, 'starter', True) + tm1_st_df.insert(0, 'position', [ + tm1_starters[i]['athlt']['pos'] for i in range(len(tm1_starters))]) + tm1_st_df.insert(0, 'player_id', [tm1_starters[i]['athlt']['uid'].split(':')[-1] + for i in range(len(tm1_starters))]) + tm1_st_df.insert(0, 'player', [ + tm1_starters[i]['athlt']['shrtNm'] for i in range(len(tm1_starters))]) + tm1_st_df.insert(0, 'team', tm1_name) + tm1_st_df.insert(0, 'game_id', game_id) + + else: + tm1_st_df = pd.DataFrame([]) # bench players' stats - tm1_bn_dict = {labels[i].lower(): [tm1_bench[j]['stats'][i] - for j in range(len(tm1_bench))] - for i in range(len(labels))} - - tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, 'starter', False) - tm1_bn_df.insert( - 0, 'position', [tm1_bench[i]['athlt']['pos'] for i in range(len(tm1_bench))]) - tm1_bn_df.insert(0, 'player_id', [tm1_bench[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm1_bench))]) - tm1_bn_df.insert( - 0, 'player', [tm1_bench[i]['athlt']['shrtNm'] for i in range(len(tm1_bench))]) - tm1_bn_df.insert(0, 'team', tm1_name) - tm1_bn_df.insert(0, 'game_id', game_id) + if len(tm1_bench) > 0: + tm1_bn_dict = {labels[i].lower(): [tm1_bench[j]['stats'][i] + for j in range(len(tm1_bench))] + for i in range(len(labels))} + + tm1_bn_df = pd.DataFrame(tm1_bn_dict) + tm1_bn_df.insert(0, 'starter', False) + tm1_bn_df.insert( + 0, 'position', [tm1_bench[i]['athlt']['pos'] for i in range(len(tm1_bench))]) + tm1_bn_df.insert(0, 'player_id', [tm1_bench[i]['athlt']['uid'].split(':')[-1] + for i in range(len(tm1_bench))]) + tm1_bn_df.insert( + 0, 'player', [tm1_bench[i]['athlt']['shrtNm'] for i in range(len(tm1_bench))]) + tm1_bn_df.insert(0, 'team', tm1_name) + tm1_bn_df.insert(0, 'game_id', game_id) + + else: + tm1_bn_df = pd.DataFrame([]) # team totals - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] - for i in range(len(labels))} + if len(tm1_totals) > 0: + tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] + for i in range(len(labels))} + + tm1_tot_df = pd.DataFrame(tm1_tot_dict) + tm1_tot_df.insert(0, 'starter', False) + tm1_tot_df.insert(0, 'position', 'TOTAL') + tm1_tot_df.insert(0, 'player_id', 'TOTAL') + tm1_tot_df.insert(0, 'player', 'TEAM') + tm1_tot_df.insert(0, 'team', tm1_name) + tm1_tot_df.insert(0, 'game_id', game_id) - tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, 'starter', False) - tm1_tot_df.insert(0, 'position', 'TOTAL') - tm1_tot_df.insert(0, 'player_id', 'TOTAL') - tm1_tot_df.insert(0, 'player', 'TEAM') - tm1_tot_df.insert(0, 'team', tm1_name) - tm1_tot_df.insert(0, 'game_id', game_id) + else: + tm1_tot_df = pd.DataFrame([]) tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) # starters' stats - tm2_st_dict = {labels[i].lower(): [tm2_starters[j]['stats'][i] - for j in range(len(tm2_starters))] - for i in range(len(labels))} - - tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, 'starter', True) - tm2_st_df.insert(0, 'position', [ - tm2_starters[i]['athlt']['pos'] for i in range(len(tm2_starters))]) - tm2_st_df.insert(0, 'player_id', [tm2_starters[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm2_starters))]) - tm2_st_df.insert(0, 'player', [ - tm2_starters[i]['athlt']['shrtNm'] for i in range(len(tm2_starters))]) - tm2_st_df.insert(0, 'team', tm2_name) - tm2_st_df.insert(0, 'game_id', game_id) + if len(tm2_starters) > 0: + tm2_st_dict = {labels[i].lower(): [tm2_starters[j]['stats'][i] + for j in range(len(tm2_starters))] + for i in range(len(labels))} + + tm2_st_df = pd.DataFrame(tm2_st_dict) + tm2_st_df.insert(0, 'starter', True) + tm2_st_df.insert(0, 'position', [ + tm2_starters[i]['athlt']['pos'] for i in range(len(tm2_starters))]) + tm2_st_df.insert(0, 'player_id', [tm2_starters[i]['athlt']['uid'].split(':')[-1] + for i in range(len(tm2_starters))]) + tm2_st_df.insert(0, 'player', [ + tm2_starters[i]['athlt']['shrtNm'] for i in range(len(tm2_starters))]) + tm2_st_df.insert(0, 'team', tm2_name) + tm2_st_df.insert(0, 'game_id', game_id) + + else: + tm2_st_df = pd.DataFrame([]) # bench players' stats - tm2_bn_dict = {labels[i].lower(): [tm2_bench[j]['stats'][i] - for j in range(len(tm2_bench))] - for i in range(len(labels))} - - tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, 'starter', False) - tm2_bn_df.insert( - 0, 'position', [tm2_bench[i]['athlt']['pos'] for i in range(len(tm2_bench))]) - tm2_bn_df.insert(0, 'player_id', [tm2_bench[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm2_bench))]) - tm2_bn_df.insert( - 0, 'player', [tm2_bench[i]['athlt']['shrtNm'] for i in range(len(tm2_bench))]) - tm2_bn_df.insert(0, 'team', tm2_name) - tm2_bn_df.insert(0, 'game_id', game_id) + if len(tm2_bench) > 0: + tm2_bn_dict = {labels[i].lower(): [tm2_bench[j]['stats'][i] + for j in range(len(tm2_bench))] + for i in range(len(labels))} + + tm2_bn_df = pd.DataFrame(tm2_bn_dict) + tm2_bn_df.insert(0, 'starter', False) + tm2_bn_df.insert( + 0, 'position', [tm2_bench[i]['athlt']['pos'] for i in range(len(tm2_bench))]) + tm2_bn_df.insert(0, 'player_id', [tm2_bench[i]['athlt']['uid'].split(':')[-1] + for i in range(len(tm2_bench))]) + tm2_bn_df.insert( + 0, 'player', [tm2_bench[i]['athlt']['shrtNm'] for i in range(len(tm2_bench))]) + tm2_bn_df.insert(0, 'team', tm2_name) + tm2_bn_df.insert(0, 'game_id', game_id) + + else: + tm2_bn_df = pd.DataFrame([]) # team totals - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] - for i in range(len(labels))} - - tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, 'starter', False) - tm2_tot_df.insert(0, 'position', 'TOTAL') - tm2_tot_df.insert(0, 'player_id', 'TOTAL') - tm2_tot_df.insert(0, 'player', 'TEAM') - tm2_tot_df.insert(0, 'team', tm2_name) - tm2_tot_df.insert(0, 'game_id', game_id) + if len(tm2_totals) > 0: + tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] + for i in range(len(labels))} + + tm2_tot_df = pd.DataFrame(tm2_tot_dict) + tm2_tot_df.insert(0, 'starter', False) + tm2_tot_df.insert(0, 'position', 'TOTAL') + tm2_tot_df.insert(0, 'player_id', 'TOTAL') + tm2_tot_df.insert(0, 'player', 'TEAM') + tm2_tot_df.insert(0, 'team', tm2_name) + tm2_tot_df.insert(0, 'game_id', game_id) + + else: + tm2_tot_df = pd.DataFrame([]) tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) From bd31b7773660b0e75afe8b3511fe44239fa00961 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 22 Dec 2022 14:57:27 -0600 Subject: [PATCH 03/53] fix for blank boxscores --- src/cbbpy/mens_scraper.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 11f18ff..70a91d1 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -525,7 +525,9 @@ def _get_game_boxscore_helper(boxscore, game_id): tm1_st_df.insert(0, 'game_id', game_id) else: - tm1_st_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm1_bench) > 0: @@ -545,7 +547,9 @@ def _get_game_boxscore_helper(boxscore, game_id): tm1_bn_df.insert(0, 'game_id', game_id) else: - tm1_bn_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm1_totals) > 0: @@ -561,7 +565,9 @@ def _get_game_boxscore_helper(boxscore, game_id): tm1_tot_df.insert(0, 'game_id', game_id) else: - tm1_tot_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_tot_df = pd.DataFrame(columns=cols) tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) @@ -583,7 +589,9 @@ def _get_game_boxscore_helper(boxscore, game_id): tm2_st_df.insert(0, 'game_id', game_id) else: - tm2_st_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm2_bench) > 0: @@ -603,7 +611,9 @@ def _get_game_boxscore_helper(boxscore, game_id): tm2_bn_df.insert(0, 'game_id', game_id) else: - tm2_bn_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm2_totals) > 0: @@ -619,12 +629,17 @@ def _get_game_boxscore_helper(boxscore, game_id): tm2_tot_df.insert(0, 'game_id', game_id) else: - tm2_tot_df = pd.DataFrame([]) + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_tot_df = pd.DataFrame(columns=cols) tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) df = pd.concat([tm1_df, tm2_df]) + if len(df) <= 0: + return pd.DataFrame([]) + # SPLIT UP THE FG FIELDS fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors='coerce') From 25e8472a62e27036183b5315e2dcc93e6d9233c8 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 22 Dec 2022 14:59:05 -0600 Subject: [PATCH 04/53] Update mens_scraper.py --- src/cbbpy/mens_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 70a91d1..9e1eec7 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -638,6 +638,7 @@ def _get_game_boxscore_helper(boxscore, game_id): df = pd.concat([tm1_df, tm2_df]) if len(df) <= 0: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') return pd.DataFrame([]) # SPLIT UP THE FG FIELDS From e2132502422ba90b3563963de873fb9c19061592 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 23 Dec 2022 23:58:00 -0600 Subject: [PATCH 05/53] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0f6b619..adba227 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "CBBpy" -version = "1.1.0" +version = "1.1.1" description = 'A Python-based web scraper for NCAA basketball.' readme = "README.md" authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }] From 4dea33f2b0cc757e7985b650debaf04dd60b5ebd Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 30 Dec 2022 12:42:09 -0600 Subject: [PATCH 06/53] blank player boxscore --- src/cbbpy/mens_scraper.py | 87 ++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 9e1eec7..3152689 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -513,14 +513,24 @@ def _get_game_boxscore_helper(boxscore, game_id): for j in range(len(tm1_starters))] for i in range(len(labels))} + tm1_st_pos = [tm1_starters[i]['athlt']['pos'] + if 'pos' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + tm1_st_id = [tm1_starters[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + tm1_st_nm = [tm1_starters[i]['athlt']['shrtNm'] + if 'shrtNm' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + tm1_st_df = pd.DataFrame(tm1_st_dict) tm1_st_df.insert(0, 'starter', True) - tm1_st_df.insert(0, 'position', [ - tm1_starters[i]['athlt']['pos'] for i in range(len(tm1_starters))]) - tm1_st_df.insert(0, 'player_id', [tm1_starters[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm1_starters))]) - tm1_st_df.insert(0, 'player', [ - tm1_starters[i]['athlt']['shrtNm'] for i in range(len(tm1_starters))]) + tm1_st_df.insert(0, 'position', tm1_st_pos) + tm1_st_df.insert(0, 'player_id', tm1_st_id) + tm1_st_df.insert(0, 'player', tm1_st_nm) tm1_st_df.insert(0, 'team', tm1_name) tm1_st_df.insert(0, 'game_id', game_id) @@ -535,14 +545,24 @@ def _get_game_boxscore_helper(boxscore, game_id): for j in range(len(tm1_bench))] for i in range(len(labels))} + tm1_bn_pos = [tm1_bench[i]['athlt']['pos'] + if 'pos' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + tm1_bn_id = [tm1_bench[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + tm1_bn_nm = [tm1_bench[i]['athlt']['shrtNm'] + if 'shrtNm' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + tm1_bn_df = pd.DataFrame(tm1_bn_dict) tm1_bn_df.insert(0, 'starter', False) - tm1_bn_df.insert( - 0, 'position', [tm1_bench[i]['athlt']['pos'] for i in range(len(tm1_bench))]) - tm1_bn_df.insert(0, 'player_id', [tm1_bench[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm1_bench))]) - tm1_bn_df.insert( - 0, 'player', [tm1_bench[i]['athlt']['shrtNm'] for i in range(len(tm1_bench))]) + tm1_bn_df.insert(0, 'position', tm1_bn_pos) + tm1_bn_df.insert(0, 'player_id', tm1_bn_id) + tm1_bn_df.insert(0, 'player', tm1_bn_nm) tm1_bn_df.insert(0, 'team', tm1_name) tm1_bn_df.insert(0, 'game_id', game_id) @@ -577,14 +597,23 @@ def _get_game_boxscore_helper(boxscore, game_id): for j in range(len(tm2_starters))] for i in range(len(labels))} + tm2_st_pos = [tm2_starters[i]['athlt']['pos'] + if 'pos' in tm2_starters[i]['athlt'].keys() + else '' + for i in range(len(tm2_starters))] + tm2_st_id = [tm2_starters[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm2_starters[i]['athlt'].keys() + else '' for i in range(len(tm2_starters))] + tm2_st_nm = [tm2_starters[i]['athlt']['shrtNm'] + if 'shrtNm' in tm2_starters[i]['athlt'].keys() + else '' + for i in range(len(tm2_starters))] + tm2_st_df = pd.DataFrame(tm2_st_dict) tm2_st_df.insert(0, 'starter', True) - tm2_st_df.insert(0, 'position', [ - tm2_starters[i]['athlt']['pos'] for i in range(len(tm2_starters))]) - tm2_st_df.insert(0, 'player_id', [tm2_starters[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm2_starters))]) - tm2_st_df.insert(0, 'player', [ - tm2_starters[i]['athlt']['shrtNm'] for i in range(len(tm2_starters))]) + tm2_st_df.insert(0, 'position', tm2_st_pos) + tm2_st_df.insert(0, 'player_id', tm2_st_id) + tm2_st_df.insert(0, 'player', tm2_st_nm) tm2_st_df.insert(0, 'team', tm2_name) tm2_st_df.insert(0, 'game_id', game_id) @@ -599,14 +628,24 @@ def _get_game_boxscore_helper(boxscore, game_id): for j in range(len(tm2_bench))] for i in range(len(labels))} + tm2_bn_pos = [tm2_bench[i]['athlt']['pos'] + if 'pos' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + tm2_bn_id = [tm2_bench[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + tm2_bn_nm = [tm2_bench[i]['athlt']['shrtNm'] + if 'shrtNm' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + tm2_bn_df = pd.DataFrame(tm2_bn_dict) tm2_bn_df.insert(0, 'starter', False) - tm2_bn_df.insert( - 0, 'position', [tm2_bench[i]['athlt']['pos'] for i in range(len(tm2_bench))]) - tm2_bn_df.insert(0, 'player_id', [tm2_bench[i]['athlt']['uid'].split(':')[-1] - for i in range(len(tm2_bench))]) - tm2_bn_df.insert( - 0, 'player', [tm2_bench[i]['athlt']['shrtNm'] for i in range(len(tm2_bench))]) + tm2_bn_df.insert(0, 'position', tm2_bn_pos) + tm2_bn_df.insert(0, 'player_id', tm2_bn_id) + tm2_bn_df.insert(0, 'player', tm2_bn_nm) tm2_bn_df.insert(0, 'team', tm2_name) tm2_bn_df.insert(0, 'game_id', game_id) From fb0ee5675879d3530fbac906b62ee311a46c2815 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 24 Mar 2023 21:51:25 -0500 Subject: [PATCH 07/53] womens scraper --- pyproject.toml | 2 +- src/cbbpy/womens_scraper.py | 980 ++++++++++++++++++++++++++++++++++++ 2 files changed, 981 insertions(+), 1 deletion(-) create mode 100644 src/cbbpy/womens_scraper.py diff --git a/pyproject.toml b/pyproject.toml index adba227..26093d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "CBBpy" -version = "1.1.1" +version = "2.0.0" description = 'A Python-based web scraper for NCAA basketball.' readme = "README.md" authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }] diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py new file mode 100644 index 0000000..d7ab669 --- /dev/null +++ b/src/cbbpy/womens_scraper.py @@ -0,0 +1,980 @@ +""" +A tool to scrape data for NCAA D1 Women's college basketball games. + +Author: Daniel Cowan +""" + + +from bs4 import BeautifulSoup as bs +import requests as r +import pandas as pd +import numpy as np +from datetime import datetime, timedelta, timezone +from dateutil.parser import parse +from pytz import timezone as tz +from tqdm import trange +import re +import time +import logging +import traceback +import json +from typing import Union + + +logging.basicConfig(filename='cbbpy.log') +_log = logging.getLogger(__name__) + +ATTEMPTS = 10 +DATE_PARSES = [ + '%Y-%m-%d', + '%Y/%m/%d', + '%m-%d-%Y', + '%m/%d/%Y', +] +USER_AGENTS = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', + 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ' + + '(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', + 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 ' + + '(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', +] +REFERERS = [ + 'https://google.com/', + 'https://youtube.com/', + 'https://facebook.com/', + 'https://twitter.com/', + 'https://nytimes.com/', + 'https://washingtonpost.com/', + 'https://linkedin.com/', + 'https://nhl.com/', + 'https://mlb.com/', + 'https://nfl.com/' +] +SCOREBOARD_URL = ( + "https://www.espn.com/womens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" +) +GAME_URL = "https://www.espn.com/womens-college-basketball/game/_/gameId/{}" +BOXSCORE_URL = "https://www.espn.com/womens-college-basketball/boxscore/_/gameId/{}" +PBP_URL = "https://www.espn.com/womens-college-basketball/playbyplay/_/gameId/{}" +NON_SHOT_TYPES = [ + 'TV Timeout', + 'Jump Ball', + 'Turnover', + 'Timeout', + 'Rebound', + 'Block', + 'Steal', + 'Foul', + 'End' +] +SHOT_TYPES = [ + 'Three Point Jumper', + 'Two Point Tip Shot', + 'Free Throw', + 'Jumper', + 'Layup', + 'Dunk' +] + + +class CouldNotParseError(Exception): + pass + + +class InvalidDateRangeError(Exception): + pass + + +def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: + """A function that scrapes all game info (metadata, boxscore, play-by-play). + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + if info: + game_info_df = get_game_info(game_id) + else: + game_info_df = pd.DataFrame([]) + + if box: + boxscore_df = get_game_boxscore(game_id) + else: + boxscore_df = pd.DataFrame([]) + + if pbp: + pbp_df = get_game_pbp(game_id) + else: + pbp_df = pd.DataFrame([]) + + return (game_info_df, boxscore_df, pbp_df) + + +def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: + """A function that scrapes a game information between a given range of dates. + + Parameters: + - start_date: a string representing the first day of games to scrape + - end_date: a string representing the last day of games to scrape + - info: a boolean denoting whether game metadata is to be scraped + - box: a boolean denoting whether game boxscore is to be scraped + - pbp: a boolean denoting whether game play-by-play is to be scraped + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + start_date = _parse_date(start_date) + end_date = _parse_date(end_date) + len_scrape = (end_date - start_date).days + 1 + date = start_date + all_data = [] + + if start_date > end_date: + raise InvalidDateRangeError( + "The start date must be sooner than the end date.") + + with trange(len_scrape) as t: + for i in t: + game_ids = get_game_ids(date) + + if len(game_ids) > 0: + games_info_day = [] + for j, gid in enumerate(game_ids): + t.set_description( + f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" + ) + games_info_day.append(get_game(gid, info, box, pbp)) + all_data.append(games_info_day) + + else: + t.set_description(f"No games on {date.strftime('%D')}") + + date += timedelta(days=1) + + game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( + drop=True + ) + game_boxscore_df = pd.concat( + [game[1] for day in all_data for game in day] + ).reset_index(drop=True) + game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( + drop=True + ) + + return (game_info_df, game_boxscore_df, game_pbp_df) + + +def get_game_boxscore(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's boxscore. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - the game boxscore as a DataFrame + """ + + for i in range(ATTEMPTS): + try: + header = { + 'User-Agent': np.random.choice(USER_AGENTS), + 'Referer': np.random.choice(REFERERS), + } + url = BOXSCORE_URL.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + js = soup.find_all('script')[3].text + js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + jsn = json.loads(js) + gamepackage = jsn['page']['content']['gamepackage'] + + # check if game was postponed + gm_status = gamepackage['gmStrp']['status']['desc'] + gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + boxscore = gamepackage['bxscr'] + + df = _get_game_boxscore_helper(boxscore, game_id) + + except Exception as ex: + if i+1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page not found error') + else: + _log.error( + f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + return pd.DataFrame([]) + else: + # try again + time.sleep(1.5) + continue + else: + # no exception thrown + break + + return df + + +def get_game_pbp(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's play-by-play information. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - the game's play-by-play information represented as a DataFrame + """ + + for i in range(ATTEMPTS): + try: + header = { + 'User-Agent': np.random.choice(USER_AGENTS), + 'Referer': np.random.choice(REFERERS), + } + url = PBP_URL.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + js = soup.find_all('script')[3].text + js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + jsn = json.loads(js) + gamepackage = jsn['page']['content']['gamepackage'] + + # check if game was postponed + gm_status = gamepackage['gmStrp']['status']['desc'] + gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + # num_halves = len(pbp['playGrps']) + + # if num_halves == 2: + # tot_seconds_in_game = (num_halves*20*60) + # else: + # tot_seconds_in_game = (2*20*60) + ((num_halves-2)*5*60) + + pbp = gamepackage['pbp'] + + df = _get_game_pbp_helper(pbp, game_id) + + except Exception as ex: + if i+1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page not found error') + else: + _log.error( + f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + return pd.DataFrame([]) + else: + # try again + time.sleep(1.5) + continue + else: + # no exception thrown + break + + return df + + +def get_game_info(game_id: str) -> pd.DataFrame: + """A function that scrapes game metadata. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - a DataFrame with one row and a column for each piece of metadata + """ + + for i in range(ATTEMPTS): + try: + header = { + 'User-Agent': np.random.choice(USER_AGENTS), + 'Referer': np.random.choice(REFERERS), + } + url = GAME_URL.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + + js = soup.find_all('script')[3].text + js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + jsn = json.loads(js) + gamepackage = jsn['page']['content']['gamepackage'] + + # check if game was postponed + gm_status = gamepackage['gmStrp']['status']['desc'] + gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + # get general game info + info = gamepackage['gmInfo'] + + # get team info + more_info = gamepackage['gmStrp'] + + df = _get_game_info_helper(info, more_info, game_id) + + except Exception as ex: + if i+1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page not found error') + else: + _log.error( + f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + return pd.DataFrame([]) + else: + # try again + time.sleep(1.5) + continue + else: + # no exception thrown + break + + return df + + +def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: + """A function that scrapes all game info (metadata, boxscore, play-by-play) for every game of + a given season. + + Parameters: + - season: an integer representing the season to be scraped. NOTE: season is takes the form + of the four-digit representation of the later year of the season. So, as an example, the + 2021-22 season is referred to by the integer 2022. + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + season_start_date = datetime(season - 1, 11, 1) + season_end_date = datetime(season, 5, 1) + len_season = (season_end_date - season_start_date).days + date = season_start_date + all_data = [] + + with trange(len_season) as t: + for i in t: + game_ids = get_game_ids(date) + + if len(game_ids) > 0: + games_info_day = [] + for j, gid in enumerate(game_ids): + t.set_description( + f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" + ) + games_info_day.append(get_game(gid, info, box, pbp)) + all_data.append(games_info_day) + + else: + t.set_description(f"No games on {date.strftime('%D')}") + + date += timedelta(days=1) + + game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( + drop=True + ) + game_boxscore_df = pd.concat( + [game[1] for day in all_data for game in day] + ).reset_index(drop=True) + game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( + drop=True + ) + + return (game_info_df, game_boxscore_df, game_pbp_df) + + +def get_game_ids(date: Union[str, datetime]) -> list: + """A function that scrapes all game IDs on a date. + + Parameters: + - date: a string/datetime object representing the date to be scraped + + Returns + - a list of ESPN all game IDs for games played on the date given + """ + if type(date) == str: + date = _parse_date(date) + + for i in range(ATTEMPTS): + try: + header = { + 'User-Agent': np.random.choice(USER_AGENTS), + 'Referer': np.random.choice(REFERERS), + } + d = date.strftime("%Y%m%d") + url = SCOREBOARD_URL.format(d) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + js = soup.find_all('script')[3].text + js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + jsn = json.loads(js) + + scoreboard = jsn['page']['content']['scoreboard']['evts'] + ids = [x['id'] for x in scoreboard] + + except Exception as ex: + if i+1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + else: + _log.error( + f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') + return pd.DataFrame([]) + else: + # try again + time.sleep(1.5) + continue + else: + # no exception thrown + break + + return ids + + +def _parse_date(date: str) -> datetime: + parsed = False + + for parse in DATE_PARSES: + try: + date = datetime.strptime(date, parse) + except: + continue + else: + parsed = True + break + + if not parsed: + raise CouldNotParseError('The given date could not be parsed. Try any of these formats:\n' + + 'Y-m-d\nY/m/d\nm-d-Y\nm/d/Y') + + return date + + +def _get_game_boxscore_helper(boxscore, game_id): + """A helper function that cleans a game's boxscore. + + Parameters: + - boxscore: a JSON object containing the boxscore + - game_id: a string representing the game's ESPN game ID + + Returns + - the game boxscore as a DataFrame + """ + tm1, tm2 = boxscore[0], boxscore[1] + tm1_name, tm2_name = tm1['tm']['dspNm'], tm2['tm']['dspNm'] + tm1_stats, tm2_stats = tm1['stats'], tm2['stats'] + + labels = tm1_stats[0]['lbls'] + + tm1_starters, tm1_bench, tm1_totals = tm1_stats[0][ + 'athlts'], tm1_stats[1]['athlts'], tm1_stats[2]['ttls'] + tm2_starters, tm2_bench, tm2_totals = tm2_stats[0][ + 'athlts'], tm2_stats[1]['athlts'], tm2_stats[2]['ttls'] + + # starters' stats + if len(tm1_starters) > 0: + tm1_st_dict = {labels[i].lower(): [tm1_starters[j]['stats'][i] + for j in range(len(tm1_starters))] + for i in range(len(labels))} + + tm1_st_pos = [tm1_starters[i]['athlt']['pos'] + if 'pos' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + tm1_st_id = [tm1_starters[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + tm1_st_nm = [tm1_starters[i]['athlt']['shrtNm'] + if 'shrtNm' in tm1_starters[i]['athlt'].keys() + else '' + for i in range(len(tm1_starters))] + + tm1_st_df = pd.DataFrame(tm1_st_dict) + tm1_st_df.insert(0, 'starter', True) + tm1_st_df.insert(0, 'position', tm1_st_pos) + tm1_st_df.insert(0, 'player_id', tm1_st_id) + tm1_st_df.insert(0, 'player', tm1_st_nm) + tm1_st_df.insert(0, 'team', tm1_name) + tm1_st_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_st_df = pd.DataFrame(columns=cols) + + # bench players' stats + if len(tm1_bench) > 0: + tm1_bn_dict = {labels[i].lower(): [tm1_bench[j]['stats'][i] + for j in range(len(tm1_bench))] + for i in range(len(labels))} + + tm1_bn_pos = [tm1_bench[i]['athlt']['pos'] + if 'pos' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + tm1_bn_id = [tm1_bench[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + tm1_bn_nm = [tm1_bench[i]['athlt']['shrtNm'] + if 'shrtNm' in tm1_bench[i]['athlt'].keys() + else '' + for i in range(len(tm1_bench))] + + tm1_bn_df = pd.DataFrame(tm1_bn_dict) + tm1_bn_df.insert(0, 'starter', False) + tm1_bn_df.insert(0, 'position', tm1_bn_pos) + tm1_bn_df.insert(0, 'player_id', tm1_bn_id) + tm1_bn_df.insert(0, 'player', tm1_bn_nm) + tm1_bn_df.insert(0, 'team', tm1_name) + tm1_bn_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_bn_df = pd.DataFrame(columns=cols) + + # team totals + if len(tm1_totals) > 0: + tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] + for i in range(len(labels))} + + tm1_tot_df = pd.DataFrame(tm1_tot_dict) + tm1_tot_df.insert(0, 'starter', False) + tm1_tot_df.insert(0, 'position', 'TOTAL') + tm1_tot_df.insert(0, 'player_id', 'TOTAL') + tm1_tot_df.insert(0, 'player', 'TEAM') + tm1_tot_df.insert(0, 'team', tm1_name) + tm1_tot_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm1_tot_df = pd.DataFrame(columns=cols) + + tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) + + # starters' stats + if len(tm2_starters) > 0: + tm2_st_dict = {labels[i].lower(): [tm2_starters[j]['stats'][i] + for j in range(len(tm2_starters))] + for i in range(len(labels))} + + tm2_st_pos = [tm2_starters[i]['athlt']['pos'] + if 'pos' in tm2_starters[i]['athlt'].keys() + else '' + for i in range(len(tm2_starters))] + tm2_st_id = [tm2_starters[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm2_starters[i]['athlt'].keys() + else '' for i in range(len(tm2_starters))] + tm2_st_nm = [tm2_starters[i]['athlt']['shrtNm'] + if 'shrtNm' in tm2_starters[i]['athlt'].keys() + else '' + for i in range(len(tm2_starters))] + + tm2_st_df = pd.DataFrame(tm2_st_dict) + tm2_st_df.insert(0, 'starter', True) + tm2_st_df.insert(0, 'position', tm2_st_pos) + tm2_st_df.insert(0, 'player_id', tm2_st_id) + tm2_st_df.insert(0, 'player', tm2_st_nm) + tm2_st_df.insert(0, 'team', tm2_name) + tm2_st_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_st_df = pd.DataFrame(columns=cols) + + # bench players' stats + if len(tm2_bench) > 0: + tm2_bn_dict = {labels[i].lower(): [tm2_bench[j]['stats'][i] + for j in range(len(tm2_bench))] + for i in range(len(labels))} + + tm2_bn_pos = [tm2_bench[i]['athlt']['pos'] + if 'pos' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + tm2_bn_id = [tm2_bench[i]['athlt']['uid'].split(':')[-1] + if 'uid' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + tm2_bn_nm = [tm2_bench[i]['athlt']['shrtNm'] + if 'shrtNm' in tm2_bench[i]['athlt'].keys() + else '' + for i in range(len(tm2_bench))] + + tm2_bn_df = pd.DataFrame(tm2_bn_dict) + tm2_bn_df.insert(0, 'starter', False) + tm2_bn_df.insert(0, 'position', tm2_bn_pos) + tm2_bn_df.insert(0, 'player_id', tm2_bn_id) + tm2_bn_df.insert(0, 'player', tm2_bn_nm) + tm2_bn_df.insert(0, 'team', tm2_name) + tm2_bn_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_bn_df = pd.DataFrame(columns=cols) + + # team totals + if len(tm2_totals) > 0: + tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] + for i in range(len(labels))} + + tm2_tot_df = pd.DataFrame(tm2_tot_dict) + tm2_tot_df.insert(0, 'starter', False) + tm2_tot_df.insert(0, 'position', 'TOTAL') + tm2_tot_df.insert(0, 'player_id', 'TOTAL') + tm2_tot_df.insert(0, 'player', 'TEAM') + tm2_tot_df.insert(0, 'team', tm2_name) + tm2_tot_df.insert(0, 'game_id', game_id) + + else: + cols = ['starter', 'position', 'player_id', 'player', + 'team', 'game_id'] + [x.lower() for x in labels] + tm2_tot_df = pd.DataFrame(columns=cols) + + tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) + + df = pd.concat([tm1_df, tm2_df]) + + if len(df) <= 0: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) + + # SPLIT UP THE FG FIELDS + fgm = pd.to_numeric([x.split("-")[0] + for x in df["fg"]], errors='coerce') + fga = pd.to_numeric([x.split("-")[1] + for x in df["fg"]], errors='coerce') + thpm = pd.to_numeric([x.split("-")[0] + for x in df["3pt"]], errors='coerce') + thpa = pd.to_numeric([x.split("-")[1] + for x in df["3pt"]], errors='coerce') + ftm = pd.to_numeric([x.split("-")[0] + for x in df["ft"]], errors='coerce') + fta = pd.to_numeric([x.split("-")[1] + for x in df["ft"]], errors='coerce') + + # GET RID OF UNWANTED COLUMNS + df = df.drop(columns=["fg", "3pt", "ft"]) + + # INSERT COLUMNS WHERE NECESSARY + df.insert(7, "fgm", fgm) + df.insert(8, "fga", fga) + df.insert(9, "2pm", fgm - thpm) + df.insert(10, "2pa", fga - thpa) + df.insert(11, "3pm", thpm) + df.insert(12, "3pa", thpa) + df.insert(13, "ftm", ftm) + df.insert(14, "fta", fta) + + # column type handling + df['min'] = pd.to_numeric(df['min'], errors='coerce') + df['oreb'] = pd.to_numeric(df['oreb'], errors='coerce') + df['dreb'] = pd.to_numeric(df['dreb'], errors='coerce') + df['reb'] = pd.to_numeric(df['reb'], errors='coerce') + df['ast'] = pd.to_numeric(df['ast'], errors='coerce') + df['stl'] = pd.to_numeric(df['stl'], errors='coerce') + df['blk'] = pd.to_numeric(df['blk'], errors='coerce') + df['to'] = pd.to_numeric(df['to'], errors='coerce') + df['pf'] = pd.to_numeric(df['pf'], errors='coerce') + df['pts'] = pd.to_numeric(df['pts'], errors='coerce') + + return df + + +def _get_game_pbp_helper(pbp, game_id): + """A helper function that cleans a game's PBP. + + Parameters: + - pbp: a JSON object containing the play-by-play + - game_id: a string representing the game's ESPN game ID + + Returns + - the game PBP as a DataFrame + """ + home_team = pbp['tms']['home']['displayName'] + away_team = pbp['tms']['away']['displayName'] + + all_plays = [play for half in pbp['playGrps'] for play in half] + + # check if PBP exists + if len(all_plays) <= 0: + _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') + return pd.DataFrame([]) + + descs = [x['text'] if 'text' in x.keys() else '' for x in all_plays] + teams = ['' if not 'homeAway' in x.keys() + else home_team if x['homeAway'] == 'home' else away_team for x in all_plays] + hscores = [int(x['homeScore']) if 'homeScore' in x.keys() + else np.nan for x in all_plays] + ascores = [int(x['awayScore']) if 'awayScore' in x.keys() + else np.nan for x in all_plays] + halves = [int(x['period']['number']) + if 'period' in x.keys() else np.nan for x in all_plays] + + time_splits = [x['clock']['displayValue'].split(':') if 'clock' in x.keys() + else '' for x in all_plays] + minutes = [int(x[0]) for x in time_splits] + seconds = [int(x[1]) for x in time_splits] + min_to_sec = [x*60 for x in minutes] + hf_secs_left = [x+y for x, y in zip(min_to_sec, seconds)] + reg_secs_left = [1200+x if half_num == 1 else x for x, + half_num in zip(hf_secs_left, halves)] + + sc_play = [True if 'scoringPlay' in x.keys() + else False for x in all_plays] + is_assisted = [True if ('text' in x.keys() and 'assisted' in x['text'].lower()) + else False for x in all_plays] + + # ASSIGN PLAY TYPES + p_types = [] + + for x in all_plays: + if not 'text' in x.keys(): + p_types.append('') + continue + + play = x['text'] + + if not type(play) == str: + play = '' + + added = False + for pt in NON_SHOT_TYPES: + if pt in play: + p_types.append(pt.lower()) + added = True + break + if not added: + for st in SHOT_TYPES: + if st in play: + p_types.append(st.lower()) + added = True + break + + if not added: + p_types.append('') + + # FIND SHOOTERS + shooting_play = [True if x in + (y.lower() for y in SHOT_TYPES) else False for x in p_types] + + scorers = [x[0].split(' made ')[0] if x[1] else '' for x in + zip(descs, sc_play)] + + non_scorers = [x[0].split(' missed ')[0] if x[1] in (y.lower() for y in SHOT_TYPES) + and not x[2] else '' for x in zip(descs, p_types, sc_play)] + + shooters = [x[0] if not x[0] == '' else x[1] + for x in zip(scorers, non_scorers)] + + assisted_pls = [x[0].split('Assisted by ')[-1].replace('.', '') if x[1] else '' for x in + zip(descs, is_assisted)] + + data = { + 'game_id': game_id, + 'home_team': home_team, + 'away_team': away_team, + 'play_desc': descs, + 'home_score': hscores, + 'away_score': ascores, + 'half': halves, + 'secs_left_half': hf_secs_left, + 'secs_left_reg': reg_secs_left, + 'play_team': teams, + 'play_type': p_types, + 'shooting_play': shooting_play, + 'scoring_play': sc_play, + 'shooter': shooters, + 'is_assisted': is_assisted, + 'assist_player': assisted_pls, + } + + return pd.DataFrame(data) + + +def _get_game_info_helper(info, more_info, game_id): + """A helper function that cleans a game's metadata. + + Parameters: + - info: a JSON object containing game metadata + - more_info: a JSON object containing game metadata + - game_id: a string representing the game's ESPN game ID + + Returns + - the game metadata as a DataFrame + """ + attendance = int(info['attnd']) if 'attnd' in info.keys() else np.nan + capacity = int(info['cpcty']) if 'cpcty' in info.keys() else np.nan + network = info['cvrg'] if 'cvrg' in info.keys() else '' + + gm_date = parse(info['dtTm']) + game_date = gm_date.replace( + tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) + game_day = game_date.strftime("%B %d, %Y") + game_time = game_date.strftime("%I:%M %p %Z") + + arena = info['loc'] if 'loc' in info.keys() else '' + loc = info['locAddr']['city'] + ', ' + \ + info['locAddr']['state'] if 'locAddr' in info.keys() else '' + + tot_refs = info['refs'] if 'refs' in info.keys() else {} + ref_1 = tot_refs[0]['dspNm'] if len(tot_refs) > 0 else '' + ref_2 = tot_refs[1]['dspNm'] if len(tot_refs) > 1 else '' + ref_3 = tot_refs[2]['dspNm'] if len(tot_refs) > 2 else '' + + teams = more_info['tms'] + ht_info, at_info = teams[0], teams[1] + + home_team, away_team = ht_info['displayName'], at_info['displayName'] + + home_id = ht_info['id'] + away_id = at_info['id'] + + if len(ht_info['links']) == 0: + ht = home_team.lower().replace(" ", "-") + home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) + elif len(ht_info['records']) == 0: + ht = home_team.lower().replace(" ", "-") + home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) + + if len(at_info['links']) == 0: + at = away_team.lower().replace(" ", "-") + away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) + elif len(at_info['records']) == 0: + at = away_team.lower().replace(" ", "-") + away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) + + home_rank = ht_info['rank'] if 'rank' in ht_info.keys() else np.nan + away_rank = at_info['rank'] if 'rank' in at_info.keys() else np.nan + + home_record = ht_info['records'][0]['displayValue'] if len( + ht_info['records']) > 0 else '' + away_record = at_info['records'][0]['displayValue'] if len( + at_info['records']) > 0 else '' + + home_score, away_score = int( + ht_info['score']), int(at_info['score']) + + home_win = True if home_score > away_score else False + + is_postseason = True if more_info['seasonType'] == 3 else False + is_conference = more_info['isConferenceGame'] + + if len(ht_info['records']) > 1 and ht_info['records'][1]['type'] == 'home': + is_neutral = False + + elif len(at_info['records']) > 1 and at_info['records'][1]['type'] == 'away': + is_neutral = False + + else: + is_neutral = True + + tournament = more_info['nte'] if 'nte' in more_info.keys() else '' + + h_ot, a_ot = len(ht_info['linescores']) - \ + 2, len(at_info['linescores']) - 2 + assert h_ot == a_ot + num_ots = h_ot + + game_info_list = [ + game_id, + home_team, + home_id, + home_rank, + home_record, + home_score, + away_team, + away_id, + away_rank, + away_record, + away_score, + home_win, + num_ots, + is_conference, + is_neutral, + is_postseason, + tournament, + game_day, + game_time, + loc, + arena, + capacity, + attendance, + network, + ref_1, + ref_2, + ref_3 + ] + + game_info_cols = [ + 'game_id', + 'home_team', + 'home_id', + 'home_rank', + 'home_record', + 'home_score', + 'away_team', + 'away_id', + 'away_rank', + 'away_record', + 'away_score', + 'home_win', + 'num_ots', + 'is_conference', + 'is_neutral', + 'is_postseason', + 'tournament', + 'game_day', + 'game_time', + 'game_loc', + 'arena', + 'arena_capacity', + 'attendance', + 'tv_network', + 'referee_1', + 'referee_2', + 'referee_3' + ] + + return pd.DataFrame([game_info_list], columns=game_info_cols) From cab15d2d2676458f214b7e050cdea329e0a04e18 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 31 Mar 2023 22:29:10 -0500 Subject: [PATCH 08/53] shot data scraping --- src/cbbpy/mens_scraper.py | 71 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 3152689..468cce4 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -279,9 +279,9 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # else: # tot_seconds_in_game = (2*20*60) + ((num_halves-2)*5*60) - pbp = gamepackage['pbp'] + # pbp = gamepackage['pbp'] - df = _get_game_pbp_helper(pbp, game_id) + df = _get_game_pbp_helper(gamepackage, game_id) except Exception as ex: if i+1 == ATTEMPTS: @@ -722,7 +722,7 @@ def _get_game_boxscore_helper(boxscore, game_id): return df -def _get_game_pbp_helper(pbp, game_id): +def _get_game_pbp_helper(gamepackage, game_id): """A helper function that cleans a game's PBP. Parameters: @@ -732,6 +732,7 @@ def _get_game_pbp_helper(pbp, game_id): Returns - the game PBP as a DataFrame """ + pbp = gamepackage['pbp'] home_team = pbp['tms']['home']['displayName'] away_team = pbp['tms']['away']['displayName'] @@ -811,6 +812,8 @@ def _get_game_pbp_helper(pbp, game_id): assisted_pls = [x[0].split('Assisted by ')[-1].replace('.', '') if x[1] else '' for x in zip(descs, is_assisted)] + is_three = ['three point' in x.lower() for x in descs] + data = { 'game_id': game_id, 'home_team': home_team, @@ -825,12 +828,72 @@ def _get_game_pbp_helper(pbp, game_id): 'play_type': p_types, 'shooting_play': shooting_play, 'scoring_play': sc_play, + 'is_three': is_three, 'shooter': shooters, 'is_assisted': is_assisted, 'assist_player': assisted_pls, } - return pd.DataFrame(data) + df = pd.DataFrame(data) + + # add shot data if it exists + is_shotchart = 'shtChrt' in gamepackage + + if is_shotchart: + chart = gamepackage['shtChrt']['plays'] + + shotteams = [x['homeAway'] for x in chart] + xs = [x['coordinate']['x'] for x in chart] + ys = [x['coordinate']['y'] for x in chart] + + shot_data = { + 'team': shotteams, + 'x': xs, + 'y': ys + } + + shot_df = pd.DataFrame(shot_data) + + # shot matching + shot_info = { + 'shot_x': [], + 'shot_y': [], + } + shot_count = 0 + + for play in df.play_desc: + if shot_count >= len(shot_df): + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + continue + + shot_play = shot_df.play_desc.iloc[shot_count] + + if play == shot_play: + shot_info['shot_x'].append(shot_df.x.iloc[shot_count]) + shot_info['shot_y'].append(shot_df.y.iloc[shot_count]) + shot_count += 1 + else: + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + + # make sure that length of shot data matches number of shots in PBP data + if (not (len(shot_info['shot_x']) == len(df))) or (not (len(shot_info['shot_y']) == len(df))): + _log.warning( + f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data') + df['shot_x'] = np.nan + df['shot_y'] = np.nan + return df + + df['shot_x'] = shot_info['shot_x'] + df['shot_y'] = shot_info['shot_y'] + + else: + df['shot_x'] = np.nan + df['shot_y'] = np.nan + return df + + return df def _get_game_info_helper(info, more_info, game_id): From 445ac7c3dc4171551a0a03a1f9e5c40b95af9f92 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 31 Mar 2023 22:34:06 -0500 Subject: [PATCH 09/53] shot descs bug --- src/cbbpy/mens_scraper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 468cce4..3116c23 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -843,11 +843,13 @@ def _get_game_pbp_helper(gamepackage, game_id): chart = gamepackage['shtChrt']['plays'] shotteams = [x['homeAway'] for x in chart] + shotdescs = [x['text'] for x in chart] xs = [x['coordinate']['x'] for x in chart] ys = [x['coordinate']['y'] for x in chart] shot_data = { 'team': shotteams, + 'play_desc': shotdescs, 'x': xs, 'y': ys } From bcca4aff12e058d6466d0f62768e19a9fc5cc23f Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 3 Apr 2023 21:18:28 -0500 Subject: [PATCH 10/53] consolidate get_games_season --- src/cbbpy/mens_scraper.py | 47 ++++++++++++--------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 3116c23..a579ce5 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -133,7 +133,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool Parameters: - start_date: a string representing the first day of games to scrape - - end_date: a string representing the last day of games to scrape + - end_date: a string representing the last day of games to scrape (inclusive) - info: a boolean denoting whether game metadata is to be scraped - box: a boolean denoting whether game boxscore is to be scraped - pbp: a boolean denoting whether game play-by-play is to be scraped @@ -382,39 +382,10 @@ def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool """ season_start_date = datetime(season - 1, 11, 1) season_end_date = datetime(season, 5, 1) - len_season = (season_end_date - season_start_date).days - date = season_start_date - all_data = [] - - with trange(len_season) as t: - for i in t: - game_ids = get_game_ids(date) - - if len(game_ids) > 0: - games_info_day = [] - for j, gid in enumerate(game_ids): - t.set_description( - f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" - ) - games_info_day.append(get_game(gid, info, box, pbp)) - all_data.append(games_info_day) - - else: - t.set_description(f"No games on {date.strftime('%D')}") - - date += timedelta(days=1) - game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( - drop=True - ) - game_boxscore_df = pd.concat( - [game[1] for day in all_data for game in day] - ).reset_index(drop=True) - game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( - drop=True - ) + info = get_games_range(season_start_date, season_end_date, info, box, pbp) - return (game_info_df, game_boxscore_df, game_pbp_df) + return info def get_game_ids(date: Union[str, datetime]) -> list: @@ -863,12 +834,22 @@ def _get_game_pbp_helper(gamepackage, game_id): } shot_count = 0 - for play in df.play_desc: + for play, isshot in zip(df.play_desc, df.shooting_play): if shot_count >= len(shot_df): shot_info['shot_x'].append(np.nan) shot_info['shot_y'].append(np.nan) continue + if not isshot: + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + continue + + if 'free throw' in play.lower(): + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + continue + shot_play = shot_df.play_desc.iloc[shot_count] if play == shot_play: From 733388d325d9d2bfb00df47c807b94300b3c1d3f Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 3 Apr 2023 22:10:32 -0500 Subject: [PATCH 11/53] tweaks --- src/cbbpy/mens_scraper.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index a579ce5..d7e5728 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -380,8 +380,8 @@ def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = datetime(season - 1, 11, 1) - season_end_date = datetime(season, 5, 1) + season_start_date = f'{season-1}-11-01' + season_end_date = f'{season}-05-01' info = get_games_range(season_start_date, season_end_date, info, box, pbp) @@ -815,8 +815,8 @@ def _get_game_pbp_helper(gamepackage, game_id): shotteams = [x['homeAway'] for x in chart] shotdescs = [x['text'] for x in chart] - xs = [x['coordinate']['x'] for x in chart] - ys = [x['coordinate']['y'] for x in chart] + xs = [50-int(x['coordinate']['x']) for x in chart] + ys = [int(x['coordinate']['y']) for x in chart] shot_data = { 'team': shotteams, @@ -848,6 +848,7 @@ def _get_game_pbp_helper(gamepackage, game_id): if 'free throw' in play.lower(): shot_info['shot_x'].append(np.nan) shot_info['shot_y'].append(np.nan) + shot_count += 1 continue shot_play = shot_df.play_desc.iloc[shot_count] From 280b536e520f56830dbf6a217884a8119db8f193 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:09:59 -0500 Subject: [PATCH 12/53] implemented multiprocessing --- pyproject.toml | 1 + src/cbbpy/mens_scraper.py | 35 +++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 26093d7..7aaa559 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ 'pytz>=2022.1', 'tqdm>=4.63.0', 'lxml>=4.9.0', + 'joblib>=1.1.0', ] requires-python = ">=3.7" diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index d7e5728..1d1604d 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -13,11 +13,13 @@ from dateutil.parser import parse from pytz import timezone as tz from tqdm import trange +from joblib import Parallel, delayed import re import time import logging import traceback import json +import os from typing import Union @@ -144,33 +146,34 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - start_date = _parse_date(start_date) - end_date = _parse_date(end_date) - len_scrape = (end_date - start_date).days + 1 - date = start_date + sd = _parse_date(start_date) + ed = _parse_date(end_date) + date_range = pd.date_range(sd, ed) + len_scrape = len(date_range) all_data = [] + cpus = os.cpu_count() - 1 - if start_date > end_date: + if len_scrape < 1: raise InvalidDateRangeError( "The start date must be sooner than the end date.") - with trange(len_scrape) as t: + bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' + + with trange(len_scrape, bar_format=bar_format) as t: for i in t: + date = date_range[i] + t.set_description(f"Scraping games on {str(date.date())}") game_ids = get_game_ids(date) + t.set_description( + f"Scraping {len(game_ids)} games on {str(date.date())}") if len(game_ids) > 0: - games_info_day = [] - for j, gid in enumerate(game_ids): - t.set_description( - f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" - ) - games_info_day.append(get_game(gid, info, box, pbp)) - all_data.append(games_info_day) + result = Parallel(n_jobs=cpus)( + delayed(get_game)(gid) for gid in game_ids) + all_data.append(result) else: - t.set_description(f"No games on {date.strftime('%D')}") - - date += timedelta(days=1) + t.set_description(f"No games on {str(date.date())}") game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( drop=True From 6cef3bce291952ed52bc74920686afd3bb0208d5 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:19:53 -0500 Subject: [PATCH 13/53] formatting + page error --- src/cbbpy/mens_scraper.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 1d1604d..efecb05 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -162,10 +162,10 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] - t.set_description(f"Scraping games on {str(date.date())}") + t.set_description(f"Scraping games on {date.strftime('%D')}") game_ids = get_game_ids(date) t.set_description( - f"Scraping {len(game_ids)} games on {str(date.date())}") + f"Scraping {len(game_ids)} games on {date.strftime('%D')}") if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( @@ -173,7 +173,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool all_data.append(result) else: - t.set_description(f"No games on {str(date.date())}") + t.set_description(f"No games on {date.strftime('%D')}") game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( drop=True @@ -229,6 +229,9 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') @@ -292,6 +295,9 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') @@ -353,6 +359,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') @@ -426,6 +435,9 @@ def get_game_ids(date: Union[str, datetime]) -> list: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') From 2228bb777a234ae27a615eecea9a726844be0109 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:26:48 -0500 Subject: [PATCH 14/53] retries --- src/cbbpy/mens_scraper.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index efecb05..18d02fa 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -26,7 +26,7 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) -ATTEMPTS = 10 +ATTEMPTS = 20 DATE_PARSES = [ '%Y-%m-%d', '%Y/%m/%d', @@ -238,7 +238,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -304,7 +304,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -368,7 +368,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -444,7 +444,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown From 0b676d7ce6d966da1a17c879da8618b6f4854cc1 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:27:56 -0500 Subject: [PATCH 15/53] women's scraper --- src/cbbpy/womens_scraper.py | 176 ++++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 57 deletions(-) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index d7ab669..e5f5ec1 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -13,11 +13,13 @@ from dateutil.parser import parse from pytz import timezone as tz from tqdm import trange +from joblib import Parallel, delayed import re import time import logging import traceback import json +import os from typing import Union @@ -133,7 +135,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool Parameters: - start_date: a string representing the first day of games to scrape - - end_date: a string representing the last day of games to scrape + - end_date: a string representing the last day of games to scrape (inclusive) - info: a boolean denoting whether game metadata is to be scraped - box: a boolean denoting whether game boxscore is to be scraped - pbp: a boolean denoting whether game play-by-play is to be scraped @@ -144,34 +146,35 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - start_date = _parse_date(start_date) - end_date = _parse_date(end_date) - len_scrape = (end_date - start_date).days + 1 - date = start_date + sd = _parse_date(start_date) + ed = _parse_date(end_date) + date_range = pd.date_range(sd, ed) + len_scrape = len(date_range) all_data = [] + cpus = os.cpu_count() - 1 - if start_date > end_date: + if len_scrape < 1: raise InvalidDateRangeError( "The start date must be sooner than the end date.") - with trange(len_scrape) as t: + bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' + + with trange(len_scrape, bar_format=bar_format) as t: for i in t: + date = date_range[i] + t.set_description(f"Scraping games on {date.strftime('%D')}") game_ids = get_game_ids(date) + t.set_description( + f"Scraping {len(game_ids)} games on {date.strftime('%D')}") if len(game_ids) > 0: - games_info_day = [] - for j, gid in enumerate(game_ids): - t.set_description( - f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" - ) - games_info_day.append(get_game(gid, info, box, pbp)) - all_data.append(games_info_day) + result = Parallel(n_jobs=cpus)( + delayed(get_game)(gid) for gid in game_ids) + all_data.append(result) else: t.set_description(f"No games on {date.strftime('%D')}") - date += timedelta(days=1) - game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( drop=True ) @@ -226,13 +229,16 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -279,9 +285,9 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # else: # tot_seconds_in_game = (2*20*60) + ((num_halves-2)*5*60) - pbp = gamepackage['pbp'] + # pbp = gamepackage['pbp'] - df = _get_game_pbp_helper(pbp, game_id) + df = _get_game_pbp_helper(gamepackage, game_id) except Exception as ex: if i+1 == ATTEMPTS: @@ -289,13 +295,16 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -350,13 +359,16 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -380,41 +392,12 @@ def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = datetime(season - 1, 11, 1) - season_end_date = datetime(season, 5, 1) - len_season = (season_end_date - season_start_date).days - date = season_start_date - all_data = [] - - with trange(len_season) as t: - for i in t: - game_ids = get_game_ids(date) - - if len(game_ids) > 0: - games_info_day = [] - for j, gid in enumerate(game_ids): - t.set_description( - f"Scraping {gid} ({j+1}/{len(game_ids)}) on {date.strftime('%D')}" - ) - games_info_day.append(get_game(gid, info, box, pbp)) - all_data.append(games_info_day) - - else: - t.set_description(f"No games on {date.strftime('%D')}") - - date += timedelta(days=1) + season_start_date = f'{season-1}-11-01' + season_end_date = f'{season}-05-01' - game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( - drop=True - ) - game_boxscore_df = pd.concat( - [game[1] for day in all_data for game in day] - ).reset_index(drop=True) - game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( - drop=True - ) + info = get_games_range(season_start_date, season_end_date, info, box, pbp) - return (game_info_df, game_boxscore_df, game_pbp_df) + return info def get_game_ids(date: Union[str, datetime]) -> list: @@ -452,13 +435,16 @@ def get_game_ids(date: Union[str, datetime]) -> list: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again - time.sleep(1.5) + time.sleep(2) continue else: # no exception thrown @@ -722,7 +708,7 @@ def _get_game_boxscore_helper(boxscore, game_id): return df -def _get_game_pbp_helper(pbp, game_id): +def _get_game_pbp_helper(gamepackage, game_id): """A helper function that cleans a game's PBP. Parameters: @@ -732,6 +718,7 @@ def _get_game_pbp_helper(pbp, game_id): Returns - the game PBP as a DataFrame """ + pbp = gamepackage['pbp'] home_team = pbp['tms']['home']['displayName'] away_team = pbp['tms']['away']['displayName'] @@ -811,6 +798,8 @@ def _get_game_pbp_helper(pbp, game_id): assisted_pls = [x[0].split('Assisted by ')[-1].replace('.', '') if x[1] else '' for x in zip(descs, is_assisted)] + is_three = ['three point' in x.lower() for x in descs] + data = { 'game_id': game_id, 'home_team': home_team, @@ -825,12 +814,85 @@ def _get_game_pbp_helper(pbp, game_id): 'play_type': p_types, 'shooting_play': shooting_play, 'scoring_play': sc_play, + 'is_three': is_three, 'shooter': shooters, 'is_assisted': is_assisted, 'assist_player': assisted_pls, } - return pd.DataFrame(data) + df = pd.DataFrame(data) + + # add shot data if it exists + is_shotchart = 'shtChrt' in gamepackage + + if is_shotchart: + chart = gamepackage['shtChrt']['plays'] + + shotteams = [x['homeAway'] for x in chart] + shotdescs = [x['text'] for x in chart] + xs = [50-int(x['coordinate']['x']) for x in chart] + ys = [int(x['coordinate']['y']) for x in chart] + + shot_data = { + 'team': shotteams, + 'play_desc': shotdescs, + 'x': xs, + 'y': ys + } + + shot_df = pd.DataFrame(shot_data) + + # shot matching + shot_info = { + 'shot_x': [], + 'shot_y': [], + } + shot_count = 0 + + for play, isshot in zip(df.play_desc, df.shooting_play): + if shot_count >= len(shot_df): + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + continue + + if not isshot: + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + continue + + if 'free throw' in play.lower(): + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + shot_count += 1 + continue + + shot_play = shot_df.play_desc.iloc[shot_count] + + if play == shot_play: + shot_info['shot_x'].append(shot_df.x.iloc[shot_count]) + shot_info['shot_y'].append(shot_df.y.iloc[shot_count]) + shot_count += 1 + else: + shot_info['shot_x'].append(np.nan) + shot_info['shot_y'].append(np.nan) + + # make sure that length of shot data matches number of shots in PBP data + if (not (len(shot_info['shot_x']) == len(df))) or (not (len(shot_info['shot_y']) == len(df))): + _log.warning( + f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data') + df['shot_x'] = np.nan + df['shot_y'] = np.nan + return df + + df['shot_x'] = shot_info['shot_x'] + df['shot_y'] = shot_info['shot_y'] + + else: + df['shot_x'] = np.nan + df['shot_y'] = np.nan + return df + + return df def _get_game_info_helper(info, more_info, game_id): From e828b3402d70be13af882ff3daf284b2979dc6c4 Mon Sep 17 00:00:00 2001 From: Daniel Cowan <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:36:40 -0500 Subject: [PATCH 16/53] Update README.md --- README.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 689d19a..2e2fa55 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # CBBpy: A Python-based web scraper for NCAA basketball ## Purpose -This package is designed to bridge the gap between data and analysis for NCAA D1 basketball. CBBpy can grab play-by-play, boxscore, and other game metadata for any NCAA D1 men's basketball game. +This package is designed to bridge the gap between data and analysis for NCAA D1 basketball. CBBpy can grab play-by-play, boxscore, and other game metadata for any NCAA D1 men's or women's basketball game. ## Installation and import CBBpy requires Python >= 3.7 as well as the following packages: @@ -13,6 +13,7 @@ CBBpy requires Python >= 3.7 as well as the following packages: * pytz>=2022.1 * tqdm>=4.63.0 * lxml>=4.9.0 +* joblib>=1.1.0 Install using pip: @@ -20,33 +21,34 @@ Install using pip: pip install cbbpy ``` -As of now, CBBpy only offers a men's basketball scraper, which can be imported as such: +The men's and women's scrapers can be imported as such: ``` -import cbbpy.mens_scraper as ms +import cbbpy.mens_scraper as s +import cbbpy.womens_scraper as s ``` ## Functions available in CBBpy NOTE: game ID, as far as CBBpy is concernced, is a valid **ESPN** game ID -`ms.get_game_info(game_id: str)` grabs all the metadata (game date, time, score, teams, referees, etc) for a particular game. +`s.get_game_info(game_id: str)` grabs all the metadata (game date, time, score, teams, referees, etc) for a particular game. -`ms.get_game_boxscore(game_id: str)` returns a pandas DataFrame with each player's stats for a particular game. +`s.get_game_boxscore(game_id: str)` returns a pandas DataFrame with each player's stats for a particular game. -`ms.get_game_pbp(game_id: str)` scrapes the play-by-play tables for a game and returns a pandas DataFrame, with each entry representing a play made during the game. +`s.get_game_pbp(game_id: str)` scrapes the play-by-play tables for a game and returns a pandas DataFrame, with each entry representing a play made during the game. -`ms.get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True)` gets *all* information about a game (game info, boxscore, PBP) and returns a tuple of results `(game_info, boxscore, pbp)`. `info, box, pbp` are booleans which users can set to `False` if there is any information they wish not to scrape. For example, `box = False` would return an empty DataFrame for the boxscore info, while scraping PBP and metadata info normally. +`s.get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True)` gets *all* information about a game (game info, boxscore, PBP) and returns a tuple of results `(game_info, boxscore, pbp)`. `info, box, pbp` are booleans which users can set to `False` if there is any information they wish not to scrape. For example, `box = False` would return an empty DataFrame for the boxscore info, while scraping PBP and metadata info normally. -`ms.get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True)` scrapes all game information for all games in a particular season. As an example, to scrape games for the 2020-21 season, call `get_games_season(2021)`. Returns a tuple of 3 DataFrames, similar to `get_game`. See `get_game` for an explanation of booleans `info, box, pbp`. +`s.get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True)` scrapes all game information for all games in a particular season. As an example, to scrape games for the 2020-21 season, call `get_games_season(2021)`. Returns a tuple of 3 DataFrames, similar to `get_game`. See `get_game` for an explanation of booleans `info, box, pbp`. -`ms.get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True)` scrapes all game information for all games between `start_date` and `end_date` (inclusive). As an example, to scrape games between November 30, 2022 and December 10, 2022, call `get_games_season('11-30-2022', '12-10-2022')`. Returns a tuple of 3 DataFrames, similar to `get_game`. See `get_game` for an explanation of booleans `info, box, pbp`. +`s.get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True)` scrapes all game information for all games between `start_date` and `end_date` (inclusive). As an example, to scrape games between November 30, 2022 and December 10, 2022, call `get_games_season('11-30-2022', '12-10-2022')`. Returns a tuple of 3 DataFrames, similar to `get_game`. See `get_game` for an explanation of booleans `info, box, pbp`. -`ms.get_game_ids(date: str)` returns a list of all game IDs for a particular date. +`s.get_game_ids(date: str)` returns a list of all game IDs for a particular date. ## Examples Function call: -`ms.get_game_info('401408636')` +`s.get_game_info('401408636')` Returns: | | game_id | home_team | home_id | home_rank | home_record | home_score | away_team | away_id | away_rank | away_record | away_score | home_win | num_ots | is_conference | is_neutral | is_postseason | tournament | game_day | game_time | game_loc | arena | arena_capacity | attendance | tv_network | referee_1 | referee_2 | referee_3 | @@ -55,7 +57,7 @@ Returns: Function call: -`ms.get_game_boxscore('401408636')` +`s.get_game_boxscore('401408636')` Returns (partially): | | game_id | team | player | player_id | position | starter | min | fgm | fga | 2pm | 2pa | 3pm | 3pa | ftm | fta | oreb | dreb | reb | ast | stl | blk | to | pf | pts | @@ -68,7 +70,7 @@ Returns (partially): Function call: -`ms.get_game_pbp('401408636')` +`s.get_game_pbp('401408636')` Returns (partially): | | game_id | home_team | away_team | play_team | home_score | away_score | half | secs_left_half | secs_left_reg | play_desc | play_type | scoring_play | shooter | is_assisted | assist_player | From e8d2a2bff449ba82ddb6f2454fc27969eb46043b Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 12 Apr 2023 22:50:23 -0500 Subject: [PATCH 17/53] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 4e91b11..b78f841 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 dacowan2 +Copyright (c) 2022 Daniel Cowan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From c8d077df7f6e3d8ea2cfbcd6eee9bb5d13f20442 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:16:57 -0500 Subject: [PATCH 18/53] no boxscore available --- src/cbbpy/mens_scraper.py | 5 +++++ src/cbbpy/womens_scraper.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 18d02fa..89dfb4e 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -224,6 +224,11 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: + if 'No Box Score Available' in soup.text: + _log.warning( + f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) + if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index e5f5ec1..8f0df40 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -224,6 +224,11 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: + if 'No Box Score Available' in soup.text: + _log.warning( + f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) + if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: From 3f22762ba415f3a40beb4463655370b9a3640f39 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 13 Apr 2023 15:46:13 -0500 Subject: [PATCH 19/53] attempts w --- src/cbbpy/womens_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 8f0df40..94eefd9 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -26,7 +26,7 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) -ATTEMPTS = 10 +ATTEMPTS = 20 DATE_PARSES = [ '%Y-%m-%d', '%Y/%m/%d', From dc5b1d555752fa91b7ad947cb6569f38e8e857f1 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 14 Apr 2023 10:55:38 -0500 Subject: [PATCH 20/53] wbb quarters vs halves --- src/cbbpy/womens_scraper.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 94eefd9..4aa4efd 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -727,7 +727,7 @@ def _get_game_pbp_helper(gamepackage, game_id): home_team = pbp['tms']['home']['displayName'] away_team = pbp['tms']['away']['displayName'] - all_plays = [play for half in pbp['playGrps'] for play in half] + all_plays = [play for quart in pbp['playGrps'] for play in quart] # check if PBP exists if len(all_plays) <= 0: @@ -741,17 +741,20 @@ def _get_game_pbp_helper(gamepackage, game_id): else np.nan for x in all_plays] ascores = [int(x['awayScore']) if 'awayScore' in x.keys() else np.nan for x in all_plays] - halves = [int(x['period']['number']) - if 'period' in x.keys() else np.nan for x in all_plays] + quarters = [int(x['period']['number']) + if 'period' in x.keys() else np.nan for x in all_plays] time_splits = [x['clock']['displayValue'].split(':') if 'clock' in x.keys() else '' for x in all_plays] minutes = [int(x[0]) for x in time_splits] seconds = [int(x[1]) for x in time_splits] min_to_sec = [x*60 for x in minutes] - hf_secs_left = [x+y for x, y in zip(min_to_sec, seconds)] - reg_secs_left = [1200+x if half_num == 1 else x for x, - half_num in zip(hf_secs_left, halves)] + qt_secs_left = [x+y for x, y in zip(min_to_sec, seconds)] + reg_secs_left = [1800+x if qt_num == 1 + else 1200+x if qt_num == 2 + else 600+x if qt_num == 3 + else x + for x, qt_num in zip(qt_secs_left, quarters)] sc_play = [True if 'scoringPlay' in x.keys() else False for x in all_plays] @@ -812,8 +815,8 @@ def _get_game_pbp_helper(gamepackage, game_id): 'play_desc': descs, 'home_score': hscores, 'away_score': ascores, - 'half': halves, - 'secs_left_half': hf_secs_left, + 'quarter': quarters, + 'secs_left_qt': qt_secs_left, 'secs_left_reg': reg_secs_left, 'play_team': teams, 'play_type': p_types, @@ -979,8 +982,7 @@ def _get_game_info_helper(info, more_info, game_id): tournament = more_info['nte'] if 'nte' in more_info.keys() else '' - h_ot, a_ot = len(ht_info['linescores']) - \ - 2, len(at_info['linescores']) - 2 + h_ot, a_ot = len(ht_info['linescores']) - 4, len(at_info['linescores']) - 4 assert h_ot == a_ot num_ots = h_ot From a366ad5318d74603868be0fe5d5a7d124b8bce24 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 14 Apr 2023 12:41:18 -0500 Subject: [PATCH 21/53] fix for no scores available --- src/cbbpy/mens_scraper.py | 11 +++++++---- src/cbbpy/womens_scraper.py | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 89dfb4e..795ec02 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -979,10 +979,13 @@ def _get_game_info_helper(info, more_info, game_id): tournament = more_info['nte'] if 'nte' in more_info.keys() else '' - h_ot, a_ot = len(ht_info['linescores']) - \ - 2, len(at_info['linescores']) - 2 - assert h_ot == a_ot - num_ots = h_ot + if ('linescores' in ht_info) and ('linescores' in at_info): + h_ot, a_ot = len(ht_info['linescores']) - \ + 2, len(at_info['linescores']) - 2 + assert h_ot == a_ot + num_ots = h_ot + else: + num_ots = -1 game_info_list = [ game_id, diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 4aa4efd..a7221e4 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -982,9 +982,13 @@ def _get_game_info_helper(info, more_info, game_id): tournament = more_info['nte'] if 'nte' in more_info.keys() else '' - h_ot, a_ot = len(ht_info['linescores']) - 4, len(at_info['linescores']) - 4 - assert h_ot == a_ot - num_ots = h_ot + if ('linescores' in ht_info) and ('linescores' in at_info): + h_ot, a_ot = len(ht_info['linescores']) - \ + 4, len(at_info['linescores']) - 4 + assert h_ot == a_ot + num_ots = h_ot + else: + num_ots = -1 game_info_list = [ game_id, From 158a4b0f71f06b6fc503fdc200ceb173f7abe6d4 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:07:18 -0500 Subject: [PATCH 22/53] score info warning --- src/cbbpy/mens_scraper.py | 1 + src/cbbpy/womens_scraper.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 795ec02..de44776 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -985,6 +985,7 @@ def _get_game_info_helper(info, more_info, game_id): assert h_ot == a_ot num_ots = h_ot else: + _log.warning(f'"{time.ctime()}": {game_id} - No score info available') num_ots = -1 game_info_list = [ diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index a7221e4..20697b2 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -988,6 +988,7 @@ def _get_game_info_helper(info, more_info, game_id): assert h_ot == a_ot num_ots = h_ot else: + _log.warning(f'"{time.ctime()}": {game_id} - No score info available') num_ots = -1 game_info_list = [ From 0f8dcb298cdb12c27f9374a4e26cf5858c49a20c Mon Sep 17 00:00:00 2001 From: Daniel Cowan <56355242+dcstats@users.noreply.github.com> Date: Sat, 15 Apr 2023 20:26:49 -0500 Subject: [PATCH 23/53] Update README.md --- README.md | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 2e2fa55..1792f0c 100644 --- a/README.md +++ b/README.md @@ -48,38 +48,47 @@ NOTE: game ID, as far as CBBpy is concernced, is a valid **ESPN** game ID Function call: -`s.get_game_info('401408636')` +``` +import cbbpy.mens_scraper as s +s.get_game_info('401522202') +``` Returns: -| | game_id | home_team | home_id | home_rank | home_record | home_score | away_team | away_id | away_rank | away_record | away_score | home_win | num_ots | is_conference | is_neutral | is_postseason | tournament | game_day | game_time | game_loc | arena | arena_capacity | attendance | tv_network | referee_1 | referee_2 | referee_3 | -|---:|----------:|:----------------|----------:|------------:|:--------------|-------------:|:-------------------------|----------:|------------:|:--------------|-------------:|:-----------|----------:|:----------------|:-------------|:----------------|:------------------------------------------------------|:---------------|:-------------|:----------------|:------------------|-----------------:|:-------------|:-------------|:------------|:--------------|:--------------| -| 0 | 401408636 | Kansas Jayhawks | 2305 | 1 | 34-6 | 72 | North Carolina Tar Heels | 153 | 8 | 29-10 | 69 | True | 0 | False | True | True | Men's Basketball Championship - National Championship | April 04, 2022 | 06:20 PM PDT | New Orleans, LA | Caesars Superdome | nan | 69,423 | TBS | Ron Groover | Terry Oglesby | Jeff Anderson | +| | game_id | home_team | home_id | home_rank | home_record | home_score | away_team | away_id | away_rank | away_record | away_score | home_win | num_ots | is_conference | is_neutral | is_postseason | tournament | game_day | game_time | game_loc | arena | arena_capacity | attendance | tv_network | referee_1 | referee_2 | referee_3 | +|---:|----------:|:--------------|----------:|------------:|:--------------|-------------:|:-----------------------|----------:|------------:|:--------------|-------------:|:-----------|----------:|:----------------|:-------------|:----------------|:------------------------------------------------------|:---------------|:-------------|:------------|:------------|-----------------:|-------------:|:-------------|:------------|:--------------|:-------------| +| 0 | 401522202 | UConn Huskies | 41 | 4 | 31-8 | 76 | San Diego State Aztecs | 21 | 5 | 32-7 | 59 | True | 0 | False | True | True | Men's Basketball Championship - National Championship | April 03, 2023 | 06:20 PM PDT | Houston, TX | NRG Stadium | 0 | 72423 | CBS | Ron Groover | Terry Oglesby | Keith Kimble | Function call: -`s.get_game_boxscore('401408636')` +``` +import cbbpy.womens_scraper as s +s.get_game_boxscore('401408636') +``` Returns (partially): -| | game_id | team | player | player_id | position | starter | min | fgm | fga | 2pm | 2pa | 3pm | 3pa | ftm | fta | oreb | dreb | reb | ast | stl | blk | to | pf | pts | -|---:|----------:|:----------------|:-------------|------------:|:-----------|:----------|------:|------:|------:|------:|------:|------:|------:|------:|------:|-------:|-------:|------:|------:|------:|------:|-----:|-----:|------:| -| 0 | 401408636 | Kansas Jayhawks | J. Wilson | 4431714 | F | True | 34 | 5 | 13 | 4 | 8 | 1 | 5 | 4 | 4 | 1 | 3 | 4 | 2 | 0 | 1 | 0 | 1 | 15 | -| 1 | 401408636 | Kansas Jayhawks | D. McCormack | 4397019 | F | True | 29 | 7 | 15 | 7 | 15 | 0 | 0 | 1 | 2 | 3 | 7 | 10 | 0 | 1 | 1 | 1 | 4 | 15 | -| 2 | 401408636 | Kansas Jayhawks | D. Harris | 4431983 | G | True | 27 | 1 | 5 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 3 | 3 | 1 | 4 | 0 | 2 | -| 3 | 401408636 | Kansas Jayhawks | C. Braun | 4431767 | G | True | 40 | 6 | 14 | 6 | 13 | 0 | 1 | 0 | 0 | 1 | 11 | 12 | 3 | 0 | 0 | 1 | 3 | 12 | -| 4 | 401408636 | Kansas Jayhawks | O. Agbaji | 4397018 | G | True | 37 | 4 | 9 | 3 | 5 | 1 | 4 | 3 | 8 | 1 | 2 | 3 | 1 | 1 | 1 | 2 | 1 | 12 | +| | game_id | team | player | player_id | position | starter | min | fgm | fga | 2pm | 2pa | 3pm | 3pa | ftm | fta | oreb | dreb | reb | ast | stl | blk | to | pf | pts | +|---:|----------:|:-----------|:------------|------------:|:-----------|:----------|------:|------:|------:|------:|------:|------:|------:|------:|------:|-------:|-------:|------:|------:|------:|------:|-----:|-----:|------:| +| 0 | 401528028 | LSU Tigers | A. Reese | 4433402 | F | True | 29 | 5 | 12 | 5 | 12 | 0 | 0 | 5 | 8 | 6 | 4 | 10 | 5 | 3 | 1 | 0 | 3 | 15 | +| 1 | 401528028 | LSU Tigers | L. Williams | 4280886 | F | True | 37 | 9 | 16 | 9 | 16 | 0 | 0 | 2 | 2 | 1 | 4 | 5 | 0 | 3 | 0 | 3 | 4 | 20 | +| 2 | 401528028 | LSU Tigers | F. Johnson | 4698736 | G | True | 37 | 4 | 11 | 3 | 7 | 1 | 4 | 1 | 1 | 2 | 5 | 7 | 4 | 1 | 0 | 4 | 1 | 10 | +| 3 | 401528028 | LSU Tigers | K. Poole | 4433418 | G | True | 24 | 2 | 3 | 0 | 1 | 2 | 2 | 0 | 2 | 0 | 3 | 3 | 1 | 0 | 1 | 1 | 2 | 6 | +| 4 | 401528028 | LSU Tigers | A. Morris | 4281251 | G | True | 33 | 8 | 14 | 7 | 11 | 1 | 3 | 4 | 4 | 1 | 1 | 2 | 9 | 1 | 0 | 2 | 3 | 21 | Function call: -`s.get_game_pbp('401408636')` +``` +import cbbpy.mens_scraper as s +s.get_game_pbp('401522202') +``` Returns (partially): -| | game_id | home_team | away_team | play_team | home_score | away_score | half | secs_left_half | secs_left_reg | play_desc | play_type | scoring_play | shooter | is_assisted | assist_player | -|---:|----------:|:----------------|:-------------------------|:-------------------------|-------------:|-------------:|-------:|-----------------:|----------------:|:-------------------------------------------------------------------|:------------|:---------------|:----------------|:--------------|:-----------------| -| 0 | 401408636 | Kansas Jayhawks | North Carolina Tar Heels | Kansas Jayhawks | 0 | 0 | 1 | 1200 | 2400 | Jump Ball won by Kansas | jump ball | False | | False | | -| 1 | 401408636 | Kansas Jayhawks | North Carolina Tar Heels | Kansas Jayhawks | 3 | 0 | 1 | 1179 | 2379 | Ochai Agbaji made Three Point Jumper. Assisted by Christian Braun. | jumper | True | Ochai Agbaji | True | Christian Braun | -| 2 | 401408636 | Kansas Jayhawks | North Carolina Tar Heels | North Carolina Tar Heels | 3 | 0 | 1 | 1161 | 2361 | Armando Bacot missed Jumper. | jumper | False | | False | | -| 3 | 401408636 | Kansas Jayhawks | North Carolina Tar Heels | Kansas Jayhawks | 3 | 0 | 1 | 1161 | 2361 | Christian Braun Defensive Rebound. | rebound | False | | False | | -| 4 | 401408636 | Kansas Jayhawks | North Carolina Tar Heels | Kansas Jayhawks | 5 | 0 | 1 | 1144 | 2344 | David McCormack made Jumper. Assisted by Dajuan Harris Jr.. | jumper | True | David McCormack | True | Dajuan Harris Jr | +| | game_id | home_team | away_team | play_desc | home_score | away_score | half | secs_left_half | secs_left_reg | play_team | play_type | shooting_play | scoring_play | is_three | shooter | is_assisted | assist_player | shot_x | shot_y | +|---:|----------:|:--------------|:-----------------------|:----------------------------------------------------------------------|-------------:|-------------:|-------:|-----------------:|----------------:|:-----------------------|:-------------------|:----------------|:---------------|:-----------|:-----------------|:--------------|:----------------|---------:|---------:| +| 0 | 401522202 | UConn Huskies | San Diego State Aztecs | Jump Ball won by UConn | 0 | 0 | 1 | 1200 | 2400 | UConn Huskies | jump ball | False | False | False | | False | | nan | nan | +| 1 | 401522202 | UConn Huskies | San Diego State Aztecs | Jordan Hawkins made Jumper. Assisted by Adama Sanogo. | 2 | 0 | 1 | 1174 | 2374 | UConn Huskies | jumper | True | True | False | Jordan Hawkins | True | Adama Sanogo | 18 | 15 | +| 2 | 401522202 | UConn Huskies | San Diego State Aztecs | Lamont Butler made Three Point Jumper. Assisted by Matt Bradley. | 2 | 3 | 1 | 1152 | 2352 | San Diego State Aztecs | three point jumper | True | True | True | Lamont Butler | True | Matt Bradley | 39 | 22 | +| 3 | 401522202 | UConn Huskies | San Diego State Aztecs | Tristen Newton Turnover. | 2 | 3 | 1 | 1130 | 2330 | UConn Huskies | turnover | False | False | False | | False | | nan | nan | +| 4 | 401522202 | UConn Huskies | San Diego State Aztecs | Darrion Trammell made Three Point Jumper. Assisted by Keshad Johnson. | 2 | 6 | 1 | 1108 | 2308 | San Diego State Aztecs | three point jumper | True | True | True | Darrion Trammell | True | Keshad Johnson | 1 | 0 | ## Contact Feel free to reach out to me directly with any questions, requests, or suggestions at . From 7ddb1e43bc1258cf39699e4f6b41eaea4ff178ae Mon Sep 17 00:00:00 2001 From: Daniel Cowan <56355242+dcstats@users.noreply.github.com> Date: Sat, 15 Apr 2023 20:54:28 -0500 Subject: [PATCH 24/53] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1792f0c..cc80018 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ NOTE: game ID, as far as CBBpy is concernced, is a valid **ESPN** game ID `s.get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True)` scrapes all game information for all games between `start_date` and `end_date` (inclusive). As an example, to scrape games between November 30, 2022 and December 10, 2022, call `get_games_season('11-30-2022', '12-10-2022')`. Returns a tuple of 3 DataFrames, similar to `get_game`. See `get_game` for an explanation of booleans `info, box, pbp`. -`s.get_game_ids(date: str)` returns a list of all game IDs for a particular date. +`s.get_game_ids(date: Union[str, datetime])` returns a list of all game IDs for a particular date. ## Examples From ec0d8724eaf74bc7f92d0f81be49a953b003dd55 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 21 Apr 2023 16:48:25 -0500 Subject: [PATCH 25/53] joblib --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7aaa559..87655f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ 'pytz>=2022.1', 'tqdm>=4.63.0', 'lxml>=4.9.0', - 'joblib>=1.1.0', + 'joblib>=1.2.0', ] requires-python = ">=3.7" From 7f171246e08b635e2bf7ef8cb0c138393c2d9ee1 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 21 Apr 2023 16:50:11 -0500 Subject: [PATCH 26/53] Update pyproject.toml --- pyproject.toml | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 87655f0..b4dc0c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,16 +15,27 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", ] -keywords = ["college", "basketball", "scraper"] +keywords = [ + "college", + "basketball", + "scraper", + "scraping", + "web scraper", + "data", + "espn", + "analysis", + "science", + "analytics", +] dependencies = [ - 'pandas>=1.4.2', - 'numpy>=1.22.3', - 'python-dateutil>=2.8.2', - 'pytz>=2022.1', - 'tqdm>=4.63.0', - 'lxml>=4.9.0', - 'joblib>=1.2.0', - ] + 'pandas>=1.4.2', + 'numpy>=1.22.3', + 'python-dateutil>=2.8.2', + 'pytz>=2022.1', + 'tqdm>=4.63.0', + 'lxml>=4.9.0', + 'joblib>=1.2.0', +] requires-python = ">=3.7" [tool.setuptools.dynamic] From 04fd06f64e9f0e859c6312ff4a4cc0b8b5313695 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 29 Apr 2023 14:55:45 -0400 Subject: [PATCH 27/53] scrape status added flag to indicate if page not found; if not, box and pbp info will not be attempted --- src/cbbpy/mens_scraper.py | 44 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index de44776..e106977 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -112,18 +112,22 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - if info: - game_info_df = get_game_info(game_id) + # scrape status will check if the page exists + # if it doesn't exist, don't run the other scrape functions to save time + scrape_status = True + + if info and scrape_status: + scrape_status, game_info_df = get_game_info(game_id) else: game_info_df = pd.DataFrame([]) - if box: - boxscore_df = get_game_boxscore(game_id) + if box and scrape_status: + scrape_status, boxscore_df = get_game_boxscore(game_id) else: boxscore_df = pd.DataFrame([]) - if pbp: - pbp_df = get_game_pbp(game_id) + if pbp and scrape_status: + scrape_status, pbp_df = get_game_pbp(game_id) else: pbp_df = pd.DataFrame([]) @@ -197,6 +201,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: Returns - the game boxscore as a DataFrame """ + scrape_status = True for i in range(ATTEMPTS): try: @@ -217,7 +222,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) boxscore = gamepackage['bxscr'] @@ -227,20 +232,21 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'No Box Score Available' in soup.text: _log.warning( f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + scrape_status = False elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) else: # try again time.sleep(2) @@ -249,7 +255,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # no exception thrown break - return df + return (scrape_status, df) def get_game_pbp(game_id: str) -> pd.DataFrame: @@ -261,6 +267,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: Returns - the game's play-by-play information represented as a DataFrame """ + scrape_status = True for i in range(ATTEMPTS): try: @@ -281,7 +288,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) # num_halves = len(pbp['playGrps']) @@ -300,13 +307,14 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + scrape_status = False elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) else: # try again time.sleep(2) @@ -315,7 +323,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # no exception thrown break - return df + return (scrape_status, df) def get_game_info(game_id: str) -> pd.DataFrame: @@ -327,6 +335,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: Returns - a DataFrame with one row and a column for each piece of metadata """ + scrape_status = True for i in range(ATTEMPTS): try: @@ -348,7 +357,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) # get general game info info = gamepackage['gmInfo'] @@ -359,18 +368,19 @@ def get_game_info(game_id: str) -> pd.DataFrame: df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: + # max number of attempts reached, so return blank df if i+1 == ATTEMPTS: - # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + scrape_status = False elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return pd.DataFrame([]) + return (scrape_status, pd.DataFrame([])) else: # try again time.sleep(2) @@ -379,7 +389,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: # no exception thrown break - return df + return (scrape_status, df) def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: From 166bb31961cbceaf2536b78d622050066234150c Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 3 May 2023 22:00:31 -0500 Subject: [PATCH 28/53] page not found tweak --- src/cbbpy/mens_scraper.py | 69 ++++++++++++++++--------------------- src/cbbpy/womens_scraper.py | 30 +++++++++------- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index e106977..940e089 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -9,7 +9,7 @@ import requests as r import pandas as pd import numpy as np -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from dateutil.parser import parse from pytz import timezone as tz from tqdm import trange @@ -26,6 +26,10 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) +# PNF will check if the page exists +# if it doesn't exist, don't run the other scrape functions to save time +PNF = False + ATTEMPTS = 20 DATE_PARSES = [ '%Y-%m-%d', @@ -112,25 +116,24 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - # scrape status will check if the page exists - # if it doesn't exist, don't run the other scrape functions to save time - scrape_status = True - - if info and scrape_status: - scrape_status, game_info_df = get_game_info(game_id) + if info and not PNF: + game_info_df = get_game_info(game_id) else: game_info_df = pd.DataFrame([]) - if box and scrape_status: - scrape_status, boxscore_df = get_game_boxscore(game_id) + if box and not PNF: + boxscore_df = get_game_boxscore(game_id) else: boxscore_df = pd.DataFrame([]) - if pbp and scrape_status: - scrape_status, pbp_df = get_game_pbp(game_id) + if pbp and not PNF: + pbp_df = get_game_pbp(game_id) else: pbp_df = pd.DataFrame([]) + global PNF + PNF = False + return (game_info_df, boxscore_df, pbp_df) @@ -201,8 +204,6 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: Returns - the game boxscore as a DataFrame """ - scrape_status = True - for i in range(ATTEMPTS): try: header = { @@ -222,7 +223,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) boxscore = gamepackage['bxscr'] @@ -232,21 +233,22 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'No Box Score Available' in soup.text: _log.warning( f'"{time.ctime()}": {game_id} - No boxscore available') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - scrape_status = False + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) else: # try again time.sleep(2) @@ -255,7 +257,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # no exception thrown break - return (scrape_status, df) + return df def get_game_pbp(game_id: str) -> pd.DataFrame: @@ -267,8 +269,6 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: Returns - the game's play-by-play information represented as a DataFrame """ - scrape_status = True - for i in range(ATTEMPTS): try: header = { @@ -288,16 +288,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return (scrape_status, pd.DataFrame([])) - - # num_halves = len(pbp['playGrps']) - - # if num_halves == 2: - # tot_seconds_in_game = (num_halves*20*60) - # else: - # tot_seconds_in_game = (2*20*60) + ((num_halves-2)*5*60) - - # pbp = gamepackage['pbp'] + return pd.DataFrame([]) df = _get_game_pbp_helper(gamepackage, game_id) @@ -307,14 +298,15 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - scrape_status = False + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) else: # try again time.sleep(2) @@ -323,7 +315,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # no exception thrown break - return (scrape_status, df) + return df def get_game_info(game_id: str) -> pd.DataFrame: @@ -335,8 +327,6 @@ def get_game_info(game_id: str) -> pd.DataFrame: Returns - a DataFrame with one row and a column for each piece of metadata """ - scrape_status = True - for i in range(ATTEMPTS): try: header = { @@ -357,7 +347,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) # get general game info info = gamepackage['gmInfo'] @@ -373,14 +363,15 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - scrape_status = False + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') else: _log.error( f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') - return (scrape_status, pd.DataFrame([])) + return pd.DataFrame([]) else: # try again time.sleep(2) @@ -389,7 +380,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: # no exception thrown break - return (scrape_status, df) + return df def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 20697b2..f0d979b 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -9,7 +9,7 @@ import requests as r import pandas as pd import numpy as np -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from dateutil.parser import parse from pytz import timezone as tz from tqdm import trange @@ -26,6 +26,10 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) +# PNF will check if the page exists +# if it doesn't exist, don't run the other scrape functions to save time +PNF = False + ATTEMPTS = 20 DATE_PARSES = [ '%Y-%m-%d', @@ -112,21 +116,24 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - if info: + if info and not PNF: game_info_df = get_game_info(game_id) else: game_info_df = pd.DataFrame([]) - if box: + if box and not PNF: boxscore_df = get_game_boxscore(game_id) else: boxscore_df = pd.DataFrame([]) - if pbp: + if pbp and not PNF: pbp_df = get_game_pbp(game_id) else: pbp_df = pd.DataFrame([]) + global PNF + PNF = False + return (game_info_df, boxscore_df, pbp_df) @@ -234,6 +241,8 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -283,15 +292,6 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) - # num_halves = len(pbp['playGrps']) - - # if num_halves == 2: - # tot_seconds_in_game = (num_halves*20*60) - # else: - # tot_seconds_in_game = (2*20*60) + ((num_halves-2)*5*60) - - # pbp = gamepackage['pbp'] - df = _get_game_pbp_helper(gamepackage, game_id) except Exception as ex: @@ -300,6 +300,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -364,6 +366,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') + global PNF + PNF = True elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') From 8a58c2532d2ea3efba8d6e4c8e75c707df426bb6 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 3 May 2023 22:36:52 -0500 Subject: [PATCH 29/53] 15 attempts --- src/cbbpy/mens_scraper.py | 2 +- src/cbbpy/womens_scraper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 940e089..900571e 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -30,7 +30,7 @@ # if it doesn't exist, don't run the other scrape functions to save time PNF = False -ATTEMPTS = 20 +ATTEMPTS = 15 DATE_PARSES = [ '%Y-%m-%d', '%Y/%m/%d', diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index f0d979b..530725b 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -30,7 +30,7 @@ # if it doesn't exist, don't run the other scrape functions to save time PNF = False -ATTEMPTS = 20 +ATTEMPTS = 15 DATE_PARSES = [ '%Y-%m-%d', '%Y/%m/%d', From 825b4ed767fffd99a08138909d5a08cc8c1c8ed5 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 4 May 2023 21:45:28 -0500 Subject: [PATCH 30/53] global tweak --- src/cbbpy/mens_scraper.py | 3 ++- src/cbbpy/womens_scraper.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 900571e..9535447 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -116,6 +116,8 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ + global PNF + if info and not PNF: game_info_df = get_game_info(game_id) else: @@ -131,7 +133,6 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True else: pbp_df = pd.DataFrame([]) - global PNF PNF = False return (game_info_df, boxscore_df, pbp_df) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 530725b..5e0c30b 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -116,6 +116,8 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ + global PNF + if info and not PNF: game_info_df = get_game_info(game_id) else: @@ -131,7 +133,6 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True else: pbp_df = pd.DataFrame([]) - global PNF PNF = False return (game_info_df, boxscore_df, pbp_df) From 88c88d0e0db7956fc772c037bb9d879549447d1b Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 6 May 2023 13:25:02 -0500 Subject: [PATCH 31/53] pnf list --- src/cbbpy/mens_scraper.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 9535447..1088a75 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -26,10 +26,6 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) -# PNF will check if the page exists -# if it doesn't exist, don't run the other scrape functions to save time -PNF = False - ATTEMPTS = 15 DATE_PARSES = [ '%Y-%m-%d', @@ -104,6 +100,11 @@ class InvalidDateRangeError(Exception): pass +# pnf_ will keep track of games w/ page not found errors +# if game has error, don't run the other scrape functions to save time +pnf_ = [] + + def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play). @@ -116,25 +117,21 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - global PNF - - if info and not PNF: + if info and not game_id in pnf_: game_info_df = get_game_info(game_id) else: game_info_df = pd.DataFrame([]) - if box and not PNF: + if box and not game_id in pnf_: boxscore_df = get_game_boxscore(game_id) else: boxscore_df = pd.DataFrame([]) - if pbp and not PNF: + if pbp and not game_id in pnf_: pbp_df = get_game_pbp(game_id) else: pbp_df = pd.DataFrame([]) - PNF = False - return (game_info_df, boxscore_df, pbp_df) @@ -241,8 +238,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -299,8 +295,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -364,8 +359,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') From b728f78db422e27ff4cac946147ef59858d506ab Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 6 May 2023 13:29:57 -0500 Subject: [PATCH 32/53] w pnf --- src/cbbpy/mens_scraper.py | 1 + src/cbbpy/womens_scraper.py | 27 +++++++++++---------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 1088a75..7545012 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -26,6 +26,7 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) + ATTEMPTS = 15 DATE_PARSES = [ '%Y-%m-%d', diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 5e0c30b..6884333 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -26,9 +26,6 @@ logging.basicConfig(filename='cbbpy.log') _log = logging.getLogger(__name__) -# PNF will check if the page exists -# if it doesn't exist, don't run the other scrape functions to save time -PNF = False ATTEMPTS = 15 DATE_PARSES = [ @@ -104,6 +101,11 @@ class InvalidDateRangeError(Exception): pass +# pnf_ will keep track of games w/ page not found errors +# if game has error, don't run the other scrape functions to save time +pnf_ = [] + + def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play). @@ -116,25 +118,21 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - global PNF - - if info and not PNF: + if info and not game_id in pnf_: game_info_df = get_game_info(game_id) else: game_info_df = pd.DataFrame([]) - if box and not PNF: + if box and not game_id in pnf_: boxscore_df = get_game_boxscore(game_id) else: boxscore_df = pd.DataFrame([]) - if pbp and not PNF: + if pbp and not game_id in pnf_: pbp_df = get_game_pbp(game_id) else: pbp_df = pd.DataFrame([]) - PNF = False - return (game_info_df, boxscore_df, pbp_df) @@ -242,8 +240,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -301,8 +298,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') @@ -367,8 +363,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page not found error') - global PNF - PNF = True + pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Page error') From 3c29a13bb3be4d150739d1859e754a4d15fa5ce3 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sun, 7 May 2023 20:40:28 -0500 Subject: [PATCH 33/53] Update pyproject.toml --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index b4dc0c3..1aeb574 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,11 @@ keywords = [ "analysis", "science", "analytics", + "cbb", + "cbbpy", + "ncaa", + "ncaam", + "ncaaw", ] dependencies = [ 'pandas>=1.4.2', From af0780d332ad74b3038011392aeb13d40073afc0 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 8 May 2023 19:53:40 -0500 Subject: [PATCH 34/53] single day game range --- src/cbbpy/mens_scraper.py | 3 +++ src/cbbpy/womens_scraper.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 7545012..c079641 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -181,6 +181,9 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool else: t.set_description(f"No games on {date.strftime('%D')}") + if not len(all_data) > 0: + return () + game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( drop=True ) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 6884333..f1c9e8e 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -181,6 +181,9 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool else: t.set_description(f"No games on {date.strftime('%D')}") + if not len(all_data) > 0: + return () + game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( drop=True ) From 240404440bf520f1c9aa6582e39ab7fb0d06dcbf Mon Sep 17 00:00:00 2001 From: Daniel Cowan <56355242+dcstats@users.noreply.github.com> Date: Sun, 20 Aug 2023 21:23:18 -0500 Subject: [PATCH 35/53] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cc80018..1d392ed 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![PyPi Version](https://img.shields.io/pypi/v/cbbpy.svg)](https://pypi.org/project/cbbpy/) [![Downloads](https://pepy.tech/badge/cbbpy)](https://pepy.tech/project/cbbpy) +[![PyPi Version](https://img.shields.io/pypi/v/cbbpy.svg)](https://pypi.org/project/cbbpy/) [![Downloads](https://static.pepy.tech/badge/cbbpy)](https://pepy.tech/project/cbbpy) # CBBpy: A Python-based web scraper for NCAA basketball From 576666a5d7aaee16ddb63572c40752a355073bc6 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 19 Oct 2023 00:47:24 -0500 Subject: [PATCH 36/53] PBP fix #28 --- src/cbbpy/mens_scraper.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index c079641..ed5b88f 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -91,6 +91,7 @@ 'Layup', 'Dunk' ] +WINDOW_STRING = "window[\'__espnfitt__\']=" class CouldNotParseError(Exception): @@ -279,8 +280,17 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: url = PBP_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + script_string = _find_json_in_content(soup) + + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] @@ -1049,3 +1059,12 @@ def _get_game_info_helper(info, more_info, game_id): ] return pd.DataFrame([game_info_list], columns=game_info_cols) + + +def _find_json_in_content(soup): + script_string = '' + for x in soup.find_all('script'): + if WINDOW_STRING in x.text: + script_string = x.text + break + return script_string From 75c3cdeea8d85c67d0636ce561d829c54313834d Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 19 Oct 2023 00:59:11 -0500 Subject: [PATCH 37/53] wm PBP fix #28 --- src/cbbpy/womens_scraper.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index f1c9e8e..c85a565 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -91,6 +91,7 @@ 'Layup', 'Dunk' ] +WINDOW_STRING = "window[\'__espnfitt__\']=" class CouldNotParseError(Exception): @@ -281,8 +282,17 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: url = PBP_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + script_string = _find_json_in_content(soup) + + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] @@ -1055,3 +1065,12 @@ def _get_game_info_helper(info, more_info, game_id): ] return pd.DataFrame([game_info_list], columns=game_info_cols) + + +def _find_json_in_content(soup): + script_string = '' + for x in soup.find_all('script'): + if WINDOW_STRING in x.text: + script_string = x.text + break + return script_string From c7bcc0ee4b5b32cd2d4213ccc5bfb7ab0d1b1eb2 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 19 Oct 2023 01:04:54 -0500 Subject: [PATCH 38/53] fix info #28 --- src/cbbpy/mens_scraper.py | 12 ++++++++++-- src/cbbpy/womens_scraper.py | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index ed5b88f..939fec0 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -346,9 +346,17 @@ def get_game_info(game_id: str) -> pd.DataFrame: url = GAME_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") + script_string = _find_json_in_content(soup) - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index c85a565..0f52e22 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -349,9 +349,17 @@ def get_game_info(game_id: str) -> pd.DataFrame: url = GAME_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") + script_string = _find_json_in_content(soup) - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] From 0bd98d42e365b42c0e7dd3c6b0d60907c10683ae Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 19 Oct 2023 11:29:06 -0500 Subject: [PATCH 39/53] boxscore fix #28 --- src/cbbpy/mens_scraper.py | 13 +++++++++++-- src/cbbpy/womens_scraper.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 939fec0..8061423 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -216,8 +216,17 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + script_string = _find_json_in_content(soup) + + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 0f52e22..739ede7 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -217,8 +217,17 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] + script_string = _find_json_in_content(soup) + + if script_string == '': + _log.warning( + f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + return pd.DataFrame([]) + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' jsn = json.loads(js) gamepackage = jsn['page']['content']['gamepackage'] From fcbdb6ad7f7ece3652c8b868c3a927b037ad2ec2 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:02:12 -0500 Subject: [PATCH 40/53] gamepackage helpers --- src/cbbpy/mens_scraper.py | 50 ++++++++++++++++--------------------- src/cbbpy/womens_scraper.py | 16 ++++++++++++ 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 8061423..822710e 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -216,20 +216,12 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) - - if script_string == '': + gamepackage = _get_gamepackage_from_soup(soup) + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -289,20 +281,13 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: url = PBP_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) + gamepackage = _get_gamepackage_from_soup(soup) - if script_string == '': + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -355,20 +340,13 @@ def get_game_info(game_id: str) -> pd.DataFrame: url = GAME_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) + gamepackage = _get_gamepackage_from_soup(soup) - if script_string == '': + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -1078,6 +1056,22 @@ def _get_game_info_helper(info, more_info, game_id): return pd.DataFrame([game_info_list], columns=game_info_cols) +def _get_gamepackage_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == '': + return None + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' + jsn = json.loads(js) + gamepackage = jsn['page']['content']['gamepackage'] + + return gamepackage + + def _find_json_in_content(soup): script_string = '' for x in soup.find_all('script'): diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 739ede7..5b62504 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -1084,6 +1084,22 @@ def _get_game_info_helper(info, more_info, game_id): return pd.DataFrame([game_info_list], columns=game_info_cols) +def _get_gamepackage_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == '': + return None + + regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" + pattern = re.compile(regex_match) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' + jsn = json.loads(js) + gamepackage = jsn['page']['content']['gamepackage'] + + return gamepackage + + def _find_json_in_content(soup): script_string = '' for x in soup.find_all('script'): From d090c5e3f9d1b494b4f60dbc75cebc31d72b2998 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Fri, 20 Oct 2023 23:49:06 -0500 Subject: [PATCH 41/53] use helper --- src/cbbpy/womens_scraper.py | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 5b62504..dc0c279 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -217,20 +217,13 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) + gamepackage = _get_gamepackage_from_soup(soup) - if script_string == '': + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -291,20 +284,13 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: url = PBP_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) + gamepackage = _get_gamepackage_from_soup(soup) - if script_string == '': + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -358,20 +344,13 @@ def get_game_info(game_id: str) -> pd.DataFrame: url = GAME_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - script_string = _find_json_in_content(soup) - - if script_string == '': + gamepackage = _get_gamepackage_from_soup(soup) + + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') return pd.DataFrame([]) - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) - found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' - jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') From 824c6c2bfbbecca0793a525efe933e04d3c39068 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 21 Oct 2023 00:04:21 -0500 Subject: [PATCH 42/53] future date checks --- src/cbbpy/mens_scraper.py | 9 +++++++++ src/cbbpy/womens_scraper.py | 10 +++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 822710e..cee1b9b 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -163,6 +163,14 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool if len_scrape < 1: raise InvalidDateRangeError( "The start date must be sooner than the end date.") + + if sd > datetime.today(): + raise InvalidDateRangeError( + "The start date must not be in the future.") + + if ed > datetime.today(): + raise InvalidDateRangeError( + "The end date must not be in the future.") bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' @@ -217,6 +225,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: page = r.get(url, headers=header) soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index dc0c279..8582b6f 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -163,6 +163,14 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool if len_scrape < 1: raise InvalidDateRangeError( "The start date must be sooner than the end date.") + + if sd > datetime.today(): + raise InvalidDateRangeError( + "The start date must not be in the future.") + + if ed > datetime.today(): + raise InvalidDateRangeError( + "The end date must not be in the future.") bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' @@ -345,7 +353,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: page = r.get(url, headers=header) soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - + if not gamepackage: _log.warning( f'"{time.ctime()}": {game_id} - Game JSON not found on page.') From 637380a50d406fb104a944f6036a122435ef14d1 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 21 Oct 2023 00:10:39 -0500 Subject: [PATCH 43/53] game ids fix #28 --- src/cbbpy/mens_scraper.py | 29 +++++++++++++++++++++++------ src/cbbpy/womens_scraper.py | 29 +++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index cee1b9b..0d49041 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -92,6 +92,7 @@ 'Dunk' ] WINDOW_STRING = "window[\'__espnfitt__\']=" +JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" class CouldNotParseError(Exception): @@ -441,11 +442,13 @@ def get_game_ids(date: Union[str, datetime]) -> list: url = SCOREBOARD_URL.format(d) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] - jsn = json.loads(js) + scoreboard = _get_scoreboard_from_soup(soup) - scoreboard = jsn['page']['content']['scoreboard']['evts'] + if not scoreboard: + _log.warning( + f'"{time.ctime()}": {date} - JSON not found on page.') + return pd.DataFrame([]) + ids = [x['id'] for x in scoreboard] except Exception as ex: @@ -1071,8 +1074,7 @@ def _get_gamepackage_from_soup(soup): if script_string == '': return None - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) + pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) js = '{' + found + '}' jsn = json.loads(js) @@ -1081,6 +1083,21 @@ def _get_gamepackage_from_soup(soup): return gamepackage +def _get_scoreboard_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == '': + return None + + pattern = re.compile(JSON_REGEX) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' + jsn = json.loads(js) + scoreboard = jsn['page']['content']['scoreboard']['evts'] + + return scoreboard + + def _find_json_in_content(soup): script_string = '' for x in soup.find_all('script'): diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 8582b6f..905782b 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -92,6 +92,7 @@ 'Dunk' ] WINDOW_STRING = "window[\'__espnfitt__\']=" +JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" class CouldNotParseError(Exception): @@ -444,11 +445,13 @@ def get_game_ids(date: Union[str, datetime]) -> list: url = SCOREBOARD_URL.format(d) page = r.get(url, headers=header) soup = bs(page.content, "lxml") - js = soup.find_all('script')[3].text - js = js.replace("window[\'__espnfitt__\']=", '')[:-1] - jsn = json.loads(js) + scoreboard = _get_scoreboard_from_soup(soup) - scoreboard = jsn['page']['content']['scoreboard']['evts'] + if not scoreboard: + _log.warning( + f'"{time.ctime()}": {date} - JSON not found on page.') + return pd.DataFrame([]) + ids = [x['id'] for x in scoreboard] except Exception as ex: @@ -1077,8 +1080,7 @@ def _get_gamepackage_from_soup(soup): if script_string == '': return None - regex_match = r"window\[\'__espnfitt__\'\]={(.*)};" - pattern = re.compile(regex_match) + pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) js = '{' + found + '}' jsn = json.loads(js) @@ -1087,6 +1089,21 @@ def _get_gamepackage_from_soup(soup): return gamepackage +def _get_scoreboard_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == '': + return None + + pattern = re.compile(JSON_REGEX) + found = re.search(pattern, script_string).group(1) + js = '{' + found + '}' + jsn = json.loads(js) + scoreboard = jsn['page']['content']['scoreboard']['evts'] + + return scoreboard + + def _find_json_in_content(soup): script_string = '' for x in soup.find_all('script'): From 501c639c2763adbd71c241639f087033f66f7c7e Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 21 Oct 2023 00:37:32 -0500 Subject: [PATCH 44/53] 'if not', logging --- src/cbbpy/mens_scraper.py | 14 +++++++------- src/cbbpy/womens_scraper.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 0d49041..b4f6c1f 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -227,9 +227,9 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -293,9 +293,9 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -352,9 +352,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -444,7 +444,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - if not scoreboard: + if scoreboard is None: _log.warning( f'"{time.ctime()}": {date} - JSON not found on page.') return pd.DataFrame([]) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 905782b..fa7f9cf 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -228,9 +228,9 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -295,9 +295,9 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -355,9 +355,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if not gamepackage: + if gamepackage is None: _log.warning( - f'"{time.ctime()}": {game_id} - Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') return pd.DataFrame([]) # check if game was postponed @@ -447,7 +447,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - if not scoreboard: + if scoreboard is None: _log.warning( f'"{time.ctime()}": {date} - JSON not found on page.') return pd.DataFrame([]) From 4ccc655e1b4c95cfca27ba7d125b82ff76be3078 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sun, 22 Oct 2023 21:46:42 -0500 Subject: [PATCH 45/53] error for none scoreboard/gamepackage --- src/cbbpy/mens_scraper.py | 58 ++++++++++++++-------------------- src/cbbpy/womens_scraper.py | 63 +++++++++++++++---------------------- 2 files changed, 49 insertions(+), 72 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index b4f6c1f..3b395bf 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -178,7 +178,6 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] - t.set_description(f"Scraping games on {date.strftime('%D')}") game_ids = get_game_ids(date) t.set_description( f"Scraping {len(game_ids)} games on {date.strftime('%D')}") @@ -227,11 +226,6 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -253,14 +247,17 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - Boxscore: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -293,11 +290,6 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -312,14 +304,17 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - PBP: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - PBP: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -352,11 +347,6 @@ def get_game_info(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -377,14 +367,17 @@ def get_game_info(game_id: str) -> pd.DataFrame: if i+1 == ATTEMPTS: if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - Game Info: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - Game Info: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -443,12 +436,6 @@ def get_game_ids(date: Union[str, datetime]) -> list: page = r.get(url, headers=header) soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - - if scoreboard is None: - _log.warning( - f'"{time.ctime()}": {date} - JSON not found on page.') - return pd.DataFrame([]) - ids = [x['id'] for x in scoreboard] except Exception as ex: @@ -456,13 +443,16 @@ def get_game_ids(date: Union[str, datetime]) -> list: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - Page error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + elif scoreboard is None: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index fa7f9cf..dbc300e 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -178,7 +178,6 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] - t.set_description(f"Scraping games on {date.strftime('%D')}") game_ids = get_game_ids(date) t.set_description( f"Scraping {len(game_ids)} games on {date.strftime('%D')}") @@ -216,7 +215,6 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: Returns - the game boxscore as a DataFrame """ - for i in range(ATTEMPTS): try: header = { @@ -228,11 +226,6 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -254,14 +247,17 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - Boxscore: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -283,7 +279,6 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: Returns - the game's play-by-play information represented as a DataFrame """ - for i in range(ATTEMPTS): try: header = { @@ -295,11 +290,6 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -314,14 +304,17 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - PBP: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - PBP: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -343,7 +336,6 @@ def get_game_info(game_id: str) -> pd.DataFrame: Returns - a DataFrame with one row and a column for each piece of metadata """ - for i in range(ATTEMPTS): try: header = { @@ -355,11 +347,6 @@ def get_game_info(game_id: str) -> pd.DataFrame: soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) - if gamepackage is None: - _log.warning( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') - return pd.DataFrame([]) - # check if game was postponed gm_status = gamepackage['gmStrp']['status']['desc'] gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') @@ -376,18 +363,21 @@ def get_game_info(game_id: str) -> pd.DataFrame: df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: + # max number of attempts reached, so return blank df if i+1 == ATTEMPTS: - # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page not found error') + f'"{time.ctime()}": {game_id} - Game Info: Page not found error') pnf_.append(game_id) elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Page error') + f'"{time.ctime()}": {game_id} - Game Info: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {game_id} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -446,12 +436,6 @@ def get_game_ids(date: Union[str, datetime]) -> list: page = r.get(url, headers=header) soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - - if scoreboard is None: - _log.warning( - f'"{time.ctime()}": {date} - JSON not found on page.') - return pd.DataFrame([]) - ids = [x['id'] for x in scoreboard] except Exception as ex: @@ -459,13 +443,16 @@ def get_game_ids(date: Union[str, datetime]) -> list: # max number of attempts reached, so return blank df if 'Page not found.' in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - Page not found error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') elif 'Page error' in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - Page error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + elif scoreboard is None: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') else: _log.error( - f'"{time.ctime()}" attempt {i+1}: {date.strftime("%D")} - {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again From ce241d7f9a682dcced56552fc669ce9e3e5c7214 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 28 Oct 2023 21:35:26 -0500 Subject: [PATCH 46/53] status 200 check --- src/cbbpy/mens_scraper.py | 114 +++++++++++++++++++++--------------- src/cbbpy/womens_scraper.py | 114 +++++++++++++++++++++--------------- 2 files changed, 132 insertions(+), 96 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 3b395bf..cf2d13c 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -93,6 +93,7 @@ ] WINDOW_STRING = "window[\'__espnfitt__\']=" JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" +STATUS_OK = 200 class CouldNotParseError(Exception): @@ -238,26 +239,31 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: - if 'No Box Score Available' in soup.text: - _log.warning( - f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) + if page.status_code == STATUS_OK: + if 'No Box Score Available' in soup.text: + _log.warning( + f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -302,19 +308,23 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -363,21 +373,25 @@ def get_game_info(game_id: str) -> pd.DataFrame: df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: - # max number of attempts reached, so return blank df if i+1 == ATTEMPTS: - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + # max number of attempts reached, so return blank df + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -441,18 +455,22 @@ def get_game_ids(date: Union[str, datetime]) -> list: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') - elif scoreboard is None: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + elif scoreboard is None: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index dbc300e..b848129 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -93,6 +93,7 @@ ] WINDOW_STRING = "window[\'__espnfitt__\']=" JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" +STATUS_OK = 200 class CouldNotParseError(Exception): @@ -238,26 +239,31 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: - if 'No Box Score Available' in soup.text: - _log.warning( - f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) + if page.status_code == STATUS_OK: + if 'No Box Score Available' in soup.text: + _log.warning( + f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -302,19 +308,23 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -363,21 +373,25 @@ def get_game_info(game_id: str) -> pd.DataFrame: df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: - # max number of attempts reached, so return blank df if i+1 == ATTEMPTS: - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error') - pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + # max number of attempts reached, so return blank df + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + pnf_.append(game_id) + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again @@ -441,18 +455,22 @@ def get_game_ids(date: Union[str, datetime]) -> list: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if 'Page not found.' in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') - elif scoreboard is None: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + if page.status_code == STATUS_OK: + if 'Page not found.' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') + elif 'Page error' in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + elif scoreboard is None: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + else: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}') return pd.DataFrame([]) else: # try again From 0c460822ab60dc9e31de5662e258801d77529f09 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 28 Oct 2023 22:57:10 -0500 Subject: [PATCH 47/53] bar print --- src/cbbpy/mens_scraper.py | 2 +- src/cbbpy/womens_scraper.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index cf2d13c..fb4c2ae 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -176,7 +176,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format) as t: + with trange(len_scrape, bar_format=bar_format, desc='Scraping games') as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index b848129..64623eb 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -176,7 +176,8 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format) as t: + with trange(len_scrape, bar_format=bar_format, desc='Scraping games', + position=0, leave=True) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) From ee4e0f31507cce3f2b9e8ca74a0f90d1f1bab303 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Tue, 31 Oct 2023 21:38:05 -0500 Subject: [PATCH 48/53] pnf checks --- src/cbbpy/mens_scraper.py | 40 ++++++++++++++++++++++-------------- src/cbbpy/womens_scraper.py | 41 ++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index fb4c2ae..6fef61d 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -121,20 +121,22 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - if info and not game_id in pnf_: + game_info_df = boxscore_df = pbp_df = pd.DataFrame([]) + + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + elif info: game_info_df = get_game_info(game_id) - else: - game_info_df = pd.DataFrame([]) - if box and not game_id in pnf_: + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + elif box: boxscore_df = get_game_boxscore(game_id) - else: - boxscore_df = pd.DataFrame([]) - if pbp and not game_id in pnf_: + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page not found error') + elif pbp: pbp_df = get_game_pbp(game_id) - else: - pbp_df = pd.DataFrame([]) return (game_info_df, boxscore_df, pbp_df) @@ -176,7 +178,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format, desc='Scraping games') as t: + with trange(len_scrape, bar_format=bar_format, position=0, leave=True) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) @@ -216,6 +218,8 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: Returns - the game boxscore as a DataFrame """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -239,7 +243,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: - if page.status_code == STATUS_OK: + if soup is not None: if 'No Box Score Available' in soup.text: _log.warning( f'"{time.ctime()}": {game_id} - No boxscore available') @@ -247,7 +251,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') @@ -285,6 +289,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: Returns - the game's play-by-play information represented as a DataFrame """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -308,7 +314,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - PBP: Page not found error') @@ -346,6 +352,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: Returns - a DataFrame with one row and a column for each piece of metadata """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -375,7 +383,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Game Info: Page not found error') @@ -436,6 +444,8 @@ def get_game_ids(date: Union[str, datetime]) -> list: Returns - a list of ESPN all game IDs for games played on the date given """ + soup = None + if type(date) == str: date = _parse_date(date) @@ -455,7 +465,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 64623eb..d931697 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -121,20 +121,22 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - if info and not game_id in pnf_: + game_info_df = boxscore_df = pbp_df = pd.DataFrame([]) + + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + elif info: game_info_df = get_game_info(game_id) - else: - game_info_df = pd.DataFrame([]) - if box and not game_id in pnf_: + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + elif box: boxscore_df = get_game_boxscore(game_id) - else: - boxscore_df = pd.DataFrame([]) - if pbp and not game_id in pnf_: + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page not found error') + elif pbp: pbp_df = get_game_pbp(game_id) - else: - pbp_df = pd.DataFrame([]) return (game_info_df, boxscore_df, pbp_df) @@ -176,8 +178,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format, desc='Scraping games', - position=0, leave=True) as t: + with trange(len_scrape, bar_format=bar_format, position=0, leave=True) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) @@ -217,6 +218,8 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: Returns - the game boxscore as a DataFrame """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -240,7 +243,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: - if page.status_code == STATUS_OK: + if soup is not None: if 'No Box Score Available' in soup.text: _log.warning( f'"{time.ctime()}": {game_id} - No boxscore available') @@ -248,7 +251,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') @@ -286,6 +289,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: Returns - the game's play-by-play information represented as a DataFrame """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -309,7 +314,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - PBP: Page not found error') @@ -347,6 +352,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: Returns - a DataFrame with one row and a column for each piece of metadata """ + soup = None + for i in range(ATTEMPTS): try: header = { @@ -376,7 +383,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {game_id} - Game Info: Page not found error') @@ -437,6 +444,8 @@ def get_game_ids(date: Union[str, datetime]) -> list: Returns - a list of ESPN all game IDs for games played on the date given """ + soup = None + if type(date) == str: date = _parse_date(date) @@ -456,7 +465,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: except Exception as ex: if i+1 == ATTEMPTS: # max number of attempts reached, so return blank df - if page.status_code == STATUS_OK: + if soup is not None: if 'Page not found.' in soup.text: _log.error( f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') From 0140539bf1d92564b7edd05b4b8a789dda869340 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Tue, 31 Oct 2023 21:52:31 -0500 Subject: [PATCH 49/53] description tqdm --- src/cbbpy/mens_scraper.py | 6 +++--- src/cbbpy/womens_scraper.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 6fef61d..17a00e9 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -182,8 +182,8 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool for i in t: date = date_range[i] game_ids = get_game_ids(date) - t.set_description( - f"Scraping {len(game_ids)} games on {date.strftime('%D')}") + t.set_description(f"Scraping {len(game_ids)} games on {date.strftime('%D')}", + refresh=False) if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( @@ -191,7 +191,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool all_data.append(result) else: - t.set_description(f"No games on {date.strftime('%D')}") + t.set_description(f"No games on {date.strftime('%D')}", refresh=False) if not len(all_data) > 0: return () diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index d931697..47104ed 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -182,8 +182,8 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool for i in t: date = date_range[i] game_ids = get_game_ids(date) - t.set_description( - f"Scraping {len(game_ids)} games on {date.strftime('%D')}") + t.set_description(f"Scraping {len(game_ids)} games on {date.strftime('%D')}", + refresh=False) if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( @@ -191,7 +191,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool all_data.append(result) else: - t.set_description(f"No games on {date.strftime('%D')}") + t.set_description(f"No games on {date.strftime('%D')}", refresh=False) if not len(all_data) > 0: return () From 4ecfbc87ca47aaf19a2931e547cbd7acd82eccc9 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 1 Nov 2023 00:36:20 -0500 Subject: [PATCH 50/53] tqdm --- src/cbbpy/mens_scraper.py | 2 +- src/cbbpy/womens_scraper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 17a00e9..a709ea1 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -178,7 +178,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format, position=0, leave=True) as t: + with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 47104ed..ff8f53e 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -178,7 +178,7 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' - with trange(len_scrape, bar_format=bar_format, position=0, leave=True) as t: + with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) From 65b5423cb7bfa1368afcaf4c9746a8bc6752483f Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Sat, 23 Dec 2023 23:45:20 -0600 Subject: [PATCH 51/53] scrape season if not complete --- .gitignore | 1 + src/cbbpy/mens_scraper.py | 957 +++++++++++++++++++---------------- src/cbbpy/womens_scraper.py | 969 ++++++++++++++++++++---------------- 3 files changed, 1055 insertions(+), 872 deletions(-) diff --git a/.gitignore b/.gitignore index 99e17dd..cb5895a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ __pycache__/ *.whl build/ dist/ +*.log \ No newline at end of file diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index a709ea1..df1a2cc 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -23,75 +23,73 @@ from typing import Union -logging.basicConfig(filename='cbbpy.log') +logging.basicConfig(filename="cbbpy.log") _log = logging.getLogger(__name__) ATTEMPTS = 15 DATE_PARSES = [ - '%Y-%m-%d', - '%Y/%m/%d', - '%m-%d-%Y', - '%m/%d/%Y', + "%Y-%m-%d", + "%Y/%m/%d", + "%m-%d-%Y", + "%m/%d/%Y", ] USER_AGENTS = [ - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ' + - '(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " + + "(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", ] REFERERS = [ - 'https://google.com/', - 'https://youtube.com/', - 'https://facebook.com/', - 'https://twitter.com/', - 'https://nytimes.com/', - 'https://washingtonpost.com/', - 'https://linkedin.com/', - 'https://nhl.com/', - 'https://mlb.com/', - 'https://nfl.com/' + "https://google.com/", + "https://youtube.com/", + "https://facebook.com/", + "https://twitter.com/", + "https://nytimes.com/", + "https://washingtonpost.com/", + "https://linkedin.com/", + "https://nhl.com/", + "https://mlb.com/", + "https://nfl.com/", ] -SCOREBOARD_URL = ( - "https://www.espn.com/mens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" -) +SCOREBOARD_URL = "https://www.espn.com/mens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" GAME_URL = "https://www.espn.com/mens-college-basketball/game/_/gameId/{}" BOXSCORE_URL = "https://www.espn.com/mens-college-basketball/boxscore/_/gameId/{}" PBP_URL = "https://www.espn.com/mens-college-basketball/playbyplay/_/gameId/{}" NON_SHOT_TYPES = [ - 'TV Timeout', - 'Jump Ball', - 'Turnover', - 'Timeout', - 'Rebound', - 'Block', - 'Steal', - 'Foul', - 'End' + "TV Timeout", + "Jump Ball", + "Turnover", + "Timeout", + "Rebound", + "Block", + "Steal", + "Foul", + "End", ] SHOT_TYPES = [ - 'Three Point Jumper', - 'Two Point Tip Shot', - 'Free Throw', - 'Jumper', - 'Layup', - 'Dunk' + "Three Point Jumper", + "Two Point Tip Shot", + "Free Throw", + "Jumper", + "Layup", + "Dunk", ] -WINDOW_STRING = "window[\'__espnfitt__\']=" +WINDOW_STRING = "window['__espnfitt__']=" JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" STATUS_OK = 200 @@ -109,7 +107,9 @@ class InvalidDateRangeError(Exception): pnf_ = [] -def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_game( + game_id: str, info: bool = True, box: bool = True, pbp: bool = True +) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play). Parameters: @@ -141,7 +141,13 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True return (game_info_df, boxscore_df, pbp_df) -def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_games_range( + start_date: str, + end_date: str, + info: bool = True, + box: bool = True, + pbp: bool = True, +) -> tuple: """A function that scrapes a game information between a given range of dates. Parameters: @@ -165,29 +171,31 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool cpus = os.cpu_count() - 1 if len_scrape < 1: - raise InvalidDateRangeError( - "The start date must be sooner than the end date.") - + raise InvalidDateRangeError("The start date must be sooner than the end date.") + if sd > datetime.today(): - raise InvalidDateRangeError( - "The start date must not be in the future.") - + raise InvalidDateRangeError("The start date must not be in the future.") + if ed > datetime.today(): - raise InvalidDateRangeError( - "The end date must not be in the future.") + raise InvalidDateRangeError("The end date must not be in the future.") - bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' + bar_format = ( + "{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec" + ) with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) - t.set_description(f"Scraping {len(game_ids)} games on {date.strftime('%D')}", - refresh=False) + t.set_description( + f"Scraping {len(game_ids)} games on {date.strftime('%D')}", + refresh=False, + ) if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( - delayed(get_game)(gid) for gid in game_ids) + delayed(get_game)(gid) for gid in game_ids + ) all_data.append(result) else: @@ -223,8 +231,8 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) @@ -232,42 +240,46 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) - boxscore = gamepackage['bxscr'] + boxscore = gamepackage["bxscr"] df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: if soup is not None: - if 'No Box Score Available' in soup.text: - _log.warning( - f'"{time.ctime()}": {game_id} - No boxscore available') + if "No Box Score Available" in soup.text: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') return pd.DataFrame([]) - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error') + f'"{time.ctime()}": {game_id} - Boxscore: Page error' + ) elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -294,8 +306,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = PBP_URL.format(game_id) page = r.get(url, headers=header) @@ -303,8 +315,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) @@ -312,25 +324,28 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: df = _get_game_pbp_helper(gamepackage, game_id) except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error') + f'"{time.ctime()}": {game_id} - PBP: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page error') + elif "Page error" in soup.text: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page error') elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -357,8 +372,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = GAME_URL.format(game_id) page = r.get(url, headers=header) @@ -366,40 +381,45 @@ def get_game_info(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) # get general game info - info = gamepackage['gmInfo'] + info = gamepackage["gmInfo"] # get team info - more_info = gamepackage['gmStrp'] + more_info = gamepackage["gmStrp"] df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + f'"{time.ctime()}": {game_id} - Game Info: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error') + f'"{time.ctime()}": {game_id} - Game Info: Page error' + ) elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -412,7 +432,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: return df -def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_games_season( + season: int, info: bool = True, box: bool = True, pbp: bool = True +) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play) for every game of a given season. @@ -427,8 +449,12 @@ def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = f'{season-1}-11-01' - season_end_date = f'{season}-05-01' + season_start_date = f"{season-1}-11-01" + season_end_date = f"{season}-05-01" + + # if season has not ended yet, set end scrape date to today + if datetime.strptime(season_end_date, "%Y-%m-%d") > datetime.today(): + season_end_date = datetime.today().strftime("%Y-%m-%d") info = get_games_range(season_start_date, season_end_date, info, box, pbp) @@ -452,35 +478,40 @@ def get_game_ids(date: Union[str, datetime]) -> list: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } d = date.strftime("%Y%m%d") url = SCOREBOARD_URL.format(d) page = r.get(url, headers=header) soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - ids = [x['id'] for x in scoreboard] + ids = [x["id"] for x in scoreboard] except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') - elif 'Page error' in soup.text: + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error' + ) + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error' + ) elif scoreboard is None: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -506,8 +537,10 @@ def _parse_date(date: str) -> datetime: break if not parsed: - raise CouldNotParseError('The given date could not be parsed. Try any of these formats:\n' + - 'Y-m-d\nY/m/d\nm-d-Y\nm/d/Y') + raise CouldNotParseError( + "The given date could not be parsed. Try any of these formats:\n" + + "Y-m-d\nY/m/d\nm-d-Y\nm/d/Y" + ) return date @@ -523,179 +556,222 @@ def _get_game_boxscore_helper(boxscore, game_id): - the game boxscore as a DataFrame """ tm1, tm2 = boxscore[0], boxscore[1] - tm1_name, tm2_name = tm1['tm']['dspNm'], tm2['tm']['dspNm'] - tm1_stats, tm2_stats = tm1['stats'], tm2['stats'] + tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] + tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] - labels = tm1_stats[0]['lbls'] + labels = tm1_stats[0]["lbls"] - tm1_starters, tm1_bench, tm1_totals = tm1_stats[0][ - 'athlts'], tm1_stats[1]['athlts'], tm1_stats[2]['ttls'] - tm2_starters, tm2_bench, tm2_totals = tm2_stats[0][ - 'athlts'], tm2_stats[1]['athlts'], tm2_stats[2]['ttls'] + tm1_starters, tm1_bench, tm1_totals = ( + tm1_stats[0]["athlts"], + tm1_stats[1]["athlts"], + tm1_stats[2]["ttls"], + ) + tm2_starters, tm2_bench, tm2_totals = ( + tm2_stats[0]["athlts"], + tm2_stats[1]["athlts"], + tm2_stats[2]["ttls"], + ) # starters' stats if len(tm1_starters) > 0: - tm1_st_dict = {labels[i].lower(): [tm1_starters[j]['stats'][i] - for j in range(len(tm1_starters))] - for i in range(len(labels))} - - tm1_st_pos = [tm1_starters[i]['athlt']['pos'] - if 'pos' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] - tm1_st_id = [tm1_starters[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] - tm1_st_nm = [tm1_starters[i]['athlt']['shrtNm'] - if 'shrtNm' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] + tm1_st_dict = { + labels[i].lower(): [ + tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) + ] + for i in range(len(labels)) + } + + tm1_st_pos = [ + tm1_starters[i]["athlt"]["pos"] + if "pos" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_id = [ + tm1_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_nm = [ + tm1_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, 'starter', True) - tm1_st_df.insert(0, 'position', tm1_st_pos) - tm1_st_df.insert(0, 'player_id', tm1_st_id) - tm1_st_df.insert(0, 'player', tm1_st_nm) - tm1_st_df.insert(0, 'team', tm1_name) - tm1_st_df.insert(0, 'game_id', game_id) + tm1_st_df.insert(0, "starter", True) + tm1_st_df.insert(0, "position", tm1_st_pos) + tm1_st_df.insert(0, "player_id", tm1_st_id) + tm1_st_df.insert(0, "player", tm1_st_nm) + tm1_st_df.insert(0, "team", tm1_name) + tm1_st_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm1_bench) > 0: - tm1_bn_dict = {labels[i].lower(): [tm1_bench[j]['stats'][i] - for j in range(len(tm1_bench))] - for i in range(len(labels))} - - tm1_bn_pos = [tm1_bench[i]['athlt']['pos'] - if 'pos' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] - tm1_bn_id = [tm1_bench[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] - tm1_bn_nm = [tm1_bench[i]['athlt']['shrtNm'] - if 'shrtNm' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] + tm1_bn_dict = { + labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] + for i in range(len(labels)) + } + + tm1_bn_pos = [ + tm1_bench[i]["athlt"]["pos"] + if "pos" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_id = [ + tm1_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_nm = [ + tm1_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, 'starter', False) - tm1_bn_df.insert(0, 'position', tm1_bn_pos) - tm1_bn_df.insert(0, 'player_id', tm1_bn_id) - tm1_bn_df.insert(0, 'player', tm1_bn_nm) - tm1_bn_df.insert(0, 'team', tm1_name) - tm1_bn_df.insert(0, 'game_id', game_id) + tm1_bn_df.insert(0, "starter", False) + tm1_bn_df.insert(0, "position", tm1_bn_pos) + tm1_bn_df.insert(0, "player_id", tm1_bn_id) + tm1_bn_df.insert(0, "player", tm1_bn_nm) + tm1_bn_df.insert(0, "team", tm1_name) + tm1_bn_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm1_totals) > 0: - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] - for i in range(len(labels))} + tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, 'starter', False) - tm1_tot_df.insert(0, 'position', 'TOTAL') - tm1_tot_df.insert(0, 'player_id', 'TOTAL') - tm1_tot_df.insert(0, 'player', 'TEAM') - tm1_tot_df.insert(0, 'team', tm1_name) - tm1_tot_df.insert(0, 'game_id', game_id) + tm1_tot_df.insert(0, "starter", False) + tm1_tot_df.insert(0, "position", "TOTAL") + tm1_tot_df.insert(0, "player_id", "TOTAL") + tm1_tot_df.insert(0, "player", "TEAM") + tm1_tot_df.insert(0, "team", tm1_name) + tm1_tot_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_tot_df = pd.DataFrame(columns=cols) tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) # starters' stats if len(tm2_starters) > 0: - tm2_st_dict = {labels[i].lower(): [tm2_starters[j]['stats'][i] - for j in range(len(tm2_starters))] - for i in range(len(labels))} - - tm2_st_pos = [tm2_starters[i]['athlt']['pos'] - if 'pos' in tm2_starters[i]['athlt'].keys() - else '' - for i in range(len(tm2_starters))] - tm2_st_id = [tm2_starters[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm2_starters[i]['athlt'].keys() - else '' for i in range(len(tm2_starters))] - tm2_st_nm = [tm2_starters[i]['athlt']['shrtNm'] - if 'shrtNm' in tm2_starters[i]['athlt'].keys() - else '' - for i in range(len(tm2_starters))] + tm2_st_dict = { + labels[i].lower(): [ + tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) + ] + for i in range(len(labels)) + } + + tm2_st_pos = [ + tm2_starters[i]["athlt"]["pos"] + if "pos" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_id = [ + tm2_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_nm = [ + tm2_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, 'starter', True) - tm2_st_df.insert(0, 'position', tm2_st_pos) - tm2_st_df.insert(0, 'player_id', tm2_st_id) - tm2_st_df.insert(0, 'player', tm2_st_nm) - tm2_st_df.insert(0, 'team', tm2_name) - tm2_st_df.insert(0, 'game_id', game_id) + tm2_st_df.insert(0, "starter", True) + tm2_st_df.insert(0, "position", tm2_st_pos) + tm2_st_df.insert(0, "player_id", tm2_st_id) + tm2_st_df.insert(0, "player", tm2_st_nm) + tm2_st_df.insert(0, "team", tm2_name) + tm2_st_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm2_bench) > 0: - tm2_bn_dict = {labels[i].lower(): [tm2_bench[j]['stats'][i] - for j in range(len(tm2_bench))] - for i in range(len(labels))} - - tm2_bn_pos = [tm2_bench[i]['athlt']['pos'] - if 'pos' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] - tm2_bn_id = [tm2_bench[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] - tm2_bn_nm = [tm2_bench[i]['athlt']['shrtNm'] - if 'shrtNm' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] + tm2_bn_dict = { + labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] + for i in range(len(labels)) + } + + tm2_bn_pos = [ + tm2_bench[i]["athlt"]["pos"] + if "pos" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_id = [ + tm2_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_nm = [ + tm2_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, 'starter', False) - tm2_bn_df.insert(0, 'position', tm2_bn_pos) - tm2_bn_df.insert(0, 'player_id', tm2_bn_id) - tm2_bn_df.insert(0, 'player', tm2_bn_nm) - tm2_bn_df.insert(0, 'team', tm2_name) - tm2_bn_df.insert(0, 'game_id', game_id) + tm2_bn_df.insert(0, "starter", False) + tm2_bn_df.insert(0, "position", tm2_bn_pos) + tm2_bn_df.insert(0, "player_id", tm2_bn_id) + tm2_bn_df.insert(0, "player", tm2_bn_nm) + tm2_bn_df.insert(0, "team", tm2_name) + tm2_bn_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm2_totals) > 0: - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] - for i in range(len(labels))} + tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, 'starter', False) - tm2_tot_df.insert(0, 'position', 'TOTAL') - tm2_tot_df.insert(0, 'player_id', 'TOTAL') - tm2_tot_df.insert(0, 'player', 'TEAM') - tm2_tot_df.insert(0, 'team', tm2_name) - tm2_tot_df.insert(0, 'game_id', game_id) + tm2_tot_df.insert(0, "starter", False) + tm2_tot_df.insert(0, "position", "TOTAL") + tm2_tot_df.insert(0, "player_id", "TOTAL") + tm2_tot_df.insert(0, "player", "TEAM") + tm2_tot_df.insert(0, "team", tm2_name) + tm2_tot_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_tot_df = pd.DataFrame(columns=cols) tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) @@ -707,18 +783,12 @@ def _get_game_boxscore_helper(boxscore, game_id): return pd.DataFrame([]) # SPLIT UP THE FG FIELDS - fgm = pd.to_numeric([x.split("-")[0] - for x in df["fg"]], errors='coerce') - fga = pd.to_numeric([x.split("-")[1] - for x in df["fg"]], errors='coerce') - thpm = pd.to_numeric([x.split("-")[0] - for x in df["3pt"]], errors='coerce') - thpa = pd.to_numeric([x.split("-")[1] - for x in df["3pt"]], errors='coerce') - ftm = pd.to_numeric([x.split("-")[0] - for x in df["ft"]], errors='coerce') - fta = pd.to_numeric([x.split("-")[1] - for x in df["ft"]], errors='coerce') + fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") + fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") + thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") + thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") + ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") + fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") # GET RID OF UNWANTED COLUMNS df = df.drop(columns=["fg", "3pt", "ft"]) @@ -734,16 +804,16 @@ def _get_game_boxscore_helper(boxscore, game_id): df.insert(14, "fta", fta) # column type handling - df['min'] = pd.to_numeric(df['min'], errors='coerce') - df['oreb'] = pd.to_numeric(df['oreb'], errors='coerce') - df['dreb'] = pd.to_numeric(df['dreb'], errors='coerce') - df['reb'] = pd.to_numeric(df['reb'], errors='coerce') - df['ast'] = pd.to_numeric(df['ast'], errors='coerce') - df['stl'] = pd.to_numeric(df['stl'], errors='coerce') - df['blk'] = pd.to_numeric(df['blk'], errors='coerce') - df['to'] = pd.to_numeric(df['to'], errors='coerce') - df['pf'] = pd.to_numeric(df['pf'], errors='coerce') - df['pts'] = pd.to_numeric(df['pts'], errors='coerce') + df["min"] = pd.to_numeric(df["min"], errors="coerce") + df["oreb"] = pd.to_numeric(df["oreb"], errors="coerce") + df["dreb"] = pd.to_numeric(df["dreb"], errors="coerce") + df["reb"] = pd.to_numeric(df["reb"], errors="coerce") + df["ast"] = pd.to_numeric(df["ast"], errors="coerce") + df["stl"] = pd.to_numeric(df["stl"], errors="coerce") + df["blk"] = pd.to_numeric(df["blk"], errors="coerce") + df["to"] = pd.to_numeric(df["to"], errors="coerce") + df["pf"] = pd.to_numeric(df["pf"], errors="coerce") + df["pts"] = pd.to_numeric(df["pts"], errors="coerce") return df @@ -758,53 +828,67 @@ def _get_game_pbp_helper(gamepackage, game_id): Returns - the game PBP as a DataFrame """ - pbp = gamepackage['pbp'] - home_team = pbp['tms']['home']['displayName'] - away_team = pbp['tms']['away']['displayName'] + pbp = gamepackage["pbp"] + home_team = pbp["tms"]["home"]["displayName"] + away_team = pbp["tms"]["away"]["displayName"] - all_plays = [play for half in pbp['playGrps'] for play in half] + all_plays = [play for half in pbp["playGrps"] for play in half] # check if PBP exists if len(all_plays) <= 0: _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') return pd.DataFrame([]) - descs = [x['text'] if 'text' in x.keys() else '' for x in all_plays] - teams = ['' if not 'homeAway' in x.keys() - else home_team if x['homeAway'] == 'home' else away_team for x in all_plays] - hscores = [int(x['homeScore']) if 'homeScore' in x.keys() - else np.nan for x in all_plays] - ascores = [int(x['awayScore']) if 'awayScore' in x.keys() - else np.nan for x in all_plays] - halves = [int(x['period']['number']) - if 'period' in x.keys() else np.nan for x in all_plays] - - time_splits = [x['clock']['displayValue'].split(':') if 'clock' in x.keys() - else '' for x in all_plays] + descs = [x["text"] if "text" in x.keys() else "" for x in all_plays] + teams = [ + "" + if not "homeAway" in x.keys() + else home_team + if x["homeAway"] == "home" + else away_team + for x in all_plays + ] + hscores = [ + int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays + ] + ascores = [ + int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays + ] + halves = [ + int(x["period"]["number"]) if "period" in x.keys() else np.nan + for x in all_plays + ] + + time_splits = [ + x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" + for x in all_plays + ] minutes = [int(x[0]) for x in time_splits] seconds = [int(x[1]) for x in time_splits] - min_to_sec = [x*60 for x in minutes] - hf_secs_left = [x+y for x, y in zip(min_to_sec, seconds)] - reg_secs_left = [1200+x if half_num == 1 else x for x, - half_num in zip(hf_secs_left, halves)] + min_to_sec = [x * 60 for x in minutes] + hf_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] + reg_secs_left = [ + 1200 + x if half_num == 1 else x for x, half_num in zip(hf_secs_left, halves) + ] - sc_play = [True if 'scoringPlay' in x.keys() - else False for x in all_plays] - is_assisted = [True if ('text' in x.keys() and 'assisted' in x['text'].lower()) - else False for x in all_plays] + sc_play = [True if "scoringPlay" in x.keys() else False for x in all_plays] + is_assisted = [ + True if ("text" in x.keys() and "assisted" in x["text"].lower()) else False + for x in all_plays + ] # ASSIGN PLAY TYPES p_types = [] for x in all_plays: - if not 'text' in x.keys(): - p_types.append('') + if not "text" in x.keys(): + p_types.append("") continue - play = x['text'] + play = x["text"] if not type(play) == str: - play = '' + play = "" added = False for pt in NON_SHOT_TYPES: @@ -820,116 +904,119 @@ def _get_game_pbp_helper(gamepackage, game_id): break if not added: - p_types.append('') + p_types.append("") # FIND SHOOTERS - shooting_play = [True if x in - (y.lower() for y in SHOT_TYPES) else False for x in p_types] + shooting_play = [ + True if x in (y.lower() for y in SHOT_TYPES) else False for x in p_types + ] - scorers = [x[0].split(' made ')[0] if x[1] else '' for x in - zip(descs, sc_play)] + scorers = [x[0].split(" made ")[0] if x[1] else "" for x in zip(descs, sc_play)] - non_scorers = [x[0].split(' missed ')[0] if x[1] in (y.lower() for y in SHOT_TYPES) - and not x[2] else '' for x in zip(descs, p_types, sc_play)] + non_scorers = [ + x[0].split(" missed ")[0] + if x[1] in (y.lower() for y in SHOT_TYPES) and not x[2] + else "" + for x in zip(descs, p_types, sc_play) + ] - shooters = [x[0] if not x[0] == '' else x[1] - for x in zip(scorers, non_scorers)] + shooters = [x[0] if not x[0] == "" else x[1] for x in zip(scorers, non_scorers)] - assisted_pls = [x[0].split('Assisted by ')[-1].replace('.', '') if x[1] else '' for x in - zip(descs, is_assisted)] + assisted_pls = [ + x[0].split("Assisted by ")[-1].replace(".", "") if x[1] else "" + for x in zip(descs, is_assisted) + ] - is_three = ['three point' in x.lower() for x in descs] + is_three = ["three point" in x.lower() for x in descs] data = { - 'game_id': game_id, - 'home_team': home_team, - 'away_team': away_team, - 'play_desc': descs, - 'home_score': hscores, - 'away_score': ascores, - 'half': halves, - 'secs_left_half': hf_secs_left, - 'secs_left_reg': reg_secs_left, - 'play_team': teams, - 'play_type': p_types, - 'shooting_play': shooting_play, - 'scoring_play': sc_play, - 'is_three': is_three, - 'shooter': shooters, - 'is_assisted': is_assisted, - 'assist_player': assisted_pls, + "game_id": game_id, + "home_team": home_team, + "away_team": away_team, + "play_desc": descs, + "home_score": hscores, + "away_score": ascores, + "half": halves, + "secs_left_half": hf_secs_left, + "secs_left_reg": reg_secs_left, + "play_team": teams, + "play_type": p_types, + "shooting_play": shooting_play, + "scoring_play": sc_play, + "is_three": is_three, + "shooter": shooters, + "is_assisted": is_assisted, + "assist_player": assisted_pls, } df = pd.DataFrame(data) # add shot data if it exists - is_shotchart = 'shtChrt' in gamepackage + is_shotchart = "shtChrt" in gamepackage if is_shotchart: - chart = gamepackage['shtChrt']['plays'] - - shotteams = [x['homeAway'] for x in chart] - shotdescs = [x['text'] for x in chart] - xs = [50-int(x['coordinate']['x']) for x in chart] - ys = [int(x['coordinate']['y']) for x in chart] - - shot_data = { - 'team': shotteams, - 'play_desc': shotdescs, - 'x': xs, - 'y': ys - } + chart = gamepackage["shtChrt"]["plays"] + + shotteams = [x["homeAway"] for x in chart] + shotdescs = [x["text"] for x in chart] + xs = [50 - int(x["coordinate"]["x"]) for x in chart] + ys = [int(x["coordinate"]["y"]) for x in chart] + + shot_data = {"team": shotteams, "play_desc": shotdescs, "x": xs, "y": ys} shot_df = pd.DataFrame(shot_data) # shot matching shot_info = { - 'shot_x': [], - 'shot_y': [], + "shot_x": [], + "shot_y": [], } shot_count = 0 for play, isshot in zip(df.play_desc, df.shooting_play): if shot_count >= len(shot_df): - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) continue if not isshot: - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) continue - if 'free throw' in play.lower(): - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + if "free throw" in play.lower(): + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) shot_count += 1 continue shot_play = shot_df.play_desc.iloc[shot_count] if play == shot_play: - shot_info['shot_x'].append(shot_df.x.iloc[shot_count]) - shot_info['shot_y'].append(shot_df.y.iloc[shot_count]) + shot_info["shot_x"].append(shot_df.x.iloc[shot_count]) + shot_info["shot_y"].append(shot_df.y.iloc[shot_count]) shot_count += 1 else: - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) # make sure that length of shot data matches number of shots in PBP data - if (not (len(shot_info['shot_x']) == len(df))) or (not (len(shot_info['shot_y']) == len(df))): + if (not (len(shot_info["shot_x"]) == len(df))) or ( + not (len(shot_info["shot_y"]) == len(df)) + ): _log.warning( - f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data') - df['shot_x'] = np.nan - df['shot_y'] = np.nan + f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data' + ) + df["shot_x"] = np.nan + df["shot_y"] = np.nan return df - df['shot_x'] = shot_info['shot_x'] - df['shot_y'] = shot_info['shot_y'] + df["shot_x"] = shot_info["shot_x"] + df["shot_y"] = shot_info["shot_y"] else: - df['shot_x'] = np.nan - df['shot_y'] = np.nan + df["shot_x"] = np.nan + df["shot_y"] = np.nan return df return df @@ -946,77 +1033,79 @@ def _get_game_info_helper(info, more_info, game_id): Returns - the game metadata as a DataFrame """ - attendance = int(info['attnd']) if 'attnd' in info.keys() else np.nan - capacity = int(info['cpcty']) if 'cpcty' in info.keys() else np.nan - network = info['cvrg'] if 'cvrg' in info.keys() else '' + attendance = int(info["attnd"]) if "attnd" in info.keys() else np.nan + capacity = int(info["cpcty"]) if "cpcty" in info.keys() else np.nan + network = info["cvrg"] if "cvrg" in info.keys() else "" - gm_date = parse(info['dtTm']) - game_date = gm_date.replace( - tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) + gm_date = parse(info["dtTm"]) + game_date = gm_date.replace(tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) game_day = game_date.strftime("%B %d, %Y") game_time = game_date.strftime("%I:%M %p %Z") - arena = info['loc'] if 'loc' in info.keys() else '' - loc = info['locAddr']['city'] + ', ' + \ - info['locAddr']['state'] if 'locAddr' in info.keys() else '' + arena = info["loc"] if "loc" in info.keys() else "" + loc = ( + info["locAddr"]["city"] + ", " + info["locAddr"]["state"] + if "locAddr" in info.keys() + else "" + ) - tot_refs = info['refs'] if 'refs' in info.keys() else {} - ref_1 = tot_refs[0]['dspNm'] if len(tot_refs) > 0 else '' - ref_2 = tot_refs[1]['dspNm'] if len(tot_refs) > 1 else '' - ref_3 = tot_refs[2]['dspNm'] if len(tot_refs) > 2 else '' + tot_refs = info["refs"] if "refs" in info.keys() else {} + ref_1 = tot_refs[0]["dspNm"] if len(tot_refs) > 0 else "" + ref_2 = tot_refs[1]["dspNm"] if len(tot_refs) > 1 else "" + ref_3 = tot_refs[2]["dspNm"] if len(tot_refs) > 2 else "" - teams = more_info['tms'] + teams = more_info["tms"] ht_info, at_info = teams[0], teams[1] - home_team, away_team = ht_info['displayName'], at_info['displayName'] + home_team, away_team = ht_info["displayName"], at_info["displayName"] - home_id = ht_info['id'] - away_id = at_info['id'] + home_id = ht_info["id"] + away_id = at_info["id"] - if len(ht_info['links']) == 0: + if len(ht_info["links"]) == 0: ht = home_team.lower().replace(" ", "-") home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - elif len(ht_info['records']) == 0: + elif len(ht_info["records"]) == 0: ht = home_team.lower().replace(" ", "-") home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - if len(at_info['links']) == 0: + if len(at_info["links"]) == 0: at = away_team.lower().replace(" ", "-") away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - elif len(at_info['records']) == 0: + elif len(at_info["records"]) == 0: at = away_team.lower().replace(" ", "-") away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - home_rank = ht_info['rank'] if 'rank' in ht_info.keys() else np.nan - away_rank = at_info['rank'] if 'rank' in at_info.keys() else np.nan + home_rank = ht_info["rank"] if "rank" in ht_info.keys() else np.nan + away_rank = at_info["rank"] if "rank" in at_info.keys() else np.nan - home_record = ht_info['records'][0]['displayValue'] if len( - ht_info['records']) > 0 else '' - away_record = at_info['records'][0]['displayValue'] if len( - at_info['records']) > 0 else '' + home_record = ( + ht_info["records"][0]["displayValue"] if len(ht_info["records"]) > 0 else "" + ) + away_record = ( + at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" + ) - home_score, away_score = int( - ht_info['score']), int(at_info['score']) + home_score, away_score = int(ht_info["score"]), int(at_info["score"]) home_win = True if home_score > away_score else False - is_postseason = True if more_info['seasonType'] == 3 else False - is_conference = more_info['isConferenceGame'] + is_postseason = True if more_info["seasonType"] == 3 else False + is_conference = more_info["isConferenceGame"] - if len(ht_info['records']) > 1 and ht_info['records'][1]['type'] == 'home': + if len(ht_info["records"]) > 1 and ht_info["records"][1]["type"] == "home": is_neutral = False - elif len(at_info['records']) > 1 and at_info['records'][1]['type'] == 'away': + elif len(at_info["records"]) > 1 and at_info["records"][1]["type"] == "away": is_neutral = False else: is_neutral = True - tournament = more_info['nte'] if 'nte' in more_info.keys() else '' + tournament = more_info["nte"] if "nte" in more_info.keys() else "" - if ('linescores' in ht_info) and ('linescores' in at_info): - h_ot, a_ot = len(ht_info['linescores']) - \ - 2, len(at_info['linescores']) - 2 + if ("linescores" in ht_info) and ("linescores" in at_info): + h_ot, a_ot = len(ht_info["linescores"]) - 2, len(at_info["linescores"]) - 2 assert h_ot == a_ot num_ots = h_ot else: @@ -1050,37 +1139,37 @@ def _get_game_info_helper(info, more_info, game_id): network, ref_1, ref_2, - ref_3 + ref_3, ] game_info_cols = [ - 'game_id', - 'home_team', - 'home_id', - 'home_rank', - 'home_record', - 'home_score', - 'away_team', - 'away_id', - 'away_rank', - 'away_record', - 'away_score', - 'home_win', - 'num_ots', - 'is_conference', - 'is_neutral', - 'is_postseason', - 'tournament', - 'game_day', - 'game_time', - 'game_loc', - 'arena', - 'arena_capacity', - 'attendance', - 'tv_network', - 'referee_1', - 'referee_2', - 'referee_3' + "game_id", + "home_team", + "home_id", + "home_rank", + "home_record", + "home_score", + "away_team", + "away_id", + "away_rank", + "away_record", + "away_score", + "home_win", + "num_ots", + "is_conference", + "is_neutral", + "is_postseason", + "tournament", + "game_day", + "game_time", + "game_loc", + "arena", + "arena_capacity", + "attendance", + "tv_network", + "referee_1", + "referee_2", + "referee_3", ] return pd.DataFrame([game_info_list], columns=game_info_cols) @@ -1089,14 +1178,14 @@ def _get_game_info_helper(info, more_info, game_id): def _get_gamepackage_from_soup(soup): script_string = _find_json_in_content(soup) - if script_string == '': + if script_string == "": return None pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' + js = "{" + found + "}" jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] + gamepackage = jsn["page"]["content"]["gamepackage"] return gamepackage @@ -1104,21 +1193,21 @@ def _get_gamepackage_from_soup(soup): def _get_scoreboard_from_soup(soup): script_string = _find_json_in_content(soup) - if script_string == '': + if script_string == "": return None pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' + js = "{" + found + "}" jsn = json.loads(js) - scoreboard = jsn['page']['content']['scoreboard']['evts'] + scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] return scoreboard def _find_json_in_content(soup): - script_string = '' - for x in soup.find_all('script'): + script_string = "" + for x in soup.find_all("script"): if WINDOW_STRING in x.text: script_string = x.text break diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index ff8f53e..1654cc3 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -23,75 +23,73 @@ from typing import Union -logging.basicConfig(filename='cbbpy.log') +logging.basicConfig(filename="cbbpy.log") _log = logging.getLogger(__name__) ATTEMPTS = 15 DATE_PARSES = [ - '%Y-%m-%d', - '%Y/%m/%d', - '%m-%d-%Y', - '%m/%d/%Y', + "%Y-%m-%d", + "%Y/%m/%d", + "%m-%d-%Y", + "%m/%d/%Y", ] USER_AGENTS = [ - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ' + - '(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 ' + - '(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " + + "(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", ] REFERERS = [ - 'https://google.com/', - 'https://youtube.com/', - 'https://facebook.com/', - 'https://twitter.com/', - 'https://nytimes.com/', - 'https://washingtonpost.com/', - 'https://linkedin.com/', - 'https://nhl.com/', - 'https://mlb.com/', - 'https://nfl.com/' + "https://google.com/", + "https://youtube.com/", + "https://facebook.com/", + "https://twitter.com/", + "https://nytimes.com/", + "https://washingtonpost.com/", + "https://linkedin.com/", + "https://nhl.com/", + "https://mlb.com/", + "https://nfl.com/", ] -SCOREBOARD_URL = ( - "https://www.espn.com/womens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" -) +SCOREBOARD_URL = "https://www.espn.com/womens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" GAME_URL = "https://www.espn.com/womens-college-basketball/game/_/gameId/{}" BOXSCORE_URL = "https://www.espn.com/womens-college-basketball/boxscore/_/gameId/{}" PBP_URL = "https://www.espn.com/womens-college-basketball/playbyplay/_/gameId/{}" NON_SHOT_TYPES = [ - 'TV Timeout', - 'Jump Ball', - 'Turnover', - 'Timeout', - 'Rebound', - 'Block', - 'Steal', - 'Foul', - 'End' + "TV Timeout", + "Jump Ball", + "Turnover", + "Timeout", + "Rebound", + "Block", + "Steal", + "Foul", + "End", ] SHOT_TYPES = [ - 'Three Point Jumper', - 'Two Point Tip Shot', - 'Free Throw', - 'Jumper', - 'Layup', - 'Dunk' + "Three Point Jumper", + "Two Point Tip Shot", + "Free Throw", + "Jumper", + "Layup", + "Dunk", ] -WINDOW_STRING = "window[\'__espnfitt__\']=" +WINDOW_STRING = "window['__espnfitt__']=" JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" STATUS_OK = 200 @@ -109,7 +107,9 @@ class InvalidDateRangeError(Exception): pnf_ = [] -def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_game( + game_id: str, info: bool = True, box: bool = True, pbp: bool = True +) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play). Parameters: @@ -141,7 +141,13 @@ def get_game(game_id: str, info: bool = True, box: bool = True, pbp: bool = True return (game_info_df, boxscore_df, pbp_df) -def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_games_range( + start_date: str, + end_date: str, + info: bool = True, + box: bool = True, + pbp: bool = True, +) -> tuple: """A function that scrapes a game information between a given range of dates. Parameters: @@ -165,29 +171,31 @@ def get_games_range(start_date: str, end_date: str, info: bool = True, box: bool cpus = os.cpu_count() - 1 if len_scrape < 1: - raise InvalidDateRangeError( - "The start date must be sooner than the end date.") - + raise InvalidDateRangeError("The start date must be sooner than the end date.") + if sd > datetime.today(): - raise InvalidDateRangeError( - "The start date must not be in the future.") - + raise InvalidDateRangeError("The start date must not be in the future.") + if ed > datetime.today(): - raise InvalidDateRangeError( - "The end date must not be in the future.") + raise InvalidDateRangeError("The end date must not be in the future.") - bar_format = '{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec' + bar_format = ( + "{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec" + ) with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] game_ids = get_game_ids(date) - t.set_description(f"Scraping {len(game_ids)} games on {date.strftime('%D')}", - refresh=False) + t.set_description( + f"Scraping {len(game_ids)} games on {date.strftime('%D')}", + refresh=False, + ) if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( - delayed(get_game)(gid) for gid in game_ids) + delayed(get_game)(gid) for gid in game_ids + ) all_data.append(result) else: @@ -223,8 +231,8 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) @@ -232,42 +240,46 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) - boxscore = gamepackage['bxscr'] + boxscore = gamepackage["bxscr"] df = _get_game_boxscore_helper(boxscore, game_id) except Exception as ex: if soup is not None: - if 'No Box Score Available' in soup.text: - _log.warning( - f'"{time.ctime()}": {game_id} - No boxscore available') + if "No Box Score Available" in soup.text: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') return pd.DataFrame([]) - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error') + f'"{time.ctime()}": {game_id} - Boxscore: Page error' + ) elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -294,8 +306,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = PBP_URL.format(game_id) page = r.get(url, headers=header) @@ -303,8 +315,8 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) @@ -312,25 +324,28 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: df = _get_game_pbp_helper(gamepackage, game_id) except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error') + f'"{time.ctime()}": {game_id} - PBP: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page error') + elif "Page error" in soup.text: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page error') elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -357,8 +372,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } url = GAME_URL.format(game_id) page = r.get(url, headers=header) @@ -366,40 +381,45 @@ def get_game_info(game_id: str) -> pd.DataFrame: gamepackage = _get_gamepackage_from_soup(soup) # check if game was postponed - gm_status = gamepackage['gmStrp']['status']['desc'] - gsbool = (gm_status == 'Final') # or (gm_status == 'In Progress') + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') if not gsbool: _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') return pd.DataFrame([]) # get general game info - info = gamepackage['gmInfo'] + info = gamepackage["gmInfo"] # get team info - more_info = gamepackage['gmStrp'] + more_info = gamepackage["gmStrp"] df = _get_game_info_helper(info, more_info, game_id) except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + f'"{time.ctime()}": {game_id} - Game Info: Page not found error' + ) pnf_.append(game_id) - elif 'Page error' in soup.text: + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error') + f'"{time.ctime()}": {game_id} - Game Info: Page error' + ) elif gamepackage is None: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.') + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -412,7 +432,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: return df -def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool = True) -> tuple: +def get_games_season( + season: int, info: bool = True, box: bool = True, pbp: bool = True +) -> tuple: """A function that scrapes all game info (metadata, boxscore, play-by-play) for every game of a given season. @@ -427,8 +449,12 @@ def get_games_season(season: int, info: bool = True, box: bool = True, pbp: bool -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = f'{season-1}-11-01' - season_end_date = f'{season}-05-01' + season_start_date = f"{season-1}-11-01" + season_end_date = f"{season}-05-01" + + # if season has not ended yet, set end scrape date to today + if datetime.strptime(season_end_date, "%Y-%m-%d") > datetime.today(): + season_end_date = datetime.today().strftime("%Y-%m-%d") info = get_games_range(season_start_date, season_end_date, info, box, pbp) @@ -452,35 +478,40 @@ def get_game_ids(date: Union[str, datetime]) -> list: for i in range(ATTEMPTS): try: header = { - 'User-Agent': np.random.choice(USER_AGENTS), - 'Referer': np.random.choice(REFERERS), + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), } d = date.strftime("%Y%m%d") url = SCOREBOARD_URL.format(d) page = r.get(url, headers=header) soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) - ids = [x['id'] for x in scoreboard] + ids = [x["id"] for x in scoreboard] except Exception as ex: - if i+1 == ATTEMPTS: + if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df if soup is not None: - if 'Page not found.' in soup.text: + if "Page not found." in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error') - elif 'Page error' in soup.text: + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error' + ) + elif "Page error" in soup.text: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error' + ) elif scoreboard is None: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.' + ) else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}' + ) else: _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}') + f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}' + ) return pd.DataFrame([]) else: # try again @@ -506,8 +537,10 @@ def _parse_date(date: str) -> datetime: break if not parsed: - raise CouldNotParseError('The given date could not be parsed. Try any of these formats:\n' + - 'Y-m-d\nY/m/d\nm-d-Y\nm/d/Y') + raise CouldNotParseError( + "The given date could not be parsed. Try any of these formats:\n" + + "Y-m-d\nY/m/d\nm-d-Y\nm/d/Y" + ) return date @@ -523,179 +556,222 @@ def _get_game_boxscore_helper(boxscore, game_id): - the game boxscore as a DataFrame """ tm1, tm2 = boxscore[0], boxscore[1] - tm1_name, tm2_name = tm1['tm']['dspNm'], tm2['tm']['dspNm'] - tm1_stats, tm2_stats = tm1['stats'], tm2['stats'] + tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] + tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] - labels = tm1_stats[0]['lbls'] + labels = tm1_stats[0]["lbls"] - tm1_starters, tm1_bench, tm1_totals = tm1_stats[0][ - 'athlts'], tm1_stats[1]['athlts'], tm1_stats[2]['ttls'] - tm2_starters, tm2_bench, tm2_totals = tm2_stats[0][ - 'athlts'], tm2_stats[1]['athlts'], tm2_stats[2]['ttls'] + tm1_starters, tm1_bench, tm1_totals = ( + tm1_stats[0]["athlts"], + tm1_stats[1]["athlts"], + tm1_stats[2]["ttls"], + ) + tm2_starters, tm2_bench, tm2_totals = ( + tm2_stats[0]["athlts"], + tm2_stats[1]["athlts"], + tm2_stats[2]["ttls"], + ) # starters' stats if len(tm1_starters) > 0: - tm1_st_dict = {labels[i].lower(): [tm1_starters[j]['stats'][i] - for j in range(len(tm1_starters))] - for i in range(len(labels))} - - tm1_st_pos = [tm1_starters[i]['athlt']['pos'] - if 'pos' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] - tm1_st_id = [tm1_starters[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] - tm1_st_nm = [tm1_starters[i]['athlt']['shrtNm'] - if 'shrtNm' in tm1_starters[i]['athlt'].keys() - else '' - for i in range(len(tm1_starters))] + tm1_st_dict = { + labels[i].lower(): [ + tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) + ] + for i in range(len(labels)) + } + + tm1_st_pos = [ + tm1_starters[i]["athlt"]["pos"] + if "pos" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_id = [ + tm1_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_nm = [ + tm1_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, 'starter', True) - tm1_st_df.insert(0, 'position', tm1_st_pos) - tm1_st_df.insert(0, 'player_id', tm1_st_id) - tm1_st_df.insert(0, 'player', tm1_st_nm) - tm1_st_df.insert(0, 'team', tm1_name) - tm1_st_df.insert(0, 'game_id', game_id) + tm1_st_df.insert(0, "starter", True) + tm1_st_df.insert(0, "position", tm1_st_pos) + tm1_st_df.insert(0, "player_id", tm1_st_id) + tm1_st_df.insert(0, "player", tm1_st_nm) + tm1_st_df.insert(0, "team", tm1_name) + tm1_st_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm1_bench) > 0: - tm1_bn_dict = {labels[i].lower(): [tm1_bench[j]['stats'][i] - for j in range(len(tm1_bench))] - for i in range(len(labels))} - - tm1_bn_pos = [tm1_bench[i]['athlt']['pos'] - if 'pos' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] - tm1_bn_id = [tm1_bench[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] - tm1_bn_nm = [tm1_bench[i]['athlt']['shrtNm'] - if 'shrtNm' in tm1_bench[i]['athlt'].keys() - else '' - for i in range(len(tm1_bench))] + tm1_bn_dict = { + labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] + for i in range(len(labels)) + } + + tm1_bn_pos = [ + tm1_bench[i]["athlt"]["pos"] + if "pos" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_id = [ + tm1_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_nm = [ + tm1_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, 'starter', False) - tm1_bn_df.insert(0, 'position', tm1_bn_pos) - tm1_bn_df.insert(0, 'player_id', tm1_bn_id) - tm1_bn_df.insert(0, 'player', tm1_bn_nm) - tm1_bn_df.insert(0, 'team', tm1_name) - tm1_bn_df.insert(0, 'game_id', game_id) + tm1_bn_df.insert(0, "starter", False) + tm1_bn_df.insert(0, "position", tm1_bn_pos) + tm1_bn_df.insert(0, "player_id", tm1_bn_id) + tm1_bn_df.insert(0, "player", tm1_bn_nm) + tm1_bn_df.insert(0, "team", tm1_name) + tm1_bn_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm1_totals) > 0: - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] - for i in range(len(labels))} + tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, 'starter', False) - tm1_tot_df.insert(0, 'position', 'TOTAL') - tm1_tot_df.insert(0, 'player_id', 'TOTAL') - tm1_tot_df.insert(0, 'player', 'TEAM') - tm1_tot_df.insert(0, 'team', tm1_name) - tm1_tot_df.insert(0, 'game_id', game_id) + tm1_tot_df.insert(0, "starter", False) + tm1_tot_df.insert(0, "position", "TOTAL") + tm1_tot_df.insert(0, "player_id", "TOTAL") + tm1_tot_df.insert(0, "player", "TEAM") + tm1_tot_df.insert(0, "team", tm1_name) + tm1_tot_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm1_tot_df = pd.DataFrame(columns=cols) tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) # starters' stats if len(tm2_starters) > 0: - tm2_st_dict = {labels[i].lower(): [tm2_starters[j]['stats'][i] - for j in range(len(tm2_starters))] - for i in range(len(labels))} - - tm2_st_pos = [tm2_starters[i]['athlt']['pos'] - if 'pos' in tm2_starters[i]['athlt'].keys() - else '' - for i in range(len(tm2_starters))] - tm2_st_id = [tm2_starters[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm2_starters[i]['athlt'].keys() - else '' for i in range(len(tm2_starters))] - tm2_st_nm = [tm2_starters[i]['athlt']['shrtNm'] - if 'shrtNm' in tm2_starters[i]['athlt'].keys() - else '' - for i in range(len(tm2_starters))] + tm2_st_dict = { + labels[i].lower(): [ + tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) + ] + for i in range(len(labels)) + } + + tm2_st_pos = [ + tm2_starters[i]["athlt"]["pos"] + if "pos" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_id = [ + tm2_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_nm = [ + tm2_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, 'starter', True) - tm2_st_df.insert(0, 'position', tm2_st_pos) - tm2_st_df.insert(0, 'player_id', tm2_st_id) - tm2_st_df.insert(0, 'player', tm2_st_nm) - tm2_st_df.insert(0, 'team', tm2_name) - tm2_st_df.insert(0, 'game_id', game_id) + tm2_st_df.insert(0, "starter", True) + tm2_st_df.insert(0, "position", tm2_st_pos) + tm2_st_df.insert(0, "player_id", tm2_st_id) + tm2_st_df.insert(0, "player", tm2_st_nm) + tm2_st_df.insert(0, "team", tm2_name) + tm2_st_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_st_df = pd.DataFrame(columns=cols) # bench players' stats if len(tm2_bench) > 0: - tm2_bn_dict = {labels[i].lower(): [tm2_bench[j]['stats'][i] - for j in range(len(tm2_bench))] - for i in range(len(labels))} - - tm2_bn_pos = [tm2_bench[i]['athlt']['pos'] - if 'pos' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] - tm2_bn_id = [tm2_bench[i]['athlt']['uid'].split(':')[-1] - if 'uid' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] - tm2_bn_nm = [tm2_bench[i]['athlt']['shrtNm'] - if 'shrtNm' in tm2_bench[i]['athlt'].keys() - else '' - for i in range(len(tm2_bench))] + tm2_bn_dict = { + labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] + for i in range(len(labels)) + } + + tm2_bn_pos = [ + tm2_bench[i]["athlt"]["pos"] + if "pos" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_id = [ + tm2_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_nm = [ + tm2_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, 'starter', False) - tm2_bn_df.insert(0, 'position', tm2_bn_pos) - tm2_bn_df.insert(0, 'player_id', tm2_bn_id) - tm2_bn_df.insert(0, 'player', tm2_bn_nm) - tm2_bn_df.insert(0, 'team', tm2_name) - tm2_bn_df.insert(0, 'game_id', game_id) + tm2_bn_df.insert(0, "starter", False) + tm2_bn_df.insert(0, "position", tm2_bn_pos) + tm2_bn_df.insert(0, "player_id", tm2_bn_id) + tm2_bn_df.insert(0, "player", tm2_bn_nm) + tm2_bn_df.insert(0, "team", tm2_name) + tm2_bn_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_bn_df = pd.DataFrame(columns=cols) # team totals if len(tm2_totals) > 0: - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] - for i in range(len(labels))} + tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, 'starter', False) - tm2_tot_df.insert(0, 'position', 'TOTAL') - tm2_tot_df.insert(0, 'player_id', 'TOTAL') - tm2_tot_df.insert(0, 'player', 'TEAM') - tm2_tot_df.insert(0, 'team', tm2_name) - tm2_tot_df.insert(0, 'game_id', game_id) + tm2_tot_df.insert(0, "starter", False) + tm2_tot_df.insert(0, "position", "TOTAL") + tm2_tot_df.insert(0, "player_id", "TOTAL") + tm2_tot_df.insert(0, "player", "TEAM") + tm2_tot_df.insert(0, "team", tm2_name) + tm2_tot_df.insert(0, "game_id", game_id) else: - cols = ['starter', 'position', 'player_id', 'player', - 'team', 'game_id'] + [x.lower() for x in labels] + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] tm2_tot_df = pd.DataFrame(columns=cols) tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) @@ -707,18 +783,12 @@ def _get_game_boxscore_helper(boxscore, game_id): return pd.DataFrame([]) # SPLIT UP THE FG FIELDS - fgm = pd.to_numeric([x.split("-")[0] - for x in df["fg"]], errors='coerce') - fga = pd.to_numeric([x.split("-")[1] - for x in df["fg"]], errors='coerce') - thpm = pd.to_numeric([x.split("-")[0] - for x in df["3pt"]], errors='coerce') - thpa = pd.to_numeric([x.split("-")[1] - for x in df["3pt"]], errors='coerce') - ftm = pd.to_numeric([x.split("-")[0] - for x in df["ft"]], errors='coerce') - fta = pd.to_numeric([x.split("-")[1] - for x in df["ft"]], errors='coerce') + fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") + fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") + thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") + thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") + ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") + fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") # GET RID OF UNWANTED COLUMNS df = df.drop(columns=["fg", "3pt", "ft"]) @@ -734,16 +804,16 @@ def _get_game_boxscore_helper(boxscore, game_id): df.insert(14, "fta", fta) # column type handling - df['min'] = pd.to_numeric(df['min'], errors='coerce') - df['oreb'] = pd.to_numeric(df['oreb'], errors='coerce') - df['dreb'] = pd.to_numeric(df['dreb'], errors='coerce') - df['reb'] = pd.to_numeric(df['reb'], errors='coerce') - df['ast'] = pd.to_numeric(df['ast'], errors='coerce') - df['stl'] = pd.to_numeric(df['stl'], errors='coerce') - df['blk'] = pd.to_numeric(df['blk'], errors='coerce') - df['to'] = pd.to_numeric(df['to'], errors='coerce') - df['pf'] = pd.to_numeric(df['pf'], errors='coerce') - df['pts'] = pd.to_numeric(df['pts'], errors='coerce') + df["min"] = pd.to_numeric(df["min"], errors="coerce") + df["oreb"] = pd.to_numeric(df["oreb"], errors="coerce") + df["dreb"] = pd.to_numeric(df["dreb"], errors="coerce") + df["reb"] = pd.to_numeric(df["reb"], errors="coerce") + df["ast"] = pd.to_numeric(df["ast"], errors="coerce") + df["stl"] = pd.to_numeric(df["stl"], errors="coerce") + df["blk"] = pd.to_numeric(df["blk"], errors="coerce") + df["to"] = pd.to_numeric(df["to"], errors="coerce") + df["pf"] = pd.to_numeric(df["pf"], errors="coerce") + df["pts"] = pd.to_numeric(df["pts"], errors="coerce") return df @@ -758,56 +828,74 @@ def _get_game_pbp_helper(gamepackage, game_id): Returns - the game PBP as a DataFrame """ - pbp = gamepackage['pbp'] - home_team = pbp['tms']['home']['displayName'] - away_team = pbp['tms']['away']['displayName'] + pbp = gamepackage["pbp"] + home_team = pbp["tms"]["home"]["displayName"] + away_team = pbp["tms"]["away"]["displayName"] - all_plays = [play for quart in pbp['playGrps'] for play in quart] + all_plays = [play for quart in pbp["playGrps"] for play in quart] # check if PBP exists if len(all_plays) <= 0: _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') return pd.DataFrame([]) - descs = [x['text'] if 'text' in x.keys() else '' for x in all_plays] - teams = ['' if not 'homeAway' in x.keys() - else home_team if x['homeAway'] == 'home' else away_team for x in all_plays] - hscores = [int(x['homeScore']) if 'homeScore' in x.keys() - else np.nan for x in all_plays] - ascores = [int(x['awayScore']) if 'awayScore' in x.keys() - else np.nan for x in all_plays] - quarters = [int(x['period']['number']) - if 'period' in x.keys() else np.nan for x in all_plays] - - time_splits = [x['clock']['displayValue'].split(':') if 'clock' in x.keys() - else '' for x in all_plays] + descs = [x["text"] if "text" in x.keys() else "" for x in all_plays] + teams = [ + "" + if not "homeAway" in x.keys() + else home_team + if x["homeAway"] == "home" + else away_team + for x in all_plays + ] + hscores = [ + int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays + ] + ascores = [ + int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays + ] + quarters = [ + int(x["period"]["number"]) if "period" in x.keys() else np.nan + for x in all_plays + ] + + time_splits = [ + x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" + for x in all_plays + ] minutes = [int(x[0]) for x in time_splits] seconds = [int(x[1]) for x in time_splits] - min_to_sec = [x*60 for x in minutes] - qt_secs_left = [x+y for x, y in zip(min_to_sec, seconds)] - reg_secs_left = [1800+x if qt_num == 1 - else 1200+x if qt_num == 2 - else 600+x if qt_num == 3 - else x - for x, qt_num in zip(qt_secs_left, quarters)] - - sc_play = [True if 'scoringPlay' in x.keys() - else False for x in all_plays] - is_assisted = [True if ('text' in x.keys() and 'assisted' in x['text'].lower()) - else False for x in all_plays] + min_to_sec = [x * 60 for x in minutes] + qt_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] + reg_secs_left = [ + 1800 + x + if qt_num == 1 + else 1200 + x + if qt_num == 2 + else 600 + x + if qt_num == 3 + else x + for x, qt_num in zip(qt_secs_left, quarters) + ] + + sc_play = [True if "scoringPlay" in x.keys() else False for x in all_plays] + is_assisted = [ + True if ("text" in x.keys() and "assisted" in x["text"].lower()) else False + for x in all_plays + ] # ASSIGN PLAY TYPES p_types = [] for x in all_plays: - if not 'text' in x.keys(): - p_types.append('') + if not "text" in x.keys(): + p_types.append("") continue - play = x['text'] + play = x["text"] if not type(play) == str: - play = '' + play = "" added = False for pt in NON_SHOT_TYPES: @@ -823,116 +911,119 @@ def _get_game_pbp_helper(gamepackage, game_id): break if not added: - p_types.append('') + p_types.append("") # FIND SHOOTERS - shooting_play = [True if x in - (y.lower() for y in SHOT_TYPES) else False for x in p_types] + shooting_play = [ + True if x in (y.lower() for y in SHOT_TYPES) else False for x in p_types + ] - scorers = [x[0].split(' made ')[0] if x[1] else '' for x in - zip(descs, sc_play)] + scorers = [x[0].split(" made ")[0] if x[1] else "" for x in zip(descs, sc_play)] - non_scorers = [x[0].split(' missed ')[0] if x[1] in (y.lower() for y in SHOT_TYPES) - and not x[2] else '' for x in zip(descs, p_types, sc_play)] + non_scorers = [ + x[0].split(" missed ")[0] + if x[1] in (y.lower() for y in SHOT_TYPES) and not x[2] + else "" + for x in zip(descs, p_types, sc_play) + ] - shooters = [x[0] if not x[0] == '' else x[1] - for x in zip(scorers, non_scorers)] + shooters = [x[0] if not x[0] == "" else x[1] for x in zip(scorers, non_scorers)] - assisted_pls = [x[0].split('Assisted by ')[-1].replace('.', '') if x[1] else '' for x in - zip(descs, is_assisted)] + assisted_pls = [ + x[0].split("Assisted by ")[-1].replace(".", "") if x[1] else "" + for x in zip(descs, is_assisted) + ] - is_three = ['three point' in x.lower() for x in descs] + is_three = ["three point" in x.lower() for x in descs] data = { - 'game_id': game_id, - 'home_team': home_team, - 'away_team': away_team, - 'play_desc': descs, - 'home_score': hscores, - 'away_score': ascores, - 'quarter': quarters, - 'secs_left_qt': qt_secs_left, - 'secs_left_reg': reg_secs_left, - 'play_team': teams, - 'play_type': p_types, - 'shooting_play': shooting_play, - 'scoring_play': sc_play, - 'is_three': is_three, - 'shooter': shooters, - 'is_assisted': is_assisted, - 'assist_player': assisted_pls, + "game_id": game_id, + "home_team": home_team, + "away_team": away_team, + "play_desc": descs, + "home_score": hscores, + "away_score": ascores, + "quarter": quarters, + "secs_left_qt": qt_secs_left, + "secs_left_reg": reg_secs_left, + "play_team": teams, + "play_type": p_types, + "shooting_play": shooting_play, + "scoring_play": sc_play, + "is_three": is_three, + "shooter": shooters, + "is_assisted": is_assisted, + "assist_player": assisted_pls, } df = pd.DataFrame(data) # add shot data if it exists - is_shotchart = 'shtChrt' in gamepackage + is_shotchart = "shtChrt" in gamepackage if is_shotchart: - chart = gamepackage['shtChrt']['plays'] - - shotteams = [x['homeAway'] for x in chart] - shotdescs = [x['text'] for x in chart] - xs = [50-int(x['coordinate']['x']) for x in chart] - ys = [int(x['coordinate']['y']) for x in chart] - - shot_data = { - 'team': shotteams, - 'play_desc': shotdescs, - 'x': xs, - 'y': ys - } + chart = gamepackage["shtChrt"]["plays"] + + shotteams = [x["homeAway"] for x in chart] + shotdescs = [x["text"] for x in chart] + xs = [50 - int(x["coordinate"]["x"]) for x in chart] + ys = [int(x["coordinate"]["y"]) for x in chart] + + shot_data = {"team": shotteams, "play_desc": shotdescs, "x": xs, "y": ys} shot_df = pd.DataFrame(shot_data) # shot matching shot_info = { - 'shot_x': [], - 'shot_y': [], + "shot_x": [], + "shot_y": [], } shot_count = 0 for play, isshot in zip(df.play_desc, df.shooting_play): if shot_count >= len(shot_df): - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) continue if not isshot: - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) continue - if 'free throw' in play.lower(): - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + if "free throw" in play.lower(): + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) shot_count += 1 continue shot_play = shot_df.play_desc.iloc[shot_count] if play == shot_play: - shot_info['shot_x'].append(shot_df.x.iloc[shot_count]) - shot_info['shot_y'].append(shot_df.y.iloc[shot_count]) + shot_info["shot_x"].append(shot_df.x.iloc[shot_count]) + shot_info["shot_y"].append(shot_df.y.iloc[shot_count]) shot_count += 1 else: - shot_info['shot_x'].append(np.nan) - shot_info['shot_y'].append(np.nan) + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) # make sure that length of shot data matches number of shots in PBP data - if (not (len(shot_info['shot_x']) == len(df))) or (not (len(shot_info['shot_y']) == len(df))): + if (not (len(shot_info["shot_x"]) == len(df))) or ( + not (len(shot_info["shot_y"]) == len(df)) + ): _log.warning( - f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data') - df['shot_x'] = np.nan - df['shot_y'] = np.nan + f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data' + ) + df["shot_x"] = np.nan + df["shot_y"] = np.nan return df - df['shot_x'] = shot_info['shot_x'] - df['shot_y'] = shot_info['shot_y'] + df["shot_x"] = shot_info["shot_x"] + df["shot_y"] = shot_info["shot_y"] else: - df['shot_x'] = np.nan - df['shot_y'] = np.nan + df["shot_x"] = np.nan + df["shot_y"] = np.nan return df return df @@ -949,77 +1040,79 @@ def _get_game_info_helper(info, more_info, game_id): Returns - the game metadata as a DataFrame """ - attendance = int(info['attnd']) if 'attnd' in info.keys() else np.nan - capacity = int(info['cpcty']) if 'cpcty' in info.keys() else np.nan - network = info['cvrg'] if 'cvrg' in info.keys() else '' + attendance = int(info["attnd"]) if "attnd" in info.keys() else np.nan + capacity = int(info["cpcty"]) if "cpcty" in info.keys() else np.nan + network = info["cvrg"] if "cvrg" in info.keys() else "" - gm_date = parse(info['dtTm']) - game_date = gm_date.replace( - tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) + gm_date = parse(info["dtTm"]) + game_date = gm_date.replace(tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) game_day = game_date.strftime("%B %d, %Y") game_time = game_date.strftime("%I:%M %p %Z") - arena = info['loc'] if 'loc' in info.keys() else '' - loc = info['locAddr']['city'] + ', ' + \ - info['locAddr']['state'] if 'locAddr' in info.keys() else '' + arena = info["loc"] if "loc" in info.keys() else "" + loc = ( + info["locAddr"]["city"] + ", " + info["locAddr"]["state"] + if "locAddr" in info.keys() + else "" + ) - tot_refs = info['refs'] if 'refs' in info.keys() else {} - ref_1 = tot_refs[0]['dspNm'] if len(tot_refs) > 0 else '' - ref_2 = tot_refs[1]['dspNm'] if len(tot_refs) > 1 else '' - ref_3 = tot_refs[2]['dspNm'] if len(tot_refs) > 2 else '' + tot_refs = info["refs"] if "refs" in info.keys() else {} + ref_1 = tot_refs[0]["dspNm"] if len(tot_refs) > 0 else "" + ref_2 = tot_refs[1]["dspNm"] if len(tot_refs) > 1 else "" + ref_3 = tot_refs[2]["dspNm"] if len(tot_refs) > 2 else "" - teams = more_info['tms'] + teams = more_info["tms"] ht_info, at_info = teams[0], teams[1] - home_team, away_team = ht_info['displayName'], at_info['displayName'] + home_team, away_team = ht_info["displayName"], at_info["displayName"] - home_id = ht_info['id'] - away_id = at_info['id'] + home_id = ht_info["id"] + away_id = at_info["id"] - if len(ht_info['links']) == 0: + if len(ht_info["links"]) == 0: ht = home_team.lower().replace(" ", "-") home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - elif len(ht_info['records']) == 0: + elif len(ht_info["records"]) == 0: ht = home_team.lower().replace(" ", "-") home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - if len(at_info['links']) == 0: + if len(at_info["links"]) == 0: at = away_team.lower().replace(" ", "-") away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - elif len(at_info['records']) == 0: + elif len(at_info["records"]) == 0: at = away_team.lower().replace(" ", "-") away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - home_rank = ht_info['rank'] if 'rank' in ht_info.keys() else np.nan - away_rank = at_info['rank'] if 'rank' in at_info.keys() else np.nan + home_rank = ht_info["rank"] if "rank" in ht_info.keys() else np.nan + away_rank = at_info["rank"] if "rank" in at_info.keys() else np.nan - home_record = ht_info['records'][0]['displayValue'] if len( - ht_info['records']) > 0 else '' - away_record = at_info['records'][0]['displayValue'] if len( - at_info['records']) > 0 else '' + home_record = ( + ht_info["records"][0]["displayValue"] if len(ht_info["records"]) > 0 else "" + ) + away_record = ( + at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" + ) - home_score, away_score = int( - ht_info['score']), int(at_info['score']) + home_score, away_score = int(ht_info["score"]), int(at_info["score"]) home_win = True if home_score > away_score else False - is_postseason = True if more_info['seasonType'] == 3 else False - is_conference = more_info['isConferenceGame'] + is_postseason = True if more_info["seasonType"] == 3 else False + is_conference = more_info["isConferenceGame"] - if len(ht_info['records']) > 1 and ht_info['records'][1]['type'] == 'home': + if len(ht_info["records"]) > 1 and ht_info["records"][1]["type"] == "home": is_neutral = False - elif len(at_info['records']) > 1 and at_info['records'][1]['type'] == 'away': + elif len(at_info["records"]) > 1 and at_info["records"][1]["type"] == "away": is_neutral = False else: is_neutral = True - tournament = more_info['nte'] if 'nte' in more_info.keys() else '' + tournament = more_info["nte"] if "nte" in more_info.keys() else "" - if ('linescores' in ht_info) and ('linescores' in at_info): - h_ot, a_ot = len(ht_info['linescores']) - \ - 4, len(at_info['linescores']) - 4 + if ("linescores" in ht_info) and ("linescores" in at_info): + h_ot, a_ot = len(ht_info["linescores"]) - 4, len(at_info["linescores"]) - 4 assert h_ot == a_ot num_ots = h_ot else: @@ -1053,37 +1146,37 @@ def _get_game_info_helper(info, more_info, game_id): network, ref_1, ref_2, - ref_3 + ref_3, ] game_info_cols = [ - 'game_id', - 'home_team', - 'home_id', - 'home_rank', - 'home_record', - 'home_score', - 'away_team', - 'away_id', - 'away_rank', - 'away_record', - 'away_score', - 'home_win', - 'num_ots', - 'is_conference', - 'is_neutral', - 'is_postseason', - 'tournament', - 'game_day', - 'game_time', - 'game_loc', - 'arena', - 'arena_capacity', - 'attendance', - 'tv_network', - 'referee_1', - 'referee_2', - 'referee_3' + "game_id", + "home_team", + "home_id", + "home_rank", + "home_record", + "home_score", + "away_team", + "away_id", + "away_rank", + "away_record", + "away_score", + "home_win", + "num_ots", + "is_conference", + "is_neutral", + "is_postseason", + "tournament", + "game_day", + "game_time", + "game_loc", + "arena", + "arena_capacity", + "attendance", + "tv_network", + "referee_1", + "referee_2", + "referee_3", ] return pd.DataFrame([game_info_list], columns=game_info_cols) @@ -1092,14 +1185,14 @@ def _get_game_info_helper(info, more_info, game_id): def _get_gamepackage_from_soup(soup): script_string = _find_json_in_content(soup) - if script_string == '': + if script_string == "": return None pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' + js = "{" + found + "}" jsn = json.loads(js) - gamepackage = jsn['page']['content']['gamepackage'] + gamepackage = jsn["page"]["content"]["gamepackage"] return gamepackage @@ -1107,21 +1200,21 @@ def _get_gamepackage_from_soup(soup): def _get_scoreboard_from_soup(soup): script_string = _find_json_in_content(soup) - if script_string == '': + if script_string == "": return None pattern = re.compile(JSON_REGEX) found = re.search(pattern, script_string).group(1) - js = '{' + found + '}' + js = "{" + found + "}" jsn = json.loads(js) - scoreboard = jsn['page']['content']['scoreboard']['evts'] + scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] return scoreboard def _find_json_in_content(soup): - script_string = '' - for x in soup.find_all('script'): + script_string = "" + for x in soup.find_all("script"): if WINDOW_STRING in x.text: script_string = x.text break From 3d174c42166e2f0b00b39baadf9ff0a8ce562b48 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 27 Dec 2023 00:43:24 -0600 Subject: [PATCH 52/53] organization; boxscore, pbp --- src/cbbpy/cbbpy_utils.py | 1248 +++++++++++++++++++++++++++++++++++ src/cbbpy/mens_scraper.py | 1136 +------------------------------ src/cbbpy/womens_scraper.py | 84 +-- 3 files changed, 1278 insertions(+), 1190 deletions(-) create mode 100644 src/cbbpy/cbbpy_utils.py diff --git a/src/cbbpy/cbbpy_utils.py b/src/cbbpy/cbbpy_utils.py new file mode 100644 index 0000000..c4b0013 --- /dev/null +++ b/src/cbbpy/cbbpy_utils.py @@ -0,0 +1,1248 @@ +from bs4 import BeautifulSoup as bs +import requests as r +import pandas as pd +import numpy as np +from datetime import datetime, timezone +from dateutil.parser import parse +from pytz import timezone as tz +from tqdm import trange +from joblib import Parallel, delayed +import re +import time +import traceback +import json +import os +import logging + + +ATTEMPTS = 15 +DATE_PARSES = [ + "%Y-%m-%d", + "%Y/%m/%d", + "%m-%d-%Y", + "%m/%d/%Y", +] +USER_AGENTS = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " + + "(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", +] +REFERERS = [ + "https://google.com/", + "https://youtube.com/", + "https://facebook.com/", + "https://twitter.com/", + "https://nytimes.com/", + "https://washingtonpost.com/", + "https://linkedin.com/", + "https://nhl.com/", + "https://mlb.com/", + "https://nfl.com/", +] +MENS_SCOREBOARD_URL = "https://www.espn.com/mens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" +MENS_GAME_URL = "https://www.espn.com/mens-college-basketball/game/_/gameId/{}" +MENS_BOXSCORE_URL = "https://www.espn.com/mens-college-basketball/boxscore/_/gameId/{}" +MENS_PBP_URL = "https://www.espn.com/mens-college-basketball/playbyplay/_/gameId/{}" +WOMENS_SCOREBOARD_URL = "https://www.espn.com/womens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" +WOMENS_GAME_URL = "https://www.espn.com/womens-college-basketball/game/_/gameId/{}" +WOMENS_BOXSCORE_URL = ( + "https://www.espn.com/womens-college-basketball/boxscore/_/gameId/{}" +) +WOMENS_PBP_URL = "https://www.espn.com/womens-college-basketball/playbyplay/_/gameId/{}" +NON_SHOT_TYPES = [ + "TV Timeout", + "Jump Ball", + "Turnover", + "Timeout", + "Rebound", + "Block", + "Steal", + "Foul", + "End", +] +SHOT_TYPES = [ + "Three Point Jumper", + "Two Point Tip Shot", + "Free Throw", + "Jumper", + "Layup", + "Dunk", +] +WINDOW_STRING = "window['__espnfitt__']=" +JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" +STATUS_OK = 200 + + +logging.basicConfig(filename="cbbpy.log") +_log = logging.getLogger(__name__) + + +# pnf_ will keep track of games w/ page not found errors +# if game has error, don't run the other scrape functions to save time +pnf_ = [] + + +class CouldNotParseError(Exception): + pass + + +class InvalidDateRangeError(Exception): + pass + + +def _get_game(game_id, game_type, info, box, pbp): + """A function that scrapes all game info (metadata, boxscore, play-by-play). + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + game_info_df = boxscore_df = pbp_df = pd.DataFrame([]) + + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Game Info: Page not found error') + elif info: + game_info_df = _get_game_info(game_id, game_type) + + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') + elif box: + boxscore_df = _get_game_boxscore(game_id, game_type) + + if game_id in pnf_: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page not found error') + elif pbp: + pbp_df = _get_game_pbp(game_id, game_type) + + return (game_info_df, boxscore_df, pbp_df) + + +def _get_games_range(start_date, end_date, game_type, info, box, pbp): + """A function that scrapes a game information between a given range of dates. + + Parameters: + - start_date: a string representing the first day of games to scrape + - end_date: a string representing the last day of games to scrape (inclusive) + - info: a boolean denoting whether game metadata is to be scraped + - box: a boolean denoting whether game boxscore is to be scraped + - pbp: a boolean denoting whether game play-by-play is to be scraped + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + sd = _parse_date(start_date) + ed = _parse_date(end_date) + date_range = pd.date_range(sd, ed) + len_scrape = len(date_range) + all_data = [] + cpus = os.cpu_count() - 1 + + if len_scrape < 1: + raise InvalidDateRangeError("The start date must be sooner than the end date.") + + if sd > datetime.today(): + raise InvalidDateRangeError("The start date must not be in the future.") + + if ed > datetime.today(): + raise InvalidDateRangeError("The end date must not be in the future.") + + bar_format = ( + "{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec" + ) + + with trange(len_scrape, bar_format=bar_format) as t: + for i in t: + date = date_range[i] + game_ids = _get_game_ids(date) + t.set_description( + f"Scraping {len(game_ids)} games on {date.strftime('%D')}", + refresh=False, + ) + + if len(game_ids) > 0: + result = Parallel(n_jobs=cpus)( + delayed(_get_game)(gid, game_type, info=info, box=box, pbp=pbp) + for gid in game_ids + ) + all_data.append(result) + + else: + t.set_description(f"No games on {date.strftime('%D')}", refresh=False) + + if not len(all_data) > 0: + return () + + game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( + drop=True + ) + game_boxscore_df = pd.concat( + [game[1] for day in all_data for game in day] + ).reset_index(drop=True) + game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( + drop=True + ) + + return (game_info_df, game_boxscore_df, game_pbp_df) + + +def _get_games_season(season, game_type, info, box, pbp): + """A function that scrapes all game info (metadata, boxscore, play-by-play) for every game of + a given season. + + Parameters: + - season: an integer representing the season to be scraped. NOTE: season is takes the form + of the four-digit representation of the later year of the season. So, as an example, the + 2021-22 season is referred to by the integer 2022. + + Returns + - (game_info_df, boxscore_df, pbp_df), a tuple consisting of: + -- game_info_df: a DataFrame of the game's metadata + -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) + -- pbp_df: a DataFrame of the game's play-by-play + """ + season_start_date = f"{season-1}-11-01" + season_end_date = f"{season}-05-01" + + # if season has not ended yet, set end scrape date to today + if datetime.strptime(season_end_date, "%Y-%m-%d") > datetime.today(): + season_end_date = datetime.today().strftime("%Y-%m-%d") + + info = _get_games_range( + season_start_date, season_end_date, game_type, info, box, pbp + ) + + return info + + +def _get_game_ids(date, game_type): + """A function that scrapes all game IDs on a date. + + Parameters: + - date: a string/datetime object representing the date to be scraped + + Returns + - a list of ESPN all game IDs for games played on the date given + """ + soup = None + + if game_type == "mens": + pre_url = MENS_BOXSCORE_URL + else: + pre_url = WOMENS_BOXSCORE_URL + + if type(date) == str: + date = _parse_date(date) + + for i in range(ATTEMPTS): + try: + header = { + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), + } + d = date.strftime("%Y%m%d") + url = pre_url.format(d) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + scoreboard = _get_scoreboard_from_soup(soup) + ids = [x["id"] for x in scoreboard] + + except Exception as ex: + if i + 1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if soup is not None: + if "Page not found." in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error' + ) + elif "Page error" in soup.text: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error' + ) + elif scoreboard is None: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.' + ) + else: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}' + ) + else: + _log.error( + f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}' + ) + return pd.DataFrame([]) + else: + # try again + time.sleep(2) + continue + else: + # no exception thrown + break + + return ids + + +def _get_game_boxscore(game_id, game_type): + """A function that scrapes a game's boxscore. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - the game boxscore as a DataFrame + """ + soup = None + + if game_type == "mens": + pre_url = MENS_BOXSCORE_URL + else: + pre_url = WOMENS_BOXSCORE_URL + + for i in range(ATTEMPTS): + try: + header = { + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), + } + url = pre_url.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + gamepackage = _get_gamepackage_from_soup(soup) + + # check if game was postponed + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + boxscore = gamepackage["bxscr"] + + df = _get_game_boxscore_helper(boxscore, game_id) + + except Exception as ex: + if soup is not None: + if "No Box Score Available" in soup.text: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) + + if i + 1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if soup is not None: + if "Page not found." in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page not found error' + ) + pnf_.append(game_id) + elif "Page error" in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Page error' + ) + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}' + ) + return pd.DataFrame([]) + else: + # try again + time.sleep(2) + continue + else: + # no exception thrown + break + + return df + + +def _get_game_pbp(game_id, game_type): + """A function that scrapes a game's play-by-play information. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - the game's play-by-play information represented as a DataFrame + """ + soup = None + + if game_type == "mens": + pre_url = MENS_PBP_URL + else: + pre_url = WOMENS_PBP_URL + + for i in range(ATTEMPTS): + try: + header = { + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), + } + url = pre_url.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + gamepackage = _get_gamepackage_from_soup(soup) + + # check if game was postponed + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + df = _get_game_pbp_helper(gamepackage, game_id, game_type) + + except Exception as ex: + if i + 1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if soup is not None: + if "Page not found." in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Page not found error' + ) + pnf_.append(game_id) + elif "Page error" in soup.text: + _log.error(f'"{time.ctime()}": {game_id} - PBP: Page error') + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}' + ) + return pd.DataFrame([]) + else: + # try again + time.sleep(2) + continue + else: + # no exception thrown + break + + return df + + +def _get_game_info(game_id, game_type): + """A function that scrapes game metadata. + + Parameters: + - game_id: a string representing the game's ESPN game ID + + Returns + - a DataFrame with one row and a column for each piece of metadata + """ + soup = None + + if game_type == "mens": + pre_url = MENS_GAME_URL + else: + pre_url = WOMENS_GAME_URL + + for i in range(ATTEMPTS): + try: + header = { + "User-Agent": np.random.choice(USER_AGENTS), + "Referer": np.random.choice(REFERERS), + } + url = pre_url.format(game_id) + page = r.get(url, headers=header) + soup = bs(page.content, "lxml") + gamepackage = _get_gamepackage_from_soup(soup) + + # check if game was postponed + gm_status = gamepackage["gmStrp"]["status"]["desc"] + gsbool = gm_status == "Final" # or (gm_status == 'In Progress') + if not gsbool: + _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') + return pd.DataFrame([]) + + # get general game info + info = gamepackage["gmInfo"] + + # get team info + more_info = gamepackage["gmStrp"] + + df = _get_game_info_helper(info, more_info, game_id) + + except Exception as ex: + if i + 1 == ATTEMPTS: + # max number of attempts reached, so return blank df + if soup is not None: + if "Page not found." in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page not found error' + ) + pnf_.append(game_id) + elif "Page error" in soup.text: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Page error' + ) + elif gamepackage is None: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}' + ) + else: + _log.error( + f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}' + ) + return pd.DataFrame([]) + else: + # try again + time.sleep(2) + continue + else: + # no exception thrown + break + + return df + + +def _parse_date(date): + parsed = False + + for parse in DATE_PARSES: + try: + date = datetime.strptime(date, parse) + except: + continue + else: + parsed = True + break + + if not parsed: + raise CouldNotParseError( + "The given date could not be parsed. Try any of these formats:\n" + + "Y-m-d\nY/m/d\nm-d-Y\nm/d/Y" + ) + + return date + + +def _get_game_boxscore_helper(boxscore, game_id): + """A helper function that cleans a game's boxscore. + + Parameters: + - boxscore: a JSON object containing the boxscore + - game_id: a string representing the game's ESPN game ID + + Returns + - the game boxscore as a DataFrame + """ + tm1, tm2 = boxscore[0], boxscore[1] + tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] + tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] + + labels = tm1_stats[0]["lbls"] + + tm1_starters, tm1_bench, tm1_totals = ( + tm1_stats[0]["athlts"], + tm1_stats[1]["athlts"], + tm1_stats[2]["ttls"], + ) + tm2_starters, tm2_bench, tm2_totals = ( + tm2_stats[0]["athlts"], + tm2_stats[1]["athlts"], + tm2_stats[2]["ttls"], + ) + + # starters' stats + if len(tm1_starters) > 0: + tm1_st_dict = { + labels[i].lower(): [ + tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) + ] + for i in range(len(labels)) + } + + tm1_st_pos = [ + tm1_starters[i]["athlt"]["pos"] + if "pos" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_id = [ + tm1_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + tm1_st_nm = [ + tm1_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_starters[i]["athlt"].keys() + else "" + for i in range(len(tm1_starters)) + ] + + tm1_st_df = pd.DataFrame(tm1_st_dict) + tm1_st_df.insert(0, "starter", True) + tm1_st_df.insert(0, "position", tm1_st_pos) + tm1_st_df.insert(0, "player_id", tm1_st_id) + tm1_st_df.insert(0, "player", tm1_st_nm) + tm1_st_df.insert(0, "team", tm1_name) + tm1_st_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm1_st_df = pd.DataFrame(columns=cols) + + # bench players' stats + if len(tm1_bench) > 0: + tm1_bn_dict = { + labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] + for i in range(len(labels)) + } + + tm1_bn_pos = [ + tm1_bench[i]["athlt"]["pos"] + if "pos" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_id = [ + tm1_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + tm1_bn_nm = [ + tm1_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm1_bench[i]["athlt"].keys() + else "" + for i in range(len(tm1_bench)) + ] + + tm1_bn_df = pd.DataFrame(tm1_bn_dict) + tm1_bn_df.insert(0, "starter", False) + tm1_bn_df.insert(0, "position", tm1_bn_pos) + tm1_bn_df.insert(0, "player_id", tm1_bn_id) + tm1_bn_df.insert(0, "player", tm1_bn_nm) + tm1_bn_df.insert(0, "team", tm1_name) + tm1_bn_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm1_bn_df = pd.DataFrame(columns=cols) + + # team totals + if len(tm1_totals) > 0: + tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} + + tm1_tot_df = pd.DataFrame(tm1_tot_dict) + tm1_tot_df.insert(0, "starter", False) + tm1_tot_df.insert(0, "position", "TOTAL") + tm1_tot_df.insert(0, "player_id", "TOTAL") + tm1_tot_df.insert(0, "player", "TEAM") + tm1_tot_df.insert(0, "team", tm1_name) + tm1_tot_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm1_tot_df = pd.DataFrame(columns=cols) + + tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) + + # starters' stats + if len(tm2_starters) > 0: + tm2_st_dict = { + labels[i].lower(): [ + tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) + ] + for i in range(len(labels)) + } + + tm2_st_pos = [ + tm2_starters[i]["athlt"]["pos"] + if "pos" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_id = [ + tm2_starters[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + tm2_st_nm = [ + tm2_starters[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_starters[i]["athlt"].keys() + else "" + for i in range(len(tm2_starters)) + ] + + tm2_st_df = pd.DataFrame(tm2_st_dict) + tm2_st_df.insert(0, "starter", True) + tm2_st_df.insert(0, "position", tm2_st_pos) + tm2_st_df.insert(0, "player_id", tm2_st_id) + tm2_st_df.insert(0, "player", tm2_st_nm) + tm2_st_df.insert(0, "team", tm2_name) + tm2_st_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm2_st_df = pd.DataFrame(columns=cols) + + # bench players' stats + if len(tm2_bench) > 0: + tm2_bn_dict = { + labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] + for i in range(len(labels)) + } + + tm2_bn_pos = [ + tm2_bench[i]["athlt"]["pos"] + if "pos" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_id = [ + tm2_bench[i]["athlt"]["uid"].split(":")[-1] + if "uid" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + tm2_bn_nm = [ + tm2_bench[i]["athlt"]["shrtNm"] + if "shrtNm" in tm2_bench[i]["athlt"].keys() + else "" + for i in range(len(tm2_bench)) + ] + + tm2_bn_df = pd.DataFrame(tm2_bn_dict) + tm2_bn_df.insert(0, "starter", False) + tm2_bn_df.insert(0, "position", tm2_bn_pos) + tm2_bn_df.insert(0, "player_id", tm2_bn_id) + tm2_bn_df.insert(0, "player", tm2_bn_nm) + tm2_bn_df.insert(0, "team", tm2_name) + tm2_bn_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm2_bn_df = pd.DataFrame(columns=cols) + + # team totals + if len(tm2_totals) > 0: + tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} + + tm2_tot_df = pd.DataFrame(tm2_tot_dict) + tm2_tot_df.insert(0, "starter", False) + tm2_tot_df.insert(0, "position", "TOTAL") + tm2_tot_df.insert(0, "player_id", "TOTAL") + tm2_tot_df.insert(0, "player", "TEAM") + tm2_tot_df.insert(0, "team", tm2_name) + tm2_tot_df.insert(0, "game_id", game_id) + + else: + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + tm2_tot_df = pd.DataFrame(columns=cols) + + tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) + + df = pd.concat([tm1_df, tm2_df]) + + if len(df) <= 0: + _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') + return pd.DataFrame([]) + + # SPLIT UP THE FG FIELDS + fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") + fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") + thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") + thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") + ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") + fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") + + # GET RID OF UNWANTED COLUMNS + df = df.drop(columns=["fg", "3pt", "ft"]) + + # INSERT COLUMNS WHERE NECESSARY + df.insert(7, "fgm", fgm) + df.insert(8, "fga", fga) + df.insert(9, "2pm", fgm - thpm) + df.insert(10, "2pa", fga - thpa) + df.insert(11, "3pm", thpm) + df.insert(12, "3pa", thpa) + df.insert(13, "ftm", ftm) + df.insert(14, "fta", fta) + + # column type handling + df["min"] = pd.to_numeric(df["min"], errors="coerce") + df["oreb"] = pd.to_numeric(df["oreb"], errors="coerce") + df["dreb"] = pd.to_numeric(df["dreb"], errors="coerce") + df["reb"] = pd.to_numeric(df["reb"], errors="coerce") + df["ast"] = pd.to_numeric(df["ast"], errors="coerce") + df["stl"] = pd.to_numeric(df["stl"], errors="coerce") + df["blk"] = pd.to_numeric(df["blk"], errors="coerce") + df["to"] = pd.to_numeric(df["to"], errors="coerce") + df["pf"] = pd.to_numeric(df["pf"], errors="coerce") + df["pts"] = pd.to_numeric(df["pts"], errors="coerce") + + return df + + +def _get_game_pbp_helper(gamepackage, game_id, game_type): + """A helper function that cleans a game's PBP. + + Parameters: + - pbp: a JSON object containing the play-by-play + - game_id: a string representing the game's ESPN game ID + - game_type: a string representing whether men's or women's basketball is being scraped + + Returns + - the game PBP as a DataFrame + """ + pbp = gamepackage["pbp"] + home_team = pbp["tms"]["home"]["displayName"] + away_team = pbp["tms"]["away"]["displayName"] + + all_plays = [play for period in pbp["playGrps"] for play in period] + + # check if PBP exists + if len(all_plays) <= 0: + _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') + return pd.DataFrame([]) + + descs = [x["text"] if "text" in x.keys() else "" for x in all_plays] + teams = [ + "" + if not "homeAway" in x.keys() + else home_team + if x["homeAway"] == "home" + else away_team + for x in all_plays + ] + hscores = [ + int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays + ] + ascores = [ + int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays + ] + periods = [ + int(x["period"]["number"]) if "period" in x.keys() else np.nan + for x in all_plays + ] + + time_splits = [ + x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" + for x in all_plays + ] + minutes = [int(x[0]) for x in time_splits] + seconds = [int(x[1]) for x in time_splits] + min_to_sec = [x * 60 for x in minutes] + pd_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] + + if game_type == "mens": + reg_secs_left = [ + 1200 + x if half_num == 1 else x + for x, half_num in zip(pd_secs_left, periods) + ] + else: + reg_secs_left = [ + 1800 + x + if qt_num == 1 + else 1200 + x + if qt_num == 2 + else 600 + x + if qt_num == 3 + else x + for x, qt_num in zip(pd_secs_left, periods) + ] + + sc_play = [True if "scoringPlay" in x.keys() else False for x in all_plays] + is_assisted = [ + True if ("text" in x.keys() and "assisted" in x["text"].lower()) else False + for x in all_plays + ] + + # ASSIGN PLAY TYPES + p_types = [] + + for x in all_plays: + if not "text" in x.keys(): + p_types.append("") + continue + + play = x["text"] + + if not type(play) == str: + play = "" + + added = False + for pt in NON_SHOT_TYPES: + if pt in play: + p_types.append(pt.lower()) + added = True + break + if not added: + for st in SHOT_TYPES: + if st in play: + p_types.append(st.lower()) + added = True + break + + if not added: + p_types.append("") + + # FIND SHOOTERS + shooting_play = [ + True if x in (y.lower() for y in SHOT_TYPES) else False for x in p_types + ] + + scorers = [x[0].split(" made ")[0] if x[1] else "" for x in zip(descs, sc_play)] + + non_scorers = [ + x[0].split(" missed ")[0] + if x[1] in (y.lower() for y in SHOT_TYPES) and not x[2] + else "" + for x in zip(descs, p_types, sc_play) + ] + + shooters = [x[0] if not x[0] == "" else x[1] for x in zip(scorers, non_scorers)] + + assisted_pls = [ + x[0].split("Assisted by ")[-1].replace(".", "") if x[1] else "" + for x in zip(descs, is_assisted) + ] + + is_three = ["three point" in x.lower() for x in descs] + + if game_type == "mens": + pd_type = "half" + pd_type_sec = "secs_left_half" + else: + pd_type = "quarter" + pd_type_sec = "secs_left_qt" + + data = { + "game_id": game_id, + "home_team": home_team, + "away_team": away_team, + "play_desc": descs, + "home_score": hscores, + "away_score": ascores, + pd_type: periods, + pd_type_sec: pd_secs_left, + "secs_left_reg": reg_secs_left, + "play_team": teams, + "play_type": p_types, + "shooting_play": shooting_play, + "scoring_play": sc_play, + "is_three": is_three, + "shooter": shooters, + "is_assisted": is_assisted, + "assist_player": assisted_pls, + } + + df = pd.DataFrame(data) + + # add shot data if it exists + is_shotchart = "shtChrt" in gamepackage + + if is_shotchart: + chart = gamepackage["shtChrt"]["plays"] + + shotteams = [x["homeAway"] for x in chart] + shotdescs = [x["text"] for x in chart] + xs = [50 - int(x["coordinate"]["x"]) for x in chart] + ys = [int(x["coordinate"]["y"]) for x in chart] + + shot_data = {"team": shotteams, "play_desc": shotdescs, "x": xs, "y": ys} + + shot_df = pd.DataFrame(shot_data) + + # shot matching + shot_info = { + "shot_x": [], + "shot_y": [], + } + shot_count = 0 + + for play, isshot in zip(df.play_desc, df.shooting_play): + if shot_count >= len(shot_df): + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) + continue + + if not isshot: + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) + continue + + if "free throw" in play.lower(): + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) + shot_count += 1 + continue + + shot_play = shot_df.play_desc.iloc[shot_count] + + if play == shot_play: + shot_info["shot_x"].append(shot_df.x.iloc[shot_count]) + shot_info["shot_y"].append(shot_df.y.iloc[shot_count]) + shot_count += 1 + else: + shot_info["shot_x"].append(np.nan) + shot_info["shot_y"].append(np.nan) + + # make sure that length of shot data matches number of shots in PBP data + if (not (len(shot_info["shot_x"]) == len(df))) or ( + not (len(shot_info["shot_y"]) == len(df)) + ): + _log.warning( + f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data' + ) + df["shot_x"] = np.nan + df["shot_y"] = np.nan + return df + + df["shot_x"] = shot_info["shot_x"] + df["shot_y"] = shot_info["shot_y"] + + else: + df["shot_x"] = np.nan + df["shot_y"] = np.nan + return df + + return df + + +def _get_game_info_helper(info, more_info, game_id, game_type): + """A helper function that cleans a game's metadata. + + Parameters: + - info: a JSON object containing game metadata + - more_info: a JSON object containing game metadata + - game_id: a string representing the game's ESPN game ID + - game_type: a string representing whether men's or women's basketball is being scraped + + Returns + - the game metadata as a DataFrame + """ + attendance = int(info["attnd"]) if "attnd" in info.keys() else np.nan + capacity = int(info["cpcty"]) if "cpcty" in info.keys() else np.nan + network = info["cvrg"] if "cvrg" in info.keys() else "" + + gm_date = parse(info["dtTm"]) + game_date = gm_date.replace(tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) + game_day = game_date.strftime("%B %d, %Y") + game_time = game_date.strftime("%I:%M %p %Z") + + arena = info["loc"] if "loc" in info.keys() else "" + loc = ( + info["locAddr"]["city"] + ", " + info["locAddr"]["state"] + if "locAddr" in info.keys() + else "" + ) + + tot_refs = info["refs"] if "refs" in info.keys() else {} + ref_1 = tot_refs[0]["dspNm"] if len(tot_refs) > 0 else "" + ref_2 = tot_refs[1]["dspNm"] if len(tot_refs) > 1 else "" + ref_3 = tot_refs[2]["dspNm"] if len(tot_refs) > 2 else "" + + teams = more_info["tms"] + ht_info, at_info = teams[0], teams[1] + + home_team, away_team = ht_info["displayName"], at_info["displayName"] + + home_id = ht_info["id"] + away_id = at_info["id"] + + if len(ht_info["links"]) == 0: + ht = home_team.lower().replace(" ", "-") + home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) + elif len(ht_info["records"]) == 0: + ht = home_team.lower().replace(" ", "-") + home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) + + if len(at_info["links"]) == 0: + at = away_team.lower().replace(" ", "-") + away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) + elif len(at_info["records"]) == 0: + at = away_team.lower().replace(" ", "-") + away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) + + home_rank = ht_info["rank"] if "rank" in ht_info.keys() else np.nan + away_rank = at_info["rank"] if "rank" in at_info.keys() else np.nan + + home_record = ( + ht_info["records"][0]["displayValue"] if len(ht_info["records"]) > 0 else "" + ) + away_record = ( + at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" + ) + + home_score, away_score = int(ht_info["score"]), int(at_info["score"]) + + home_win = True if home_score > away_score else False + + is_postseason = True if more_info["seasonType"] == 3 else False + is_conference = more_info["isConferenceGame"] + + if len(ht_info["records"]) > 1 and ht_info["records"][1]["type"] == "home": + is_neutral = False + + elif len(at_info["records"]) > 1 and at_info["records"][1]["type"] == "away": + is_neutral = False + + else: + is_neutral = True + + tournament = more_info["nte"] if "nte" in more_info.keys() else "" + + if ("linescores" in ht_info) and ("linescores" in at_info): + h_ot, a_ot = len(ht_info["linescores"]) - 2, len(at_info["linescores"]) - 2 + assert h_ot == a_ot + num_ots = h_ot + else: + _log.warning(f'"{time.ctime()}": {game_id} - No score info available') + num_ots = -1 + + game_info_list = [ + game_id, + home_team, + home_id, + home_rank, + home_record, + home_score, + away_team, + away_id, + away_rank, + away_record, + away_score, + home_win, + num_ots, + is_conference, + is_neutral, + is_postseason, + tournament, + game_day, + game_time, + loc, + arena, + capacity, + attendance, + network, + ref_1, + ref_2, + ref_3, + ] + + game_info_cols = [ + "game_id", + "home_team", + "home_id", + "home_rank", + "home_record", + "home_score", + "away_team", + "away_id", + "away_rank", + "away_record", + "away_score", + "home_win", + "num_ots", + "is_conference", + "is_neutral", + "is_postseason", + "tournament", + "game_day", + "game_time", + "game_loc", + "arena", + "arena_capacity", + "attendance", + "tv_network", + "referee_1", + "referee_2", + "referee_3", + ] + + return pd.DataFrame([game_info_list], columns=game_info_cols) + + +def _get_gamepackage_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == "": + return None + + pattern = re.compile(JSON_REGEX) + found = re.search(pattern, script_string).group(1) + js = "{" + found + "}" + jsn = json.loads(js) + gamepackage = jsn["page"]["content"]["gamepackage"] + + return gamepackage + + +def _get_scoreboard_from_soup(soup): + script_string = _find_json_in_content(soup) + + if script_string == "": + return None + + pattern = re.compile(JSON_REGEX) + found = re.search(pattern, script_string).group(1) + js = "{" + found + "}" + jsn = json.loads(js) + scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] + + return scoreboard + + +def _find_json_in_content(soup): + script_string = "" + for x in soup.find_all("script"): + if WINDOW_STRING in x.text: + script_string = x.text + break + return script_string diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index df1a2cc..20c0a7a 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -4,107 +4,18 @@ Author: Daniel Cowan """ - -from bs4 import BeautifulSoup as bs -import requests as r +from datetime import datetime import pandas as pd -import numpy as np -from datetime import datetime, timezone -from dateutil.parser import parse -from pytz import timezone as tz -from tqdm import trange -from joblib import Parallel, delayed -import re -import time -import logging -import traceback -import json -import os from typing import Union - - -logging.basicConfig(filename="cbbpy.log") -_log = logging.getLogger(__name__) - - -ATTEMPTS = 15 -DATE_PARSES = [ - "%Y-%m-%d", - "%Y/%m/%d", - "%m-%d-%Y", - "%m/%d/%Y", -] -USER_AGENTS = [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", - "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " - + "(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", - "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", -] -REFERERS = [ - "https://google.com/", - "https://youtube.com/", - "https://facebook.com/", - "https://twitter.com/", - "https://nytimes.com/", - "https://washingtonpost.com/", - "https://linkedin.com/", - "https://nhl.com/", - "https://mlb.com/", - "https://nfl.com/", -] -SCOREBOARD_URL = "https://www.espn.com/mens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" -GAME_URL = "https://www.espn.com/mens-college-basketball/game/_/gameId/{}" -BOXSCORE_URL = "https://www.espn.com/mens-college-basketball/boxscore/_/gameId/{}" -PBP_URL = "https://www.espn.com/mens-college-basketball/playbyplay/_/gameId/{}" -NON_SHOT_TYPES = [ - "TV Timeout", - "Jump Ball", - "Turnover", - "Timeout", - "Rebound", - "Block", - "Steal", - "Foul", - "End", -] -SHOT_TYPES = [ - "Three Point Jumper", - "Two Point Tip Shot", - "Free Throw", - "Jumper", - "Layup", - "Dunk", -] -WINDOW_STRING = "window['__espnfitt__']=" -JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" -STATUS_OK = 200 - - -class CouldNotParseError(Exception): - pass - - -class InvalidDateRangeError(Exception): - pass - - -# pnf_ will keep track of games w/ page not found errors -# if game has error, don't run the other scrape functions to save time -pnf_ = [] +from cbbpy_utils import ( + _get_game, + _get_games_range, + _get_games_season, + _get_game_ids, + _get_game_boxscore, + _get_game_pbp, + _get_game_info, +) def get_game( @@ -121,24 +32,7 @@ def get_game( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - game_info_df = boxscore_df = pbp_df = pd.DataFrame([]) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - Game Info: Page not found error') - elif info: - game_info_df = get_game_info(game_id) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') - elif box: - boxscore_df = get_game_boxscore(game_id) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - PBP: Page not found error') - elif pbp: - pbp_df = get_game_pbp(game_id) - - return (game_info_df, boxscore_df, pbp_df) + return _get_game(game_id, info, box, pbp) def get_games_range( @@ -163,273 +57,7 @@ def get_games_range( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - sd = _parse_date(start_date) - ed = _parse_date(end_date) - date_range = pd.date_range(sd, ed) - len_scrape = len(date_range) - all_data = [] - cpus = os.cpu_count() - 1 - - if len_scrape < 1: - raise InvalidDateRangeError("The start date must be sooner than the end date.") - - if sd > datetime.today(): - raise InvalidDateRangeError("The start date must not be in the future.") - - if ed > datetime.today(): - raise InvalidDateRangeError("The end date must not be in the future.") - - bar_format = ( - "{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec" - ) - - with trange(len_scrape, bar_format=bar_format) as t: - for i in t: - date = date_range[i] - game_ids = get_game_ids(date) - t.set_description( - f"Scraping {len(game_ids)} games on {date.strftime('%D')}", - refresh=False, - ) - - if len(game_ids) > 0: - result = Parallel(n_jobs=cpus)( - delayed(get_game)(gid) for gid in game_ids - ) - all_data.append(result) - - else: - t.set_description(f"No games on {date.strftime('%D')}", refresh=False) - - if not len(all_data) > 0: - return () - - game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( - drop=True - ) - game_boxscore_df = pd.concat( - [game[1] for day in all_data for game in day] - ).reset_index(drop=True) - game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( - drop=True - ) - - return (game_info_df, game_boxscore_df, game_pbp_df) - - -def get_game_boxscore(game_id: str) -> pd.DataFrame: - """A function that scrapes a game's boxscore. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - the game boxscore as a DataFrame - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = BOXSCORE_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - boxscore = gamepackage["bxscr"] - - df = _get_game_boxscore_helper(boxscore, game_id) - - except Exception as ex: - if soup is not None: - if "No Box Score Available" in soup.text: - _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) - - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error' - ) - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df - - -def get_game_pbp(game_id: str) -> pd.DataFrame: - """A function that scrapes a game's play-by-play information. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - the game's play-by-play information represented as a DataFrame - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = PBP_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - df = _get_game_pbp_helper(gamepackage, game_id) - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error(f'"{time.ctime()}": {game_id} - PBP: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df - - -def get_game_info(game_id: str) -> pd.DataFrame: - """A function that scrapes game metadata. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - a DataFrame with one row and a column for each piece of metadata - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = GAME_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - # get general game info - info = gamepackage["gmInfo"] - - # get team info - more_info = gamepackage["gmStrp"] - - df = _get_game_info_helper(info, more_info, game_id) - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error' - ) - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df + return _get_games_range(start_date, end_date, info, box, pbp) def get_games_season( @@ -449,16 +77,7 @@ def get_games_season( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = f"{season-1}-11-01" - season_end_date = f"{season}-05-01" - - # if season has not ended yet, set end scrape date to today - if datetime.strptime(season_end_date, "%Y-%m-%d") > datetime.today(): - season_end_date = datetime.today().strftime("%Y-%m-%d") - - info = get_games_range(season_start_date, season_end_date, info, box, pbp) - - return info + return _get_games_season(season, "mens", info, box, pbp) def get_game_ids(date: Union[str, datetime]) -> list: @@ -470,745 +89,40 @@ def get_game_ids(date: Union[str, datetime]) -> list: Returns - a list of ESPN all game IDs for games played on the date given """ - soup = None - - if type(date) == str: - date = _parse_date(date) - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - d = date.strftime("%Y%m%d") - url = SCOREBOARD_URL.format(d) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - scoreboard = _get_scoreboard_from_soup(soup) - ids = [x["id"] for x in scoreboard] - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error' - ) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error' - ) - elif scoreboard is None: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return ids - - -def _parse_date(date: str) -> datetime: - parsed = False - - for parse in DATE_PARSES: - try: - date = datetime.strptime(date, parse) - except: - continue - else: - parsed = True - break + return _get_game_ids(date, "mens") - if not parsed: - raise CouldNotParseError( - "The given date could not be parsed. Try any of these formats:\n" - + "Y-m-d\nY/m/d\nm-d-Y\nm/d/Y" - ) - return date - - -def _get_game_boxscore_helper(boxscore, game_id): - """A helper function that cleans a game's boxscore. +def get_game_boxscore(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's boxscore. Parameters: - - boxscore: a JSON object containing the boxscore - game_id: a string representing the game's ESPN game ID Returns - the game boxscore as a DataFrame """ - tm1, tm2 = boxscore[0], boxscore[1] - tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] - tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] - - labels = tm1_stats[0]["lbls"] - - tm1_starters, tm1_bench, tm1_totals = ( - tm1_stats[0]["athlts"], - tm1_stats[1]["athlts"], - tm1_stats[2]["ttls"], - ) - tm2_starters, tm2_bench, tm2_totals = ( - tm2_stats[0]["athlts"], - tm2_stats[1]["athlts"], - tm2_stats[2]["ttls"], - ) - - # starters' stats - if len(tm1_starters) > 0: - tm1_st_dict = { - labels[i].lower(): [ - tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) - ] - for i in range(len(labels)) - } - - tm1_st_pos = [ - tm1_starters[i]["athlt"]["pos"] - if "pos" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - tm1_st_id = [ - tm1_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - tm1_st_nm = [ - tm1_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - - tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, "starter", True) - tm1_st_df.insert(0, "position", tm1_st_pos) - tm1_st_df.insert(0, "player_id", tm1_st_id) - tm1_st_df.insert(0, "player", tm1_st_nm) - tm1_st_df.insert(0, "team", tm1_name) - tm1_st_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_st_df = pd.DataFrame(columns=cols) - - # bench players' stats - if len(tm1_bench) > 0: - tm1_bn_dict = { - labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] - for i in range(len(labels)) - } - - tm1_bn_pos = [ - tm1_bench[i]["athlt"]["pos"] - if "pos" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] - tm1_bn_id = [ - tm1_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] - tm1_bn_nm = [ - tm1_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] + return _get_game_boxscore(game_id, "mens") - tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, "starter", False) - tm1_bn_df.insert(0, "position", tm1_bn_pos) - tm1_bn_df.insert(0, "player_id", tm1_bn_id) - tm1_bn_df.insert(0, "player", tm1_bn_nm) - tm1_bn_df.insert(0, "team", tm1_name) - tm1_bn_df.insert(0, "game_id", game_id) - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_bn_df = pd.DataFrame(columns=cols) - - # team totals - if len(tm1_totals) > 0: - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} - - tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, "starter", False) - tm1_tot_df.insert(0, "position", "TOTAL") - tm1_tot_df.insert(0, "player_id", "TOTAL") - tm1_tot_df.insert(0, "player", "TEAM") - tm1_tot_df.insert(0, "team", tm1_name) - tm1_tot_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_tot_df = pd.DataFrame(columns=cols) - - tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) - - # starters' stats - if len(tm2_starters) > 0: - tm2_st_dict = { - labels[i].lower(): [ - tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) - ] - for i in range(len(labels)) - } - - tm2_st_pos = [ - tm2_starters[i]["athlt"]["pos"] - if "pos" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - tm2_st_id = [ - tm2_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - tm2_st_nm = [ - tm2_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - - tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, "starter", True) - tm2_st_df.insert(0, "position", tm2_st_pos) - tm2_st_df.insert(0, "player_id", tm2_st_id) - tm2_st_df.insert(0, "player", tm2_st_nm) - tm2_st_df.insert(0, "team", tm2_name) - tm2_st_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_st_df = pd.DataFrame(columns=cols) - - # bench players' stats - if len(tm2_bench) > 0: - tm2_bn_dict = { - labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] - for i in range(len(labels)) - } - - tm2_bn_pos = [ - tm2_bench[i]["athlt"]["pos"] - if "pos" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - tm2_bn_id = [ - tm2_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - tm2_bn_nm = [ - tm2_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - - tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, "starter", False) - tm2_bn_df.insert(0, "position", tm2_bn_pos) - tm2_bn_df.insert(0, "player_id", tm2_bn_id) - tm2_bn_df.insert(0, "player", tm2_bn_nm) - tm2_bn_df.insert(0, "team", tm2_name) - tm2_bn_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_bn_df = pd.DataFrame(columns=cols) - - # team totals - if len(tm2_totals) > 0: - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} - - tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, "starter", False) - tm2_tot_df.insert(0, "position", "TOTAL") - tm2_tot_df.insert(0, "player_id", "TOTAL") - tm2_tot_df.insert(0, "player", "TEAM") - tm2_tot_df.insert(0, "team", tm2_name) - tm2_tot_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_tot_df = pd.DataFrame(columns=cols) - - tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) - - df = pd.concat([tm1_df, tm2_df]) - - if len(df) <= 0: - _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) - - # SPLIT UP THE FG FIELDS - fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") - fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") - thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") - thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") - ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") - fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") - - # GET RID OF UNWANTED COLUMNS - df = df.drop(columns=["fg", "3pt", "ft"]) - - # INSERT COLUMNS WHERE NECESSARY - df.insert(7, "fgm", fgm) - df.insert(8, "fga", fga) - df.insert(9, "2pm", fgm - thpm) - df.insert(10, "2pa", fga - thpa) - df.insert(11, "3pm", thpm) - df.insert(12, "3pa", thpa) - df.insert(13, "ftm", ftm) - df.insert(14, "fta", fta) - - # column type handling - df["min"] = pd.to_numeric(df["min"], errors="coerce") - df["oreb"] = pd.to_numeric(df["oreb"], errors="coerce") - df["dreb"] = pd.to_numeric(df["dreb"], errors="coerce") - df["reb"] = pd.to_numeric(df["reb"], errors="coerce") - df["ast"] = pd.to_numeric(df["ast"], errors="coerce") - df["stl"] = pd.to_numeric(df["stl"], errors="coerce") - df["blk"] = pd.to_numeric(df["blk"], errors="coerce") - df["to"] = pd.to_numeric(df["to"], errors="coerce") - df["pf"] = pd.to_numeric(df["pf"], errors="coerce") - df["pts"] = pd.to_numeric(df["pts"], errors="coerce") - - return df - - -def _get_game_pbp_helper(gamepackage, game_id): - """A helper function that cleans a game's PBP. +def get_game_pbp(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's play-by-play information. Parameters: - - pbp: a JSON object containing the play-by-play - game_id: a string representing the game's ESPN game ID Returns - - the game PBP as a DataFrame + - the game's play-by-play information represented as a DataFrame """ - pbp = gamepackage["pbp"] - home_team = pbp["tms"]["home"]["displayName"] - away_team = pbp["tms"]["away"]["displayName"] - - all_plays = [play for half in pbp["playGrps"] for play in half] - - # check if PBP exists - if len(all_plays) <= 0: - _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') - return pd.DataFrame([]) - - descs = [x["text"] if "text" in x.keys() else "" for x in all_plays] - teams = [ - "" - if not "homeAway" in x.keys() - else home_team - if x["homeAway"] == "home" - else away_team - for x in all_plays - ] - hscores = [ - int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays - ] - ascores = [ - int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays - ] - halves = [ - int(x["period"]["number"]) if "period" in x.keys() else np.nan - for x in all_plays - ] - - time_splits = [ - x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" - for x in all_plays - ] - minutes = [int(x[0]) for x in time_splits] - seconds = [int(x[1]) for x in time_splits] - min_to_sec = [x * 60 for x in minutes] - hf_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] - reg_secs_left = [ - 1200 + x if half_num == 1 else x for x, half_num in zip(hf_secs_left, halves) - ] - - sc_play = [True if "scoringPlay" in x.keys() else False for x in all_plays] - is_assisted = [ - True if ("text" in x.keys() and "assisted" in x["text"].lower()) else False - for x in all_plays - ] - - # ASSIGN PLAY TYPES - p_types = [] - - for x in all_plays: - if not "text" in x.keys(): - p_types.append("") - continue - - play = x["text"] - - if not type(play) == str: - play = "" - - added = False - for pt in NON_SHOT_TYPES: - if pt in play: - p_types.append(pt.lower()) - added = True - break - if not added: - for st in SHOT_TYPES: - if st in play: - p_types.append(st.lower()) - added = True - break - - if not added: - p_types.append("") - - # FIND SHOOTERS - shooting_play = [ - True if x in (y.lower() for y in SHOT_TYPES) else False for x in p_types - ] - - scorers = [x[0].split(" made ")[0] if x[1] else "" for x in zip(descs, sc_play)] - - non_scorers = [ - x[0].split(" missed ")[0] - if x[1] in (y.lower() for y in SHOT_TYPES) and not x[2] - else "" - for x in zip(descs, p_types, sc_play) - ] - - shooters = [x[0] if not x[0] == "" else x[1] for x in zip(scorers, non_scorers)] - - assisted_pls = [ - x[0].split("Assisted by ")[-1].replace(".", "") if x[1] else "" - for x in zip(descs, is_assisted) - ] - - is_three = ["three point" in x.lower() for x in descs] - - data = { - "game_id": game_id, - "home_team": home_team, - "away_team": away_team, - "play_desc": descs, - "home_score": hscores, - "away_score": ascores, - "half": halves, - "secs_left_half": hf_secs_left, - "secs_left_reg": reg_secs_left, - "play_team": teams, - "play_type": p_types, - "shooting_play": shooting_play, - "scoring_play": sc_play, - "is_three": is_three, - "shooter": shooters, - "is_assisted": is_assisted, - "assist_player": assisted_pls, - } - - df = pd.DataFrame(data) - - # add shot data if it exists - is_shotchart = "shtChrt" in gamepackage - - if is_shotchart: - chart = gamepackage["shtChrt"]["plays"] + return _get_game_pbp(game_id, "mens") - shotteams = [x["homeAway"] for x in chart] - shotdescs = [x["text"] for x in chart] - xs = [50 - int(x["coordinate"]["x"]) for x in chart] - ys = [int(x["coordinate"]["y"]) for x in chart] - shot_data = {"team": shotteams, "play_desc": shotdescs, "x": xs, "y": ys} - - shot_df = pd.DataFrame(shot_data) - - # shot matching - shot_info = { - "shot_x": [], - "shot_y": [], - } - shot_count = 0 - - for play, isshot in zip(df.play_desc, df.shooting_play): - if shot_count >= len(shot_df): - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - continue - - if not isshot: - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - continue - - if "free throw" in play.lower(): - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - shot_count += 1 - continue - - shot_play = shot_df.play_desc.iloc[shot_count] - - if play == shot_play: - shot_info["shot_x"].append(shot_df.x.iloc[shot_count]) - shot_info["shot_y"].append(shot_df.y.iloc[shot_count]) - shot_count += 1 - else: - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - - # make sure that length of shot data matches number of shots in PBP data - if (not (len(shot_info["shot_x"]) == len(df))) or ( - not (len(shot_info["shot_y"]) == len(df)) - ): - _log.warning( - f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data' - ) - df["shot_x"] = np.nan - df["shot_y"] = np.nan - return df - - df["shot_x"] = shot_info["shot_x"] - df["shot_y"] = shot_info["shot_y"] - - else: - df["shot_x"] = np.nan - df["shot_y"] = np.nan - return df - - return df - - -def _get_game_info_helper(info, more_info, game_id): - """A helper function that cleans a game's metadata. +def get_game_info(game_id: str) -> pd.DataFrame: + """A function that scrapes game metadata. Parameters: - - info: a JSON object containing game metadata - - more_info: a JSON object containing game metadata - game_id: a string representing the game's ESPN game ID Returns - - the game metadata as a DataFrame + - a DataFrame with one row and a column for each piece of metadata """ - attendance = int(info["attnd"]) if "attnd" in info.keys() else np.nan - capacity = int(info["cpcty"]) if "cpcty" in info.keys() else np.nan - network = info["cvrg"] if "cvrg" in info.keys() else "" - - gm_date = parse(info["dtTm"]) - game_date = gm_date.replace(tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) - game_day = game_date.strftime("%B %d, %Y") - game_time = game_date.strftime("%I:%M %p %Z") - - arena = info["loc"] if "loc" in info.keys() else "" - loc = ( - info["locAddr"]["city"] + ", " + info["locAddr"]["state"] - if "locAddr" in info.keys() - else "" - ) - - tot_refs = info["refs"] if "refs" in info.keys() else {} - ref_1 = tot_refs[0]["dspNm"] if len(tot_refs) > 0 else "" - ref_2 = tot_refs[1]["dspNm"] if len(tot_refs) > 1 else "" - ref_3 = tot_refs[2]["dspNm"] if len(tot_refs) > 2 else "" - - teams = more_info["tms"] - ht_info, at_info = teams[0], teams[1] - - home_team, away_team = ht_info["displayName"], at_info["displayName"] - - home_id = ht_info["id"] - away_id = at_info["id"] - - if len(ht_info["links"]) == 0: - ht = home_team.lower().replace(" ", "-") - home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - elif len(ht_info["records"]) == 0: - ht = home_team.lower().replace(" ", "-") - home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - - if len(at_info["links"]) == 0: - at = away_team.lower().replace(" ", "-") - away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - elif len(at_info["records"]) == 0: - at = away_team.lower().replace(" ", "-") - away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - - home_rank = ht_info["rank"] if "rank" in ht_info.keys() else np.nan - away_rank = at_info["rank"] if "rank" in at_info.keys() else np.nan - - home_record = ( - ht_info["records"][0]["displayValue"] if len(ht_info["records"]) > 0 else "" - ) - away_record = ( - at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" - ) - - home_score, away_score = int(ht_info["score"]), int(at_info["score"]) - - home_win = True if home_score > away_score else False - - is_postseason = True if more_info["seasonType"] == 3 else False - is_conference = more_info["isConferenceGame"] - - if len(ht_info["records"]) > 1 and ht_info["records"][1]["type"] == "home": - is_neutral = False - - elif len(at_info["records"]) > 1 and at_info["records"][1]["type"] == "away": - is_neutral = False - - else: - is_neutral = True - - tournament = more_info["nte"] if "nte" in more_info.keys() else "" - - if ("linescores" in ht_info) and ("linescores" in at_info): - h_ot, a_ot = len(ht_info["linescores"]) - 2, len(at_info["linescores"]) - 2 - assert h_ot == a_ot - num_ots = h_ot - else: - _log.warning(f'"{time.ctime()}": {game_id} - No score info available') - num_ots = -1 - - game_info_list = [ - game_id, - home_team, - home_id, - home_rank, - home_record, - home_score, - away_team, - away_id, - away_rank, - away_record, - away_score, - home_win, - num_ots, - is_conference, - is_neutral, - is_postseason, - tournament, - game_day, - game_time, - loc, - arena, - capacity, - attendance, - network, - ref_1, - ref_2, - ref_3, - ] - - game_info_cols = [ - "game_id", - "home_team", - "home_id", - "home_rank", - "home_record", - "home_score", - "away_team", - "away_id", - "away_rank", - "away_record", - "away_score", - "home_win", - "num_ots", - "is_conference", - "is_neutral", - "is_postseason", - "tournament", - "game_day", - "game_time", - "game_loc", - "arena", - "arena_capacity", - "attendance", - "tv_network", - "referee_1", - "referee_2", - "referee_3", - ] - - return pd.DataFrame([game_info_list], columns=game_info_cols) - - -def _get_gamepackage_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": - return None - - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - gamepackage = jsn["page"]["content"]["gamepackage"] - - return gamepackage - - -def _get_scoreboard_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": - return None - - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] - - return scoreboard - - -def _find_json_in_content(soup): - script_string = "" - for x in soup.find_all("script"): - if WINDOW_STRING in x.text: - script_string = x.text - break - return script_string + return _get_game_info(game_id, "mens") diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index 1654cc3..c2a2ea4 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -21,87 +21,13 @@ import json import os from typing import Union +from cbbpy_utils import * logging.basicConfig(filename="cbbpy.log") _log = logging.getLogger(__name__) -ATTEMPTS = 15 -DATE_PARSES = [ - "%Y-%m-%d", - "%Y/%m/%d", - "%m-%d-%Y", - "%m/%d/%Y", -] -USER_AGENTS = [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", - "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " - + "(KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", - "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " - + "(KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", -] -REFERERS = [ - "https://google.com/", - "https://youtube.com/", - "https://facebook.com/", - "https://twitter.com/", - "https://nytimes.com/", - "https://washingtonpost.com/", - "https://linkedin.com/", - "https://nhl.com/", - "https://mlb.com/", - "https://nfl.com/", -] -SCOREBOARD_URL = "https://www.espn.com/womens-college-basketball/scoreboard/_/date/{}/seasontype/2/group/50" -GAME_URL = "https://www.espn.com/womens-college-basketball/game/_/gameId/{}" -BOXSCORE_URL = "https://www.espn.com/womens-college-basketball/boxscore/_/gameId/{}" -PBP_URL = "https://www.espn.com/womens-college-basketball/playbyplay/_/gameId/{}" -NON_SHOT_TYPES = [ - "TV Timeout", - "Jump Ball", - "Turnover", - "Timeout", - "Rebound", - "Block", - "Steal", - "Foul", - "End", -] -SHOT_TYPES = [ - "Three Point Jumper", - "Two Point Tip Shot", - "Free Throw", - "Jumper", - "Layup", - "Dunk", -] -WINDOW_STRING = "window['__espnfitt__']=" -JSON_REGEX = r"window\[\'__espnfitt__\'\]={(.*)};" -STATUS_OK = 200 - - -class CouldNotParseError(Exception): - pass - - -class InvalidDateRangeError(Exception): - pass - - # pnf_ will keep track of games w/ page not found errors # if game has error, don't run the other scrape functions to save time pnf_ = [] @@ -234,7 +160,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: "User-Agent": np.random.choice(USER_AGENTS), "Referer": np.random.choice(REFERERS), } - url = BOXSCORE_URL.format(game_id) + url = WOMENS_BOXSCORE_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) @@ -309,7 +235,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: "User-Agent": np.random.choice(USER_AGENTS), "Referer": np.random.choice(REFERERS), } - url = PBP_URL.format(game_id) + url = WOMENS_PBP_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) @@ -375,7 +301,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: "User-Agent": np.random.choice(USER_AGENTS), "Referer": np.random.choice(REFERERS), } - url = GAME_URL.format(game_id) + url = WOMENS_GAME_URL.format(game_id) page = r.get(url, headers=header) soup = bs(page.content, "lxml") gamepackage = _get_gamepackage_from_soup(soup) @@ -482,7 +408,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: "Referer": np.random.choice(REFERERS), } d = date.strftime("%Y%m%d") - url = SCOREBOARD_URL.format(d) + url = WOMENS_SCOREBOARD_URL.format(d) page = r.get(url, headers=header) soup = bs(page.content, "lxml") scoreboard = _get_scoreboard_from_soup(soup) From 02c6a61d559ee7cd52289c1cdd56fa55bd577d2d Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 27 Dec 2023 00:59:22 -0600 Subject: [PATCH 53/53] game info, game ids, bug fixes --- src/cbbpy/cbbpy_utils.py | 16 +- src/cbbpy/mens_scraper.py | 4 +- src/cbbpy/womens_scraper.py | 1069 +---------------------------------- 3 files changed, 37 insertions(+), 1052 deletions(-) diff --git a/src/cbbpy/cbbpy_utils.py b/src/cbbpy/cbbpy_utils.py index c4b0013..1649452 100644 --- a/src/cbbpy/cbbpy_utils.py +++ b/src/cbbpy/cbbpy_utils.py @@ -176,7 +176,7 @@ def _get_games_range(start_date, end_date, game_type, info, box, pbp): with trange(len_scrape, bar_format=bar_format) as t: for i in t: date = date_range[i] - game_ids = _get_game_ids(date) + game_ids = _get_game_ids(date, game_type) t.set_description( f"Scraping {len(game_ids)} games on {date.strftime('%D')}", refresh=False, @@ -184,7 +184,7 @@ def _get_games_range(start_date, end_date, game_type, info, box, pbp): if len(game_ids) > 0: result = Parallel(n_jobs=cpus)( - delayed(_get_game)(gid, game_type, info=info, box=box, pbp=pbp) + delayed(_get_game)(gid, game_type, info, box, pbp) for gid in game_ids ) all_data.append(result) @@ -249,9 +249,9 @@ def _get_game_ids(date, game_type): soup = None if game_type == "mens": - pre_url = MENS_BOXSCORE_URL + pre_url = MENS_SCOREBOARD_URL else: - pre_url = WOMENS_BOXSCORE_URL + pre_url = WOMENS_SCOREBOARD_URL if type(date) == str: date = _parse_date(date) @@ -496,7 +496,7 @@ def _get_game_info(game_id, game_type): # get team info more_info = gamepackage["gmStrp"] - df = _get_game_info_helper(info, more_info, game_id) + df = _get_game_info_helper(info, more_info, game_id, game_type) except Exception as ex: if i + 1 == ATTEMPTS: @@ -1139,7 +1139,11 @@ def _get_game_info_helper(info, more_info, game_id, game_type): tournament = more_info["nte"] if "nte" in more_info.keys() else "" if ("linescores" in ht_info) and ("linescores" in at_info): - h_ot, a_ot = len(ht_info["linescores"]) - 2, len(at_info["linescores"]) - 2 + if game_type == "mens": + h_ot, a_ot = len(ht_info["linescores"]) - 2, len(at_info["linescores"]) - 2 + else: + h_ot, a_ot = len(ht_info["linescores"]) - 4, len(at_info["linescores"]) - 4 + assert h_ot == a_ot num_ots = h_ot else: diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 20c0a7a..15dc842 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -32,7 +32,7 @@ def get_game( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - return _get_game(game_id, info, box, pbp) + return _get_game(game_id, "mens", info, box, pbp) def get_games_range( @@ -57,7 +57,7 @@ def get_games_range( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - return _get_games_range(start_date, end_date, info, box, pbp) + return _get_games_range(start_date, end_date, "mens", info, box, pbp) def get_games_season( diff --git a/src/cbbpy/womens_scraper.py b/src/cbbpy/womens_scraper.py index c2a2ea4..fceb250 100644 --- a/src/cbbpy/womens_scraper.py +++ b/src/cbbpy/womens_scraper.py @@ -4,33 +4,18 @@ Author: Daniel Cowan """ - -from bs4 import BeautifulSoup as bs -import requests as r +from datetime import datetime import pandas as pd -import numpy as np -from datetime import datetime, timezone -from dateutil.parser import parse -from pytz import timezone as tz -from tqdm import trange -from joblib import Parallel, delayed -import re -import time -import logging -import traceback -import json -import os from typing import Union -from cbbpy_utils import * - - -logging.basicConfig(filename="cbbpy.log") -_log = logging.getLogger(__name__) - - -# pnf_ will keep track of games w/ page not found errors -# if game has error, don't run the other scrape functions to save time -pnf_ = [] +from cbbpy_utils import ( + _get_game, + _get_games_range, + _get_games_season, + _get_game_ids, + _get_game_boxscore, + _get_game_pbp, + _get_game_info, +) def get_game( @@ -47,24 +32,7 @@ def get_game( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - game_info_df = boxscore_df = pbp_df = pd.DataFrame([]) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - Game Info: Page not found error') - elif info: - game_info_df = get_game_info(game_id) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - Boxscore: Page not found error') - elif box: - boxscore_df = get_game_boxscore(game_id) - - if game_id in pnf_: - _log.error(f'"{time.ctime()}": {game_id} - PBP: Page not found error') - elif pbp: - pbp_df = get_game_pbp(game_id) - - return (game_info_df, boxscore_df, pbp_df) + return _get_game(game_id, "womens", info, box, pbp) def get_games_range( @@ -89,273 +57,7 @@ def get_games_range( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - sd = _parse_date(start_date) - ed = _parse_date(end_date) - date_range = pd.date_range(sd, ed) - len_scrape = len(date_range) - all_data = [] - cpus = os.cpu_count() - 1 - - if len_scrape < 1: - raise InvalidDateRangeError("The start date must be sooner than the end date.") - - if sd > datetime.today(): - raise InvalidDateRangeError("The start date must not be in the future.") - - if ed > datetime.today(): - raise InvalidDateRangeError("The end date must not be in the future.") - - bar_format = ( - "{l_bar}{bar}| {n_fmt} of {total_fmt} days scraped in {elapsed_s:.1f} sec" - ) - - with trange(len_scrape, bar_format=bar_format) as t: - for i in t: - date = date_range[i] - game_ids = get_game_ids(date) - t.set_description( - f"Scraping {len(game_ids)} games on {date.strftime('%D')}", - refresh=False, - ) - - if len(game_ids) > 0: - result = Parallel(n_jobs=cpus)( - delayed(get_game)(gid) for gid in game_ids - ) - all_data.append(result) - - else: - t.set_description(f"No games on {date.strftime('%D')}", refresh=False) - - if not len(all_data) > 0: - return () - - game_info_df = pd.concat([game[0] for day in all_data for game in day]).reset_index( - drop=True - ) - game_boxscore_df = pd.concat( - [game[1] for day in all_data for game in day] - ).reset_index(drop=True) - game_pbp_df = pd.concat([game[2] for day in all_data for game in day]).reset_index( - drop=True - ) - - return (game_info_df, game_boxscore_df, game_pbp_df) - - -def get_game_boxscore(game_id: str) -> pd.DataFrame: - """A function that scrapes a game's boxscore. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - the game boxscore as a DataFrame - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = WOMENS_BOXSCORE_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - boxscore = gamepackage["bxscr"] - - df = _get_game_boxscore_helper(boxscore, game_id) - - except Exception as ex: - if soup is not None: - if "No Box Score Available" in soup.text: - _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) - - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Page error' - ) - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Boxscore: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df - - -def get_game_pbp(game_id: str) -> pd.DataFrame: - """A function that scrapes a game's play-by-play information. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - the game's play-by-play information represented as a DataFrame - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = WOMENS_PBP_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - df = _get_game_pbp_helper(gamepackage, game_id) - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error(f'"{time.ctime()}": {game_id} - PBP: Page error') - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - PBP: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df - - -def get_game_info(game_id: str) -> pd.DataFrame: - """A function that scrapes game metadata. - - Parameters: - - game_id: a string representing the game's ESPN game ID - - Returns - - a DataFrame with one row and a column for each piece of metadata - """ - soup = None - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - url = WOMENS_GAME_URL.format(game_id) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - gamepackage = _get_gamepackage_from_soup(soup) - - # check if game was postponed - gm_status = gamepackage["gmStrp"]["status"]["desc"] - gsbool = gm_status == "Final" # or (gm_status == 'In Progress') - if not gsbool: - _log.warning(f'"{time.ctime()}": {game_id} - {gm_status}') - return pd.DataFrame([]) - - # get general game info - info = gamepackage["gmInfo"] - - # get team info - more_info = gamepackage["gmStrp"] - - df = _get_game_info_helper(info, more_info, game_id) - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page not found error' - ) - pnf_.append(game_id) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Page error' - ) - elif gamepackage is None: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: Game JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {game_id} - Game Info: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return df + return _get_games_range(start_date, end_date, "womens", info, box, pbp) def get_games_season( @@ -375,16 +77,7 @@ def get_games_season( -- boxscore_df: a DataFrame of the game's boxscore (both teams combined) -- pbp_df: a DataFrame of the game's play-by-play """ - season_start_date = f"{season-1}-11-01" - season_end_date = f"{season}-05-01" - - # if season has not ended yet, set end scrape date to today - if datetime.strptime(season_end_date, "%Y-%m-%d") > datetime.today(): - season_end_date = datetime.today().strftime("%Y-%m-%d") - - info = get_games_range(season_start_date, season_end_date, info, box, pbp) - - return info + return _get_games_season(season, "womens", info, box, pbp) def get_game_ids(date: Union[str, datetime]) -> list: @@ -396,752 +89,40 @@ def get_game_ids(date: Union[str, datetime]) -> list: Returns - a list of ESPN all game IDs for games played on the date given """ - soup = None - - if type(date) == str: - date = _parse_date(date) - - for i in range(ATTEMPTS): - try: - header = { - "User-Agent": np.random.choice(USER_AGENTS), - "Referer": np.random.choice(REFERERS), - } - d = date.strftime("%Y%m%d") - url = WOMENS_SCOREBOARD_URL.format(d) - page = r.get(url, headers=header) - soup = bs(page.content, "lxml") - scoreboard = _get_scoreboard_from_soup(soup) - ids = [x["id"] for x in scoreboard] - - except Exception as ex: - if i + 1 == ATTEMPTS: - # max number of attempts reached, so return blank df - if soup is not None: - if "Page not found." in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page not found error' - ) - elif "Page error" in soup.text: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: Page error' - ) - elif scoreboard is None: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: JSON not found on page.' - ) - else: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: {ex}\n{traceback.format_exc()}' - ) - else: - _log.error( - f'"{time.ctime()}": {date.strftime("%D")} - IDs: GET error\n{ex}\n{traceback.format_exc()}' - ) - return pd.DataFrame([]) - else: - # try again - time.sleep(2) - continue - else: - # no exception thrown - break - - return ids + return _get_game_ids(date, "womens") -def _parse_date(date: str) -> datetime: - parsed = False - - for parse in DATE_PARSES: - try: - date = datetime.strptime(date, parse) - except: - continue - else: - parsed = True - break - - if not parsed: - raise CouldNotParseError( - "The given date could not be parsed. Try any of these formats:\n" - + "Y-m-d\nY/m/d\nm-d-Y\nm/d/Y" - ) - - return date - - -def _get_game_boxscore_helper(boxscore, game_id): - """A helper function that cleans a game's boxscore. +def get_game_boxscore(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's boxscore. Parameters: - - boxscore: a JSON object containing the boxscore - game_id: a string representing the game's ESPN game ID Returns - the game boxscore as a DataFrame """ - tm1, tm2 = boxscore[0], boxscore[1] - tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] - tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] - - labels = tm1_stats[0]["lbls"] - - tm1_starters, tm1_bench, tm1_totals = ( - tm1_stats[0]["athlts"], - tm1_stats[1]["athlts"], - tm1_stats[2]["ttls"], - ) - tm2_starters, tm2_bench, tm2_totals = ( - tm2_stats[0]["athlts"], - tm2_stats[1]["athlts"], - tm2_stats[2]["ttls"], - ) - - # starters' stats - if len(tm1_starters) > 0: - tm1_st_dict = { - labels[i].lower(): [ - tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) - ] - for i in range(len(labels)) - } - - tm1_st_pos = [ - tm1_starters[i]["athlt"]["pos"] - if "pos" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - tm1_st_id = [ - tm1_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - tm1_st_nm = [ - tm1_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_starters[i]["athlt"].keys() - else "" - for i in range(len(tm1_starters)) - ] - - tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, "starter", True) - tm1_st_df.insert(0, "position", tm1_st_pos) - tm1_st_df.insert(0, "player_id", tm1_st_id) - tm1_st_df.insert(0, "player", tm1_st_nm) - tm1_st_df.insert(0, "team", tm1_name) - tm1_st_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_st_df = pd.DataFrame(columns=cols) - - # bench players' stats - if len(tm1_bench) > 0: - tm1_bn_dict = { - labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] - for i in range(len(labels)) - } - - tm1_bn_pos = [ - tm1_bench[i]["athlt"]["pos"] - if "pos" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] - tm1_bn_id = [ - tm1_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] - tm1_bn_nm = [ - tm1_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_bench[i]["athlt"].keys() - else "" - for i in range(len(tm1_bench)) - ] - - tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, "starter", False) - tm1_bn_df.insert(0, "position", tm1_bn_pos) - tm1_bn_df.insert(0, "player_id", tm1_bn_id) - tm1_bn_df.insert(0, "player", tm1_bn_nm) - tm1_bn_df.insert(0, "team", tm1_name) - tm1_bn_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_bn_df = pd.DataFrame(columns=cols) - - # team totals - if len(tm1_totals) > 0: - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} + return _get_game_boxscore(game_id, "womens") - tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, "starter", False) - tm1_tot_df.insert(0, "position", "TOTAL") - tm1_tot_df.insert(0, "player_id", "TOTAL") - tm1_tot_df.insert(0, "player", "TEAM") - tm1_tot_df.insert(0, "team", tm1_name) - tm1_tot_df.insert(0, "game_id", game_id) - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_tot_df = pd.DataFrame(columns=cols) - - tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) - - # starters' stats - if len(tm2_starters) > 0: - tm2_st_dict = { - labels[i].lower(): [ - tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) - ] - for i in range(len(labels)) - } - - tm2_st_pos = [ - tm2_starters[i]["athlt"]["pos"] - if "pos" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - tm2_st_id = [ - tm2_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - tm2_st_nm = [ - tm2_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_starters[i]["athlt"].keys() - else "" - for i in range(len(tm2_starters)) - ] - - tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, "starter", True) - tm2_st_df.insert(0, "position", tm2_st_pos) - tm2_st_df.insert(0, "player_id", tm2_st_id) - tm2_st_df.insert(0, "player", tm2_st_nm) - tm2_st_df.insert(0, "team", tm2_name) - tm2_st_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_st_df = pd.DataFrame(columns=cols) - - # bench players' stats - if len(tm2_bench) > 0: - tm2_bn_dict = { - labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] - for i in range(len(labels)) - } - - tm2_bn_pos = [ - tm2_bench[i]["athlt"]["pos"] - if "pos" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - tm2_bn_id = [ - tm2_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - tm2_bn_nm = [ - tm2_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_bench[i]["athlt"].keys() - else "" - for i in range(len(tm2_bench)) - ] - - tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, "starter", False) - tm2_bn_df.insert(0, "position", tm2_bn_pos) - tm2_bn_df.insert(0, "player_id", tm2_bn_id) - tm2_bn_df.insert(0, "player", tm2_bn_nm) - tm2_bn_df.insert(0, "team", tm2_name) - tm2_bn_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_bn_df = pd.DataFrame(columns=cols) - - # team totals - if len(tm2_totals) > 0: - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} - - tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, "starter", False) - tm2_tot_df.insert(0, "position", "TOTAL") - tm2_tot_df.insert(0, "player_id", "TOTAL") - tm2_tot_df.insert(0, "player", "TEAM") - tm2_tot_df.insert(0, "team", tm2_name) - tm2_tot_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_tot_df = pd.DataFrame(columns=cols) - - tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) - - df = pd.concat([tm1_df, tm2_df]) - - if len(df) <= 0: - _log.warning(f'"{time.ctime()}": {game_id} - No boxscore available') - return pd.DataFrame([]) - - # SPLIT UP THE FG FIELDS - fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") - fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") - thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") - thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") - ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") - fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") - - # GET RID OF UNWANTED COLUMNS - df = df.drop(columns=["fg", "3pt", "ft"]) - - # INSERT COLUMNS WHERE NECESSARY - df.insert(7, "fgm", fgm) - df.insert(8, "fga", fga) - df.insert(9, "2pm", fgm - thpm) - df.insert(10, "2pa", fga - thpa) - df.insert(11, "3pm", thpm) - df.insert(12, "3pa", thpa) - df.insert(13, "ftm", ftm) - df.insert(14, "fta", fta) - - # column type handling - df["min"] = pd.to_numeric(df["min"], errors="coerce") - df["oreb"] = pd.to_numeric(df["oreb"], errors="coerce") - df["dreb"] = pd.to_numeric(df["dreb"], errors="coerce") - df["reb"] = pd.to_numeric(df["reb"], errors="coerce") - df["ast"] = pd.to_numeric(df["ast"], errors="coerce") - df["stl"] = pd.to_numeric(df["stl"], errors="coerce") - df["blk"] = pd.to_numeric(df["blk"], errors="coerce") - df["to"] = pd.to_numeric(df["to"], errors="coerce") - df["pf"] = pd.to_numeric(df["pf"], errors="coerce") - df["pts"] = pd.to_numeric(df["pts"], errors="coerce") - - return df - - -def _get_game_pbp_helper(gamepackage, game_id): - """A helper function that cleans a game's PBP. +def get_game_pbp(game_id: str) -> pd.DataFrame: + """A function that scrapes a game's play-by-play information. Parameters: - - pbp: a JSON object containing the play-by-play - game_id: a string representing the game's ESPN game ID Returns - - the game PBP as a DataFrame + - the game's play-by-play information represented as a DataFrame """ - pbp = gamepackage["pbp"] - home_team = pbp["tms"]["home"]["displayName"] - away_team = pbp["tms"]["away"]["displayName"] - - all_plays = [play for quart in pbp["playGrps"] for play in quart] - - # check if PBP exists - if len(all_plays) <= 0: - _log.warning(f'"{time.ctime()}": {game_id} - No PBP available') - return pd.DataFrame([]) - - descs = [x["text"] if "text" in x.keys() else "" for x in all_plays] - teams = [ - "" - if not "homeAway" in x.keys() - else home_team - if x["homeAway"] == "home" - else away_team - for x in all_plays - ] - hscores = [ - int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays - ] - ascores = [ - int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays - ] - quarters = [ - int(x["period"]["number"]) if "period" in x.keys() else np.nan - for x in all_plays - ] - - time_splits = [ - x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" - for x in all_plays - ] - minutes = [int(x[0]) for x in time_splits] - seconds = [int(x[1]) for x in time_splits] - min_to_sec = [x * 60 for x in minutes] - qt_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] - reg_secs_left = [ - 1800 + x - if qt_num == 1 - else 1200 + x - if qt_num == 2 - else 600 + x - if qt_num == 3 - else x - for x, qt_num in zip(qt_secs_left, quarters) - ] - - sc_play = [True if "scoringPlay" in x.keys() else False for x in all_plays] - is_assisted = [ - True if ("text" in x.keys() and "assisted" in x["text"].lower()) else False - for x in all_plays - ] - - # ASSIGN PLAY TYPES - p_types = [] - - for x in all_plays: - if not "text" in x.keys(): - p_types.append("") - continue - - play = x["text"] - - if not type(play) == str: - play = "" - - added = False - for pt in NON_SHOT_TYPES: - if pt in play: - p_types.append(pt.lower()) - added = True - break - if not added: - for st in SHOT_TYPES: - if st in play: - p_types.append(st.lower()) - added = True - break - - if not added: - p_types.append("") - - # FIND SHOOTERS - shooting_play = [ - True if x in (y.lower() for y in SHOT_TYPES) else False for x in p_types - ] - - scorers = [x[0].split(" made ")[0] if x[1] else "" for x in zip(descs, sc_play)] - - non_scorers = [ - x[0].split(" missed ")[0] - if x[1] in (y.lower() for y in SHOT_TYPES) and not x[2] - else "" - for x in zip(descs, p_types, sc_play) - ] - - shooters = [x[0] if not x[0] == "" else x[1] for x in zip(scorers, non_scorers)] - - assisted_pls = [ - x[0].split("Assisted by ")[-1].replace(".", "") if x[1] else "" - for x in zip(descs, is_assisted) - ] - - is_three = ["three point" in x.lower() for x in descs] - - data = { - "game_id": game_id, - "home_team": home_team, - "away_team": away_team, - "play_desc": descs, - "home_score": hscores, - "away_score": ascores, - "quarter": quarters, - "secs_left_qt": qt_secs_left, - "secs_left_reg": reg_secs_left, - "play_team": teams, - "play_type": p_types, - "shooting_play": shooting_play, - "scoring_play": sc_play, - "is_three": is_three, - "shooter": shooters, - "is_assisted": is_assisted, - "assist_player": assisted_pls, - } - - df = pd.DataFrame(data) - - # add shot data if it exists - is_shotchart = "shtChrt" in gamepackage - - if is_shotchart: - chart = gamepackage["shtChrt"]["plays"] - - shotteams = [x["homeAway"] for x in chart] - shotdescs = [x["text"] for x in chart] - xs = [50 - int(x["coordinate"]["x"]) for x in chart] - ys = [int(x["coordinate"]["y"]) for x in chart] - - shot_data = {"team": shotteams, "play_desc": shotdescs, "x": xs, "y": ys} + return _get_game_pbp(game_id, "womens") - shot_df = pd.DataFrame(shot_data) - # shot matching - shot_info = { - "shot_x": [], - "shot_y": [], - } - shot_count = 0 - - for play, isshot in zip(df.play_desc, df.shooting_play): - if shot_count >= len(shot_df): - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - continue - - if not isshot: - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - continue - - if "free throw" in play.lower(): - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - shot_count += 1 - continue - - shot_play = shot_df.play_desc.iloc[shot_count] - - if play == shot_play: - shot_info["shot_x"].append(shot_df.x.iloc[shot_count]) - shot_info["shot_y"].append(shot_df.y.iloc[shot_count]) - shot_count += 1 - else: - shot_info["shot_x"].append(np.nan) - shot_info["shot_y"].append(np.nan) - - # make sure that length of shot data matches number of shots in PBP data - if (not (len(shot_info["shot_x"]) == len(df))) or ( - not (len(shot_info["shot_y"]) == len(df)) - ): - _log.warning( - f'"{time.ctime()}": {game_id} - Shot data length does not match PBP data' - ) - df["shot_x"] = np.nan - df["shot_y"] = np.nan - return df - - df["shot_x"] = shot_info["shot_x"] - df["shot_y"] = shot_info["shot_y"] - - else: - df["shot_x"] = np.nan - df["shot_y"] = np.nan - return df - - return df - - -def _get_game_info_helper(info, more_info, game_id): - """A helper function that cleans a game's metadata. +def get_game_info(game_id: str) -> pd.DataFrame: + """A function that scrapes game metadata. Parameters: - - info: a JSON object containing game metadata - - more_info: a JSON object containing game metadata - game_id: a string representing the game's ESPN game ID Returns - - the game metadata as a DataFrame + - a DataFrame with one row and a column for each piece of metadata """ - attendance = int(info["attnd"]) if "attnd" in info.keys() else np.nan - capacity = int(info["cpcty"]) if "cpcty" in info.keys() else np.nan - network = info["cvrg"] if "cvrg" in info.keys() else "" - - gm_date = parse(info["dtTm"]) - game_date = gm_date.replace(tzinfo=timezone.utc).astimezone(tz=tz("US/Pacific")) - game_day = game_date.strftime("%B %d, %Y") - game_time = game_date.strftime("%I:%M %p %Z") - - arena = info["loc"] if "loc" in info.keys() else "" - loc = ( - info["locAddr"]["city"] + ", " + info["locAddr"]["state"] - if "locAddr" in info.keys() - else "" - ) - - tot_refs = info["refs"] if "refs" in info.keys() else {} - ref_1 = tot_refs[0]["dspNm"] if len(tot_refs) > 0 else "" - ref_2 = tot_refs[1]["dspNm"] if len(tot_refs) > 1 else "" - ref_3 = tot_refs[2]["dspNm"] if len(tot_refs) > 2 else "" - - teams = more_info["tms"] - ht_info, at_info = teams[0], teams[1] - - home_team, away_team = ht_info["displayName"], at_info["displayName"] - - home_id = ht_info["id"] - away_id = at_info["id"] - - if len(ht_info["links"]) == 0: - ht = home_team.lower().replace(" ", "-") - home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - elif len(ht_info["records"]) == 0: - ht = home_team.lower().replace(" ", "-") - home_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", ht) - - if len(at_info["links"]) == 0: - at = away_team.lower().replace(" ", "-") - away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - elif len(at_info["records"]) == 0: - at = away_team.lower().replace(" ", "-") - away_id = "nd-" + re.sub("[^0-9a-zA-Z\-]", "", at) - - home_rank = ht_info["rank"] if "rank" in ht_info.keys() else np.nan - away_rank = at_info["rank"] if "rank" in at_info.keys() else np.nan - - home_record = ( - ht_info["records"][0]["displayValue"] if len(ht_info["records"]) > 0 else "" - ) - away_record = ( - at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" - ) - - home_score, away_score = int(ht_info["score"]), int(at_info["score"]) - - home_win = True if home_score > away_score else False - - is_postseason = True if more_info["seasonType"] == 3 else False - is_conference = more_info["isConferenceGame"] - - if len(ht_info["records"]) > 1 and ht_info["records"][1]["type"] == "home": - is_neutral = False - - elif len(at_info["records"]) > 1 and at_info["records"][1]["type"] == "away": - is_neutral = False - - else: - is_neutral = True - - tournament = more_info["nte"] if "nte" in more_info.keys() else "" - - if ("linescores" in ht_info) and ("linescores" in at_info): - h_ot, a_ot = len(ht_info["linescores"]) - 4, len(at_info["linescores"]) - 4 - assert h_ot == a_ot - num_ots = h_ot - else: - _log.warning(f'"{time.ctime()}": {game_id} - No score info available') - num_ots = -1 - - game_info_list = [ - game_id, - home_team, - home_id, - home_rank, - home_record, - home_score, - away_team, - away_id, - away_rank, - away_record, - away_score, - home_win, - num_ots, - is_conference, - is_neutral, - is_postseason, - tournament, - game_day, - game_time, - loc, - arena, - capacity, - attendance, - network, - ref_1, - ref_2, - ref_3, - ] - - game_info_cols = [ - "game_id", - "home_team", - "home_id", - "home_rank", - "home_record", - "home_score", - "away_team", - "away_id", - "away_rank", - "away_record", - "away_score", - "home_win", - "num_ots", - "is_conference", - "is_neutral", - "is_postseason", - "tournament", - "game_day", - "game_time", - "game_loc", - "arena", - "arena_capacity", - "attendance", - "tv_network", - "referee_1", - "referee_2", - "referee_3", - ] - - return pd.DataFrame([game_info_list], columns=game_info_cols) - - -def _get_gamepackage_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": - return None - - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - gamepackage = jsn["page"]["content"]["gamepackage"] - - return gamepackage - - -def _get_scoreboard_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": - return None - - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] - - return scoreboard - - -def _find_json_in_content(soup): - script_string = "" - for x in soup.find_all("script"): - if WINDOW_STRING in x.text: - script_string = x.text - break - return script_string + return _get_game_info(game_id, "womens")