-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeEntries.py
223 lines (182 loc) · 7.94 KB
/
scrapeEntries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import json
import urllib.request
from bs4 import BeautifulSoup
import csv
import xlsxwriter
import pandas
import time
# Remove tabs and new lines
def clean(string):
string = string.replace('\t','')
string = string.replace('\n','')
return string
# Get the round name/number
def getRoundNumber(roundHTML):
roundNumberHTML = roundHTML.find('span', attrs = {'class': 'tenth semibold'})
roundText = roundNumberHTML.text
roundText = clean(roundText)
return roundText
# Get the side (aff/neg)
def getSide(roundHTML):
sidesHTML = roundHTML.find_all('span',attrs={'class':'tenth'})
for side in sidesHTML:
attrs = side['class']
if 'semibold' in attrs:
continue
sideText = side.text
sideText = clean(sideText)
return sideText
# Get team code for opponent
def getOpponent(roundHTML):
opponentHTML = roundHTML.find('a', attrs = {'class': 'white padtop padbottom'})
if opponentHTML:
opponentText = opponentHTML.text
opponentText = clean(opponentText)
opponentText = opponentText.replace('vs ', '')
return opponentText
return None
# Return array of judges (allows same code to work for multiple judges/out rounds)
def getJudgeDecisions(roundHTML):
judgeDecisionsHTML = roundHTML.find_all('div', attrs = {'class': 'padless full marno borderbottom'})
# print(len(judgeDecisionsHTML))
judgeDecisions = []
for judgeDecision in judgeDecisionsHTML:
judge = getJudge(judgeDecision)
decision = getDecisions(judgeDecision)
speakers = getSpeakers(judgeDecision)
judgeDecisions.append((judge,decision,speakers))
return judgeDecisions
def getJudge(judgeDecisionHTML):
judgesHTML = judgeDecisionHTML.find_all('a', attrs = {'class': 'white padtop padbottom'})
for judgeHTML in judgesHTML:
if '\t' in judgeHTML.text:
continue
judgeText = judgeHTML.text
judgeText = judgeText.strip(" ")
return judgeText
# Get W or L for each judge
def getDecisions(judgeDecisionHTML):
decisionHTML = judgeDecisionHTML.find('span', attrs = {'class' : 'tenth centeralign semibold'})
decisionText = decisionHTML.text
decisionText = clean(decisionText)
return decisionText
# Get debater names and speaker points
# Returns null if nothing there (out rounds)
def getSpeakers(judgeDecisionHTML):
debaterDivHTML = judgeDecisionHTML.find_all('div',attrs={'class':'full nospace smallish'})
if len(debaterDivHTML) == 0:
return None
if len(debaterDivHTML) == 1:
mavHTML = soup.find('h4', attrs = {'class':'nospace semibold'})
mavName = mavHTML.text
mavName = clean(mavName)
speaksHTML = judgeDecisionHTML.find('span',attrs = {'class':'fifth marno'})
speaks = speaksHTML.text
speaks = clean(speaks)
return [(mavName,speaks)]
speakersToReturn = []
for debater in debaterDivHTML:
debaterHTML = debater.find('span',attrs = {'class':'threefifths nowrap marvertno'})
speaksHTML = debater.find('span',attrs = {'class':'fifth marno'})
debater = debaterHTML.text
debater = clean(debater)
speaks = ""
if speaksHTML:
speaks = speaksHTML.text
speaks = clean(speaks)
else:
speaks = "N/A"
speakersToReturn.append((debater,speaks))
return speakersToReturn
# Call all methods to get data
def getResults(roundHTML):
return getRoundNumber(roundHTML),getSide(roundHTML),getOpponent(roundHTML),getJudgeDecisions(roundHTML)
# Adds a url to the total list
def addURL(team):
urlText = 'https://www.tabroom.com'+team['href']
if not urlText in urls:
urls.append(urlText)
# Tournament to scrape -- needs results page for any random team and tournament name
tournaments = [('https://www.tabroom.com/index/tourn/postings/entry_record.mhtml?tourn_id=10550&entry_id=1953803','UMW')]
# Initial URL
start_url = 'http://www.tabroom.com/index/tourn/results/ranked_list.mhtml?event_id=109288&tourn_id=12828'
# urllib variables
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
# url list
urls = []
url_location = 0
# Load results page
request = urllib.request.Request(start_url, headers={'User-Agent': user_agent})
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
# Get all entries on the results page and get their results URLs
resultsPage = soup.find_all('a', attrs={'class': 'white'})
for team in resultsPage:
addURL(team)
# Get the tournament name
tournamentName = soup.find('h2').text
tournamentName = clean(tournamentName)
with xlsxwriter.Workbook('speaks.xlsx', {'strings_to_numbers': True}) as workbook:
worksheet = workbook.add_worksheet(tournamentName)
rowCount = 1
while len(urls) > url_location:
print(urls[url_location])
# Get results webpage
request = urllib.request.Request(urls[url_location], headers={'User-Agent': user_agent})
html = urllib.request.urlopen(request).read()
# Add headers
headers = ['Round','Side', 'Opponent', 'Judge','Result','DebaterA','SpeaksA','DebaterB','SpeaksB','Team Code']
worksheet.write_row(0,0,headers)
print("opene")
# Use BeautifulSoup to parse page html
soup = BeautifulSoup(html,'html.parser')
# Grab team code
teamCodeHTML = soup.find('h2')
teamCode = teamCodeHTML.text
teamCode = clean(teamCode)
# Dummy variables to store debater names if they are not available later
debaterA = ""
debaterB = ""
# Grab each div which contains a single round of data and iterate through
allRoundHTML = soup.find_all('div', {"class": "row"})
for i,roundHTML in enumerate(reversed(allRoundHTML)):
roundNumber,side,opponent,judgesDecisions = getResults(roundHTML)
# When there is no judge it is a BYE
if len(judgesDecisions) == 0:
if not opponent:
opponent = "N/A"
entry = (roundNumber,"BYE",opponent,"N/A","N/A","N/A","N/A","N/A","N/A",teamCode)
worksheet.write_row(rowCount,0,entry)
rowCount += 1
continue
# Store debater names in case not present later
if i == 0 and len(judgesDecisions[0][2]) == 2:
# print(judgesDecisions)
debaterA = judgesDecisions[0][2][0][0]
debaterB = judgesDecisions[0][2][1][0]
for judgeDecision in judgesDecisions:
if judgeDecision[2]:
speakers = judgeDecision[2]
else:
speakers = [(debaterA, "N/A"),(debaterB,"N/A")]
judge = judgeDecision[0]
decision = judgeDecision[1]
if len(speakers) == 2:
entry = (roundNumber,side,opponent,judge,decision,speakers[0][0], speakers[0][1],speakers[1][0],speakers[1][1],teamCode)
elif len(speakers) == 1:
entry = (roundNumber,side,opponent,judge,decision,speakers[0][0], speakers[0][1],"N/A","N/A",teamCode)
worksheet.write_row(rowCount,0,entry)
rowCount += 1
print(entry)
# When there is no judge it is a BYE
# if len(judges) == 0:
# entry = (roundNumber,"BYE",opponent,"N/A","N/A","N/A","N/A",teamCode)
# worksheet.write_row(rowCount,0,entry)
# rowCount += 1
# continue
# Iterate through each judge and make a seperate entry
# for i in range(0,len(judges)):
# entry = (roundNumber,side,opponent,judges[i],decisions[i],speakers[0][0],speakers[0][1],speakers[1][0],speakers[1][1],teamCode)
# worksheet.write_row(rowCount,0,entry)
# rowCount += 1
url_location += 1