-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCB10.py
211 lines (159 loc) · 5.72 KB
/
CB10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python
# coding: utf-8
# In[11]:
import urllib.request
import calendar
import pandas as pd
from tabula import wrapper
import re
from sqlalchemy import create_engine, Column, String, Integer, MetaData
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# In[12]:
#init connection to sqlite
engine = create_engine("sqlite:///events.db")
#create session to cache commands for sqlite engine instance
Session = sessionmaker(bind = engine)
session = Session()
#provide table definition
Base = declarative_base()
class Event(Base):
__tablename__ = 'event'
id = Column('id',Integer, primary_key = True)
district = Column(Integer)
title = Column(String(100))
date = Column(String(50))
details = Column(String(1000))
time = Column(String(50))
def __init__(self, title, date, details,time,district):
self.title = title
self.date = date
self.details = details
self.time = time
self.district = district
#for print
def __repr__(self):
return f'{self.title} - {self.date}: {self.time}\n {self.details}'
#call to metadata to generate schema
Base.metadata.create_all(engine)
# In[13]:
def dataPrep(url):
#download file
fileName = 'cb10.pdf'
pdf = urllib.request.urlretrieve(url, fileName)
tables = wrapper.read_pdf(fileName, lattice = True, multiple_tables = True)
#select largest table
largest = pd.DataFrame()
for i in tables:
if i.size >= largest.size:
largest = i
#get rid of the top columns names
table = largest.copy()
dayOfWeekRow = None
for index, row in table.iterrows():
dayChars = [d[:2] for d in calendar.day_name[:]]
rowString = [str(i).strip('\r').title()[:2] for i in list(row) if not pd.isna(i)]
if len(set(dayChars) & set(rowString)) >= 7 :
dayOfWeekRow = index
table.columns = table.iloc[dayOfWeekRow]
table = table[dayOfWeekRow + 1:]
#drop nan column
table = table.loc[:,table.columns.notnull()]
#drop nan with blank string
table = table.fillna('')
events = []
for column in table:
dayOfWeek = column
values = table[column].values
event = {}
#if day in first row has value init weeknumber to 0 otherwise init to 1
if table[:1].get(column).values != '':
weekNumber = 0
else:
weekNumber = 1
#go though each cell of the column and check for an integer,
#if there is an integer start day otherwise just append the text
for cell in values:
date = re.search(r"^(\d*)",cell).group(0)
if(date):
weekNumber += 1
event[column] = {
'text' : cell,
'weekNumber' : weekNumber
}
if event[column].items not in events:
events.append(event)
event = {}
return events
# In[14]:
def getDayOfMonth(year,month,weekNumber,dayOfWeek):
if month > 12:
print("Month is invalid")
return 0
#Print message when week number is out of bound
weeksInMonth = calendar.monthcalendar(year,month)
if weekNumber > len(weeksInMonth):
print("Week Number out of bound")
return 0
daysInWeek = weeksInMonth[weekNumber-1]
dayChars = [d[:2] for d in calendar.day_name[:]]
dayCharsOfWeek = dayOfWeek[:2].title()
if dayCharsOfWeek not in dayChars:
print("Day of week invalid")
return 0
indexInWeek = dayChars.index(dayCharsOfWeek)
return daysInWeek[indexInWeek]
# In[15]:
def eventSearch(text, weekNumber, dayOfWeek,month):
events_dict = {}
day = getDayOfMonth(2021,month,weekNumber,dayOfWeek)
search = re.search(r"((?:1[0-2]:\d\d.?)[AaPp]?[Mm]?|(?:\d:\d\d.?)[AaPp]?[Mm]?)([\s\S]+)", text)
try:
time = search.group(1)
details = search.group(2)
title = ""
except:
time = ""
details = text
title = ""
if "http" in details:
linkSplit = details.split('http')
title = linkSplit[0].replace('Via ZOOM', '')
details = 'http' + linkSplit[1]
details=details.replace(" ","")
events_dict['title'] = title.strip()
events_dict['date'] = f'{month}/{day}'
events_dict['time'] = time
events_dict['details'] = details
return events_dict
# In[16]:
def CB10(month,events):
#add events found from eventSearch to events_arr
events_arr = []
for event in events:
for k,v in event.items():
text = v['text'].replace('\r', ' ')
if len(text.strip())>2:
#two events in one cell
manyEventsSearch = re.search(r"([^_]*)_{3,}([^_]*)", text)
if manyEventsSearch != None:
for e in manyEventsSearch.groups():
events_arr.append(eventSearch(e, v['weekNumber'], k,month))
else:
events_arr.append(eventSearch(text, v['weekNumber'], k,month))
district = 110
#remove previous entries
session.query(Event).filter(Event.district == district).delete()
session.commit()
#add items to database
for event in events_arr:
row = Event(title=event['title'], date=event['date'],
details=event['details'], time=event['time'], district= district)
session.add(row)
session.commit()
# In[17]:
CB10(2,dataPrep('https://www1.nyc.gov/html/mancb10/downloads/pdf/february_2021_calendar.pdf'))
# In[18]:
#print all users
for event in session.query(Event).filter(Event.district == 110):
print(event,"\n-------------------------------")