-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwikipediaAnalysis.py
85 lines (64 loc) · 2.46 KB
/
wikipediaAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from bokeh.plotting import figure
from bokeh.plotting import figure, show, output_file
from bokeh.sampledata.us_states import data as statesData
import bokeh
import nltk
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import wikipedia
from geopy.geocoders import Nominatim
page = wikipedia.page("List_of_school_shootings_in_the_United_States")
html = page.html().encode("UTF-8")
#print (html[:1000])
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
print(str(table))
for span in table.select("span.sortkey"):
span.decompose()
df = pd.read_html(str(table), header=0)[0]
# print(df)
# print(df.describe())
df['tokenized'] = df['Description'].apply(nltk.word_tokenize)
print(df.head(n=3))
def return_coordinate(local):
location = geolocator.geocode(local)
return pd.Series({'latitude': location.latitude, "longitude": location.longitude})
geolocator = Nominatim()
#print(return_coordinate("New York, New York"))
df[['latitude', 'longitude']] = df.apply(
lambda row: return_coordinate(str(row['Location'])), axis=1)
print(df.head(n=10))
def split_and_strip(row):
return [x.strip() for x in row["Location"].split(',')][1]
df['state'] = df.apply(lambda row: split_and_strip(row), axis=1)
print (df.head(n=10))
df['deaths_and_injuries'] = str(df['Deaths']) + df['Injuries']
df.describe()
# now it is the time to do the mapping
state_df = df.groupby(df['state'])['deaths_and_injuries'].sum()
print(state_df.head(n=5))
colors = bokeh.palettes.OrRd5[::-1]
color_mapper = bokeh.models.mappers.LinearColorMapper(palette=colors)
state_dict = state_df.to_dict()
# state leven information
new_state_xs = []
new_state_ys = []
state_name = []
state_count = []
for abbr, state in statesData.items():
new_state_xs.append(state['lons'])
new_state_ys.append(state['lats'])
state_name.append(state['name'])
state_count.append(state_dict.get(state['name'], 0))
incident_data_source = bokeh.models.sources.ColumnDataSource(df)
state_data_source = bokeh.models.sources.ColumnDataSource(
data=dict(x=new_state_xs, y=new_state_ys, color=state_count))
plot = figure(title="School shooting", plot_width=800, plot_height=500)
plot.patches('x', 'y', source=state_data_source, color={
'field': 'color', 'transform': color_mapper}, line_color="white", line_width=0.5)
circle = bokeh.models.markers.Circle(
x='longitude', y='latitude', size='deaths_and_injuries')
plot.add_glyph(incident_data_source, circle)
show(plot)