Skip to content

Commit

Permalink
new: plots and minor stuff
Browse files Browse the repository at this point in the history
- Plot by zip code
- Plot by state
- Capture more errors better
- Do some data sanitizing to get better data
  • Loading branch information
nickumia-reisys committed Oct 28, 2022
1 parent d32021b commit 06eaf7c
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 39 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,10 @@ dmypy.json
ips
*.html
*.pkl
*.cpg
*.dbf
*.prj
*.shp
*.shp.ea.iso.xml
*.shp.iso.xml
*.shx
7 changes: 4 additions & 3 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self):
self.ip_objs = {}
self.country_dict = {}
self.postal_code = {}
self.states = {}

def save_data(self):
with open(save_file, 'wb') as f:
Expand All @@ -21,14 +22,14 @@ def save_data(self):
def load_data(self):
try:
with open(save_file, 'rb') as f:
self.bad_ips, self.ip_objs, self.country_dict, self.postal_code = pickle.load(f)
self.bad_ips, self.ip_objs, self.country_dict, self.postal_code, self.states = pickle.load(f)
except (ValueError, FileNotFoundError):
return [], {}, {}, {}

g = Data()

if __name__ == "__main__":
# Inspect the order of variables on save/load
# g.load_data()
g.load_data()
print(list(vars(g).keys()))
print(list(vars(g).values()))
# print(list(vars(g).values()))
97 changes: 65 additions & 32 deletions location.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,84 @@
import pycountry

from data import g
from states import abbrev_to_us_state

from shapely.errors import ShapelyDeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)


def tally_countries(ip):
try:
country = pycountry.countries.get(alpha_2=g.ip_objs[ip]['data']['asn_country_code']).name
pc = g.ip_objs[ip]['data']['nets'][0]['postal_code']

if country in g.country_dict.keys():
g.country_dict[country] += 1
else:
g.country_dict[country] = 1

if country == 'United States':
if pc in g.postal_code.keys():
g.postal_code[pc] += 1
if ip not in g.bad_ips:
try:
country = pycountry.countries.get(alpha_2=g.ip_objs[ip]['data']['asn_country_code']).name
pc = g.ip_objs[ip]['data']['nets'][0]['postal_code']
state = abbrev_to_us_state[g.ip_objs[ip]['data']['nets'][0]['state']]

if country in g.country_dict.keys():
g.country_dict[country] += 1
else:
g.postal_code[pc] = 1
except KeyError:
pass
g.country_dict[country] = 1

if country == 'United States':
# Tally Zip Codes
if pc in g.postal_code.keys():
g.postal_code[pc] += 1
else:
g.postal_code[pc] = 1
# Tally States
if state in g.states.keys():
g.states[state] += 1
else:
g.states[state] = 1
except KeyError:
pass


def plot_postal_codes():
def plot_location(map_type='state'):
pandas_bokeh.output_notebook()
us = geopandas.read_file("./zip_map/cb_2018_us_zcta510_500k.shp")

# Set all other Zip Codes to 0
import uszipcode
all_codes = [i.zipcode for i in uszipcode.SearchEngine().by_pattern('', returns=1000000)]
for i, code in enumerate(all_codes):
if code not in g.postal_code:
g.postal_code[code] = 0
if map_type == 'state':
us = geopandas.read_file("./state_map/cb_2018_us_state_5m.shp")

# Ignore extremely high frequency states
# g.states['Washington'] = 0
# g.states['California'] = 0
# g.states['North Carolina'] = 0

# Relative to the highest frequency state
sf = {}
highest = max(g.states.values())
for state in g.states:
sf[state] = g.states[state]/highest

# Plot State Frequencies
df=pd.DataFrame({'PCODE': list(sf.keys()), 'A':list(sf.values()) })
# Join State Plot to US Plot
new_df=us.join(df.set_index('PCODE'), on='NAME')

# Plot Zip Code Frequencies
df=pd.DataFrame({'PCODE': list(g.postal_code.keys()), 'A':list(g.postal_code.values()) })
new_df.plot_bokeh(simplify_shapes=20000,
category="A",
colormap="Viridis",
# marker="inverted_triangle",
hovertool_columns=["NAME","A"])

# Join Zip Code Plot to US Plot
new_df=us.join(df.set_index('PCODE'), on='ZCTA5CE10')
elif map_type == 'zip':
us = geopandas.read_file("./zip_map/cb_2018_us_zcta510_500k.shp")
# Set all other Zip Codes to 0
import uszipcode
all_codes = [i.zipcode for i in uszipcode.SearchEngine().by_pattern('', returns=1000000)]
for i, code in enumerate(all_codes):
if code not in g.postal_code:
g.postal_code[code] = 0

# Plot Zip Code Frequencies
df=pd.DataFrame({'PCODE': list(g.postal_code.keys()), 'A':list(g.postal_code.values()) })
# Join Zip Code Plot to US Plot
new_df=us.join(df.set_index('PCODE'), on='ZCTA5CE10')

new_df.plot_bokeh(simplify_shapes=20000,
category="A",
colormap="Spectral",
# marker="inverted_triangle",
hovertool_columns=["ZCTA5CE10","A"])
new_df.plot_bokeh(simplify_shapes=20000,
category="A",
colormap="Inferno",
# marker="inverted_triangle",
hovertool_columns=["ZCTA5CE10","A"])
14 changes: 10 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ip import lookup_ips
from data import g, file_to_analyze
from location import tally_countries, plot_postal_codes
from location import tally_countries, plot_location



Expand Down Expand Up @@ -36,10 +36,16 @@ def iterate_input(input_file, work_todo, starting_no=0):
g.save_data()
print(g.country_dict)
print(g.postal_code)
print(g.states)
if sys.argv[1] == 'plot':
plot_postal_codes()
print(g.postal_code)
if sys.argv[1] == 'print':
if len(sys.argv) == 3:
if sys.argv[2] == 'zip':
plot_location(map_type='zip')
print(g.postal_code)
else:
plot_location(map_type='state')
print(g.states)
else:
print(len(g.bad_ips), g.bad_ips)
print(g.ip_objs.keys())
except BaseException as e:
Expand Down
Empty file added state_map/.gitignore
Empty file.
67 changes: 67 additions & 0 deletions states.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Credits to https://gist.github.com/rogerallen/1583593

us_state_to_abbrev = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}

# invert the dictionary
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))

if __name__ == "__main__":
print(abbrev_to_us_state['NY'])

0 comments on commit 06eaf7c

Please sign in to comment.