new: plots and minor stuff

- Plot by zip code - Plot by state - Capture more errors better - Do some data sanitizing to get better data
nickumia-reisys · Oct 28, 2022 · 06eaf7c · 06eaf7c
1 parent d32021b
commit 06eaf7c
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,10 @@ dmypy.json
 ips
 *.html
 *.pkl
+*.cpg
+*.dbf
+*.prj
+*.shp
+*.shp.ea.iso.xml
+*.shp.iso.xml
+*.shx
diff --git a/data.py b/data.py
@@ -13,6 +13,7 @@ def __init__(self):
         self.ip_objs = {}
         self.country_dict = {}
         self.postal_code = {}
+        self.states = {}
 
     def save_data(self):
         with open(save_file, 'wb') as f:
@@ -21,14 +22,14 @@ def save_data(self):
     def load_data(self):
         try:
             with open(save_file, 'rb') as f:
-                self.bad_ips, self.ip_objs, self.country_dict, self.postal_code = pickle.load(f)
+                self.bad_ips, self.ip_objs, self.country_dict, self.postal_code, self.states = pickle.load(f)
         except (ValueError, FileNotFoundError):
             return [], {}, {}, {}
 
 g = Data()
 
 if __name__ == "__main__":
     # Inspect the order of variables on save/load
-    # g.load_data()
+    g.load_data()
     print(list(vars(g).keys()))
-    print(list(vars(g).values()))
+    # print(list(vars(g).values()))
diff --git a/location.py b/location.py
@@ -5,51 +5,84 @@
 import pycountry
 
 from data import g
+from states import abbrev_to_us_state
 
 from shapely.errors import ShapelyDeprecationWarning
 import warnings
 warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)
 
 
 def tally_countries(ip):
-    try:
-        country = pycountry.countries.get(alpha_2=g.ip_objs[ip]['data']['asn_country_code']).name
-        pc = g.ip_objs[ip]['data']['nets'][0]['postal_code']
-
-        if country in g.country_dict.keys():
-            g.country_dict[country] += 1
-        else:
-            g.country_dict[country] = 1
-
-        if country == 'United States':
-            if pc in g.postal_code.keys():
-                g.postal_code[pc] += 1
+    if ip not in g.bad_ips:
+        try:
+            country = pycountry.countries.get(alpha_2=g.ip_objs[ip]['data']['asn_country_code']).name
+            pc = g.ip_objs[ip]['data']['nets'][0]['postal_code']
+            state = abbrev_to_us_state[g.ip_objs[ip]['data']['nets'][0]['state']]
+
+            if country in g.country_dict.keys():
+                g.country_dict[country] += 1
             else:
-                g.postal_code[pc] = 1
-    except KeyError:
-        pass
+                g.country_dict[country] = 1
+
+            if country == 'United States':
+                # Tally Zip Codes
+                if pc in g.postal_code.keys():
+                    g.postal_code[pc] += 1
+                else:
+                    g.postal_code[pc] = 1
+                # Tally States
+                if state in g.states.keys():
+                    g.states[state] += 1
+                else:
+                    g.states[state] = 1
+        except KeyError:
+            pass
 
 
-def plot_postal_codes():
+def plot_location(map_type='state'):
     pandas_bokeh.output_notebook()
-    us = geopandas.read_file("./zip_map/cb_2018_us_zcta510_500k.shp")
 
-    # Set all other Zip Codes to 0
-    import uszipcode
-    all_codes = [i.zipcode for i in uszipcode.SearchEngine().by_pattern('', returns=1000000)]
-    for i, code in enumerate(all_codes):
-        if code not in g.postal_code:
-            g.postal_code[code] = 0
+    if map_type == 'state':
+        us = geopandas.read_file("./state_map/cb_2018_us_state_5m.shp")
+
+        # Ignore extremely high frequency states
+        # g.states['Washington'] = 0
+        # g.states['California'] = 0
+        # g.states['North Carolina'] = 0
+
+        # Relative to the highest frequency state
+        sf = {}
+        highest = max(g.states.values())
+        for state in g.states:
+            sf[state] = g.states[state]/highest
+
+        # Plot State Frequencies
+        df=pd.DataFrame({'PCODE': list(sf.keys()), 'A':list(sf.values()) })
+        # Join State Plot to US Plot
+        new_df=us.join(df.set_index('PCODE'), on='NAME')
 
-    # Plot Zip Code Frequencies
-    df=pd.DataFrame({'PCODE': list(g.postal_code.keys()), 'A':list(g.postal_code.values()) })
+        new_df.plot_bokeh(simplify_shapes=20000,
+                          category="A",
+                          colormap="Viridis",
+                          # marker="inverted_triangle",
+                          hovertool_columns=["NAME","A"])
 
-    # Join Zip Code Plot to US Plot
-    new_df=us.join(df.set_index('PCODE'), on='ZCTA5CE10')
+    elif map_type == 'zip':
+        us = geopandas.read_file("./zip_map/cb_2018_us_zcta510_500k.shp")
+        # Set all other Zip Codes to 0
+        import uszipcode
+        all_codes = [i.zipcode for i in uszipcode.SearchEngine().by_pattern('', returns=1000000)]
+        for i, code in enumerate(all_codes):
+            if code not in g.postal_code:
+                g.postal_code[code] = 0
 
+        # Plot Zip Code Frequencies
+        df=pd.DataFrame({'PCODE': list(g.postal_code.keys()), 'A':list(g.postal_code.values()) })
+        # Join Zip Code Plot to US Plot
+        new_df=us.join(df.set_index('PCODE'), on='ZCTA5CE10')
 
-    new_df.plot_bokeh(simplify_shapes=20000,
-                      category="A",
-                      colormap="Spectral",
-                      # marker="inverted_triangle",
-                      hovertool_columns=["ZCTA5CE10","A"])
+        new_df.plot_bokeh(simplify_shapes=20000,
+                          category="A",
+                          colormap="Inferno",
+                          # marker="inverted_triangle",
+                          hovertool_columns=["ZCTA5CE10","A"])
diff --git a/main.py b/main.py
@@ -7,7 +7,7 @@
 
 from ip import lookup_ips
 from data import g, file_to_analyze
-from location import tally_countries, plot_postal_codes
+from location import tally_countries, plot_location
 
 
 
@@ -36,10 +36,16 @@ def iterate_input(input_file, work_todo, starting_no=0):
             g.save_data()
             print(g.country_dict)
             print(g.postal_code)
+            print(g.states)
         if sys.argv[1] == 'plot':
-            plot_postal_codes()
-            print(g.postal_code)
-        if sys.argv[1] == 'print':
+            if len(sys.argv) == 3:
+                if sys.argv[2] == 'zip':
+                    plot_location(map_type='zip')
+                    print(g.postal_code)
+            else:
+                plot_location(map_type='state')
+                print(g.states)
+        else:
             print(len(g.bad_ips), g.bad_ips)
             print(g.ip_objs.keys())
     except BaseException as e:

diff --git a/state_map/.gitignore b/state_map/.gitignore
diff --git a/states.py b/states.py
@@ -0,0 +1,67 @@
+# Credits to https://gist.github.com/rogerallen/1583593
+
+us_state_to_abbrev = {
+    "Alabama": "AL",
+    "Alaska": "AK",
+    "Arizona": "AZ",
+    "Arkansas": "AR",
+    "California": "CA",
+    "Colorado": "CO",
+    "Connecticut": "CT",
+    "Delaware": "DE",
+    "Florida": "FL",
+    "Georgia": "GA",
+    "Hawaii": "HI",
+    "Idaho": "ID",
+    "Illinois": "IL",
+    "Indiana": "IN",
+    "Iowa": "IA",
+    "Kansas": "KS",
+    "Kentucky": "KY",
+    "Louisiana": "LA",
+    "Maine": "ME",
+    "Maryland": "MD",
+    "Massachusetts": "MA",
+    "Michigan": "MI",
+    "Minnesota": "MN",
+    "Mississippi": "MS",
+    "Missouri": "MO",
+    "Montana": "MT",
+    "Nebraska": "NE",
+    "Nevada": "NV",
+    "New Hampshire": "NH",
+    "New Jersey": "NJ",
+    "New Mexico": "NM",
+    "New York": "NY",
+    "North Carolina": "NC",
+    "North Dakota": "ND",
+    "Ohio": "OH",
+    "Oklahoma": "OK",
+    "Oregon": "OR",
+    "Pennsylvania": "PA",
+    "Rhode Island": "RI",
+    "South Carolina": "SC",
+    "South Dakota": "SD",
+    "Tennessee": "TN",
+    "Texas": "TX",
+    "Utah": "UT",
+    "Vermont": "VT",
+    "Virginia": "VA",
+    "Washington": "WA",
+    "West Virginia": "WV",
+    "Wisconsin": "WI",
+    "Wyoming": "WY",
+    "District of Columbia": "DC",
+    "American Samoa": "AS",
+    "Guam": "GU",
+    "Northern Mariana Islands": "MP",
+    "Puerto Rico": "PR",
+    "United States Minor Outlying Islands": "UM",
+    "U.S. Virgin Islands": "VI",
+}
+
+# invert the dictionary
+abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))
+
+if __name__ == "__main__":
+    print(abbrev_to_us_state['NY'])
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,3 +131,10 @@ dmypy.json @@
     ips
     *.html
     *.pkl
+    *.cpg
+    *.dbf
+    *.prj
+    *.shp
+    *.shp.ea.iso.xml
+    *.shp.iso.xml
+    *.shx