-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeohelpers.py
229 lines (212 loc) · 9.36 KB
/
geohelpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import numpy as np
import pandas as pd
import geopandas as gpd
import os
def _check_chunksize(chunksize, df_size):
'''Check that the chunksize is reasonable given the size of the data'''
try:
chunksize = int(chunksize)
except TypeError:
chunksize = min(max(df_size//50,5000), 60000)
return chunksize
def cartesian_product(*arrays):
'''
Perform a cartesian product on the provided arrays
arrays: numpy arrays or lists
'''
ndim = len(arrays)
return np.stack(np.meshgrid(*arrays), axis=-1).reshape(-1, ndim)
def generate_key(df, accuracy_m=1000, lats='latitude',longs='longitude'):
'''
Generate the key based on the selected level of accuracy
df(dataframe): pandas-like dataframe containing the latitudes and longitudes to be converted to key
accuracy_m(int): desired accuracy (in meters) - should be consistent with rest of the data
lats(str): (optional) specify the name of the column containing the latitudes
longs(str): (optional) specify the name of the column containing the longitudes
Returns: dataframe with generated key columns attached (will replace existing 'key' column if present)
'''
print('Generating the geokey')
# ensure that the lats and longs correspond with the selected accuracy
round_level = int(5 - np.log10(accuracy_m))
df = df.copy()
df[lats] = df[lats].round(round_level)
df[longs] = df[longs].round(round_level)
df['geokey'] = (df[lats]*100000).astype(int).astype(str) \
+';'+ (df[longs]*100000).astype(int).astype(str)
return df
def generate_points_from_coordinates(df, chunksize=None, lats='latitude',longs='longitude'):
'''
Generate a Point geometries column from the latitudes and longitudes
df(dataframe): pandas-like dataframe containing the latitudes and longitudes to be converted to Points
'''
print('Generating the geometry points from the coordinates')
chunksize = _check_chunksize(chunksize, df.shape[0])
print(f'selected chunksize {chunksize}', flush=True, end='')
dfs = []
for df_chunk in _chunker(df, chunksize):
df_ret = gpd.GeoDataFrame(df_chunk, geometry=gpd.points_from_xy(df_chunk[longs],df_chunk[lats]))
df_ret.crs = {'init': 'epsg:4326'}
dfs.append(df_ret)
print('.', flush=True, end='')
print('Done!')
return pd.concat(dfs)
def _generate_smaller_grid(lats, longs, accuracy_m, verbose):
'''For use with generate grid'''
mlat, mlong = len(lats), len(longs)
if max(mlat,mlong) >= 1500:
# potentially too large to handle in one go...dividing and conquering
dfs = []
if mlat>mlong:
for lat in np.array_split(lats,mlat//100):
dfs += [_generate_smaller_grid(lat,longs,accuracy_m,verbose)]
else:
for lon in np.array_split(longs,mlong//100):
dfs += [_generate_smaller_grid(lats,lon,accuracy_m,verbose)]
grid = pd.concat(dfs)
if verbose: print('collecting results')
else:
if verbose: print('.',end='',flush=True)
cols=['latitude','longitude']
grid = pd.DataFrame(cartesian_product(lats,longs), columns=cols)
return grid
def generate_grid(lats, longs, accuracy_m=1000, verbose=False):
'''Create accuracy_m spaced grid using (min,max) pairs provided in lats, longs
longs(array-like): min,max values for longitude range
lats(array-like): min,max values for latitude range
accuracy_m(int): desired approximate accuracy in meters from (1,10,100,1000,10000,100000)
verbose(bool): set verbosity level
Returns: GeoDataFrame with latitudes, longitudes, coordinates and constructed key
'''
try:
accuracy_m = int(accuracy_m)
if not accuracy_m in (10**x for x in range(6)):
raise ValueError('accuracy_m not in (1,10,100,1000,10000,100000)')
except Exception:
raise ValueError('accuracy_m must be one of these values: (1,10,100,1000,10000,100000)')
steps = accuracy_m/100000
print('Generating point grid')
lats = np.arange(np.min(lats), np.max(lats), steps)
longs = np.arange(np.min(longs), np.max(longs), steps)
ret = _generate_smaller_grid(lats,longs,accuracy_m,verbose).reset_index(drop=True)
print(f'\nGrid of size {ret.shape} generated!')
return ret
def _chunker(df, chunksize=None):
'''
df(dataframe): Pandas-like df to be split into chunks
chunksize(int): desired size of smaller dfs
returns: generator of dfs with at most chunksize rows or at most chunks entries
'''
if not chunksize:
return [df]
df_in = df.copy()
dfs = []
for (g,df) in df_in.groupby(np.arange(df_in.shape[0]) // chunksize):
dfs+=[df]
return dfs
def do_join(points, geometries):
'''
Helper to do the geometry join
'''
joined = gpd.sjoin(points, geometries, how='inner', op='within', rsuffix='geometries')
return joined
def locate_points(points, geometries, chunksize=None, verbose=False):
'''
Locate the points within the provided geometries
points(geopandasdf): GeoPandasDF containing a single column of Points to locate in geometries
geometries(geopandasdf): GeoPandasDF conatining a single column of Geometries
chunksize(int): (optional) specify the size of the chunks in which to process the points
verbose(bool): Do you want all the information? (Default False)
'''
print('Locating the points')
results = []
chunksize = _check_chunksize(chunksize, points.shape[0])
if points.shape[0] > 500000:
print('There are many points to locate - this is going to take a while!')
n,steps = 0,0
for small_points in _chunker(points, chunksize):
if verbose: print('.',flush=True,end='')
small_join = do_join(small_points, geometries)
results += [small_join]
steps += 1
n += small_points.shape[0]
if steps%10==0:
print(f' ({n} of {points.shape[0]} [{n*100//points.shape[0]}%] processed)')
print('Combining results')
ret = pd.concat(results)
print(f'Done!')
if verbose:
print(f'{ret.shape[0]} of {points.shape[0]} located within geometries')
print(f'{points.shape[0] - ret.shape[0]} points were not found within provided geometries!')
return ret.reset_index(drop=True)
def save_data(df, filename='filename.json.gz', directory='processed_data', columns=None, skip_checks=True):
'''
Save the data in json records format, optionally only keeping certain columns
df(dataframe): dataframe to save
filename(str): the name of the destination filename, including extension and compression
directory(str): the directory in which to save the files
columns(list/dict): list of columns to keep in the saved file.
If dictionary is passed in columns, the keys will be used to filter the df
while the (key:value) pairs will be used to rename the columns
'''
if columns:
if not isinstance(columns, dict):
if not isinstance(columns, list):
df = df[list(columns)]
else:
df = df[columns.keys()]
df = df.rename(columns=columns)
df = pd.DataFrame(df)
try:
# Create target Directory
os.mkdir(directory)
print(f"Directory {directory} created")
except FileExistsError:
print(f"Directory {directory} already exists")
full_pathname = os.path.join(directory,filename)
print(f'Writing file at {full_pathname}')
df.to_json(full_pathname,orient='records')
if not skip_checks:
check_first_col(df)
return df
def process_dataframe(df, geometries, accuracy_m=1000, chunksize=None, verbose=False):
chunksize = _check_chunksize(chunksize, df.shape[0])
df = generate_key(df, accuracy_m)
df = generate_points_from_coordinates(df, chunksize)
df = locate_points(df, geometries, chunksize, verbose)
return df
def check_table_key(df, key):
'''
Check that the key in the table is unique and not null. Print warning if duplicates or nulls detected
df(dataframe): pandas-like dataframe to be checked
key(str/list): the column(s) in the df to be used as primary key of table
'''
if isinstance(key, str):
key = [key]
for key_col in key:
key_data = df[key_col]
if key_data.isnull().sum()!=0:
print(f'WARNING! Nulls discovered in {key_col}! This column is intended to be used as a key.')
elif key_data.nunique()!=df.shape[0]:
print(f'WARNING! Duplicates detected in {key_col}! This column is intended to be used as a key')
else:
print(f'Column {key_col} passed notnull and unique checks!')
return True
def check_grid(grid):
'''
Check the generated grid and ensure that it is not empty. Print warning if the grid is found to be empty.
'''
if grid.shape[0]==0:
print('WARNING! The grid contains no rows!')
print('This is likely caused by incorrect latitude/longitude ranges or')
print('by an incorrect accuracy_m setting. Please double-check')
else:
print('Generated grid passed quality check!')
def check_first_col(df):
'''
Checks the first column of dataset, assuming it is intended as the key of the table.
'''
first_col = list(df)[0]
passed = check_table_key(df, first_col)
if not passed:
print(f'First column ({first_col}) of dataset did not pass notnull and unique checks!')
print('-- double check column ordering')