-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcgiar-csi-srtm.py
148 lines (118 loc) · 4.49 KB
/
cgiar-csi-srtm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import multiprocessing
import os
from urlparse import urlparse
import boto3
import rasterio
from rasterio import crs
from rasterio.warp import (reproject, RESAMPLING, calculate_default_transform)
from rasterio._io import virtual_file_to_buffer
from pyspark import SparkConf, SparkContext
APP_NAME = "SRTM 90m conversion"
"""
spark-submit --master local[*] app.py
"""
def convert(input, output):
"""
Intended for conversion from whatever the source format is to matching
filenames containing 4326 data, etc.
"""
resampling = "bilinear"
driver = "GTiff"
dst_crs = "EPSG:4326"
threads = multiprocessing.cpu_count() / 2
creation_options = {
"tiled": True,
"compress": "deflate",
"predictor": 2,
"sparse_ok": True,
"blockxsize": 256,
"blockysize": 256,
}
input = input.replace("s3://", "/vsicurl/http://s3.amazonaws.com/")
uri = urlparse(output)
input_uri = urlparse(input)
input = "/vsizip%s/%s" % (input, os.path.basename(input_uri.path).replace("zip", "tif"))
print input
resampling = getattr(RESAMPLING, resampling)
with rasterio.drivers():
try:
with rasterio.open(input) as src:
out_kwargs = src.meta.copy()
out_kwargs["driver"] = driver
dst_crs = crs.from_string(dst_crs)
dst_transform, dst_width, dst_height = calculate_default_transform(
src.crs, dst_crs, src.width, src.height, *src.bounds,
resolution=None)
out_kwargs.update({
"crs": dst_crs,
"transform": dst_transform,
"affine": dst_transform,
"width": dst_width,
"height": dst_height
})
out_kwargs.update(**creation_options)
with rasterio.open("/vsimem/img", "w", **out_kwargs) as dst:
for i in range(1, src.count + 1):
reproject(
source=rasterio.band(src, i),
destination=rasterio.band(dst, i),
src_transform=src.affine,
src_crs=src.crs,
dst_transform=out_kwargs["transform"],
dst_crs=out_kwargs["crs"],
resampling=resampling,
num_threads=threads)
contents = bytearray(virtual_file_to_buffer("/vsimem/img"))
if uri.scheme == "s3":
client = boto3.client("s3")
response = client.put_object(
ACL="public-read",
Body=bytes(contents),
Bucket=uri.netloc,
# CacheControl="TODO",
ContentType="image/tiff",
Key=uri.path[1:]
)
else:
f = open(output, "w")
f.write(contents)
f.close()
return output
except:
print "%s failed to process" % (input)
def main(sc):
client = boto3.client("s3")
bucket = "cgiar-csi-srtm.openterrain.org"
paginator = client.get_paginator("list_objects")
source_pages = paginator.paginate(Bucket=bucket, Prefix="source/")
target_pages = paginator.paginate(Bucket=bucket, Prefix="4326/")
existing_targets = reduce(
lambda a,b: a + b,
map(
lambda page: map(
lambda o: "s3://%s/%s" % (bucket, o["Key"]),
page["Contents"]
),
target_pages),
[])
sources = filter(
lambda x: "zip" in x and x.replace("source", "4326").replace(".zip", ".tif") not in existing_targets,
reduce(
lambda a,b: a + b,
map(
lambda page: map(
lambda o: "s3://%s/%s" % (bucket, o["Key"]),
page["Contents"]
),
source_pages),
[]))
# sources = sources[:1]
# manually set the number of partitions to avoid large variance in loads
# (create more partitions than Spark would otherwise)
sc.parallelize(sources).map(
lambda src: convert(src, src.replace("source", "4326").replace(".zip", ".tif"))
).collect()
if __name__ == "__main__":
conf = SparkConf().setAppName(APP_NAME)
sc = SparkContext(conf=conf)
main(sc)