forked from ourresearch/jump-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon_data.py
124 lines (106 loc) · 4.19 KB
/
common_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import argparse
from collections import defaultdict
import json
import gzip
from app import get_db_cursor
from app import s3_client
def get_embargo_data_from_db():
command = "select issn_l, embargo from journal_delayed_oa_active"
embargo_rows = None
with get_db_cursor() as cursor:
cursor.execute(command)
embargo_rows = cursor.fetchall()
embargo_dict = dict((a["issn_l"], round(a["embargo"])) for a in embargo_rows)
return embargo_dict
def get_unpaywall_downloads_from_db():
command = "select * from jump_unpaywall_downloads where issn_l in (select distinct issn_l from jump_counter)"
big_view_rows = None
with get_db_cursor() as cursor:
cursor.execute(command)
big_view_rows = cursor.fetchall()
unpaywall_downloads_dict = dict((row["issn_l"], dict(row)) for row in big_view_rows)
return unpaywall_downloads_dict
def get_num_papers_from_db():
command = "select issn_l, year, num_papers from jump_num_papers_oa where year >= 2014"
with get_db_cursor() as cursor:
cursor.execute(command)
rows = cursor.fetchall()
lookup_dict = defaultdict(dict)
for row in rows:
lookup_dict[row["issn_l"]][row["year"]] = row["num_papers"]
return lookup_dict
def get_oa_data_from_db():
oa_dict = {}
for submitted in ["with_submitted", "no_submitted"]:
for bronze in ["with_bronze", "no_bronze"]:
key = "{}_{}".format(submitted, bronze)
command = """select * from jump_oa_{}
where year_int >= 2015
""".format(key)
with get_db_cursor() as cursor:
cursor.execute(command)
rows = cursor.fetchall()
for x in rows:
x['year_int'] = int(x['year_int'])
lookup_dict = defaultdict(list)
for row in rows:
lookup_dict[row["issn_l"]] += [dict(row)]
oa_dict[key] = lookup_dict
return oa_dict
def get_society_data_from_db():
command = "select issn_l, is_society_journal from jump_society_journals_input where is_society_journal is not null"
with get_db_cursor() as cursor:
cursor.execute(command)
rows = cursor.fetchall()
lookup_dict = defaultdict(list)
for row in rows:
lookup_dict[row["issn_l"]] = row["is_society_journal"]
return lookup_dict
def get_social_networks_data_from_db():
command = """select issn_l, asn_only_rate::float from jump_mturk_asn_rates
"""
with get_db_cursor() as cursor:
cursor.execute(command)
rows = cursor.fetchall()
lookup_dict = {}
for row in rows:
lookup_dict[row["issn_l"]] = row["asn_only_rate"]
return lookup_dict
def write_json(title, data):
print("dumping")
with open(title, 'w') as f:
json.dump(data, f)
print("done dumping")
def gather_common_data():
my_data = {}
my_data["embargo_dict"] = get_embargo_data_from_db()
my_data["unpaywall_downloads_dict_raw"] = get_unpaywall_downloads_from_db()
my_data["social_networks"] = get_social_networks_data_from_db()
my_data["oa"] = get_oa_data_from_db()
my_data["society"] = get_society_data_from_db()
my_data["num_papers"] = get_num_papers_from_db()
return my_data
def upload_common_data():
print("gathering data from database")
data = gather_common_data()
try:
os.remove('data/common_package_data_for_all.json.gz')
except OSError:
pass
with gzip.open('data/common_package_data_for_all.json.gz', 'w') as f:
f.write(json.dumps(data, default=str).encode('utf-8'))
print("uploading to S3")
s3_client.upload_file(
Filename="data/common_package_data_for_all.json.gz",
Bucket="unsub-cache",
Key="common_package_data_for_all.json.gz")
print("done!")
# heroku local:run python common_data.py --run
# heroku run --size=performance-l python common_data.py --run -r heroku
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--run", help="Prepare common data and upload to S3", action="store_true", default=False)
parsed_args = parser.parse_args()
if parsed_args.run:
upload_common_data()