forked from ourresearch/jump-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprices_public_custom_join.py
140 lines (121 loc) · 5.01 KB
/
prices_public_custom_join.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import datetime
from app import get_db_cursor
from package import Package
from psycopg2 import sql
from psycopg2.extras import execute_values
def publisher_switch(publisher):
if publisher == "Elsevier":
return "Elsevier"
elif publisher == "SpringerNature":
return "Springer Nature"
elif publisher == "Wiley":
return "Wiley"
elif publisher == "Sage":
return "SAGE"
elif publisher == "TaylorFrancis":
return "Taylor & Francis"
return publisher
# in database, create table if not done yet
# create table jump_journal_prices_pre_public_price_add as (select * from jump_journal_prices limit 1);
# truncate jump_journal_prices_pre_public_price_add;
# get list of package ids
with get_db_cursor() as cursor:
qry = """
select package_id from jump_account_package
where not is_deleted
and currency is not null
and institution_id in (select distinct(institution_id) from jump_debug_combo_view)
"""
cursor.execute(qry)
pkg_rows = cursor.fetchall()
pkg_ids = [w[0] for w in pkg_rows]
len(pkg_ids)
# remove those already done
with get_db_cursor() as cursor:
qry = "select DISTINCT(package_id) from jump_journal_prices where public_price;"
cursor.execute(qry)
pdone = cursor.fetchall()
pkg_already_done = [w[0] for w in pdone]
print(f"{len(pkg_already_done)} packages already done")
pkg_ids = list(set(pkg_ids) - set(pkg_already_done))
print(f"Getting to work on {len(pkg_ids)} packages\n")
# package_id = pkg_ids[0]
# package_id = 'package-QWGwbByHNa8c'
for package_id in pkg_ids:
print(f"({package_id})")
pkg = Package.query.get(package_id)
publisher = pkg.publisher
currency = pkg.currency
if not publisher and 'CRKN' in pkg.package_name:
publisher = 'SpringerNature'
if not publisher:
continue
# get public price data
print(f" ({package_id}) getting public price data")
with get_db_cursor() as cursor:
qry = """
select %s as package_id,publisher,title,issn_l,null as subject,subscription_price_{} as price,null as year,sysdate as created,true as public_price
from openalex_computed
where publisher ilike %s
and subscription_price_{} is not null
""".format(currency.lower(), currency.lower())
cursor.execute(qry, (package_id, publisher_switch(publisher),))
public_price_rows = cursor.fetchall()
# get package price data
print(f" ({package_id}) getting custom price data")
with get_db_cursor() as cursor:
qry = "select * from jump_journal_prices where package_id = %s"
cursor.execute(qry, (package_id,))
price_rows = cursor.fetchall()
# filter issns in price_rows out of public_price_rows
if price_rows:
public_price_rows_to_add = list(filter(lambda w: w['issn_l'] not in [x['issn_l'] for x in price_rows], public_price_rows))
else:
print(f" ({package_id}) no custom prices found")
public_price_rows_to_add = public_price_rows
# combine custom and public prices
price_all_rows = price_rows + public_price_rows_to_add
# len(price_all_rows)
# put current prices into a backup table to retrieve later if needed
print(f" ({package_id}) backing up prices")
with get_db_cursor() as cursor:
qry_backup = "insert into jump_journal_prices_pre_public_price_add (select * from jump_journal_prices where package_id = %s)"
cursor.execute(qry_backup, (package_id,))
# delete current prices for pkg
print(f" ({package_id}) deleting old prices from jump_journal_prices")
with get_db_cursor() as cursor:
qry_delete = "delete from jump_journal_prices where package_id = %s"
cursor.execute(qry_delete, (package_id,))
# insert new prices for pkg
print(f" ({package_id}) inserting new prices {len(price_all_rows)} into jump_journal_prices")
cols = list(price_all_rows[0].keys())
with get_db_cursor() as cursor:
qry = sql.SQL(
"INSERT INTO jump_journal_prices ({}) VALUES %s"
).format(sql.SQL(", ").join(map(sql.Identifier, cols)))
execute_values(cursor, qry, price_all_rows, page_size=500)
# update jump_raw_file_upload_object row for price upload
with get_db_cursor() as cursor:
qry_exists = "select count(*) from jump_raw_file_upload_object where package_id = %s and file = 'price'"
cursor.execute(qry_exists, (package_id,))
file_upload = cursor.fetchone()
if not file_upload[0]:
print(f" ({package_id}) no price entry in jump_raw_file_upload_object - creating")
with get_db_cursor() as cursor:
file_upload_cols = ['package_id','file','bucket_name','object_name','created','num_rows']
qry = sql.SQL(
"INSERT INTO jump_raw_file_upload_object ({}) VALUES %s"
).format(sql.SQL(", ").join(map(sql.Identifier, file_upload_cols)))
cursor.execute(qry, [(package_id, "price", "unsub-file-uploads", f"{package_id}_price.csv", datetime.datetime.utcnow(), len(price_all_rows),)])
else:
print(f" ({package_id}) price entry found in jump_raw_file_upload_object - updating")
with get_db_cursor() as cursor:
qry = """
UPDATE jump_raw_file_upload_object
SET num_rows=%s, created=sysdate
where package_id = %s
and file = 'price'
"""
# print(cursor.mogrify(qry, (len(price_all_rows), package_id,)))
cursor.execute(qry, (len(price_all_rows), package_id,))
print("\n")