forked from ourresearch/jump-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcitation_authorship_update.py
215 lines (177 loc) · 6.74 KB
/
citation_authorship_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# coding: utf-8
import click
from psycopg2 import sql
import datetime
from app import get_db_cursor
# 7 days, as seconds
UPDATE_AFTER_DEFAULT = 604800
# update_after = 600
def grid_id_delete(grid_id, table):
with get_db_cursor() as cursor:
cmd = "delete from {} where grid_id=%s".format(table)
click.echo(cursor.mogrify(cmd, (grid_id,)))
cursor.execute(cmd, (grid_id,))
def grid_id_insert(grid_id, table, view):
with get_db_cursor() as cursor:
cmd = "insert into {} (select * from {} where grid_id = %s)".format(table, view)
click.echo(cursor.mogrify(cmd, (grid_id,)))
cursor.execute(cmd, (grid_id,))
def update_citing(grid_id):
grid_id_delete(grid_id, "jump_citing")
grid_id_insert(grid_id, "jump_citing", "jump_citing_view")
def update_authorship(grid_id):
grid_id_delete(grid_id, "jump_authorship")
grid_id_insert(grid_id, "jump_authorship", "jump_authorship_view")
def update_apc_authorships(package_id):
with get_db_cursor() as cursor:
cmd = "delete from jump_apc_authorships where package_id=%s"
click.echo(cursor.mogrify(cmd, (package_id,)))
cursor.execute(cmd, (package_id,))
with get_db_cursor() as cursor:
cmd = """insert into jump_apc_authorships (
select * from jump_apc_authorships_view
where package_id = %s and issn_l in
(select issn_l from openalex_computed))
"""
click.echo(cursor.mogrify(cmd, (package_id,)))
cursor.execute(cmd, (package_id,))
def check_updated(grid_id, table):
with get_db_cursor() as cursor:
cmd = "select * from {} where grid_id=%s order by updated desc limit 1".format(table)
click.echo(cursor.mogrify(cmd, (grid_id,)))
cursor.execute(cmd, (grid_id,))
rows = cursor.fetchall()
return rows
def check_updated_apc(pkg_id):
with get_db_cursor() as cursor:
cmd = "select * from jump_apc_authorships_updates where package_id=%s order by updated desc limit 1"
click.echo(cursor.mogrify(cmd, (pkg_id,)))
cursor.execute(cmd, (pkg_id,))
rows = cursor.fetchall()
return rows
def record_update(grid_id, table, err, institution_id = None, package_id = None):
if table in ['jump_citing_updates','jump_authorship_updates',]:
cols = ['updated', 'grid_id', 'error']
values = (datetime.datetime.utcnow(), grid_id, err, )
else:
cols = ['updated', 'institution_id', 'package_id', 'error',]
values = (datetime.datetime.utcnow(), institution_id, package_id, err, )
with get_db_cursor() as cursor:
qry = sql.SQL("insert into {} ({}) values ({})").format(
sql.Identifier(table),
sql.SQL(', ').join(map(sql.Identifier, cols)),
sql.SQL(', ').join(sql.Placeholder() * len(cols)))
click.echo(cursor.mogrify(qry, values))
cursor.execute(qry, values)
@click.group()
def cli():
"""
Update jump_citing, jump_authorship, and jump_apc_authorships
Examples:
python citation_authorship_update.py --help
python citation_authorship_update.py citing --help
python citation_authorship_update.py citing
python citation_authorship_update.py authorship
python citation_authorship_update.py apc
"""
@cli.command(short_help='Update jump_citing table for each grid_id')
@click.option("--update_after", help="update after (seconds)", type=int)
def citing(update_after=None):
click.echo("Updating jump_citing")
update_after = update_after or UPDATE_AFTER_DEFAULT
with get_db_cursor() as cursor:
cmd = "select DISTINCT(grid_id) from jump_citing"
cursor.execute(cmd)
rows = cursor.fetchall()
jc_grid_ids_uniq = [w[0] for w in rows]
# len(jc_grid_ids_uniq)
# grid_id = jc_grid_ids_uniq[3]
for grid_id in jc_grid_ids_uniq:
click.echo(f"working on (update_citing, {grid_id})")
res = check_updated(grid_id, "jump_citing_updates")
update = False
if not res:
update = True
else:
if (datetime.datetime.utcnow() - res[0]['updated']).seconds > update_after:
update = True
if not update:
click.echo(f"(update_citing, {grid_id}) already updated recently")
else:
mssg = None
try:
update_citing(grid_id)
except Exception as err:
mssg = str(err)
click.echo(f"(update_citing, {grid_id}) failed: {err}")
record_update(grid_id, 'jump_citing_updates', mssg)
@cli.command(short_help='Update jump_authorship table for each grid_id')
@click.option("--update_after", help="update after (seconds)", type=int)
def authorship(update_after=None):
click.echo("Updating jump_authorship")
update_after = update_after or UPDATE_AFTER_DEFAULT
with get_db_cursor() as cursor:
cmd = "select distinct(grid_id) from jump_grid_id where grid_id not ilike '%example%'"
cursor.execute(cmd)
authrows = cursor.fetchall()
ja_grid_ids_uniq = [w[0] for w in authrows]
# len(ja_grid_ids_uniq)
# grid_id = ja_grid_ids_uniq[3]
for grid_id in ja_grid_ids_uniq:
click.echo(f"working on (update_authorship, {grid_id})")
res = check_updated(grid_id, "jump_authorship_updates")
update = False
if not res:
update = True
else:
if (datetime.datetime.utcnow() - res[0]['updated']).seconds > update_after:
update = True
if not update:
click.echo(f"(update_authorship, {grid_id}) already updated recently")
else:
mssg = None
try:
update_authorship(grid_id)
except Exception as err:
mssg = str(err)
click.echo(f"(update_authorship, {grid_id}) failed: {err}")
record_update(grid_id, 'jump_authorship_updates', mssg)
@cli.command(short_help='Update jump_apc_authorships table for each package_id')
@click.option("--update_after", help="update after (seconds)", type=int)
def apc(update_after=None):
click.echo("Updating jump_apc_authorships")
update_after = update_after or UPDATE_AFTER_DEFAULT
# exclude packages: deleted, demo, and consortial feeder (-f for some jisc pkgs, and others)
with get_db_cursor() as cursor:
cmd = """
select package_id, institution_id from jump_account_package
where package_id in (select DISTINCT(package_id) from jump_apc_authorships)
and not is_deleted
and not is_demo
and package_id not ilike '%-f'
and package_id not in (select DISTINCT(member_package_id) from jump_consortium_members)
"""
cursor.execute(cmd)
pkgrows = cursor.fetchall()
for row in pkgrows:
click.echo(f"working on (update_apc_authorships, {row['package_id']})")
res = check_updated_apc(row['package_id'])
update = False
if not res:
update = True
else:
if (datetime.datetime.utcnow() - res[0]['updated']).seconds > update_after:
update = True
if not update:
click.echo(f"(update_apc_authorships, {row['package_id']}) already updated recently")
else:
mssg = None
try:
update_apc_authorships(row['package_id'])
except Exception as err:
mssg = str(err)
click.echo(f"(update_apc_authorships, {row['package_id']}) failed: {err}")
record_update(None, 'jump_apc_authorships_updates', mssg,
institution_id = row['institution_id'], package_id = row['package_id'])
if __name__ == "__main__":
cli()