forked from ourresearch/jump-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_accounts.py
232 lines (185 loc) · 9.07 KB
/
import_accounts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# coding: utf-8
import openpyxl
import unicodecsv as csv
import argparse
import datetime
from dateutil import parser
import glob
import codecs
from app import db
from app import get_db_cursor
from package import Package
from util import read_csv_file
from util import safe_commit
from util import is_issn
from util import write_to_tempfile
def convert_from_utf16_to_utf8(filename):
# https://stackoverflow.com/a/10300007/596939
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(filename, "r", "utf-16") as sourceFile:
new_filename = "{}_utf8".format(filename)
with codecs.open(new_filename, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
return new_filename
def convert_date_spaces_to_dashes(filename):
# https://stackoverflow.com/a/10300007/596939
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(filename, "r", "utf-8") as sourceFile:
new_filename = "{}_dashes".format(filename)
with codecs.open(new_filename, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
contents = contents.replace(" 2018,", "-2018,")
targetFile.write(contents)
return new_filename
def import_consortium_counter_xls(xls_filename):
results = []
xlsx_file = open(xls_filename, "rb")
workbook = openpyxl.load_workbook(xlsx_file, read_only=True)
sheetnames = list(workbook.sheetnames)
for sheetname in sheetnames:
university = sheetname.replace("JR1 ", "")
print(university)
sheet = workbook[sheetname]
column_names = {}
for i, column in enumerate(list(sheet.iter_rows(min_row=1, max_row=1))[0]):
column_names[column.value] = i
for row_cells in sheet.iter_rows(min_row=1):
issn = row_cells[column_names["Print ISSN"]].value
total = row_cells[column_names["Reporting period total"]].value
if is_issn(issn):
results.append({
"university": university,
"issn": issn,
"total": total
})
# print ".",
# print university, issn, total
with open("data/countercleaned.csv", "w") as csv_file:
csv_writer = csv.writer(csv_file, encoding="utf-8")
header = ["university", "issn", "total"]
csv_writer.writerow(header)
for my_dict in results:
# doing this hacky thing so excel doesn't format the issn as a date :(
csv_writer.writerow([my_dict[k] for k in header])
def import_perpetual_access_files():
results = []
my_files = glob.glob("/Users/hpiwowar/Downloads/wvu_perpetual_access.csv")
my_files.reverse()
for my_file in my_files:
print(my_file)
if False:
xlsx_file = open(my_file, "rb")
workbook = openpyxl.load_workbook(xlsx_file, read_only=True)
sheetnames = list(workbook.sheetnames)
for sheetname in sheetnames:
sheet = workbook[sheetname]
column_names = {}
for i, column in enumerate(list(sheet.iter_rows(min_row=1, max_row=1))[0]):
column_names[column.value] = i
for row_cells in sheet.iter_rows(min_row=1):
username = row_cells[column_names["Account Name"]].value
issn = row_cells[column_names["ISSN (FS split)"]].value
start_date = row_cells[column_names["Content Start Date"]].value
end_date = row_cells[column_names["Content End Date"]].value
if is_issn(issn):
new_dict = {
"username": username,
"issn": issn,
"start_date": start_date,
"end_date": end_date
}
results.append(new_dict)
# print new_dict
print(".", end=' ')
else:
rows = read_csv_file(my_file)
for row in rows:
print(row)
new_dict = {
"username": "wvu",
"issn": row["issn"],
"start_date": row["start_date"],
"end_date": row["end_date"]
}
results.append(new_dict)
# print new_dict
print(".", end=' ')
with open("/Users/hpiwowar/Downloads/perpetual_access_cleaned.csv", "w") as csv_file:
csv_writer = csv.writer(csv_file, encoding="utf-8")
header = ["username", "issn", "start_date", "end_date"]
csv_writer.writerow(header)
for my_dict in results:
csv_writer.writerow([my_dict[k] for k in header])
print("/Users/hpiwowar/Downloads/perpetual_access_cleaned.csv")
# python import_accounts.py --filename=~/Downloads/new_accounts.csv
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run stuff.")
parser.add_argument("--filename", type=str, default=None, help="input file to parse")
parser.add_argument("--username", type=str, default=None, help="username to input")
parsed_args = parser.parse_args()
parsed_vars = vars(parsed_args)
# create_accounts(parsed_vars["filename"])
# build_counter_import_file(filename=parsed_vars["filename"], username=parsed_vars["username"])
crkn_ids = read_csv_file("/Users/hpiwowar/Documents/Projects/tiv2/jump-api/data/crkn_lookup.csv")
institution_for_all_these_packages = "institution-fVnPvXK9iBYA"
# report_name = "trj2"
# all_in_one_data_rows = read_csv_file("/Users/hpiwowar/Documents/Projects/tiv2/jump-api/data/counter5_crkn/TR_J2 SUSHI Harvester CRKN_Wiley-2019_incl-non-participants.csv")
# report_name = "trj3"
# all_in_one_data_rows = read_csv_file("/Users/hpiwowar/Documents/Projects/tiv2/jump-api/data/counter5_crkn/TR_J3 SUSHI Harvester CRKN_Wiley-2019.csv")
# report_name = "trj4"
# all_in_one_data_rows = read_csv_file("/Users/hpiwowar/Documents/Projects/tiv2/jump-api/data/counter5_crkn/1 TR_J4 SUSHI Harvester CRKN_Wiley-2019.csv")
# all_in_one_data_rows = read_csv_file("/Users/hpiwowar/Documents/Projects/tiv2/jump-api/data/counter5_crkn/2 TR_J4 SUSHI Harvester CRKN_Wiley-2019.csv")
report_name = "trj4"
all_in_one_data_rows = read_csv_file(" /Users/hpiwowar/Dropbox/companywide/unsub_customer_data/learn_counter5/raw_from_unis/crkn/elsevier/COP5_TR_J4_2019_UIR.xlsx")
for row in crkn_ids:
print("row: {}".format(list(row.values())))
publisher_name = "Elseiver"
crkn_institution_id = row["crkn_elsevier"]
institution_number = row["institution_id"].replace("institution-", "")
package_id = "package-CRKNcounter5{}{}".format(publisher_name, institution_number)
package_name = "CRKN COUNTER5 {} {}".format(publisher_name, row["name"])
from counter import CounterInput
my_counter_test = CounterInput.query.filter(CounterInput.package_id == package_id,
CounterInput.report_name == report_name).first()
has_counter_data_to_load = False
if my_counter_test:
print("counter already loaded for {}".format(report_name))
else:
temp_filename = "data/temp.csv"
with open(temp_filename, "w") as csv_file:
csv_writer = csv.writer(csv_file, encoding="utf-8")
header = list(all_in_one_data_rows[0].keys())
csv_writer.writerow(header)
for row in all_in_one_data_rows:
if row["Customer ID"] == crkn_institution_id:
# doing this hacky thing so excel doesn't format the issn as a date :(
csv_writer.writerow([row[k] for k in header])
has_counter_data_to_load = True
print("now loading in counter data")
if not has_counter_data_to_load:
print("has no counter data to load")
else:
# see if package exists:
my_package = Package.query.get(package_id)
if not my_package:
my_package = Package(
package_id=package_id,
publisher=publisher_name,
package_name=package_name,
created=datetime.datetime.utcnow().isoformat(),
institution_id=institution_for_all_these_packages,
is_demo=False
)
db.session.add(my_package)
safe_commit(db)
else:
print("package already created")
print("{}".format(my_package))
CounterInput().load(package_id, temp_filename, commit=True)