-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson2csv.py
68 lines (55 loc) · 2.71 KB
/
json2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
input_file = open("books_json.txt", 'r', encoding='utf-8')
output_file = open("books_long.csv", 'w', encoding='utf-8')
fields = ["key", "title", "subtitle", "authors", "translated_titles", "subjects", "subject_places", "subject_times", "subject_people", "description", "dewey_number", "lc_classifications", "first_sentence", "original_languages ", "other_titles", "first_publish_date", "links", "notes", "cover_edition", "covers"]
head = ["key", "title", "subtitle", "authors", "translated_titles", "subjects", "description", "dewey_number", "lc_classifications", "first_sentence", "original_languages ", "other_titles", "first_publish_date", "links", "notes", "cover_edition", "covers"]
output_file.writelines("{}\n".format(';'.join(head)))
#limit = 50000
while True:
line = input_file.readline()
#limit -= 1
if not line:# or limit < 0:
break
json_obj = json.loads(line.split("\t")[4])
not_null_fields = json_obj.keys()
line_to_write = ""
for field in head:
if field in not_null_fields:
if field == "authors":
authors = []
for el in json_obj[field]:
try:
authors += [el["author"]["key"].split('/')[2].replace(';', ',')]
except:
continue
line_to_write += '{}'.format(authors)
elif field == "key":
book_id = json_obj[field].split('/')[2]
line_to_write += '{}'.format(book_id.replace(';', ','))
elif field == "description":
if type(json_obj["description"]) == dict:
text = json_obj["description"]["value"]
else:
text = json_obj["description"]
'''text = text.replace('\n', '')
text = text.replace('\r', '')
text = text.replace('\t', '')'''
line_to_write += text.replace(';', ',')
elif field == "subjects":
subjects = []
for subfield in ["subjects", "subject_places", "subject_times", "subject_people"]:
try:
for el in json_obj[subfield]:
subjects += [el.replace(';', ',')]
except: continue
line_to_write += '{}'.format(subjects)
else:
line_to_write += '{}'.format(str(json_obj[field]).replace(';', ','))
line_to_write += ";"
line_to_write = line_to_write.replace('\t', '')
line_to_write = line_to_write.replace('\r', '')
line_to_write = line_to_write.replace('\n', '')
line_to_write +="\n"
output_file.writelines(line_to_write)
input_file.close()
output_file.close()