-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathformat_books_from_xml.py
39 lines (34 loc) · 1.33 KB
/
format_books_from_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import xml.etree.ElementTree as ET
import goodreads_scrape
import goodreads_author_scrape
import json
#year is the original publication year
#my_rating is 1-5, if it's 0 then I haven't read it yet
#average_rating is the public rating (1-5)
books = []
tree = ET.parse('data/raw_books.xml')
root = tree.getroot()
reviews = root[1]
i=0
for child in reviews:
year = goodreads_scrape.get_original_publication_year(child[1][0].text)
place = goodreads_scrape.get_setting(child[1][0].text)
if(len(child[1][19]) > 0):
author_birthplace = goodreads_author_scrape.get_author_birthplace(child[1][19][0][0].text)
if(len(child[1][19][0]) > 0):
author = child[1][19][0][1].text
author = author.replace(".", "")
books.append(dict({'title' : child[1][4].text,
'id' : child[1][0].text,
'original_pub_year' : year,
'my_rating' : child[2].text,
'average_rating': child[1][16].text,
'setting': place,
'author_birthplace': author_birthplace,
'author': author}))
print(i)
i+=1
#for book in books:
#print(book['title'], ": ", book['original_pub_year'], book['setting'])
with open('data/parsed_book_data.json','w') as outfile:
json.dump(books,outfile)