-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
executable file
·67 lines (50 loc) · 2.03 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
"""
This is a template for a Python scraper on morph.io (https://morph.io)
including some code snippets below that you should find helpful
Based on https://github.com/planningalerts-scrapers/noosa_council
"""
import os
import sys
# import scraperwiki
import lxml.html
import requests
from morph_planningalerts import DevelopmentApplication, MorphDatabase
DEFAULT_START_URL = "http://engage.bayswater.wa.gov.au/planning-applications-public-advertising"
# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
def main(url):
MorphDatabase.init()
html = requests.get(url)
tree = lxml.html.fromstring(html.content)
print(tree)
sys.exit(0)
count_new = total = 0
for application_url in get_application_links(url):
if not application_url:
# Skipped entry...
total += 1
continue
# html = scraperwiki.scrape(url)
# XPath
#This will create a list of buyers:
buyers = tree.xpath('//div[@title="buyer-name"]/text()')
#This will create a list of prices
prices = tree.xpath('//span[@class="item-price"]/text()')
# Find something on the page using css selectors
# root.cssselect("div[align='left']")
#
data = extract_application_details(application_url)
application, created = DevelopmentApplication.get_or_create(**data)
total += 1
if not created:
print("* Skipping {0.council_reference}".format(application))
else:
print("Saved {0.council_reference}".format(application))
count_new += 1
print("Added {0} records out of {1} processed.".format(count_new, total))
if __name__ == "__main__":
main(os.environ.get('MORPH_START_URL', DEFAULT_START_URL))