-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
32 lines (27 loc) · 985 Bytes
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from bs4 import BeautifulSoup as bs
import urllib3
def GetArticleText(art_url):
http = urllib3.PoolManager()
requested = http.request('GET', art_url)
B = BeautifulSoup(requested.data.decode('utf-8'), 'lxml')
Btmp = B.find('div', {'id': 'storytext'})
T = Btmp.get_text().replace("\n", "")
return T
def ScrapeNPR():
http = urllib3.PoolManager()
npr_url = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States"
requested = http.request('GET', npr_url)
B = BeautifulSoup(requested.data.decode('utf-8'), 'lmxl')
Articles = B.find_all('h2', attrs = {'class':'title'})
for meta_art in Articles:
title = meta_art.find('a').string
art_url = meta_art.find('a')['href']
article = GetArticleText(art_url)
print(title)
print(art_url)
SentimentAnalysis(article)
Summary(article)
print("-------------------------")
def main():
ScrapeNPR()
main()