forked from dataquestio/twitter-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
69 lines (59 loc) · 2.07 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import settings
import tweepy
import dataset
from textblob import TextBlob
from sqlalchemy.exc import ProgrammingError
import json
db = dataset.connect(settings.CONNECTION_STRING)
class StreamListener(tweepy.StreamListener):
def on_status(self, status):
if status.retweeted:
return
description = status.user.description
loc = status.user.location
text = status.text
coords = status.coordinates
geo = status.geo
name = status.user.screen_name
user_created = status.user.created_at
followers = status.user.followers_count
id_str = status.id_str
created = status.created_at
retweets = status.retweet_count
bg_color = status.user.profile_background_color
blob = TextBlob(text)
sent = blob.sentiment
if geo is not None:
geo = json.dumps(geo)
if coords is not None:
coords = json.dumps(coords)
table = db[settings.TABLE_NAME]
try:
table.insert(dict(
user_description=description,
user_location=loc,
coordinates=coords,
text=text,
geo=geo,
user_name=name,
user_created=user_created,
user_followers=followers,
id_str=id_str,
created=created,
retweet_count=retweets,
user_bg_color=bg_color,
polarity=sent.polarity,
subjectivity=sent.subjectivity,
))
except ProgrammingError as err:
print(err)
def on_error(self, status_code):
if status_code == 420:
#returning False in on_data disconnects the stream
return False
auth = tweepy.OAuthHandler(settings.TWITTER_APP_KEY, settings.TWITTER_APP_SECRET)
auth.set_access_token(settings.TWITTER_KEY, settings.TWITTER_SECRET)
api = tweepy.API(auth)
stream_listener = StreamListener()
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
stream.filter(track=settings.TRACK_TERMS)