Skip to content

Commit

Permalink
Fix all Flake8 errors, fixes Charcoal-SE#43.
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas-daniels committed Jan 25, 2015
1 parent 127d2f7 commit 2c3b9fd
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 101 deletions.
3 changes: 1 addition & 2 deletions bayesianfuncs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from bayesian.classify import Classify
from bayesian.learn import Learn
from parsing import fetch_title_from_msg_content


def bayesian_score(title):
Expand All @@ -24,4 +23,4 @@ def bayesian_learn_title(title, doctype):
bayesian_learn.execute()
return True
except:
return False
return False
19 changes: 13 additions & 6 deletions bodyfetcher.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
from spamhandling import *
from chatcommunicate import *
from datahandling import *
from spamhandling import handle_spam, check_if_spam
from globalvars import GlobalVars
import json
import time
import requests


class BodyFetcher:
queue = {}

specialCases = {"stackoverflow.com" : 5, "serverfault.com" : 5, "superuser.com" : 5, "drupal.stackexchange.com" : 1, "meta.stackexchange.com" : 1}
specialCases = {"stackoverflow.com": 5,
"serverfault.com": 5,
"superuser.com": 5,
"drupal.stackexchange.com": 1,
"meta.stackexchange.com": 1}

threshold = 2

def add_to_queue(self, post):
d=json.loads(json.loads(post)["data"])
d = json.loads(json.loads(post)["data"])
sitebase = d["siteBaseHostAddress"]
postid = d["id"]
if sitebase in self.queue:
Expand Down Expand Up @@ -44,7 +51,7 @@ def print_queue(self):

def make_api_call_for_site(self, site):
posts = self.queue.pop(site)
url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
# wait to make sure API has/updates post data
time.sleep(30)
response = requests.get(url).json()
Expand Down
3 changes: 2 additions & 1 deletion datahandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def is_auto_ignored_post(postid_site_tuple):
return True
return False


def is_privileged(room_id_str, user_id_str):
return room_id_str in GlobalVars.privileged_users and user_id_str in GlobalVars.privileged_users[room_id_str]

Expand Down Expand Up @@ -123,4 +124,4 @@ def has_already_been_posted(host, post_id, title):
for post in GlobalVars.latest_questions:
if post[0] == host and post[1] == str(post_id) and post[2] == title:
return True
return False
return False
2 changes: 2 additions & 0 deletions excepthook.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ def install_thread_excepthook():
since this replaces a new-style class method.
"""
init_old = threading.Thread.__init__

def init(self, *args, **kwargs):
init_old(self, *args, **kwargs)
run_old = self.run

def run_with_except_hook(*args, **kw):
try:
run_old(*args, **kw)
Expand Down
36 changes: 18 additions & 18 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@

class FindSpam:
rules = [
{'regex': u"(?i)\\b(baba(ji)?|fifa.*coins?|nike|tosterone|bajotz|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|rejuvenated skin|LifeForce|swtor2credits|me2.do|black magic|bam2u|Neuro(3X|flexyn)|Nutra|TesteroneXL|Bowtrol|Slim ?Genix|Cleanse EFX|Babyliss ?Pro|Forskolin|Blackline Elite|TestCore Pro|Xtreme Antler|Maxx Test 3000|Cheap Wigs?|jivam|(Improve )?Brain Power|aging skin|acne( prone)? skin|(skin )?eye serum|skin (serum|eye)|(fake|original) (passports?|driver'?s? licen[cs]e|ID cards?)|bagprada)\\b|ಌ|(support|service|helpline)( phone)? number|1[ -]?866[ -]?978[ -]?6819|>>>>(?s).*http", 'all': True,
'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'body': True, 'username': True},
{'regex': u"(?i)\\b(fifabay)\\b", 'all': True, 'reason': "Bad keyword in {}", 'sites': [], 'title': True, 'body': True, 'username': True},
{'regex': u"(?i)\\b(weight (body ?builder|loo?s[es]|reduction)|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"(?i)^(?:(?=.*?\\b(?:online|hd)\\b)(?=.*?(?:free|full|unlimited)).*?movies?\\b|(?=.*?\\b(?:acai|kisn)\\b)(?=.*?care).*products?\\b|(?=.*?packer).*mover)", 'all': True,
'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}|\\d[ -]?\\d{3}[ -]?\\d{3}[ -]?\\d{4}", 'all': True,
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'check_phone_numbers', 'title': True, 'body': False, 'username': False},
{'regex': u"(?i)\\b(nigg(a|er)|asshole|fag|fuck(ing?)?|shit|whore)s?\\b", 'all': True,
'sites': [], 'reason': "Offensive {} detected", 'insensitive':True, 'title': True, 'body': True, 'username': False},
{'regex': u"(?i)\\b(crap)\\b", 'all': True, 'sites': [], 'reason': "Offensive {} detected", 'insensitive': True, 'title': True, 'body': False, 'username': False},
{'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
{'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'body': False, 'username': False},
{'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'body': False, 'username': False},
{'regex': u"(?i)(online ?kelas|wowtoes|ipubsoft|orabank|powerigfaustralia|cfpchampionship2015playofflive|optimalstackfacts|maletestosteronebooster|x4facts|tripleeffectseyeserum|healthcaresup|garciniacambogiaprofacts|filerepairforum|lxwpro-t|casque-beatsbydre|tenderpublish|elliskinantiaging|funmac|lovebiscuits)", 'sites': [], 'all': True, 'reason': "Blacklisted website", 'title': True, 'body': True, 'username': True},
{'regex': u"([a-zA-Z])\\1{10,}", 'all': True, 'sites': [], 'reason': "Repeating characters in {}", 'title': True, 'body': True, 'username': False}
{'regex': u"(?i)\\b(baba(ji)?|fifa.*coins?|nike|tosterone|bajotz|vashi?k[ae]r[ae]n|sumer|kolcak|porn|molvi|judi bola|ituBola.com|lost lover|11s|acai|skin care|rejuvenated skin|LifeForce|swtor2credits|me2.do|black magic|bam2u|Neuro(3X|flexyn)|Nutra|TesteroneXL|Bowtrol|Slim ?Genix|Cleanse EFX|Babyliss ?Pro|Forskolin|Blackline Elite|TestCore Pro|Xtreme Antler|Maxx Test 3000|Cheap Wigs?|jivam|(Improve )?Brain Power|aging skin|acne( prone)? skin|(skin )?eye serum|skin (serum|eye)|(fake|original) (passports?|driver'?s? licen[cs]e|ID cards?)|bagprada)\\b|ಌ|(support|service|helpline)( phone)? number|1[ -]?866[ -]?978[ -]?6819|>>>>(?s).*http", 'all': True,
'sites': [], 'reason': "Bad keyword in {}", 'title': True, 'body': True, 'username': True},
{'regex': u"(?i)\\b(fifabay)\\b", 'all': True, 'reason': "Bad keyword in {}", 'sites': [], 'title': True, 'body': True, 'username': True},
{'regex': u"(?i)\\b(weight (body ?builder|loo?s[es]|reduction)|muscles? build(ing)?|muscles?( (grow(th)?|diets?))?|anti aging|SkinCentric|loo?s[es] weight|wrinkles?)\\b", 'all': True,
'sites': ["fitness.stackexchange.com"], 'reason': "Bad keyword in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"(?i)^(?:(?=.*?\\b(?:online|hd)\\b)(?=.*?(?:free|full|unlimited)).*?movies?\\b|(?=.*?\\b(?:acai|kisn)\\b)(?=.*?care).*products?\\b|(?=.*?packer).*mover)", 'all': True,
'sites': [], 'reason': "Bad keywords in {}", 'title': True, 'body': False, 'username': True},
{'regex': u"\\d(?:_*\\d){9}|\\+?\\d_*\\d[\\s\\-]?(?:_*\\d){8,10}|\\d[ -]?\\d{3}[ -]?\\d{3}[ -]?\\d{4}", 'all': True,
'sites': ["patents.stackexchange.com"], 'reason': "Phone number detected", 'validation_method': 'check_phone_numbers', 'title': True, 'body': False, 'username': False},
{'regex': u"(?i)\\b(nigg(a|er)|asshole|fag|fuck(ing?)?|shit|whore)s?\\b", 'all': True,
'sites': [], 'reason': "Offensive {} detected", 'insensitive':True, 'title': True, 'body': True, 'username': False},
{'regex': u"(?i)\\b(crap)\\b", 'all': True, 'sites': [], 'reason': "Offensive {} detected", 'insensitive': True, 'title': True, 'body': False, 'username': False},
{'regex': u"^(?=.*[A-Z])[^a-z]*$", 'all': True, 'sites': [], 'reason': "All-caps title", 'title': True, 'username': False},
{'regex': u"^(?=.*[0-9])[^a-zA-Z]*$", 'all': True, 'sites': [], 'reason': "Numbers-only title", 'title': True, 'body': False, 'username': False},
{'regex': u"https?://[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,4}(/[a-zA-Z0-9_/?=.-])?", 'all': True,
'sites': ["stackoverflow.com", "superuser.com", "askubuntu.com"], 'reason': "URL in title", 'title': True, 'body': False, 'username': False},
{'regex': u"(?i)(online ?kelas|wowtoes|ipubsoft|orabank|powerigfaustralia|cfpchampionship2015playofflive|optimalstackfacts|maletestosteronebooster|x4facts|tripleeffectseyeserum|healthcaresup|garciniacambogiaprofacts|filerepairforum|lxwpro-t|casque-beatsbydre|tenderpublish|elliskinantiaging|funmac|lovebiscuits)", 'sites': [], 'all': True, 'reason': "Blacklisted website", 'title': True, 'body': True, 'username': True},
{'regex': u"([a-zA-Z])\\1{10,}", 'all': True, 'sites': [], 'reason': "Repeating characters in {}", 'title': True, 'body': True, 'username': False}
]

@staticmethod
Expand Down
18 changes: 9 additions & 9 deletions globalvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,18 @@ class GlobalVars:
blockedTime = 0
charcoal_room_id = "11540"
meta_tavern_room_id = "89"
site_filename = { "electronics.stackexchange.com" : "ElectronicsGood.txt",
"gaming.stackexchange.com" : "GamingGood.txt", "german.stackexchange.com" : "GermanGood.txt",
"italian.stackexchange.com" : "ItalianGood.txt", "math.stackexchange.com" : "MathematicsGood.txt",
"spanish.stackexchange.com" : "SpanishGood.txt", "stats.stackexchange.com" : "StatsGood.txt" }
site_filename = {"electronics.stackexchange.com": "ElectronicsGood.txt",
"gaming.stackexchange.com": "GamingGood.txt", "german.stackexchange.com": "GermanGood.txt",
"italian.stackexchange.com": "ItalianGood.txt", "math.stackexchange.com": "MathematicsGood.txt",
"spanish.stackexchange.com": "SpanishGood.txt", "stats.stackexchange.com": "StatsGood.txt"}
parser = HTMLParser.HTMLParser()
wrap = Client("stackexchange.com")
wrapm = Client("meta.stackexchange.com")
privileged_users = { charcoal_room_id: ["117490", "66258", "31768","103081","73046","88521","59776", "31465"],
meta_tavern_room_id: ["259867", "244519", "244382", "194047", "158100", "178438", "237685",
"215468", "229438", "180276", "161974", "244382", "186281", "266094",
"245167", "230261", "213575", "241919", "203389", "202832"] }
smokeDetector_user_id = { charcoal_room_id: "120914", meta_tavern_room_id: "266345" }
privileged_users = {charcoal_room_id: ["117490", "66258", "31768", "103081", "73046", "88521", "59776", "31465"],
meta_tavern_room_id: ["259867", "244519", "244382", "194047", "158100", "178438", "237685",
"215468", "229438", "180276", "161974", "244382", "186281", "266094",
"245167", "230261", "213575", "241919", "203389", "202832"]}
smokeDetector_user_id = {charcoal_room_id: "120914", meta_tavern_room_id: "266345"}
commit = os.popen("git log --pretty=format:'%h' -n 1").read()
commit_with_author = os.popen("git log --pretty=format:'%h (%cn: *%s*)' -n 1").read()
on_master = os.popen("git rev-parse --abbrev-ref HEAD").read().strip() == "master"
Expand Down
30 changes: 18 additions & 12 deletions spamhandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
import sys
import time
from findspam import FindSpam
from datahandling import *
from parsing import get_user_from_url, unescape_title, escape_special_chars_in_title
from bayesianfuncs import *
from datahandling import add_auto_ignored_post, is_blacklisted_user, \
is_whitelisted_user, has_already_been_posted, is_false_positive, \
is_auto_ignored_post, is_ignored_post, append_to_latest_questions
from parsing import get_user_from_url, unescape_title,\
escape_special_chars_in_title
from bayesianfuncs import bayesian_score
from globalvars import GlobalVars
from datetime import datetime


def check_if_spam(title, body, user_name, user_url, post_site, post_id, is_answer):
Expand All @@ -18,30 +23,31 @@ def check_if_spam(title, body, user_name, user_url, post_site, post_id, is_answe
or is_whitelisted_user(get_user_from_url(user_url)) \
or is_ignored_post((post_id, post_site)) \
or is_auto_ignored_post((post_id, post_site)):
return False, None # Don't repost. Reddit will hate you.
return False, None # Don't repost. Reddit will hate you.
return True, test
return False, None


def check_if_spam_json(data):
d = json.loads(json.loads(data)["data"])
try:
_ = d["ownerUrl"]
_ = d["ownerUrl"] # noqa
except:
return False, None # owner's account doesn't exist anymore, no need to post it in chat:
# http://chat.stackexchange.com/transcript/message/18380776#18380776
# owner's account doesn't exist anymore, no need to post it in chat:
# http://chat.stackexchange.com/transcript/message/18380776#18380776
return False, None
title = d["titleEncodedFancy"]
title = unescape_title(title)
poster = d["ownerDisplayName"]
url = d["url"]
post_id = str(d["id"])
print time.strftime("%Y-%m-%d %H:%M:%S"),title.encode("ascii",errors="replace")
print time.strftime("%Y-%m-%d %H:%M:%S"), title.encode("ascii", errors="replace")
quality_score = bayesian_score(title)
print quality_score
if quality_score < 0.3 and d["siteBaseHostAddress"] == "stackoverflow.com":
print GlobalVars.bayesian_testroom.send_message("[ SmokeDetector | BayesianBeta ] Quality score " + str(quality_score * 100) + ": [" + title + "](" + url + ")")
site = d["siteBaseHostAddress"]
site = site.encode("ascii",errors="replace")
site = site.encode("ascii", errors="replace")
sys.stdout.flush()
is_spam, reason = check_if_spam(title, None, poster, url, site, post_id, False)
return is_spam, reason
Expand All @@ -62,8 +68,8 @@ def handle_spam(title, poster, site, post_url, poster_url, post_id, reasons, is_
try:
title = escape_special_chars_in_title(title)
s = "[ [SmokeDetector](https://github.com/Charcoal-SE/SmokeDetector) ] %s: [%s](%s) by [%s](%s) on `%s`" % \
(reason, title.strip(), post_url, poster.strip(), poster_url, site)
print GlobalVars.parser.unescape(s).encode('ascii',errors='replace')
(reason, title.strip(), post_url, poster.strip(), poster_url, site)
print GlobalVars.parser.unescape(s).encode('ascii', errors='replace')
if time.time() >= GlobalVars.blockedTime:
append_to_latest_questions(site, post_id, title)
GlobalVars.charcoal_hq.send_message(s)
Expand All @@ -78,7 +84,7 @@ def handle_spam(title, poster, site, post_url, poster_url, post_id, reasons, is_

def handle_spam_json(data, reason):
try:
d=json.loads(json.loads(data)["data"])
d = json.loads(json.loads(data)["data"])
title = unescape_title(d["titleEncodedFancy"])
poster = d["ownerDisplayName"]
site = d["siteBaseHostAddress"]
Expand Down
2 changes: 1 addition & 1 deletion test/srccheckbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ def get_smokedetector_root():
smokedetector_root = "../"
else:
return False, ""
return True, smokedetector_root
return True, smokedetector_root
6 changes: 4 additions & 2 deletions test/test_parsing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from parsing import *
from parsing import fetch_post_id_and_site_from_msg_content, \
escape_special_chars_in_title, unescape_title, get_user_from_url, \
fetch_owner_url_from_msg_content, fetch_title_from_msg_content, \
fetch_post_url_from_msg_content
import pytest

test_data_inputs = []
Expand Down Expand Up @@ -33,6 +36,5 @@
(test_data_inputs[3], fetch_owner_url_from_msg_content, 'http://stackoverflow.com/users/3754535/user3754535'),
(test_data_inputs[3], fetch_title_from_msg_content, "Why I can't insert data in a model from a custom controller?")
])

def test_parsing(input_data, parse_method, expected):
assert parse_method(input_data) == expected
Loading

0 comments on commit 2c3b9fd

Please sign in to comment.