diff --git a/WASEHTMLParser.py b/WASEHTMLParser.py index 59226b7..c7cd481 100644 --- a/WASEHTMLParser.py +++ b/WASEHTMLParser.py @@ -8,6 +8,15 @@ def add_attrs(attrNames, attrList): return [a[1] for a in filter(lambda attr: attr[0] in attrNames, attrList)] +def has_attr(attrs, attr): + return attr in map(lambda kv: kv[0], attrs) + +def attr_val_is(attrs, attr, val): + try: + return filter(lambda kv: kv[0] == attr, attrs)[0][1] == val + except: + return False + class WASEHTMLParser(HTMLParser, object): def reset(self): self.doctype = set() @@ -28,33 +37,33 @@ def handle_decl(self, decl): def handle_starttag(self, tag, attrs): if tag == "iframe": - self.frames = self.frames.union(add_attrs(["src"], attrs)) + self.frames.update(add_attrs(["src"], attrs)) elif tag == "base": - self.base = self.base.union(add_attrs(["href"], attrs)) - elif tag == "link" and "rel" in attrs and attrs["rel"] == "stylesheet": - self.stylesheets = self.stylesheets.union(add_attrs(["href"], attrs)) + self.base.update(add_attrs(["href"], attrs)) + elif tag == "link" and attr_val_is(attrs, "rel", "stylesheet"): + self.stylesheets.update(add_attrs(["href"], attrs)) elif tag == "script": - self.scripts = self.scripts.union(add_attrs(["src"], attrs)) + self.scripts.update(add_attrs(["src"], attrs)) elif tag == "a" or tag == "area": - self.links = self.links.union(add_attrs(["href"], attrs)) + self.links.update(add_attrs(["href"], attrs)) elif tag == "img" or tag == "input": - self.images = self.images.union(add_attrs(["src"], attrs)) + self.images.update(add_attrs(["src"], attrs)) elif tag == "svg" or tag == "image": - self.images = self.images.union(add_attrs(["href", "xlink:href"], attrs)) + self.images.update(add_attrs(["href", "xlink:href"], attrs)) elif tag == "audio": - self.audio = self.audio.union(add_attrs(["src"], attrs)) + self.audio.update(add_attrs(["src"], attrs)) elif tag == "video": - self.video = self.video.union(add_attrs(["src"], attrs)) + self.video.update(add_attrs(["src"], attrs)) elif tag == "object": - self.objects = self.objects.union(add_attrs(["data"], attrs)) + self.objects.update(add_attrs(["data"], attrs)) elif tag == "embed": - self.objects = self.objects.union(add_attrs(["src"], attrs)) + self.objects.update(add_attrs(["src"], attrs)) elif tag == "applet": - self.objects = self.objects.union(add_attrs(["code"], attrs)) + self.objects.update(add_attrs(["code"], attrs)) elif tag == "form": - self.formactions = self.formactions.union(add_attrs(["action"], attrs)) + self.formactions.update(add_attrs(["action"], attrs)) elif tag == "input" or tag == "button": - self.formactions = self.formactions.union(add_attrs(["formaction"], attrs)) + self.formactions.update(add_attrs(["formaction"], attrs)) else: return diff --git a/elasticsearch-py b/elasticsearch-py index 1c7b23e..51defea 160000 --- a/elasticsearch-py +++ b/elasticsearch-py @@ -1 +1 @@ -Subproject commit 1c7b23e2141c02e24a670fd93b033faf62943c8f +Subproject commit 51defea8c9e1f5d664c879071d027cd5761630ab