-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetURL.py
89 lines (81 loc) · 2.76 KB
/
getURL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import urllib
from urlparse import urlparse
from urlparse import urljoin
from collections import Counter
import re
reg=re.compile('[a-z0-9/\-_]+$')
with open('inURLs.txt') as f:
content = f.readlines()
URLs = map(lambda s: s.strip(), content)
## isEvent is mostly for the workshopsf.org link, if using other links I
## would check wordcounts for various key words more, otherwise I get too
## many false positives trying to make this function more general.
def isEvent(body):
c = Counter(w.lower() for w in re.findall('(workshop)', body))
return sum(c.values())>3
def getnewLinks(page,prevLinks, rootdomain):
try:
response = urllib.urlopen(page[:-1])
except urllib.URLError, e:
print e.code
print e.read()
return []
html = response.read()
links,pos,allFound=[],0,False
while not allFound:
aTag=html.find("<a href=",pos)
if aTag>-1:
href=html.find('"',aTag+1)
endHref=html.find('"',href+1)
url=html[href+1:endHref]
if len(url)>0:
if url[:7]!="http://" and bool(reg.match(url)):
url = urljoin(page, url)
if url[:7]=="http://":
if url[-1]=="/":
url=url[:-1]
if not url in links and not url in prevLinks and rootdomain in url:
links.append(url)
closeTag=html.find("</a>",aTag)
pos=closeTag+1
else:
allFound=True
return links
for seed in URLs:
print 'looking at ' + seed
rootdomain = '.'.join(urlparse(seed).netloc.split('.')[-2:])
if seed[-1]=="/":
seed = seed[0:-1]
toCrawl=[seed]
prior = seed
while len(prior)>len(seed[0:11+len(rootdomain)]):
prior = prior[0:prior.rfind('/')]
toCrawl.append(prior+'/')
otherevents = []
crawled=[]
while toCrawl:
if len(otherevents) >= 10:
break
url=toCrawl.pop(0)
#print url
crawled.append(url)
newLinks=getnewLinks(url,crawled, rootdomain)
toCrawl=toCrawl + newLinks
for link in newLinks:
if link not in otherevents and seed not in link:
if "event" in link:
otherevents.append(link)
else:
try:
response = urllib.urlopen(link)
except urllib.URLError, e:
print e.code
print e.read()
continue
body = response.read()
if isEvent(body):
otherevents.append(link)
file = open("outURLs.txt", "a")
for event in otherevents:
file.write(event+'\n')
file.close()