-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
102 lines (65 loc) · 2.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: UTF-8 -*-
'''
Created on Apr 27, 2012
@author: Rabih Kodeih
'''
from utils.decorators import measureExecutionTime
from htmlcrawler.hc import Site
@measureExecutionTime
def crawl(site):
site.purge()
site.crawl()
if __name__ == '__main__':
s = Site(home_page_url='http://www.ibm.com',
site_name='snoak',
www_path='./../output/www',
depth=1,
fetch_resources=True,
process_inline_js=False,
process_embedded_css=False,
remove_comments=False,
remove_ns_tags=False,
randomize_text=False,
open_home_page_in_browser=True)
crawl(s)
#===============================================================================
# Pipeline
#===============================================================================
#markup ='''
#<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
#<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US">
# <head>
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1256"></meta>
# <title>testing</title>
# <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
# <script type="text/javascript" src="some_source.js"></script>
# <script type="text/javascript">
# <!--
# alert('hello world!');
# // -->
# </script>
#
# <style type="text/css">
#
# body {
# background-color: #FF00FF;
# color: blue; }</style>
# <div><p><!-- this is comment 1 -->some tail</p></div>
# </head>
#<body>
#<p>blabla & </p>
# <div><script src='rabih.js'>var x=99; alert(x);</script></div>
# <div class='clear-fix'>
# some div text
# <p>some <i>paragraph</i> text 1 <b>and</b> text 2</p>
#<p>some paragraph text 2</p>
#end of div
#<!-- this is comment 2 -->
# <style type="text/css">div { color: red; }</style>
# <style type="text/css"></style>
# <link rel="alternate" type="application/rss+xml" title="Flex.org RSS Feed" href="http://flex.org/feed/" />
# </div>
# </body>
#</html>
#'''
#markup = read_file('../stealer/output/markup.html')