forked from phonedude/cs532-s17
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrabBlogs.sh
executable file
·46 lines (35 loc) · 1.02 KB
/
grabBlogs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
# list of blogs
blogList=data/blogList.txt
if [ -f "$blogList" ]; then
rm -f $blogList
fi
touch $blogList
# downloaded html blogs
blogDir=data/blogs/
if [ -d "$blogDir" ]; then
rm -r $blogDir
fi
mkdir $blogDir
curl -s "http://f-measure.blogspot.com/" > "${blogDir}b001.html"
curl -s "http://ws-dl.blogspot.com/" > "${blogDir}b002.html"
echo "b001.html http://f-measure.blogspot.com/" >> $blogList
echo "b002.html http://ws-dl.blogspot.com/" >> $blogList
for (( i = 3; i <= 200; i++ )); do
num=`seq -f%03g $i $i`
uri=`curl -Ls -o data/b$num.html -w %{url_effective} "http://www.blogger.com/next-blog?navBar=true&blogID=3471633091411211117"`
echo "b$num.html $uri" >> $blogList
done
#remove duplicate uri and page files
sort -u -k2 $blogList > data/tempList
sort -k1 data/tempList> $blogList
rm data/tempList
for file in `cat $blogList | cut -d' ' -f1`; do
mv data/$file $blogDir
done
# cleanup duplicate downloaded files
toDelete=$(find ./data -maxdepth 1 -name "*b*.html")
for item in $toDelete
do
rm $item
done