-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyze.py
151 lines (108 loc) · 4.8 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Import of software.o.o downloads log in PIWIK
import argparse
import cPickle
import datetime
import md5
import os.path
import re
import bsddb3
import dblist
# Used to filter only the lines that actually reference a download
RE_ISO = re.compile(r'/distribution/[^/]*/iso/[^ "]')
def week_dbname(date_):
"""Name the weekly db. Return YYYYWW string."""
# Group the date at the end of the week, so some dates can appears
# in a different year. For example, 32/12/2012 will appear as a
# 201301, the first week of 2013.
delta = datetime.timedelta(days=6-date_.weekday())
week_date = date_ + delta
return '%04d%02d'%(week_date.year, week_date.isocalendar()[2])
def month_dbname(date_):
"""Name of the monthly db. Return YYYYMM string."""
return '%04d%02d'%(date_.year, date_.month)
def count_analysis(key, dics):
"""Perform count analysis"""
for d in dics:
c = d.get(key, 0)
d[key] = c + 1
def set_analysis(key, item, dics):
"""Perform set analysis."""
for d in dics:
items = d.get(key, set())
if item not in items:
items.add(item)
d[key] = items
class PDict(dict):
def __init__(self, path, *args, **kwargs):
self.path = path
super(PDict, self).__init__(*args, **kwargs)
try:
_dict = cPickle.load(open(path, 'rb'))
except:
_dict = {}
self.update(_dict)
def save(self):
cPickle.dump(self, open(self.path, 'wb'), cPickle.HIGHEST_PROTOCOL)
def analyze(dbenv, dbname, day):
"""Read every line and analize the data (D/W/M)."""
# Create / open the accumulators. Use picke objects for performance
results = os.path.join(dbenv, 'results')
# download_dicts = [PDict(os.path.join(results, '%s_download_%s.pkl'%(dbname, period)))
# for period in ('day',)] # 'week', 'month')]
download_ip_dicts = [PDict(os.path.join(results, '%s_download_ip_%s.pkl'%(dbname, period)))
for period in ('day',)] # 'week', 'month')]
# uuid_dicts = [PDict(os.path.join(results, '%s_uuid_%s.pkl'%(dbname, period)))
# for period in ('day',)] # 'week', 'month')]
# ip_dicts = [PDict(os.path.join(results, '%s_ip_%s.pkl'%(dbname, period)))
# for period in ('day',)] # 'week', 'month')]
# medium_dicts = [PDict(os.path.join(results, '%s_medium_%s.pkl'%(dbname, period)))
# for period in ('day',)] # 'week', 'month')]
# arch_dicts = [PDict(os.path.join(results, '%s_arch_%s.pkl'%(dbname, period)))
# for period in ('day',)] # 'week', 'month')]
# Open lines databases
lines = dblist.open(None, os.path.join(dbenv, dbname), flags=bsddb3.db.DB_RDONLY)
lines_path = dblist.open(None, os.path.join(dbenv, dbname+'_paths'), flags=bsddb3.db.DB_RDONLY)
# lines_uuid = dblist.open(None, os.path.join(dbenv, dbname+'_uuids'), flags=bsddb3.db.DB_RDONLY)
# Read the bots file
bots = set(l.strip() for l in open('bots.txt'))
# Recover the path information
paths = { md5.new(path).digest(): path for path in lines_path }
for line in lines:
# A line is a tuple with this schema:
# ip, hour, minute, second, md5_path, status, size,
# referrer, user_agent, md5_uuid, medium, version, arch
(ip, _, _, _, md5_path, status, _, _, user_agent, md5_uuid, medium, version, arch) = line
if user_agent in bots:
continue
path = paths[md5_path]
if RE_ISO.match(path) and status != 404:
# count_analysis(path, download_dicts)
set_analysis(path, ip, download_ip_dicts)
# if medium:
# count_analysis(medium, medium_dicts)
# if arch:
# count_analysis(arch, arch_dicts)
# if version:
# set_analysis(version, md5_uuid, uuid_dicts)
# set_analysis(version, ip, ip_dicts)
for dicts in (download_ip_dicts,): #(download_dicts, download_ip_dicts, uuid_dicts, ip_dicts, medium_dicts, arch_dicts):
for d in dicts:
d.save()
lines.close()
lines_path.close()
# lines_uuid.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Analyze a single bdb log file into bdb')
parser.add_argument('--dbenv', default='dbenv', help='Database environment')
parser.add_argument('--db', help='Name of the database to read the information')
args = parser.parse_args()
# dbenv = bsddb3.db.DBEnv()
# dbenv.open(args.dbenv,
# bsddb3.db.DB_INIT_MPOOL
# |bsddb3.db.DB_CREATE)
year, month, day = (int(x) for x in (args.db[:4], args.db[4:6], args.db[6:]))
day = datetime.datetime(year=year, month=month, day=day)
analyze(args.dbenv, args.db, day)
# dbenv.close()