-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbob.py
executable file
·116 lines (93 loc) · 4.38 KB
/
bob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
import os
import re
from glob import glob
def parser(dico_path):
filenames = glob(dico_path + 'bob/*')
dico = {}
for file in filenames:
#file = re.sub('\n', '', file)
#file = file.decode('iso8859_15').encode('utf-8')
#outfile = file
#output = os.path.join('/Users/Zsofia/BOB/bob_new/', outfile)
#output = open(output, 'w')
#filename = os.path.join('/Users/Zsofia/BOB/bob/', file)
#incoming = open(filename)
entry = ''
for line in open(file):
line = line.decode('iso8859_15')
if re.search('<div class="g_mot">', line):
m = re.search('<div class="g_mot">([^<]+)', line)
head = m.group(1)
head = re.sub(' ', '', head)
word = head
word = re.sub('([^ ]+).*', '\\1', word)
if word not in dico:
dico[word] = []
head = head.upper()
entry += head
entry += ','
entry += '\n'
try:
if re.search('class="g_connexion">date :</span> <span class="cap">', line):
m = re.search('class="g_connexion">date :</span> <span class="cap">([^<]+)', line)
date = m.group(1)
entry += date
entry += '.'
entry += '\n'
except IndexError:
print 'got exception'
if re.search('<span class="g_gram">', line):
m = re.search('<span class="g_gram">([^<]+)', line)
gram_ps = m.group(1)
gram_ps = re.sub(' ', '', gram_ps)
gram_ps = re.sub('\|', '', gram_ps)
entry += gram_ps
entry += '\n'
if re.search('</span> <span class="g_gram">', line):
m = re.search('</span> <span class="g_gram">([^<]+)', line)
gram_case = m.group(1)
gram_case = re.sub(' ', '', gram_case)
gram_case = re.sub('\|', '', gram_case)
entry += gram_case
entry += '\n'
if re.search('<div class="g_def">', line):
m = re.search('<div class="g_def">([^<]+)', line)
defn = m.group(1)
defn = re.sub('¶', '', defn)
defn = defn.rstrip()
entry += defn
entry += '.'
entry += '<br>'
entry += '\n'
if re.search('<span class="g_etymo">', line):
m = re.search('<span class="g_etymo">([^<]+)', line)
etymo = m.group(1)
etymo = re.sub('&loz', '', etymo)
etymo = etymo.rstrip()
entry += etymo
entry += '<br>'
entry += '\n'
if re.search('<span class="g_cit">◊', line):
citation = re.findall('<span class="g_cit">◊([^<]+)', line)
citation_date = re.findall('<span class="g_cit_date">([^<]+)', line)
citations = zip(citation, citation_date)
for c, d in citations:
entry += '<i>'
entry += c
entry += ' ('
entry += d
entry += ').'
entry += '</i>'
entry += '<br>'
entry += '\n'
if re.search('<span class="g_connexion">', line):
m = re.findall('<span class="g_connexion">synonyme : <span class="cap">([^(]+)', line)
for synonym in m:
synonym = synonym.rstrip()
entry += synonym
entry += '.'
entry += '<br>'
entry += '\n'
dico[word].append(entry)
return dico