-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsrilm.pyx
156 lines (134 loc) · 5.3 KB
/
srilm.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Copyright (C) 2010, Nathaniel J. Smith <[email protected]>
# This code released under a 2-clause BSD license, see COPYING.
# SRILM itself is under an ad hoc "non-free" license though.
# See:
# http://wiki.cython.org/WrappingCPlusPlus
# for discussion of the crazy hacks we use to trick pyrex into generating
# valid c++.
#
# These shouldn't be necessary anymore, since Cython has learned a bunch about
# C++ since. But this is old code...
# compile me with cython --cplus, or Extension(..., language="c++", ...)
cdef extern from "unistd.h":
void * malloc(int)
void free(void *)
cdef extern from *:
ctypedef int Boolean # not sure if this is right...
cdef extern from "File.h":
ctypedef struct c_File "File":
pass
c_File * new_File "new File" (char * path, char * mode) except +
void del_File "delete" (c_File *)
cdef extern from "Vocab.h":
ctypedef int VocabIndex
ctypedef char * VocabString
VocabIndex Vocab_None
ctypedef struct c_Vocab "Vocab":
# These two actually return Boolean&'s, but if we just ignore that
# then things work out okay...
Boolean (*unkIsWord)()
Boolean (*toLower)()
VocabString (*getWord)(VocabIndex)
VocabIndex (*getIndex)(VocabString, VocabIndex unkIndex)
VocabIndex (*unkIndex)()
VocabIndex (*ssIndex)()
VocabIndex (*seIndex)()
VocabIndex (*highIndex)()
Boolean (*isNonEvent)(VocabIndex word)
c_Vocab * new_Vocab "new Vocab" ()
void del_Vocab "delete" (c_Vocab *)
cdef extern from "Ngram.h":
ctypedef double LogP
ctypedef struct c_Ngram "Ngram":
void (*read)(c_File)
LogP (*wordProb)(VocabIndex word, VocabIndex * context)
Boolean debugme(int)
c_Ngram * new_Ngram "new Ngram" (c_Vocab, int)
void del_Ngram "delete" (c_Ngram *)
cdef extern from "srilm-c++-hacks.hh":
Boolean * take_address_of_bool "take_address_of<Boolean>"(Boolean b)
###########################################################################
cdef class _Vocab:
cdef c_Vocab * _vocab
def __cinit__(self, lower):
self._vocab = new_Vocab()
take_address_of_bool(self._vocab.unkIsWord())[0] = 1
take_address_of_bool(self._vocab.toLower())[0] = bool(lower)
def __dealloc__(self):
del_Vocab(self._vocab)
def intern(self, word):
# ensure of the binary representation of str
return self._vocab.getIndex(str.encode(word),
self._vocab.unkIndex())
def extern(self, idx):
cdef VocabString s
s = self._vocab.getWord(idx)
if s:
return s
else:
return None
# to iterate over all words, use range(max_interned() + 1) and be prepared
# for extern to return None
def max_interned(self):
return self._vocab.highIndex()
# Returns True for weirdo entities like <unk>, <s>, etc.
# Vocab defines a isNonEvent() call but it doesn't really work right...
def is_non_word(self, idx):
cdef VocabIndex c_idx = idx
return (c_idx == self._vocab.unkIndex()
or c_idx == self._vocab.ssIndex()
or c_idx == self._vocab.seIndex()
or self._vocab.isNonEvent(c_idx))
cdef class LM:
cdef public _Vocab vocab
cdef c_Ngram * _ngram
cdef public object path
def __cinit__(self, path, debug=False, lower=False, vocab=None):
path = str.encode(path) # binary representation of str
if vocab is None:
vocab = _Vocab(lower)
self.vocab = vocab
self._ngram = new_Ngram(self.vocab._vocab[0], 20)
if debug:
self._ngram.debugme(10)
cdef c_File * fp
fp = new_File(path, "r")
self._ngram.read(fp[0])
del_File(fp)
self.path = path
def __dealloc__(self):
del_Ngram(self._ngram)
# Usage: log P(brown | the quick)
# -> logprob_strings("brown", ["quick", "the"])
def logprob_strings(self, word, context):
word_i = self.vocab.intern(word)
context_i = map(self.vocab.intern, context)
context_i = list(context_i) # compatible with python 3.x
return self.logprob(word_i, context_i)
# Like above, but takes interned words.
# Note that this may return -inf
def logprob(self, word, context):
cdef VocabIndex c_context[20]
cdef int i, length = len(context)
if length >= 20:
length = 19
for 0 <= i < length:
c_context[i] = context[i]
c_context[length] = Vocab_None
return self._ngram.wordProb(word, c_context)
# Takes a list like ["The", "man", "who"], and returns the total
# log-probability of a sentence starting with that:
# logP(The | <s>) + logP(man | <s> The) + logP(who | <s> The man)
def total_logprob_strings(self, ngram):
ngram_i = map(self.vocab.intern, ngram)
ngram_i = list(ngram_i)
ngram_i.append(self.vocab.intern("</s>"))
ngram_i.reverse()
ngram_i.append(self.vocab.intern("<s>"))
lp = 0
for 0 <= i < len(ngram_i) - 1:
lp = lp + self.logprob(ngram_i[i], ngram_i[i + 1:])
return lp
# FIXME: add a wrapper for Ngram::contextID(), whose &length argument
# returns the order of the ngram that was actually used (so we can query
# about backoff)