-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkovgen.py
173 lines (159 loc) · 7.76 KB
/
markovgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python
# Written by Levi Schuck: https://github.com/LeviSchuck
# Modified and added to by Nick Kanel: https://github.com/SnowySailor
import random
class Markov(object):
def __init__(self, open_file=None, max_size=5000000, initEmpty=False, min_length=None):
if initEmpty:
self.cache = {}
self.lines = []
self.line_size = 0
else:
self.cache = {}
self.trippleCache = {}
self.open_file = open_file
self.lines = self.file_to_lines(max_size, min_length)
self.line_size = len(self.lines)
self.database()
def file_to_lines(self, max_size, min_length):
self.open_file.seek(0)
data = self.open_file.read()
lines = []
# Number of bytes that we have read in
sizeRead = 0
# Start at the end of the file so that we get the newest data
for line in reversed(data.split('\n')):
# Get the size of the current line
lineSize = len(line.encode('utf-8'))
# If this line would overshoot the limit, quit reading
if sizeRead + lineSize > max_size:
break
# If we have a min length set and the message is longer
# then we can append it to the list and add to sizeRead
if min_length and len(line.split()) >= min_length:
lines.append(line)
sizeRead += lineSize
# If we don't have min length set, then we just read in
# all the lines that are 3 words and longer.
elif not min_length and len(line.split()) >= 3:
lines.append(line)
sizeRead += lineSize
return lines
def triples(self):
""" Generates triples from the given data string. So if our string were
"What a lovely day", we'd generate (What, a, lovely) and then
(a, lovely, day).
"""
""" Triples are too revealing. Use doubles"""
for line in self.lines:
line = line.split()
if len(line) < 3:
continue
for i in range(len(line) - 2):
yield (line[i], line[i+1], line[i+2])
yield(line[len(line) - 2], line[len(line) - 1], "\n")
def quadruples(self):
for line in self.lines:
line = line.split()
if len(line) < 4:
continue
for i in range(len(line) - 3):
yield (line[i], line[i+1], line[i+2], line[i+3])
yield (line[len(line)-3], line[len(line)-2], line[len(line)-1], "\n")
def database(self):
# for w1, w2, w3 in self.triples():
# key = (w1, w2)
# if key in self.cache:
# self.cache[key].append(w3)
# else:
# self.cache[key] = [w3]
for w1, w2, w3 in self.triples():
key = (w1,w2)
if key in self.cache:
self.cache[key].append(w3)
else:
self.cache[key] = [w3]
def generate_tripple_markov_text(self, size=25):
while True:
seed_line = self.lines[random.randint(0, self.line_size)].split()
if len(seed_line) > 3:
break
seed_word, next_word, last_word = seed_line[0], seed_line[1], seed_line[2]
w1, w2, w3 = seed_word, next_word, last_word
gen_words = []
while True:
if(w1 == "\n"):
break
gen_words.append(w1)
if(w2 == "\n"):
break
if(w3 == "\n"):
break
""" Get new words with more tollerance to letter case
Given a string like
"Hello there Tim and Bob"
Where w1 = "there"
w2 = "Tim"
(new word "and")
It could also select "it's" from a string like
"Look over there tim it's a seagull"
Because we also check to see if there are lower case versions
of the (w1,w2) pair in the cache.
"""
#lowerKeyList = []
#if (w1.lower(),w2.lower()) in self.cache:
# lowerKeyList = self.cache[(w1.lower(),w2.lower())]
#w1, w2 = w2, random.choice(list(set().union(self.cache[(w1, w2)], lowerKeyList)))
w1, w2, w3 = w2, w3, random.choice(self.trippleCache[(w1, w2, w3)])
"""gen_words.append(w3) """
return ' '.join(gen_words)
def generate_markov_text(self, size=25):
while True:
seed_line = self.lines[random.randint(0, self.line_size)].split()
if len(seed_line) > 2:
break
seed_word, next_word = seed_line[0], seed_line[1]
w1, w2 = seed_word, next_word
gen_words = []
for i in range(size):
if(w1 == "\n"):
break
gen_words.append(w1)
if(w2 == "\n"):
break
""" Get new words with more tollerance to letter case
Given a string like
"Hello there Tim and Bob"
Where w1 = "there"
w2 = "Tim"
(new word "and")
It could also select "it's" from a string like
"Look over there tim it's a seagull"
Because we also check to see if there are lower case versions
of the (w1,w2) pair in the cache.
"""
#lowerKeyList = []
#if (w1.lower(),w2.lower()) in self.cache:
# lowerKeyList = self.cache[(w1.lower(),w2.lower())]
#w1, w2 = w2, random.choice(list(set().union(self.cache[(w1, w2)], lowerKeyList)))
w1, w2 = w2, random.choice(self.cache[(w1, w2)])
"""gen_words.append(w3) """
return ' '.join(gen_words)
def digest_single_message(self, message):
# Add the new message as a new line
self.lines.append(message)
# Increse the total number of lines we have
self.line_size += 1
for w1, w2, w3 in self.tipple_one_word(message):
key = (w1, w2)
if key in self.cache:
self.cache[key].append(w3)
else:
self.cache[key] = [w3]
def tipple_one_word(self, message):
line = message.split()
if len(line) < 3:
return
for i in range(len(line)-2):
yield (line[i], line[i+1], line[i+2])
yield(line[len(line) - 2], line[len(line) - 1], "\n")