-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcharacter.py
46 lines (32 loc) · 1.42 KB
/
character.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import math
from textblob import TextBlob as tb
class Character():
def __init__(self, name, lines):
self.name = name
self.lines = lines
self.blob = tb(' '.join(lines))
def gen_tf_idf_vec(self, bloblist, all_words):
self.tf_idf_vec = [self.tf_idf(word, bloblist) for word in all_words]
def tf(self, word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(self, word, bloblist):
return sum(1 for blob in bloblist if word in blob.words)
def idf(self, word, bloblist):
return math.log(len(bloblist) / (1 + self.n_containing(word, bloblist)))
def tf_idf(self, word, bloblist):
return self.tf(word, self.blob) * self.idf(word, bloblist)
def cosine_sim(self, other):
sum_components = sum(map(lambda x : x[0] * x[1], zip(self.tf_idf_vec, other.tf_idf_vec)))
sqrt_self = math.sqrt(sum(map(lambda x : x ** 2, self.tf_idf_vec)))
sqrt_other = math.sqrt(sum(map(lambda x : x ** 2, other.tf_idf_vec)))
return sum_components / (sqrt_self * sqrt_other)
'''
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob.words)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
'''