-
Notifications
You must be signed in to change notification settings - Fork 106
/
Copy pathword_frequency.py
45 lines (39 loc) · 1.47 KB
/
word_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import graphlab as gl
def get_word_frequency(docs):
"""
Returns the frequency of occurrence of words in an SArray of documents
Args:
docs: An SArray (of dtype str) of documents
Returns:
An SFrame with the following columns:
'word' : Word used
'count' : Number of times the word occured in all documents.
'frequency' : Relative frequency of the word in the set of input documents.
"""
# Use the count_words function to count the number of words.
docs_sf = gl.SFrame()
docs_sf['words'] = gl.text_analytics.count_words(docs)
# Stack the dictionary into individual word-count pairs.
docs_sf = docs_sf.stack('words',
new_column_name=['word', 'count'])
# Count the number of unique words (remove None values)
docs_sf = docs_sf.groupby('word', {'count': gl.aggregate.SUM('count')})
docs_sf['frequency'] = docs_sf['count'] / docs_sf["count"].sum()
return docs_sf
# Sample SArray
docs = gl.SArray(['The quick', 'brown fox',
'jumps over the', 'lazy dog'])
docs_count = get_word_frequency(docs)
print docs_count
# +-------+-------+-----------+
# | word | count | frequency |
# +-------+-------+-----------+
# | brown | 1 | 0.25 |
# | lazy | 1 | 0.25 |
# | dog | 1 | 0.25 |
# | quick | 1 | 0.25 |
# | jumps | 1 | 0.25 |
# | the | 2 | 0.5 |
# | fox | 1 | 0.25 |
# | over | 1 | 0.25 |
# +-------+-------+-----------+