-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathparse_word.py
executable file
·76 lines (68 loc) · 2.13 KB
/
parse_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
def ratings(word, start=0):
# got_results = True
# while got_results:
import subprocess
from xml.etree.ElementTree import ElementTree
import lxml.html
def get_links(i):
res = []
for a in i:
if a.tag == 'a':
res.append(a.text_content())
for b in a:
if b.tag == 'a':
res.append(b.text_content())
for c in b:
if c.tag == 'a':
res.append(c.text_content())
for d in c:
if d.tag == 'a':
res.append(d.text_content())
return res
# out = subprocess.check_output(['cat', 'index.html'])
out = subprocess.check_output(['curl', '-qk', 'https://wordassociations.net/en/words-associated-with/{0}?start={1}'.format(word, start)])
#
html = lxml.html.fromstring(out)
res = {}
def geta(cat):
res = html.find_class('{0}-SECTION'.format(cat))
a = get_links(res)
# print(a)
return a
return geta('NOUN'), geta('VERB'), geta('ADJECTIVE')
def increased_ratings(word):
start = 0
A, B, C = ratings(word, start=start)
AA, BB, CC = [], [], []
AA += A
BB += B
CC += C
while len(A)+len(B)+len(C) != 0:
start += 100
A, B, C = ratings(word, start=start)
AA += A
BB += B
CC += C
return AA, BB, CC
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('words', metavar='WORD', nargs='+',
help='an integer for')
args = parser.parse_args()
for word in args.words:
word = word.lower()
import os
pathname = 'ratings/{0}.txt'.format(word)
if os.path.exists(pathname):
continue
with open(pathname, 'w') as file:
A, B, C = increased_ratings(word)
file.write(' '.join(A))
file.write("\n")
file.write(' '.join(B))
file.write("\n")
file.write(' '.join(C))
file.write("\n")
#mce print(increased_ratings('apple'))