-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrim_quotes.py
69 lines (58 loc) · 2.17 KB
/
trim_quotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import csv
import re
from argparse import ArgumentParser, FileType
from collections import Counter
""" Trim usable quotes from a large source file.
This expects a source of quotes like the one at
https://archive.org/details/quotes_20230625
It grabs the first 100 quotes at each letter count from 10 to 50.
"""
def parse_args():
parser = ArgumentParser(description='Trim quotes from big CSV file.')
parser.add_argument('csv_in',
type=FileType())
parser.add_argument('csv_out',
type=FileType('w'))
return parser.parse_args()
def main():
args = parse_args()
reader = csv.DictReader(args.csv_in)
writer = csv.DictWriter(args.csv_out, fieldnames=reader.fieldnames)
writer.writeheader()
length_counts = Counter()
category_counts = Counter()
author_counts = Counter()
for i, row in enumerate(reader):
if __name__ == '__live_coding__' and i > 10:
break
category_text = row['category']
comma_count = sum(c == ',' for c in category_text)
if comma_count != len(category_text.split()) - 1:
# Probably missing quotes around text with commas. Skip it.
continue
quote = row['quote']
letters = re.sub('[^A-Z]', '', quote.upper())
letter_count = len(letters)
if letter_count < 10 or 50 < letter_count:
continue
if length_counts[letter_count] >= 100:
continue
author = row['author']
if 'Lailah Gifty Akita' in author:
continue
if 'Sunday Adelaja' in author:
continue
writer.writerow(row)
length_counts[letter_count] += 1
author_counts[author] += 1
categories = re.split(r'\s*,\s*', category_text)
for category in categories:
category_counts[category] += 1
for length, count in sorted(length_counts.items()):
print(f'{length}: {count}')
for category, count in category_counts.most_common(100):
print(f'{category}: {count}')
for author, count in author_counts.most_common(100):
print(f'{author}: {count}')
if __name__ in ('__main__', '__live_coding__'):
main()