-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean-whatsapp.py
100 lines (87 loc) · 3.45 KB
/
clean-whatsapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import nltk
import re
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
links = []
### CLEANING WHATSAPP FILES FROM YOUTUBE LINKS, SYMBOLS, FILE EXTENSIONS ETC
files = os.listdir("whatsapp-corpus/")
for f in files:
list_sentences = {}
list_replace = {}
encoded_messages = []
messages = open("whatsapp-corpus/" + f,'r').readlines()
for message in messages:
encoded_messages.append(unicode(message,errors='ignore'))
messages = encoded_messages
# extracting links from messages I have sent using regex and saving them in a seperate file
for message in messages[1:len(messages):2]:
url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[\w.,@?^=%&:/~+#-][\w@?^=%&/~+#-]))+'
find = re.findall(url, message)
if find:
links.append(str(find) + '\n')
# extracting links sent to me using regex and removing them
for message in messages:
url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-]))+'
find = re.findall(url, message)
if find: # only do this for question lines... OkNouf can reply links
i = messages.index(message)
new = re.sub(url, '.', message)
# if new == '.': then delete the line and the next one
message = new
messages.pop(i)
messages.insert(i,message)
# extracting file extensions and removing them
for message in messages:
tag = "[-\w]+.(?:jpg|gif|png|mp4|pdf|opus)+(.*?)\>"
extension = re.findall(tag, message)
if extension:
i = messages.index(message)
new = re.sub(tag, '', message)
message = new
messages.pop(i)
messages.insert(i,message)
# extracting date and time stamps
for message in messages:
tag = "\[(.*?)\: "
find = re.findall(tag, message)
if find:
#print find
i = messages.index(message)
new = re.sub(tag, '', message)
message = new
messages.pop(i)
messages.insert(i,message)
# Using NLTK, extracting questions sent to me
for message in messages[::2]:
list_sentences[messages.index(message)] = nltk.sent_tokenize(message)
#print list_sentences
for sentences in list_sentences.items():
#print sentences[1]
for sentence in sentences[1]:
words = nltk.word_tokenize(sentence)
num = words.count('?')
if num > 1:
words.remove('?')
#print words
#print num
for word in words:
if word == '?' and sentences[0] not in list_replace:
#print sentence
#list_replace.append(sentence)
list_replace[sentences[0]] = sentence + " "
elif word == '?' and sentences[0] in list_replace:
list_replace[sentences[0]] += sentence + " "
# replacing those sentences with just the question asked
for index in list_replace.items():
#print index[0]
messages.pop(index[0])
messages.insert(index[0],index[1]+ '\n')
# rewriting file with cleaned version
with open("whatsapp-corpus/" + f, 'w') as file:
file.writelines( messages )
print "finished cleaning up"
# saving links extracted to another file
with open("links/whatsapp-links.txt", 'w') as file:
file.writelines( links )