Skip to content

Commit

Permalink
edited the ner and text processing back in. also changed get_text to …
Browse files Browse the repository at this point in the history
…delete control characters
  • Loading branch information
Olthoff231381 committed Sep 24, 2024
1 parent b09e019 commit 70e5497
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 27 deletions.
10 changes: 1 addition & 9 deletions data/out/dict.out
Original file line number Diff line number Diff line change
@@ -1,9 +1 @@
<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!
[Inline-Bild]

[Inline-Bild]

À bientôt,

Pierre
</content></email>
<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre</content></email>
6 changes: 5 additions & 1 deletion mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import eml_parser
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml
import unicodedata

class InoutHandler:
def __init__(self, directory_name: str):
Expand Down Expand Up @@ -52,13 +53,16 @@ def get_text(self, file: Path) -> str:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
# content = parsed_eml["body"][0]["content"]
mapping = dict.fromkeys(range(32))
# res = content.translate(mapping)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
self.email_content = {"content": parsed_eml["body"][0]["content"].translate(mapping),
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
Expand Down
34 changes: 17 additions & 17 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,8 @@ def init_spacy(lang):


def init_transformers():
# ner_recognizer = pipeline("token-classification")
ner_recognizer = pipeline(
"token-classification", model="xlm-roberta-large-finetuned-conll03-english"
"token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda'
)
return ner_recognizer

Expand All @@ -105,8 +104,8 @@ def make_dir(path: str):


if __name__ == "__main__":
# nlp_spacy = init_spacy(lang)
# nlp_transformers = init_transformers()
nlp_spacy = init_spacy(lang)
nlp_transformers = init_transformers()

# check that input dir is there
if not check_dir(path_input):
Expand All @@ -132,21 +131,22 @@ def make_dir(path: str):
# skip this text if email could not be parsed
if not text:
continue
### nlp = init_spacy(sprache)
# doc_spacy = nlp_spacy(text) ### fehlt - alte version
# text = get_sentences(doc_spacy)
### nlp = init_spacy(sprache) done l.108
doc_spacy = nlp_spacy(text) ### fehlt - alte version
text = get_sentences(doc_spacy)
# start with first line
# here you can limit the number of sentences to parse
# newlist = []
# max_i = len(text) ### weg
newlist = []
max_i = len(text) ### weg
### init transformers
# for i in range(0, max_i):
for i in range(0, max_i):
# if tool == "transformers": ### gibt nur eins
# nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
# doc = nlps
# newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
# newlist[i] = " ".join(newlist[i])
nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
doc = nlps
newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
newlist[i] = " ".join(newlist[i])
# join the new and old lines for comparison
# printout = "New: " + " ".join(newlist) + "\n"
# printout = printout + "Old: " + " ".join(text[0:max_i])
# write_file(printout, path_output + "/" + file)
printout = "New: " + " ".join(newlist) + "\n"
printout = printout + "Old: " + " ".join(text[0:max_i])
print(printout)
# write_file(printout, path_output + "/" + file)

0 comments on commit 70e5497

Please sign in to comment.