diff --git a/data/out/dict.out b/data/out/dict.out
index 97ba2b3..cdd27f9 100644
--- a/data/out/dict.out
+++ b/data/out/dict.out
@@ -1,9 +1 @@
-J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!
-[Inline-Bild]
-
-[Inline-Bild]
-
-À bientôt,
-
-Pierre
-
\ No newline at end of file
+J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre
\ No newline at end of file
diff --git a/mailcom/inout.py b/mailcom/inout.py
index 15c7252..7155f93 100644
--- a/mailcom/inout.py
+++ b/mailcom/inout.py
@@ -3,6 +3,7 @@
import eml_parser
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml
+import unicodedata
class InoutHandler:
def __init__(self, directory_name: str):
@@ -52,13 +53,16 @@ def get_text(self, file: Path) -> str:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
+ # content = parsed_eml["body"][0]["content"]
+ mapping = dict.fromkeys(range(32))
+ # res = content.translate(mapping)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
- self.email_content = {"content": parsed_eml["body"][0]["content"],
+ self.email_content = {"content": parsed_eml["body"][0]["content"].translate(mapping),
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
diff --git a/mailcom/parse.py b/mailcom/parse.py
index a21f881..392c52f 100644
--- a/mailcom/parse.py
+++ b/mailcom/parse.py
@@ -85,9 +85,8 @@ def init_spacy(lang):
def init_transformers():
- # ner_recognizer = pipeline("token-classification")
ner_recognizer = pipeline(
- "token-classification", model="xlm-roberta-large-finetuned-conll03-english"
+ "token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda'
)
return ner_recognizer
@@ -105,8 +104,8 @@ def make_dir(path: str):
if __name__ == "__main__":
- # nlp_spacy = init_spacy(lang)
- # nlp_transformers = init_transformers()
+ nlp_spacy = init_spacy(lang)
+ nlp_transformers = init_transformers()
# check that input dir is there
if not check_dir(path_input):
@@ -132,21 +131,22 @@ def make_dir(path: str):
# skip this text if email could not be parsed
if not text:
continue
- ### nlp = init_spacy(sprache)
- # doc_spacy = nlp_spacy(text) ### fehlt - alte version
- # text = get_sentences(doc_spacy)
+ ### nlp = init_spacy(sprache) done l.108
+ doc_spacy = nlp_spacy(text) ### fehlt - alte version
+ text = get_sentences(doc_spacy)
# start with first line
# here you can limit the number of sentences to parse
- # newlist = []
- # max_i = len(text) ### weg
+ newlist = []
+ max_i = len(text) ### weg
### init transformers
- # for i in range(0, max_i):
+ for i in range(0, max_i):
# if tool == "transformers": ### gibt nur eins
- # nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
- # doc = nlps
- # newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
- # newlist[i] = " ".join(newlist[i])
+ nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
+ doc = nlps
+ newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
+ newlist[i] = " ".join(newlist[i])
# join the new and old lines for comparison
- # printout = "New: " + " ".join(newlist) + "\n"
- # printout = printout + "Old: " + " ".join(text[0:max_i])
- # write_file(printout, path_output + "/" + file)
+ printout = "New: " + " ".join(newlist) + "\n"
+ printout = printout + "Old: " + " ".join(text[0:max_i])
+ print(printout)
+ # write_file(printout, path_output + "/" + file)
\ No newline at end of file