edited the ner and text processing back in. also changed get_text to …

…delete control characters
ssciwr · Sep 24, 2024 · 70e5497 · 70e5497
1 parent b09e019
commit 70e5497
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 27 deletions.
diff --git a/data/out/dict.out b/data/out/dict.out
@@ -1,9 +1 @@
-<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!
-[Inline-Bild]
-
-[Inline-Bild]
-
-À bientôt,
-
-Pierre
-</content></email>
+<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre</content></email>
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -3,6 +3,7 @@
 import eml_parser
 from bs4 import BeautifulSoup
 from dicttoxml import dicttoxml
+import unicodedata
 
 class InoutHandler:
     def __init__(self, directory_name: str):
@@ -52,13 +53,16 @@ def get_text(self, file: Path) -> str:
             raw_email = fhdl.read()
         ep = eml_parser.EmlParser(include_raw_body=True)
         parsed_eml = ep.decode_email_bytes(raw_email)
+        # content = parsed_eml["body"][0]["content"]
+        mapping =  dict.fromkeys(range(32))
+        # res = content.translate(mapping)
         attachmenttypes = []
         # find if there are any attachements, and if yes, how many
         attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
         # find the types of attachements
         if attachments > 0:
             attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-        self.email_content = {"content": parsed_eml["body"][0]["content"], 
+        self.email_content = {"content": parsed_eml["body"][0]["content"].translate(mapping), 
                     "date": parsed_eml["header"]["date"], 
                     "attachment": attachments, 
                     "attachement type": attachmenttypes

diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -85,9 +85,8 @@ def init_spacy(lang):
 
 
 def init_transformers():
-    # ner_recognizer = pipeline("token-classification")
     ner_recognizer = pipeline(
-        "token-classification", model="xlm-roberta-large-finetuned-conll03-english"
+        "token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda'
     )
     return ner_recognizer
 
@@ -105,8 +104,8 @@ def make_dir(path: str):
 
 
 if __name__ == "__main__":
-    # nlp_spacy = init_spacy(lang)
-    # nlp_transformers = init_transformers()
+    nlp_spacy = init_spacy(lang)
+    nlp_transformers = init_transformers()
 
     # check that input dir is there
     if not check_dir(path_input):
@@ -132,21 +131,22 @@ def make_dir(path: str):
         # skip this text if email could not be parsed
         if not text:
             continue 
-        ### nlp = init_spacy(sprache)   
-        # doc_spacy = nlp_spacy(text) ### fehlt - alte version
-        # text = get_sentences(doc_spacy)
+        ### nlp = init_spacy(sprache) done l.108
+        doc_spacy = nlp_spacy(text) ### fehlt - alte version
+        text = get_sentences(doc_spacy)
         # start with first line
         # here you can limit the number of sentences to parse
-        # newlist = []
-        # max_i = len(text) ### weg
+        newlist = []
+        max_i = len(text) ### weg
         ### init transformers
-        # for i in range(0, max_i):
+        for i in range(0, max_i):
         #     if tool == "transformers": ### gibt nur eins
-        #         nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
-        #         doc = nlps
-        #     newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
-        #     newlist[i] = " ".join(newlist[i])
+            nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
+            doc = nlps
+            newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
+            newlist[i] = " ".join(newlist[i])
         # join the new and old lines for comparison
-        # printout = "New: " + " ".join(newlist) + "\n"
-        # printout = printout + "Old: " + " ".join(text[0:max_i])
-        # write_file(printout, path_output + "/" + file)
+        printout = "New: " + " ".join(newlist) + "\n"
+        printout = printout + "Old: " + " ".join(text[0:max_i])
+        print(printout)
+        # write_file(printout, path_output + "/" + file)