diff --git a/data/out/dict.out b/data/out/dict.out index 97ba2b3..cdd27f9 100644 --- a/data/out/dict.out +++ b/data/out/dict.out @@ -1,9 +1 @@ -J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage! -[Inline-Bild] - -[Inline-Bild] - -À bientôt, - -Pierre - \ No newline at end of file +J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre \ No newline at end of file diff --git a/mailcom/inout.py b/mailcom/inout.py index 15c7252..7155f93 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -3,6 +3,7 @@ import eml_parser from bs4 import BeautifulSoup from dicttoxml import dicttoxml +import unicodedata class InoutHandler: def __init__(self, directory_name: str): @@ -52,13 +53,16 @@ def get_text(self, file: Path) -> str: raw_email = fhdl.read() ep = eml_parser.EmlParser(include_raw_body=True) parsed_eml = ep.decode_email_bytes(raw_email) + # content = parsed_eml["body"][0]["content"] + mapping = dict.fromkeys(range(32)) + # res = content.translate(mapping) attachmenttypes = [] # find if there are any attachements, and if yes, how many attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 # find the types of attachements if attachments > 0: attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - self.email_content = {"content": parsed_eml["body"][0]["content"], + self.email_content = {"content": parsed_eml["body"][0]["content"].translate(mapping), "date": parsed_eml["header"]["date"], "attachment": attachments, "attachement type": attachmenttypes diff --git a/mailcom/parse.py b/mailcom/parse.py index a21f881..392c52f 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -85,9 +85,8 @@ def init_spacy(lang): def init_transformers(): - # ner_recognizer = pipeline("token-classification") ner_recognizer = pipeline( - "token-classification", model="xlm-roberta-large-finetuned-conll03-english" + "token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda' ) return ner_recognizer @@ -105,8 +104,8 @@ def make_dir(path: str): if __name__ == "__main__": - # nlp_spacy = init_spacy(lang) - # nlp_transformers = init_transformers() + nlp_spacy = init_spacy(lang) + nlp_transformers = init_transformers() # check that input dir is there if not check_dir(path_input): @@ -132,21 +131,22 @@ def make_dir(path: str): # skip this text if email could not be parsed if not text: continue - ### nlp = init_spacy(sprache) - # doc_spacy = nlp_spacy(text) ### fehlt - alte version - # text = get_sentences(doc_spacy) + ### nlp = init_spacy(sprache) done l.108 + doc_spacy = nlp_spacy(text) ### fehlt - alte version + text = get_sentences(doc_spacy) # start with first line # here you can limit the number of sentences to parse - # newlist = [] - # max_i = len(text) ### weg + newlist = [] + max_i = len(text) ### weg ### init transformers - # for i in range(0, max_i): + for i in range(0, max_i): # if tool == "transformers": ### gibt nur eins - # nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc - # doc = nlps - # newlist.append(process_doc(doc, ner_tool=tool, text=text[i])) - # newlist[i] = " ".join(newlist[i]) + nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc + doc = nlps + newlist.append(process_doc(doc, ner_tool=tool, text=text[i])) + newlist[i] = " ".join(newlist[i]) # join the new and old lines for comparison - # printout = "New: " + " ".join(newlist) + "\n" - # printout = printout + "Old: " + " ".join(text[0:max_i]) - # write_file(printout, path_output + "/" + file) + printout = "New: " + " ".join(newlist) + "\n" + printout = printout + "Old: " + " ".join(text[0:max_i]) + print(printout) + # write_file(printout, path_output + "/" + file) \ No newline at end of file