From 4a37487fec0ff7171df19eb86c41f470b896b61d Mon Sep 17 00:00:00 2001 From: ThoreOlthoff Date: Tue, 10 Sep 2024 09:22:56 +0200 Subject: [PATCH 1/5] changed inout to a classstructure --- mailcom/inout.py | 115 +++++++++++++++++++++++++---------------------- mailcom/parse.py | 9 ++-- 2 files changed, 67 insertions(+), 57 deletions(-) diff --git a/mailcom/inout.py b/mailcom/inout.py index 00a6706..21f4162 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -3,63 +3,72 @@ import eml_parser from bs4 import BeautifulSoup -def list_of_files(directory_name: str) -> list[Path]: - """Function to create a list of files that are present in a directory as path objects. - - Args: - directory_name (str): The directory where the files are located. - - Returns: - list[Path]: A list of Path objects that represent the files in the directory.""" - if not os.path.exists(directory_name): # check if given dir exists raises error otherwise - raise OSError("Path {} does not exist".format(directory_name)) - mypath = Path(directory_name) - pattern = [".eml", ".html"] # we would not change the file type through user input - email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern] - if len(email_list) == 0: - raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) - return email_list - -def get_html_text(text_check: str) -> str: - """Clean up a string if it contains html content. - Args: - text_check (str): The string that may contain html content. +class InoutHandler: + @staticmethod + def list_of_files(directory_name: str) -> list[Path]: + """Function to create a list of files that are present in a directory as path objects. + + Args: + directory_name (str): The directory where the files are located. - Returns: - str: The (potentially) cleaned up string.""" - soup = BeautifulSoup(text_check , 'html.parser') - if soup.find(): - text_check = soup.get_text() - return text_check + Returns: + list[Path]: A list of Path objects that represent the files in the directory.""" + if not os.path.exists(directory_name): # check if given dir exists raises error otherwise + raise OSError("Path {} does not exist".format(directory_name)) + mypath = Path(directory_name) + pattern = [".eml", ".html"] # we would not change the file type through user input + email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern] + if len(email_list) == 0: + raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) + return email_list -def get_text(file: Path) -> str: - """Function to extract the textual content and other metadata from an email file. - - Args: - file (Path): The path to the email file. + @staticmethod + def get_html_text(text_check: str) -> str: + """Clean up a string if it contains html content. + Args: + text_check (str): The string that may contain html content. + + Returns: + str: The (potentially) cleaned up string.""" + soup = BeautifulSoup(text_check , 'html.parser') + if soup.find(): + text_check = soup.get_text() + return text_check + + @staticmethod + def get_text(file: Path) -> str: + """Function to extract the textual content and other metadata from an email file. - Returns: - str: The textual content of the email. In the future, this will return the - complete dictionary with the metadata.""" - if not file.is_file(): # check if given file exists raises error otherwise - raise OSError("File {} does not exist".format(file)) - with open(file, 'rb') as fhdl: - raw_email = fhdl.read() - ep = eml_parser.EmlParser(include_raw_body=True) - parsed_eml = ep.decode_email_bytes(raw_email) - attachmenttypes = [] - # find if there are any attachements, and if yes, how many - attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 - # find the types of attachements - if attachments > 0: - attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - email_content = {"content": parsed_eml["body"][0]["content"], - "date": parsed_eml["header"]["date"], - "attachment": attachments, - "attachement type": attachmenttypes - } - return(email_content["content"]) + Args: + file (Path): The path to the email file. + + Returns: + str: The textual content of the email. In the future, this will return the + complete dictionary with the metadata.""" + if not file.is_file(): # check if given file exists raises error otherwise + raise OSError("File {} does not exist".format(file)) + with open(file, 'rb') as fhdl: + raw_email = fhdl.read() + ep = eml_parser.EmlParser(include_raw_body=True) + parsed_eml = ep.decode_email_bytes(raw_email) + attachmenttypes = [] + # find if there are any attachements, and if yes, how many + attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 + # find the types of attachements + if attachments > 0: + attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] + email_content = {"content": parsed_eml["body"][0]["content"], + "date": parsed_eml["header"]["date"], + "attachment": attachments, + "attachement type": attachmenttypes + } + return(email_content["content"]) + def validate_data(): + return + + def data_to_xml(): + return def write_file(text: str, name: str)-> None: """Write the extracted string to a text file. diff --git a/mailcom/parse.py b/mailcom/parse.py index 0e7388c..77546f7 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -2,7 +2,7 @@ import spacy as sp from transformers import pipeline from pathlib import Path -from mailcom.inout import get_text, list_of_files, get_html_text +from mailcom import inout # please modify this section depending on your setup # input language - either "es" or "fr" @@ -116,11 +116,12 @@ def make_dir(path: str): print("Generating output directory/ies.") make_dir(path_output) # process the text - eml_files = list_of_files(path_input) + io = inout.InoutHandler() + eml_files = io.list_of_files(path_input) # html_files = list_of_files(path_input, "html") for file in eml_files: - text = get_text(file) - text = get_html_text(text) + text = io.get_text(file) + text = io.get_html_text(text) print(text) # skip this text if email could not be parsed if not text: From ec1e25db168561e1bceeebf070e82cca49fd853f Mon Sep 17 00:00:00 2001 From: ThoreOlthoff Date: Tue, 10 Sep 2024 09:25:59 +0200 Subject: [PATCH 2/5] edited tests --- mailcom/test/test_inout.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 727c94f..e21d6ed 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -1,9 +1,10 @@ -from mailcom.inout import list_of_files, get_text, get_html_text +from mailcom import inout import pytest from pathlib import Path from importlib import resources pkg = resources.files("mailcom") +io = inout.InoutHandler() FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml") TEXT_REF = "J'espère que tu vas bien!" @@ -11,15 +12,15 @@ def test_list_of_files_found(tmp_path): p = tmp_path / "test.eml" p.write_text("test") - assert len(list_of_files(tmp_path)) != 0 + assert len(io.list_of_files(tmp_path)) != 0 def test_list_of_files_empty(tmp_path): with pytest.raises(ValueError): - list_of_files(tmp_path) + io.list_of_files(tmp_path) def test_list_of_files_dir_not_existing(): with pytest.raises(OSError): - list_of_files("nonexistingDir") + io.list_of_files("nonexistingDir") def test_list_of_files_correct_format(tmp_path): p = tmp_path / "test.eml" @@ -28,29 +29,29 @@ def test_list_of_files_correct_format(tmp_path): p.write_text("test2") p = tmp_path / "test3.xml" p.write_text("test3") - assert tmp_path / "test3.xml" not in list_of_files(tmp_path) + assert tmp_path / "test3.xml" not in io.list_of_files(tmp_path) def test_get_text(tmp_path): p = tmp_path / "test.eml" p.write_text("test") - assert get_text(p) == 'test' - text = get_text(FILE_PATH) + assert io.get_text(p) == 'test' + text = io.get_text(FILE_PATH) print(text[0:25]) assert text[0:25] == TEXT_REF def test_get_text_err(): with pytest.raises(OSError): - list_of_files("nonexistingDir") + io.list_of_files("nonexistingDir") def test_get_html_text(): html = """Test""" - assert get_html_text(html) == 'Test' + assert io.get_html_text(html) == 'Test' def test_get_html_text_noHtml(): noHtml = """Test""" - assert get_html_text(noHtml) == 'Test' + assert io.get_html_text(noHtml) == 'Test' def test_get_text_no_file(tmp_path): p = tmp_path / "test.eml" with pytest.raises(OSError): - get_text(p) \ No newline at end of file + io.get_text(p) \ No newline at end of file From 70e96f39d051ed1f69fb95329daa58d0bf51677e Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 13 Sep 2024 11:07:57 +0200 Subject: [PATCH 3/5] use instant of the class for better interface --- mailcom/inout.py | 62 ++++++++++++++++++------------------ mailcom/test/test_inout.py | 64 +++++++++++++++++--------------------- 2 files changed, 60 insertions(+), 66 deletions(-) diff --git a/mailcom/inout.py b/mailcom/inout.py index 21f4162..4cc14ba 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -4,26 +4,27 @@ from bs4 import BeautifulSoup class InoutHandler: - @staticmethod - def list_of_files(directory_name: str) -> list[Path]: - """Function to create a list of files that are present in a directory as path objects. + def __init__(self, directory_name: str): + """Constructor for the InoutHandler class. Args: directory_name (str): The directory where the files are located. - - Returns: - list[Path]: A list of Path objects that represent the files in the directory.""" - if not os.path.exists(directory_name): # check if given dir exists raises error otherwise - raise OSError("Path {} does not exist".format(directory_name)) - mypath = Path(directory_name) - pattern = [".eml", ".html"] # we would not change the file type through user input - email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern] - if len(email_list) == 0: + """ + self.directory_name = directory_name + # presets + self.pattern = [".eml", ".html"] + + def list_of_files(self): + """Method to create a list of Path objects (files) that are present + in a directory.""" + if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise + raise OSError("Path {} does not exist".format(self.directory_name)) + mypath = Path(self.directory_name) + self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern] + if len(self.email_list) == 0: raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) - return email_list - @staticmethod - def get_html_text(text_check: str) -> str: + def get_html_text(self, text_check: str) -> str: """Clean up a string if it contains html content. Args: text_check (str): The string that may contain html content. @@ -35,8 +36,7 @@ def get_html_text(text_check: str) -> str: text_check = soup.get_text() return text_check - @staticmethod - def get_text(file: Path) -> str: + def get_text(self, file: Path) -> str: """Function to extract the textual content and other metadata from an email file. Args: @@ -57,24 +57,24 @@ def get_text(file: Path) -> str: # find the types of attachements if attachments > 0: attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - email_content = {"content": parsed_eml["body"][0]["content"], + self.email_content = {"content": parsed_eml["body"][0]["content"], "date": parsed_eml["header"]["date"], "attachment": attachments, "attachement type": attachmenttypes } - return(email_content["content"]) + return(self.email_content["content"]) - def validate_data(): - return + def validate_data(self): + pass - def data_to_xml(): - return + def data_to_xml(self): + pass -def write_file(text: str, name: str)-> None: - """Write the extracted string to a text file. - - Args: - text (str): The string to be written to the file. - name (str): The name of the file to be written.""" - with open("{}.out".format(name), "w") as file: - file.write(text) + def write_file(self, text: str, name: str)-> None: + """Write the extracted string to a text file. + + Args: + text (str): The string to be written to the file. + name (str): The name of the file to be written.""" + with open("{}.out".format(name), "w") as file: + file.write(text) diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index e21d6ed..a369c20 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -2,56 +2,50 @@ import pytest from pathlib import Path from importlib import resources +import datetime pkg = resources.files("mailcom") -io = inout.InoutHandler() FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml") + TEXT_REF = "J'espère que tu vas bien!" -def test_list_of_files_found(tmp_path): - p = tmp_path / "test.eml" - p.write_text("test") - assert len(io.list_of_files(tmp_path)) != 0 +@pytest.fixture() +def get_instant(tmp_path): + return inout.InoutHandler(tmp_path) -def test_list_of_files_empty(tmp_path): +def test_list_of_files(get_instant): with pytest.raises(ValueError): - io.list_of_files(tmp_path) - -def test_list_of_files_dir_not_existing(): - with pytest.raises(OSError): - io.list_of_files("nonexistingDir") - -def test_list_of_files_correct_format(tmp_path): - p = tmp_path / "test.eml" + get_instant.list_of_files() + p = get_instant.directory_name / "test.eml" p.write_text("test") - p = tmp_path / "test2.html" + get_instant.list_of_files() + assert len(get_instant.email_list) != 0 + get_instant2 = inout.InoutHandler("nonexistingDir") + with pytest.raises(OSError): + get_instant2.list_of_files() + p = get_instant.directory_name / "test2.html" p.write_text("test2") - p = tmp_path / "test3.xml" + p = get_instant.directory_name / "test3.xml" p.write_text("test3") - assert tmp_path / "test3.xml" not in io.list_of_files(tmp_path) + get_instant.list_of_files() + assert get_instant.directory_name / "test3.xml" not in get_instant.email_list -def test_get_text(tmp_path): - p = tmp_path / "test.eml" +def test_get_text(get_instant): + p = get_instant.directory_name / "test.eml" p.write_text("test") - assert io.get_text(p) == 'test' - text = io.get_text(FILE_PATH) - print(text[0:25]) + extracted_text = get_instant.get_text(p) + assert extracted_text == 'test' + text = get_instant.get_text(FILE_PATH) assert text[0:25] == TEXT_REF - -def test_get_text_err(): + assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc) + assert get_instant.email_content["attachment"] == 2 + assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] with pytest.raises(OSError): - io.list_of_files("nonexistingDir") + get_instant.get_text(get_instant.directory_name / "nonexisting.eml") -def test_get_html_text(): +def test_get_html_text(get_instant): html = """Test""" - assert io.get_html_text(html) == 'Test' - -def test_get_html_text_noHtml(): + assert get_instant.get_html_text(html) == 'Test' noHtml = """Test""" - assert io.get_html_text(noHtml) == 'Test' - -def test_get_text_no_file(tmp_path): - p = tmp_path / "test.eml" - with pytest.raises(OSError): - io.get_text(p) \ No newline at end of file + assert get_instant.get_html_text(noHtml) == 'Test' \ No newline at end of file From 1b8ec764c5099d356a8201ce5e6ab46d34eef8bd Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 20 Sep 2024 11:29:28 +0200 Subject: [PATCH 4/5] update parse to run with inouthandler class --- mailcom/parse.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mailcom/parse.py b/mailcom/parse.py index 77546f7..4226978 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -2,7 +2,7 @@ import spacy as sp from transformers import pipeline from pathlib import Path -from mailcom import inout +from mailcom.inout import InoutHandler # please modify this section depending on your setup # input language - either "es" or "fr" @@ -116,13 +116,16 @@ def make_dir(path: str): print("Generating output directory/ies.") make_dir(path_output) # process the text - io = inout.InoutHandler() - eml_files = io.list_of_files(path_input) + io = InoutHandler(path_input) + io.list_of_files() # html_files = list_of_files(path_input, "html") - for file in eml_files: + for file in io.email_list: text = io.get_text(file) text = io.get_html_text(text) print(text) + print(io.email_content["date"]) + print(io.email_content["attachment"]) + print(io.email_content["attachement type"]) # skip this text if email could not be parsed if not text: continue From 822131cf7e88fdcfee0190b45ffeb55b94372257 Mon Sep 17 00:00:00 2001 From: Thore Schoeller Date: Mon, 23 Sep 2024 15:25:29 +0200 Subject: [PATCH 5/5] Write XML-File to Output --- mailcom/parse.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mailcom/parse.py b/mailcom/parse.py index 4226978..050390b 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -3,6 +3,8 @@ from transformers import pipeline from pathlib import Path from mailcom.inout import InoutHandler +from dicttoxml import dicttoxml +from xml.dom.minidom import parseString # please modify this section depending on your setup # input language - either "es" or "fr" @@ -122,13 +124,21 @@ def make_dir(path: str): for file in io.email_list: text = io.get_text(file) text = io.get_html_text(text) - print(text) - print(io.email_content["date"]) - print(io.email_content["attachment"]) - print(io.email_content["attachement type"]) + # print(text) + # print(io.email_content["date"]) + # print(io.email_content["attachment"]) + # print(io.email_content["attachement type"]) # skip this text if email could not be parsed if not text: continue + xml = dicttoxml(io.email_content["content"]) + # xml = dicttoxml(io.email_content) Different options for review + xml_decode = xml.decode() + xmlfile = open(path_output / "dict.xml", "w") + xmlfile.write(xml_decode) + xmlfile.close() + print(parseString(xml).toprettyxml()) + # doc_spacy = nlp_spacy(text) # text = get_sentences(doc_spacy) # start with first line