From 95c4add121ac708977357deb5d9281980920be48 Mon Sep 17 00:00:00 2001 From: Olthoff231381 <147524551+Olthoff231381@users.noreply.github.com> Date: Mon, 23 Sep 2024 15:26:51 +0200 Subject: [PATCH] changed inout to a classstructure (#33) * changed inout to a classstructure * edited tests * use instant of the class for better interface * update parse to run with inouthandler class --------- Co-authored-by: Inga Ulusoy --- mailcom/inout.py | 129 ++++++++++++++++++++----------------- mailcom/parse.py | 14 ++-- mailcom/test/test_inout.py | 65 +++++++++---------- 3 files changed, 108 insertions(+), 100 deletions(-) diff --git a/mailcom/inout.py b/mailcom/inout.py index 00a6706..4cc14ba 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -3,69 +3,78 @@ import eml_parser from bs4 import BeautifulSoup -def list_of_files(directory_name: str) -> list[Path]: - """Function to create a list of files that are present in a directory as path objects. - - Args: - directory_name (str): The directory where the files are located. - - Returns: - list[Path]: A list of Path objects that represent the files in the directory.""" - if not os.path.exists(directory_name): # check if given dir exists raises error otherwise - raise OSError("Path {} does not exist".format(directory_name)) - mypath = Path(directory_name) - pattern = [".eml", ".html"] # we would not change the file type through user input - email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern] - if len(email_list) == 0: - raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) - return email_list +class InoutHandler: + def __init__(self, directory_name: str): + """Constructor for the InoutHandler class. + + Args: + directory_name (str): The directory where the files are located. + """ + self.directory_name = directory_name + # presets + self.pattern = [".eml", ".html"] + + def list_of_files(self): + """Method to create a list of Path objects (files) that are present + in a directory.""" + if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise + raise OSError("Path {} does not exist".format(self.directory_name)) + mypath = Path(self.directory_name) + self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern] + if len(self.email_list) == 0: + raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) -def get_html_text(text_check: str) -> str: - """Clean up a string if it contains html content. - Args: - text_check (str): The string that may contain html content. + def get_html_text(self, text_check: str) -> str: + """Clean up a string if it contains html content. + Args: + text_check (str): The string that may contain html content. + + Returns: + str: The (potentially) cleaned up string.""" + soup = BeautifulSoup(text_check , 'html.parser') + if soup.find(): + text_check = soup.get_text() + return text_check + + def get_text(self, file: Path) -> str: + """Function to extract the textual content and other metadata from an email file. - Returns: - str: The (potentially) cleaned up string.""" - soup = BeautifulSoup(text_check , 'html.parser') - if soup.find(): - text_check = soup.get_text() - return text_check + Args: + file (Path): The path to the email file. + + Returns: + str: The textual content of the email. In the future, this will return the + complete dictionary with the metadata.""" + if not file.is_file(): # check if given file exists raises error otherwise + raise OSError("File {} does not exist".format(file)) + with open(file, 'rb') as fhdl: + raw_email = fhdl.read() + ep = eml_parser.EmlParser(include_raw_body=True) + parsed_eml = ep.decode_email_bytes(raw_email) + attachmenttypes = [] + # find if there are any attachements, and if yes, how many + attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 + # find the types of attachements + if attachments > 0: + attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] + self.email_content = {"content": parsed_eml["body"][0]["content"], + "date": parsed_eml["header"]["date"], + "attachment": attachments, + "attachement type": attachmenttypes + } + return(self.email_content["content"]) -def get_text(file: Path) -> str: - """Function to extract the textual content and other metadata from an email file. + def validate_data(self): + pass - Args: - file (Path): The path to the email file. - - Returns: - str: The textual content of the email. In the future, this will return the - complete dictionary with the metadata.""" - if not file.is_file(): # check if given file exists raises error otherwise - raise OSError("File {} does not exist".format(file)) - with open(file, 'rb') as fhdl: - raw_email = fhdl.read() - ep = eml_parser.EmlParser(include_raw_body=True) - parsed_eml = ep.decode_email_bytes(raw_email) - attachmenttypes = [] - # find if there are any attachements, and if yes, how many - attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 - # find the types of attachements - if attachments > 0: - attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - email_content = {"content": parsed_eml["body"][0]["content"], - "date": parsed_eml["header"]["date"], - "attachment": attachments, - "attachement type": attachmenttypes - } - return(email_content["content"]) + def data_to_xml(self): + pass + def write_file(self, text: str, name: str)-> None: + """Write the extracted string to a text file. -def write_file(text: str, name: str)-> None: - """Write the extracted string to a text file. - - Args: - text (str): The string to be written to the file. - name (str): The name of the file to be written.""" - with open("{}.out".format(name), "w") as file: - file.write(text) + Args: + text (str): The string to be written to the file. + name (str): The name of the file to be written.""" + with open("{}.out".format(name), "w") as file: + file.write(text) diff --git a/mailcom/parse.py b/mailcom/parse.py index 0e7388c..4226978 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -2,7 +2,7 @@ import spacy as sp from transformers import pipeline from pathlib import Path -from mailcom.inout import get_text, list_of_files, get_html_text +from mailcom.inout import InoutHandler # please modify this section depending on your setup # input language - either "es" or "fr" @@ -116,12 +116,16 @@ def make_dir(path: str): print("Generating output directory/ies.") make_dir(path_output) # process the text - eml_files = list_of_files(path_input) + io = InoutHandler(path_input) + io.list_of_files() # html_files = list_of_files(path_input, "html") - for file in eml_files: - text = get_text(file) - text = get_html_text(text) + for file in io.email_list: + text = io.get_text(file) + text = io.get_html_text(text) print(text) + print(io.email_content["date"]) + print(io.email_content["attachment"]) + print(io.email_content["attachement type"]) # skip this text if email could not be parsed if not text: continue diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 727c94f..a369c20 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -1,56 +1,51 @@ -from mailcom.inout import list_of_files, get_text, get_html_text +from mailcom import inout import pytest from pathlib import Path from importlib import resources +import datetime pkg = resources.files("mailcom") FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml") + TEXT_REF = "J'espère que tu vas bien!" -def test_list_of_files_found(tmp_path): - p = tmp_path / "test.eml" - p.write_text("test") - assert len(list_of_files(tmp_path)) != 0 +@pytest.fixture() +def get_instant(tmp_path): + return inout.InoutHandler(tmp_path) -def test_list_of_files_empty(tmp_path): +def test_list_of_files(get_instant): with pytest.raises(ValueError): - list_of_files(tmp_path) - -def test_list_of_files_dir_not_existing(): - with pytest.raises(OSError): - list_of_files("nonexistingDir") - -def test_list_of_files_correct_format(tmp_path): - p = tmp_path / "test.eml" + get_instant.list_of_files() + p = get_instant.directory_name / "test.eml" p.write_text("test") - p = tmp_path / "test2.html" + get_instant.list_of_files() + assert len(get_instant.email_list) != 0 + get_instant2 = inout.InoutHandler("nonexistingDir") + with pytest.raises(OSError): + get_instant2.list_of_files() + p = get_instant.directory_name / "test2.html" p.write_text("test2") - p = tmp_path / "test3.xml" + p = get_instant.directory_name / "test3.xml" p.write_text("test3") - assert tmp_path / "test3.xml" not in list_of_files(tmp_path) + get_instant.list_of_files() + assert get_instant.directory_name / "test3.xml" not in get_instant.email_list -def test_get_text(tmp_path): - p = tmp_path / "test.eml" +def test_get_text(get_instant): + p = get_instant.directory_name / "test.eml" p.write_text("test") - assert get_text(p) == 'test' - text = get_text(FILE_PATH) - print(text[0:25]) + extracted_text = get_instant.get_text(p) + assert extracted_text == 'test' + text = get_instant.get_text(FILE_PATH) assert text[0:25] == TEXT_REF - -def test_get_text_err(): + assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc) + assert get_instant.email_content["attachment"] == 2 + assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] with pytest.raises(OSError): - list_of_files("nonexistingDir") + get_instant.get_text(get_instant.directory_name / "nonexisting.eml") -def test_get_html_text(): +def test_get_html_text(get_instant): html = """Test""" - assert get_html_text(html) == 'Test' - -def test_get_html_text_noHtml(): + assert get_instant.get_html_text(html) == 'Test' noHtml = """Test""" - assert get_html_text(noHtml) == 'Test' - -def test_get_text_no_file(tmp_path): - p = tmp_path / "test.eml" - with pytest.raises(OSError): - get_text(p) \ No newline at end of file + assert get_instant.get_html_text(noHtml) == 'Test' \ No newline at end of file