From 4a37487fec0ff7171df19eb86c41f470b896b61d Mon Sep 17 00:00:00 2001
From: ThoreOlthoff <thore.olthoff@iwr.uni-heidelberg.de>
Date: Tue, 10 Sep 2024 09:22:56 +0200
Subject: [PATCH 1/5] changed inout to a classstructure

---
 mailcom/inout.py | 115 +++++++++++++++++++++++++----------------------
 mailcom/parse.py |   9 ++--
 2 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/mailcom/inout.py b/mailcom/inout.py
index 00a6706..21f4162 100644
--- a/mailcom/inout.py
+++ b/mailcom/inout.py
@@ -3,63 +3,72 @@
 import eml_parser
 from bs4 import BeautifulSoup
 
-def list_of_files(directory_name: str) -> list[Path]:
-    """Function to create a list of files that are present in a directory as path objects.
-    
-    Args: 
-        directory_name (str): The directory where the files are located.
-    
-    Returns:
-        list[Path]: A list of Path objects that represent the files in the directory."""
-    if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
-        raise OSError("Path {} does not exist".format(directory_name))
-    mypath = Path(directory_name)
-    pattern = [".eml", ".html"]  # we would not change the file type through user input
-    email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
-    if len(email_list) == 0:
-        raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
-    return email_list
-
-def get_html_text(text_check: str) -> str:
-    """Clean up a string if it contains html content.
-    Args:
-        text_check (str): The string that may contain html content.
+class InoutHandler:
+    @staticmethod
+    def list_of_files(directory_name: str) -> list[Path]:
+        """Function to create a list of files that are present in a directory as path objects.
+        
+        Args: 
+            directory_name (str): The directory where the files are located.
         
-    Returns:
-        str: The (potentially) cleaned up string."""
-    soup = BeautifulSoup(text_check , 'html.parser')
-    if soup.find():
-        text_check = soup.get_text()
-    return text_check
+        Returns:
+            list[Path]: A list of Path objects that represent the files in the directory."""
+        if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
+            raise OSError("Path {} does not exist".format(directory_name))
+        mypath = Path(directory_name)
+        pattern = [".eml", ".html"]  # we would not change the file type through user input
+        email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
+        if len(email_list) == 0:
+            raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
+        return email_list
 
-def get_text(file: Path) -> str:
-    """Function to extract the textual content and other metadata from an email file.
-    
-    Args:
-        file (Path): The path to the email file.
+    @staticmethod
+    def get_html_text(text_check: str) -> str:
+        """Clean up a string if it contains html content.
+        Args:
+            text_check (str): The string that may contain html content.
+            
+        Returns:
+            str: The (potentially) cleaned up string."""
+        soup = BeautifulSoup(text_check , 'html.parser')
+        if soup.find():
+            text_check = soup.get_text()
+        return text_check
+
+    @staticmethod
+    def get_text(file: Path) -> str:
+        """Function to extract the textual content and other metadata from an email file.
         
-    Returns:
-        str: The textual content of the email. In the future, this will return the 
-        complete dictionary with the metadata."""
-    if not file.is_file(): # check if given file exists raises error otherwise
-        raise OSError("File {} does not exist".format(file))
-    with open(file, 'rb') as fhdl:
-        raw_email = fhdl.read()
-    ep = eml_parser.EmlParser(include_raw_body=True)
-    parsed_eml = ep.decode_email_bytes(raw_email)
-    attachmenttypes = []
-    # find if there are any attachements, and if yes, how many
-    attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
-    # find the types of attachements
-    if attachments > 0:
-        attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-    email_content = {"content": parsed_eml["body"][0]["content"], 
-                 "date": parsed_eml["header"]["date"], 
-                 "attachment": attachments, 
-                 "attachement type": attachmenttypes
-                 }
-    return(email_content["content"])
+        Args:
+            file (Path): The path to the email file.
+            
+        Returns:
+            str: The textual content of the email. In the future, this will return the 
+            complete dictionary with the metadata."""
+        if not file.is_file(): # check if given file exists raises error otherwise
+            raise OSError("File {} does not exist".format(file))
+        with open(file, 'rb') as fhdl:
+            raw_email = fhdl.read()
+        ep = eml_parser.EmlParser(include_raw_body=True)
+        parsed_eml = ep.decode_email_bytes(raw_email)
+        attachmenttypes = []
+        # find if there are any attachements, and if yes, how many
+        attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
+        # find the types of attachements
+        if attachments > 0:
+            attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
+        email_content = {"content": parsed_eml["body"][0]["content"], 
+                    "date": parsed_eml["header"]["date"], 
+                    "attachment": attachments, 
+                    "attachement type": attachmenttypes
+                    }
+        return(email_content["content"])
 
+    def validate_data():
+        return
+    
+    def data_to_xml():
+        return
 
 def write_file(text: str, name: str)-> None:
     """Write the extracted string to a text file.
diff --git a/mailcom/parse.py b/mailcom/parse.py
index 0e7388c..77546f7 100644
--- a/mailcom/parse.py
+++ b/mailcom/parse.py
@@ -2,7 +2,7 @@
 import spacy as sp
 from transformers import pipeline
 from pathlib import Path
-from mailcom.inout import get_text, list_of_files, get_html_text
+from mailcom import inout
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
@@ -116,11 +116,12 @@ def make_dir(path: str):
         print("Generating output directory/ies.")
         make_dir(path_output)
     # process the text
-    eml_files = list_of_files(path_input)
+    io = inout.InoutHandler()
+    eml_files = io.list_of_files(path_input)
     # html_files = list_of_files(path_input, "html")
     for file in eml_files:
-        text = get_text(file)
-        text = get_html_text(text)
+        text = io.get_text(file)
+        text = io.get_html_text(text)
         print(text)
         # skip this text if email could not be parsed
         if not text:

From ec1e25db168561e1bceeebf070e82cca49fd853f Mon Sep 17 00:00:00 2001
From: ThoreOlthoff <thore.olthoff@iwr.uni-heidelberg.de>
Date: Tue, 10 Sep 2024 09:25:59 +0200
Subject: [PATCH 2/5] edited tests

---
 mailcom/test/test_inout.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py
index 727c94f..e21d6ed 100644
--- a/mailcom/test/test_inout.py
+++ b/mailcom/test/test_inout.py
@@ -1,9 +1,10 @@
-from mailcom.inout import list_of_files, get_text, get_html_text
+from mailcom import inout
 import pytest
 from pathlib import Path
 from importlib import resources
 
 pkg = resources.files("mailcom")
+io = inout.InoutHandler()
 
 FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")
 TEXT_REF = "J'espère que tu vas bien!"
@@ -11,15 +12,15 @@
 def test_list_of_files_found(tmp_path):
     p = tmp_path / "test.eml"
     p.write_text("test")
-    assert len(list_of_files(tmp_path)) != 0
+    assert len(io.list_of_files(tmp_path)) != 0
 
 def test_list_of_files_empty(tmp_path):
     with pytest.raises(ValueError):
-        list_of_files(tmp_path)
+        io.list_of_files(tmp_path)
 
 def test_list_of_files_dir_not_existing():
     with pytest.raises(OSError):
-        list_of_files("nonexistingDir")
+        io.list_of_files("nonexistingDir")
 
 def test_list_of_files_correct_format(tmp_path):
     p = tmp_path / "test.eml"
@@ -28,29 +29,29 @@ def test_list_of_files_correct_format(tmp_path):
     p.write_text("test2")
     p = tmp_path / "test3.xml"
     p.write_text("test3")
-    assert tmp_path / "test3.xml" not in list_of_files(tmp_path)
+    assert tmp_path / "test3.xml" not in io.list_of_files(tmp_path)
 
 def test_get_text(tmp_path):
     p = tmp_path / "test.eml"
     p.write_text("test")
-    assert get_text(p) == 'test'
-    text = get_text(FILE_PATH)
+    assert io.get_text(p) == 'test'
+    text = io.get_text(FILE_PATH)
     print(text[0:25])
     assert text[0:25] == TEXT_REF
 
 def test_get_text_err():
     with pytest.raises(OSError):
-        list_of_files("nonexistingDir")
+        io.list_of_files("nonexistingDir")
 
 def test_get_html_text():
     html = """<html><head><title>Test</title></head></html>"""
-    assert get_html_text(html) == 'Test'
+    assert io.get_html_text(html) == 'Test'
 
 def test_get_html_text_noHtml():
     noHtml = """Test"""
-    assert get_html_text(noHtml) == 'Test'
+    assert io.get_html_text(noHtml) == 'Test'
 
 def test_get_text_no_file(tmp_path):
     p = tmp_path / "test.eml"
     with pytest.raises(OSError):
-        get_text(p)
\ No newline at end of file
+        io.get_text(p)
\ No newline at end of file

From 70e96f39d051ed1f69fb95329daa58d0bf51677e Mon Sep 17 00:00:00 2001
From: Inga Ulusoy <inga.ulusoy@uni-heidelberg.de>
Date: Fri, 13 Sep 2024 11:07:57 +0200
Subject: [PATCH 3/5] use instant of the class for better interface

---
 mailcom/inout.py           | 62 ++++++++++++++++++------------------
 mailcom/test/test_inout.py | 64 +++++++++++++++++---------------------
 2 files changed, 60 insertions(+), 66 deletions(-)

diff --git a/mailcom/inout.py b/mailcom/inout.py
index 21f4162..4cc14ba 100644
--- a/mailcom/inout.py
+++ b/mailcom/inout.py
@@ -4,26 +4,27 @@
 from bs4 import BeautifulSoup
 
 class InoutHandler:
-    @staticmethod
-    def list_of_files(directory_name: str) -> list[Path]:
-        """Function to create a list of files that are present in a directory as path objects.
+    def __init__(self, directory_name: str):
+        """Constructor for the InoutHandler class.
         
         Args: 
             directory_name (str): The directory where the files are located.
-        
-        Returns:
-            list[Path]: A list of Path objects that represent the files in the directory."""
-        if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
-            raise OSError("Path {} does not exist".format(directory_name))
-        mypath = Path(directory_name)
-        pattern = [".eml", ".html"]  # we would not change the file type through user input
-        email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
-        if len(email_list) == 0:
+        """        
+        self.directory_name = directory_name
+        # presets
+        self.pattern = [".eml", ".html"]
+
+    def list_of_files(self):
+        """Method to create a list of Path objects (files) that are present 
+        in a directory."""
+        if not os.path.exists(self.directory_name):  # check if given dir exists raises error otherwise
+            raise OSError("Path {} does not exist".format(self.directory_name))
+        mypath = Path(self.directory_name)
+        self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
+        if len(self.email_list) == 0:
             raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
-        return email_list
 
-    @staticmethod
-    def get_html_text(text_check: str) -> str:
+    def get_html_text(self, text_check: str) -> str:
         """Clean up a string if it contains html content.
         Args:
             text_check (str): The string that may contain html content.
@@ -35,8 +36,7 @@ def get_html_text(text_check: str) -> str:
             text_check = soup.get_text()
         return text_check
 
-    @staticmethod
-    def get_text(file: Path) -> str:
+    def get_text(self, file: Path) -> str:
         """Function to extract the textual content and other metadata from an email file.
         
         Args:
@@ -57,24 +57,24 @@ def get_text(file: Path) -> str:
         # find the types of attachements
         if attachments > 0:
             attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-        email_content = {"content": parsed_eml["body"][0]["content"], 
+        self.email_content = {"content": parsed_eml["body"][0]["content"], 
                     "date": parsed_eml["header"]["date"], 
                     "attachment": attachments, 
                     "attachement type": attachmenttypes
                     }
-        return(email_content["content"])
+        return(self.email_content["content"])
 
-    def validate_data():
-        return
+    def validate_data(self):
+        pass
     
-    def data_to_xml():
-        return
+    def data_to_xml(self):
+        pass
 
-def write_file(text: str, name: str)-> None:
-    """Write the extracted string to a text file.
-    
-    Args:
-        text (str): The string to be written to the file.
-        name (str): The name of the file to be written."""
-    with open("{}.out".format(name), "w") as file:
-        file.write(text)
+    def write_file(self, text: str, name: str)-> None:
+        """Write the extracted string to a text file.
+
+        Args:
+            text (str): The string to be written to the file.
+            name (str): The name of the file to be written."""
+        with open("{}.out".format(name), "w") as file:
+            file.write(text)
diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py
index e21d6ed..a369c20 100644
--- a/mailcom/test/test_inout.py
+++ b/mailcom/test/test_inout.py
@@ -2,56 +2,50 @@
 import pytest
 from pathlib import Path
 from importlib import resources
+import datetime
 
 pkg = resources.files("mailcom")
-io = inout.InoutHandler()
 
 FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")
+
 TEXT_REF = "J'espère que tu vas bien!"
 
-def test_list_of_files_found(tmp_path):
-    p = tmp_path / "test.eml"
-    p.write_text("test")
-    assert len(io.list_of_files(tmp_path)) != 0
+@pytest.fixture()
+def get_instant(tmp_path):
+    return inout.InoutHandler(tmp_path)
 
-def test_list_of_files_empty(tmp_path):
+def test_list_of_files(get_instant):
     with pytest.raises(ValueError):
-        io.list_of_files(tmp_path)
-
-def test_list_of_files_dir_not_existing():
-    with pytest.raises(OSError):
-        io.list_of_files("nonexistingDir")
-
-def test_list_of_files_correct_format(tmp_path):
-    p = tmp_path / "test.eml"
+        get_instant.list_of_files()
+    p = get_instant.directory_name / "test.eml"
     p.write_text("test")
-    p = tmp_path / "test2.html"
+    get_instant.list_of_files()
+    assert len(get_instant.email_list) != 0
+    get_instant2 = inout.InoutHandler("nonexistingDir")
+    with pytest.raises(OSError):
+        get_instant2.list_of_files()
+    p = get_instant.directory_name / "test2.html"
     p.write_text("test2")
-    p = tmp_path / "test3.xml"
+    p = get_instant.directory_name / "test3.xml"
     p.write_text("test3")
-    assert tmp_path / "test3.xml" not in io.list_of_files(tmp_path)
+    get_instant.list_of_files()
+    assert get_instant.directory_name / "test3.xml" not in get_instant.email_list
 
-def test_get_text(tmp_path):
-    p = tmp_path / "test.eml"
+def test_get_text(get_instant):
+    p = get_instant.directory_name / "test.eml"
     p.write_text("test")
-    assert io.get_text(p) == 'test'
-    text = io.get_text(FILE_PATH)
-    print(text[0:25])
+    extracted_text = get_instant.get_text(p)
+    assert extracted_text == 'test'
+    text = get_instant.get_text(FILE_PATH)
     assert text[0:25] == TEXT_REF
-
-def test_get_text_err():
+    assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
+    assert get_instant.email_content["attachment"] == 2
+    assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
     with pytest.raises(OSError):
-        io.list_of_files("nonexistingDir")
+        get_instant.get_text(get_instant.directory_name / "nonexisting.eml")
 
-def test_get_html_text():
+def test_get_html_text(get_instant):
     html = """<html><head><title>Test</title></head></html>"""
-    assert io.get_html_text(html) == 'Test'
-
-def test_get_html_text_noHtml():
+    assert get_instant.get_html_text(html) == 'Test'
     noHtml = """Test"""
-    assert io.get_html_text(noHtml) == 'Test'
-
-def test_get_text_no_file(tmp_path):
-    p = tmp_path / "test.eml"
-    with pytest.raises(OSError):
-        io.get_text(p)
\ No newline at end of file
+    assert get_instant.get_html_text(noHtml) == 'Test'
\ No newline at end of file

From 1b8ec764c5099d356a8201ce5e6ab46d34eef8bd Mon Sep 17 00:00:00 2001
From: Inga Ulusoy <inga.ulusoy@uni-heidelberg.de>
Date: Fri, 20 Sep 2024 11:29:28 +0200
Subject: [PATCH 4/5] update parse to run with inouthandler class

---
 mailcom/parse.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mailcom/parse.py b/mailcom/parse.py
index 77546f7..4226978 100644
--- a/mailcom/parse.py
+++ b/mailcom/parse.py
@@ -2,7 +2,7 @@
 import spacy as sp
 from transformers import pipeline
 from pathlib import Path
-from mailcom import inout
+from mailcom.inout import InoutHandler
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
@@ -116,13 +116,16 @@ def make_dir(path: str):
         print("Generating output directory/ies.")
         make_dir(path_output)
     # process the text
-    io = inout.InoutHandler()
-    eml_files = io.list_of_files(path_input)
+    io = InoutHandler(path_input)
+    io.list_of_files()
     # html_files = list_of_files(path_input, "html")
-    for file in eml_files:
+    for file in io.email_list:
         text = io.get_text(file)
         text = io.get_html_text(text)
         print(text)
+        print(io.email_content["date"])
+        print(io.email_content["attachment"])
+        print(io.email_content["attachement type"])
         # skip this text if email could not be parsed
         if not text:
             continue

From 822131cf7e88fdcfee0190b45ffeb55b94372257 Mon Sep 17 00:00:00 2001
From: Thore Schoeller <s231381@student.dhbw-mannheim.de>
Date: Mon, 23 Sep 2024 15:25:29 +0200
Subject: [PATCH 5/5] Write XML-File to Output

---
 mailcom/parse.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/mailcom/parse.py b/mailcom/parse.py
index 4226978..050390b 100644
--- a/mailcom/parse.py
+++ b/mailcom/parse.py
@@ -3,6 +3,8 @@
 from transformers import pipeline
 from pathlib import Path
 from mailcom.inout import InoutHandler
+from dicttoxml import dicttoxml
+from xml.dom.minidom import parseString
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
@@ -122,13 +124,21 @@ def make_dir(path: str):
     for file in io.email_list:
         text = io.get_text(file)
         text = io.get_html_text(text)
-        print(text)
-        print(io.email_content["date"])
-        print(io.email_content["attachment"])
-        print(io.email_content["attachement type"])
+        # print(text)
+        # print(io.email_content["date"])
+        # print(io.email_content["attachment"])
+        # print(io.email_content["attachement type"])
         # skip this text if email could not be parsed
         if not text:
             continue
+    xml = dicttoxml(io.email_content["content"])
+    # xml = dicttoxml(io.email_content)  Different options for review
+    xml_decode = xml.decode()
+    xmlfile = open(path_output / "dict.xml", "w")
+    xmlfile.write(xml_decode)
+    xmlfile.close()
+    print(parseString(xml).toprettyxml())
+    
         # doc_spacy = nlp_spacy(text)
         # text = get_sentences(doc_spacy)
         # start with first line