Skip to content

Commit

Permalink
xml functions sorted to appropiate class and fuction. Test cases exte…
Browse files Browse the repository at this point in the history
…nded
  • Loading branch information
Olthoff231381 committed Sep 24, 2024
1 parent 1731f97 commit e4e3e56
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
9 changes: 9 additions & 0 deletions data/out/dict.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!
[Inline-Bild]

[Inline-Bild]

À bientôt,

Pierre
</content></email>
8 changes: 6 additions & 2 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import eml_parser
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml

class InoutHandler:
def __init__(self, directory_name: str):
Expand Down Expand Up @@ -67,8 +68,10 @@ def get_text(self, file: Path) -> str:
def validate_data(self):
pass

def data_to_xml(self):
pass
def data_to_xml(self, text):
my_item_func = lambda x: 'content'
xml = dicttoxml(text, custom_root='email', item_func = my_item_func) # Different options for review
return xml.decode()

def write_file(self, text: str, name: str)-> None:
"""Write the extracted string to a text file.
Expand All @@ -78,3 +81,4 @@ def write_file(self, text: str, name: str)-> None:
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

17 changes: 4 additions & 13 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from transformers import pipeline
from pathlib import Path
from mailcom.inout import InoutHandler
from dicttoxml import dicttoxml
from xml.dom.minidom import parseString

# please modify this section depending on your setup
# input language - either "es" or "fr"
Expand All @@ -16,6 +14,7 @@
# path where the output files should be written to
# this is generated if not present yet
path_output = Path("../data/out/")
output_filename = "dict"
# the ner tool - currently only "transformers"
tool = "transformers"
# please do not modify below this section unless you know what you are doing
Expand Down Expand Up @@ -126,23 +125,15 @@ def make_dir(path: str):
for file in io.email_list:
text = io.get_text(file)
text = io.get_html_text(text)
xml = io.data_to_xml(text)
io.write_file(xml, path_output / output_filename)
# print(text)
# print(io.email_content["date"])
# print(io.email_content["attachment"])
# print(io.email_content["attachement type"])
# skip this text if email could not be parsed
if not text:
continue
xml = dicttoxml(io.email_content["content"])
# xml = dicttoxml(io.email_content) Different options for review
xml_decode = xml.decode()

if check_dir(path_output):
xmlfile = open(path_output / "dict.xml", "w")
xmlfile.write(xml_decode)
xmlfile.close()
print(parseString(xml).toprettyxml())

continue
# doc_spacy = nlp_spacy(text)
# text = get_sentences(doc_spacy)
# start with first line
Expand Down
7 changes: 6 additions & 1 deletion mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")

TEXT_REF = "J'espère que tu vas bien!"
XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"

@pytest.fixture()
def get_instant(tmp_path):
Expand Down Expand Up @@ -43,9 +44,13 @@ def test_get_text(get_instant):
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
with pytest.raises(OSError):
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")
return text

def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_instant.get_html_text(html) == 'Test'
noHtml = """Test"""
assert get_instant.get_html_text(noHtml) == 'Test'
assert get_instant.get_html_text(noHtml) == 'Test'

def test_data_to_xml(get_instant):
assert get_instant.data_to_xml(test_get_text)[0:66] == XML_REF

0 comments on commit e4e3e56

Please sign in to comment.