diff --git a/README.md b/README.md index fdcf111..d90a1b1 100644 --- a/README.md +++ b/README.md @@ -34,10 +34,11 @@ Valentine can be used to find matches among columns of a given pair of pandas Da ### Matching methods In order to do so, the user can choose one of the following 5 matching methods: -1. `Coma(int: max_n str: strategy)` is a python wrapper around [COMA 3.0 Comunity edition](https://sourceforge.net/projects/coma-ce/) +1. `Coma(int: max_n, bool: use_instances, str: java_xmx)` is a python wrapper around [COMA 3.0 Comunity edition](https://sourceforge.net/projects/coma-ce/) * **Parameters**: - * **max_n**(*int*) - Accept similarity threshold, default is 0. - * **strategy**(*str*) - Choice of "COMA\_OPT" (schema based matching - default) or "COMA\_OPT\_INST" (schema and instance based matching) + * **max_n**(*int*) - Accept similarity threshold, (default: 0). + * **use_instances**(*bool*) - Wheather Coma will make use of the data instances or just the schema information, (default: False). + * **java_xmx**(*str*) - The amount of RAM that Coma is allowed to use, (default: "1024m") . 2. `Cupid(float: w_struct, float: leaf_w_struct, float: th_accept)` is the python implementation of the paper [Generic Schema Matching with Cupid](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.79.4079&rep=rep1&type=pdf) * **Parameters**: diff --git a/valentine/algorithms/coma/coma.py b/valentine/algorithms/coma/coma.py index f5d4422..4840f63 100644 --- a/valentine/algorithms/coma/coma.py +++ b/valentine/algorithms/coma/coma.py @@ -10,6 +10,10 @@ from ...utils.utils import get_project_root +class JavaException(Exception): + pass + + class Coma(BaseMatcher): def __init__(self, @@ -45,15 +49,18 @@ def __run_coma_jar(self, source_data = os.path.join(tmp_folder_path, source_table_f_name) target_data = os.path.join(tmp_folder_path, target_table_f_name) coma_output_path = os.path.join(tmp_folder_path, coma_output_path) - with open(os.path.join(tmp_folder_path, "NUL"), "w") as fh: - subprocess.call(['java', f'-Xmx{self.__java_XmX}', - '-cp', jar_path, - '-DinputFile1=' + source_data, - '-DinputFile2=' + target_data, - '-DoutputFile=' + coma_output_path, - '-DmaxN=' + str(self.__max_n), - '-Dstrategy=' + self.__strategy, - 'Main'], stdout=fh, stderr=fh) + try: + subprocess.check_output(['java', f'-Xmx{self.__java_XmX}', + '-cp', jar_path, + '-DinputFile1=' + source_data, + '-DinputFile2=' + target_data, + '-DoutputFile=' + coma_output_path, + '-DmaxN=' + str(self.__max_n), + '-Dstrategy=' + self.__strategy, + 'Main'], stderr=subprocess.DEVNULL) + except subprocess.CalledProcessError: + raise JavaException("Either Java (JRE) is not installed or Java does not have enough memory to operate. " + "Try raising the java_xmx parameter of the Coma class") def __write_schema_csv_files(self, table1: BaseTable,