From ceaa8d8e01cfde06f06b3da14346fdb52f50e459 Mon Sep 17 00:00:00 2001 From: Motoki Saito Date: Thu, 26 Sep 2024 14:25:47 +0200 Subject: [PATCH 1/4] Refactors functions in performance.py - accuracy has now keyword-only arguments. - predict_df has now keyword-only arguments. - `accuracy (*, pred, gold, method='correlation')` (old: `accuracy (hat, mat, distance=False)`) - `predict_df (*, pred, gold, n=1, method='correlation')` (old: `predict_df (hat, mat, max_guess=1, distance=False, method='cosine')`) - Adds docstrings to `predict_df`. --- discriminative_lexicon_model/performance.py | 104 ++++++++++++-------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/discriminative_lexicon_model/performance.py b/discriminative_lexicon_model/performance.py index 954d771..d91a36d 100644 --- a/discriminative_lexicon_model/performance.py +++ b/discriminative_lexicon_model/performance.py @@ -3,40 +3,72 @@ import xarray as xr import scipy.spatial.distance as spd -def accuracy (hat, mat, distance=False): - pred = predict_df(hat, mat, max_guess=1, distance=distance) +def accuracy (*, pred, gold, method='correlation'): + pred = predict_df(pred=pred, gold=gold, n=1, method=method) acc = pred.acc.sum() / len(pred) return acc -def predict_df (hat, mat, max_guess=1, distance=False, method='cosine'): - if not isinstance(max_guess, int): raise TypeError('"max_guess" must be integer') - coss = distance_matrix(pred=hat, gold=mat, method=method).values - if distance: - pos1 = [np.argmin(coss, axis=1)] - sign = 1 - else: - coss = 1 - coss - pos1 = [np.argmax(coss, axis=1)] - sign = -1 +def predict_df (*, pred, gold, n=1, method='correlation'): + """ + Constructs a dataframe of predictions. - if max_guess>1: - pos = [ np.apply_along_axis(lambda x: np.argsort(x)[(sign*i)], 1, coss) for i in range(2,max_guess+1) ] - else: - pos = [] - pos = pos1 + pos - prds = [ [ mat.word.values[j] for j in i ] for i in pos ] - hits = [ [ j==k for j,k in zip(i,hat.word.values) ] for i in prds ] - if len(prds)==1: - prds = [ pd.DataFrame({'pred':j}) for j in prds ] - hits = [ pd.DataFrame({'acc':j}) for j in hits ] - else: - prds = [ pd.DataFrame({'pred{:d}'.format(i+1):j}) for i,j in enumerate(prds) ] - hits = [ pd.DataFrame({'acc{:d}'.format(i+1):j}) for i,j in enumerate(hits) ] - prds = pd.concat(prds, axis=1) - hits = pd.concat(hits, axis=1) - wrds = pd.DataFrame({'Word':hat.word.values}) - dddd = pd.concat([wrds,prds,hits], axis=1) - return dddd + Parameters + ---------- + pred : xarray.core.dataarray.DataArray + A matrix of predictions. It is usually a C-hat or S-hat matrix. + gold : xarray.core.dataarray.DataArray + A matrix of gold-standard vectors. It is usually a C or S matrix. + n : int or None + The number of predictions to make for each word. When n=1, the first prediction for each word will be produced. When n=2, the first and second predictions for each word will be included in the output dataframe. When n=None, as many predictions as possible will be produced. + method : str + Which method to use to calculate distance/similarity. It must be "correlation", "cosine" (for cosine similarity), and "euclidean" (for euclidean distance). + + Returns + ------- + df : pandas.core.frame.DataFrame + A dataframe of a model's predictions. + + Examples + -------- + >>> import discriminative_lexicon_model as dlm + >>> import pandas as pd + >>> words = ['cat','rat','hat'] + >>> sems = pd.DataFrame({'':[1,1,0], '':[0,0,1], '':[1,0,0]}, index=words) + >>> mdl = dlm.ldl.LDL() + >>> mdl.gen_cmat(words) + >>> mdl.gen_smat(sems) + >>> mdl.gen_gmat() + >>> mdl.gen_chat() + >>> dlm.performance.predict_df(pred=mdl.chat, gold=mdl.cmat, n=2, method='correlation') + Word Pred1 Pred2 Correct1 Correct2 + 0 cat cat hat True False + 1 rat rat hat True False + 2 hat hat cat True False + """ + if not (method in ['correlation', 'cosine', 'euclidean']): + raise ValueError('"method" must be "correlation", "cosine", or "euclidean".') + if not (n is None): + if not isinstance(n, int): + raise TypeError('"n" must be integer or None.') + if not (n>0): + raise ValueError('"n" must be a positive integer.') + n = pred.shape[0] if n is None else n + + dist = distance_matrix(pred=pred, gold=gold, method=method).values + dist = dist if method=='euclidean' else 1-dist + inds = dist.argsort(axis=1) if method=='euclidean' else (-dist).argsort(axis=1) + inds = inds[:,:n] + + prds = np.apply_along_axis(lambda x: gold.word.values[x], 1, inds) + hits = np.array([ prds[i,:]==j for i,j in zip(range(prds.shape[0]), gold.word.values) ]) + + clms = ['Pred'] if prds.shape[1]==1 else [ 'Pred{:d}'.format(i) for i in range(1, prds.shape[1]+1) ] + prds = pd.DataFrame(prds, columns=clms) + clms = ['Correct'] if hits.shape[1]==1 else [ 'Correct{:d}'.format(i) for i in range(1, hits.shape[1]+1) ] + hits = pd.DataFrame(hits, columns=clms) + wrds = pd.DataFrame({'Word':gold.word.values}) + df = pd.concat([wrds, prds, hits], axis=1) + return df def distance_matrix (*, pred, gold, method='cosine'): """ @@ -70,15 +102,3 @@ def distance_matrix (*, pred, gold, method='cosine'): dist = xr.DataArray(dist, dims=('pred','gold'), coords=new_coords) return dist -def predict (word, hat, mat, distance=False): - hat = np.tile(hat.loc[word,:], (1,1)) - coss = spd.cdist(np.array(hat), np.array(mat), 'cosine') - if distance: - sign = 1 - else: - coss = 1 - coss - sign = -1 - coss = coss[0,:] - pred = mat.word.values[np.argsort(sign*coss)] - return pd.Series(pred) - From 4cbf0256652dcab5dcd022d4e6faf7bb160409c2 Mon Sep 17 00:00:00 2001 From: Motoki Saito Date: Thu, 26 Sep 2024 14:34:34 +0200 Subject: [PATCH 2/4] Increments the version number & cleans pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 59746fa..d9679a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "discriminative_lexicon_model" -version = "1.4.3" +version = "2.0,0" description = "Python-implementation of Discriminative Lexicon Model / Linear Discriminative Learning" license = "MIT" @@ -40,5 +40,5 @@ sphinx = ">=7.3" sphinx_rtd_theme = ">=2.0" [build-system] -requires = ["poetry-core", "setuptools", "Cython", "numpy"] +requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From cc81856d8cdb3c488988fcc7bff258e53b596f91 Mon Sep 17 00:00:00 2001 From: Motoki Saito Date: Thu, 26 Sep 2024 15:11:12 +0200 Subject: [PATCH 3/4] Fixes a small bug in performance.py; fixes tests --- discriminative_lexicon_model/performance.py | 2 +- tests/resources/predict_df_00.csv | 2 +- tests/resources/predict_df_01.csv | 14 ++++----- tests/resources/predict_df_02.csv | 14 ++++----- tests/resources/predict_df_03.csv | 14 ++++----- tests/resources/predict_df_04.csv | 7 ----- tests/resources/predict_df_05.csv | 7 ----- tests/resources/predict_df_06.csv | 7 ----- tests/resources/predict_df_07.csv | 7 ----- tests/test_ldl.py | 4 +-- tests/test_mapping.py | 2 +- tests/test_measures.py | 4 +-- tests/test_performance.py | 35 ++++++--------------- 13 files changed, 38 insertions(+), 81 deletions(-) delete mode 100644 tests/resources/predict_df_04.csv delete mode 100644 tests/resources/predict_df_05.csv delete mode 100644 tests/resources/predict_df_06.csv delete mode 100644 tests/resources/predict_df_07.csv diff --git a/discriminative_lexicon_model/performance.py b/discriminative_lexicon_model/performance.py index d91a36d..65dc33c 100644 --- a/discriminative_lexicon_model/performance.py +++ b/discriminative_lexicon_model/performance.py @@ -5,7 +5,7 @@ def accuracy (*, pred, gold, method='correlation'): pred = predict_df(pred=pred, gold=gold, n=1, method=method) - acc = pred.acc.sum() / len(pred) + acc = pred.Correct.sum() / len(pred) return acc def predict_df (*, pred, gold, n=1, method='correlation'): diff --git a/tests/resources/predict_df_00.csv b/tests/resources/predict_df_00.csv index f598cc2..58a0517 100644 --- a/tests/resources/predict_df_00.csv +++ b/tests/resources/predict_df_00.csv @@ -1,4 +1,4 @@ -Word pred acc +Word Pred Correct walk0 walk0 True walk1 walk0 False walks walks True diff --git a/tests/resources/predict_df_01.csv b/tests/resources/predict_df_01.csv index f598cc2..3e12b37 100644 --- a/tests/resources/predict_df_01.csv +++ b/tests/resources/predict_df_01.csv @@ -1,7 +1,7 @@ -Word pred acc -walk0 walk0 True -walk1 walk0 False -walks walks True -walked0 walked0 True -walked1 walked0 False -walked2 walked0 False +Word Pred1 Pred2 Correct1 Correct2 +walk0 walk0 walk1 True False +walk1 walk0 walk1 False True +walks walks walk0 True False +walked0 walked0 walked1 True False +walked1 walked0 walked1 False True +walked2 walked0 walked1 False False diff --git a/tests/resources/predict_df_02.csv b/tests/resources/predict_df_02.csv index 18d16ce..9919ac6 100644 --- a/tests/resources/predict_df_02.csv +++ b/tests/resources/predict_df_02.csv @@ -1,7 +1,7 @@ -Word pred1 pred2 acc1 acc2 -walk0 walk0 walks True False -walk1 walk0 walks False False -walks walks walk1 True False -walked0 walked0 walked2 True False -walked1 walked0 walked2 False False -walked2 walked0 walked2 False True +Word Pred Correct +walk0 walk1 False +walk1 walk1 True +walks walks True +walked0 walked2 False +walked1 walked2 False +walked2 walked2 True diff --git a/tests/resources/predict_df_03.csv b/tests/resources/predict_df_03.csv index d925566..72ab833 100644 --- a/tests/resources/predict_df_03.csv +++ b/tests/resources/predict_df_03.csv @@ -1,7 +1,7 @@ -Word pred1 pred2 acc1 acc2 -walk0 walk0 walk0 True True -walk1 walk0 walk0 False False -walks walks walk1 True False -walked0 walked0 walked1 True False -walked1 walked0 walked1 False True -walked2 walked0 walked1 False False +Word Pred1 Pred2 Correct1 Correct2 +walk0 walk1 walk0 False True +walk1 walk1 walk0 True False +walks walks walked2 True False +walked0 walked2 walked0 False True +walked1 walked2 walked0 False False +walked2 walked2 walked0 True False diff --git a/tests/resources/predict_df_04.csv b/tests/resources/predict_df_04.csv deleted file mode 100644 index 36125e4..0000000 --- a/tests/resources/predict_df_04.csv +++ /dev/null @@ -1,7 +0,0 @@ -Word pred acc -walk0 walk1 False -walk1 walk1 True -walks walks True -walked0 walked2 False -walked1 walked2 False -walked2 walked2 True diff --git a/tests/resources/predict_df_05.csv b/tests/resources/predict_df_05.csv deleted file mode 100644 index 36125e4..0000000 --- a/tests/resources/predict_df_05.csv +++ /dev/null @@ -1,7 +0,0 @@ -Word pred acc -walk0 walk1 False -walk1 walk1 True -walks walks True -walked0 walked2 False -walked1 walked2 False -walked2 walked2 True diff --git a/tests/resources/predict_df_06.csv b/tests/resources/predict_df_06.csv deleted file mode 100644 index a02e187..0000000 --- a/tests/resources/predict_df_06.csv +++ /dev/null @@ -1,7 +0,0 @@ -Word pred1 pred2 acc1 acc2 -walk0 walk1 walks False False -walk1 walk1 walks True False -walks walks walk1 True False -walked0 walked2 walked1 False False -walked1 walked2 walked1 False True -walked2 walked2 walked1 True False diff --git a/tests/resources/predict_df_07.csv b/tests/resources/predict_df_07.csv deleted file mode 100644 index a668b9b..0000000 --- a/tests/resources/predict_df_07.csv +++ /dev/null @@ -1,7 +0,0 @@ -Word pred1 pred2 acc1 acc2 -walk0 walk1 walk0 False True -walk1 walk1 walk0 True False -walks walks walked2 True False -walked0 walked2 walked0 False True -walked1 walked2 walked0 False False -walked2 walked2 walked0 True False diff --git a/tests/test_ldl.py b/tests/test_ldl.py index c74c201..2fff6f0 100644 --- a/tests/test_ldl.py +++ b/tests/test_ldl.py @@ -3,9 +3,9 @@ import os import pandas as pd from pathlib import Path -import pyldl.mapping as pm +import discriminative_lexicon_model.mapping as pm import xarray as xr -from pyldl.ldl import LDL +from discriminative_lexicon_model.ldl import LDL TEST_ROOT = Path('.') #TEST_ROOT = Path(__file__).parent diff --git a/tests/test_mapping.py b/tests/test_mapping.py index cec3d53..0eb0ad0 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -3,7 +3,7 @@ import os import pandas as pd from pathlib import Path -import pyldl.mapping as pm +import discriminative_lexicon_model.mapping as pm import xarray as xr # TEST_ROOT = Path('.') diff --git a/tests/test_measures.py b/tests/test_measures.py index 2a105dd..6cc927c 100644 --- a/tests/test_measures.py +++ b/tests/test_measures.py @@ -3,8 +3,8 @@ import os import pandas as pd from pathlib import Path -import pyldl.mapping as pm -import pyldl.measures as lmea +import discriminative_lexicon_model.mapping as pm +import discriminative_lexicon_model.measures as lmea import xarray as xr TEST_ROOT = Path(__file__).parent diff --git a/tests/test_performance.py b/tests/test_performance.py index 4e49801..f3bf120 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -3,8 +3,8 @@ import os import pandas as pd from pathlib import Path -import pyldl.mapping as pm -import pyldl.performance as lper +import discriminative_lexicon_model.mapping as pm +import discriminative_lexicon_model.performance as lper import xarray as xr TEST_ROOT = Path(__file__).parent @@ -29,37 +29,22 @@ def expand_to_mats (x): return ret mats = ['c', 's'] -dist = [True, False] -pars = [ (i,j) for i in mats for j in dist ] -pars = [ tuple(expand_to_mats(i[0]) + [i[1]]) for i in pars ] -@pytest.mark.parametrize('hat, mat, dist', pars) -def test_accuracy (hat, mat, dist): - assert lper.accuracy(hat, mat, dist) == 0.5 +pars = [ tuple(expand_to_mats(i[0])) for i in mats ] +@pytest.mark.parametrize('hat, mat', pars) +def test_accuracy (hat, mat): + assert lper.accuracy(pred=hat, gold=mat) == 0.5 mats = ['c', 's'] gues = [1, 2] -dist = [True, False] -pars = [ [i,j,k] for i in mats for j in gues for k in dist ] +pars = [ [i,j] for i in mats for j in gues ] pars = [ tuple(expand_to_mats(i[0]) + i[1:]) for i in pars ] pars = [ [i,*j] for i,j in enumerate(pars) ] -@pytest.mark.parametrize('ind, hat, mat, gues, dist', pars) -def test_accuracy (ind, hat, mat, gues, dist): - pred = lper.predict_df(hat, mat, gues, dist) +@pytest.mark.parametrize('ind, hat, mat, gues', pars) +def test_accuracy_df (ind, hat, mat, gues): + pred = lper.predict_df(pred=hat, gold=mat, n=gues) _prd = '{}/predict_df_{:02d}.csv'.format(RESOURCES, ind) _prd = pd.read_csv(_prd, sep='\t', header=0) assert pred.equals(_prd) -wrds = ['walk0', 'walks'] -mats = ['c', 's'] -dist = [True, False] -pars = [ [i,j,k] for i in wrds for j in mats for k in dist ] -pars = [ [i[0]] + expand_to_mats(i[1]) + i[2:] for i in pars ] -pars = [ [i,*j] for i,j in enumerate(pars) ] -@pytest.mark.parametrize('ind, wrd, hat, mat, dist', pars) -def test_predict (ind, wrd, hat, mat, dist): - pred = lper.predict(wrd, hat, mat, dist) - _prd = '{}/predict_{:02d}.csv'.format(RESOURCES, ind) - _prd = pd.read_csv(_prd, sep='\t', header=None).squeeze('columns') - assert pred.equals(_prd) From ed2ea9715225ec0e2d3bf1844d56ebf3d92e8a99 Mon Sep 17 00:00:00 2001 From: Motoki Saito Date: Thu, 26 Sep 2024 15:20:16 +0200 Subject: [PATCH 4/4] Fixes a typo in the version number --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d9679a7..6e63433 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "discriminative_lexicon_model" -version = "2.0,0" +version = "2.0.1" description = "Python-implementation of Discriminative Lexicon Model / Linear Discriminative Learning" license = "MIT"