From d8f50879459e8b3824a5ddaec2b4c41e23cef4eb Mon Sep 17 00:00:00 2001 From: mugdhapolimera <35502000+mugdhapolimera@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:25:40 -0500 Subject: [PATCH] Computing doctype boost on the fly (#185) * Computing doctype boost on the fly * updated default value for doctype_boost field in solr object * updated solr checksum for test case * renamed gpn field to planetary feature * updating planetary feature test cases * Bump adsputils to v1.5.5 --------- Co-authored-by: Mugdha Polimera Co-authored-by: tjacovich --- adsmp/solr_updater.py | 43 ++++++++++++++++++++++---------- adsmp/tests/test_solr_updater.py | 39 ++++++++++++++++------------- adsmp/tests/test_tasks.py | 7 +++--- config.py | 24 ++++++++++++++++++ requirements.txt | 2 +- 5 files changed, 79 insertions(+), 36 deletions(-) diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py index 9499cf8..45a6f9d 100644 --- a/adsmp/solr_updater.py +++ b/adsmp/solr_updater.py @@ -34,10 +34,10 @@ def extract_data_pipeline(data, solrdoc): grant.append(grant_no) grant_facet_hier.extend(generate_hier_facet(agency, grant_no)) - gpn = [] - gpn_id = [] - gpn_facet_hier_2level = [] - gpn_facet_hier_3level = [] + planetary_feature = [] + planetary_feature_id = [] + planetary_feature_facet_hier_2level = [] + planetary_feature_facet_hier_3level = [] featurelist = [ "albedo feature", @@ -50,14 +50,14 @@ def extract_data_pipeline(data, solrdoc): "satellite feature", ] - for x in data.get("gpn", []): + for x in data.get("planetary_feature", []): planet, feature, feature_name, id_no = x.split("/", 3) - gpn.append("/".join([planet, feature, feature_name])) - gpn_id.append(id_no) - gpn_facet_hier_3level.extend(generate_hier_facet(planet, feature, feature_name)) + planetary_feature.append("/".join([planet, feature, feature_name])) + planetary_feature_id.append(id_no) + planetary_feature_facet_hier_3level.extend(generate_hier_facet(planet, feature, feature_name)) if feature.lower() in featurelist: feature_name = " ".join([feature, feature_name]) - gpn_facet_hier_2level.extend(generate_hier_facet(planet, feature_name)) + planetary_feature_facet_hier_2level.extend(generate_hier_facet(planet, feature_name)) uat = [] uat_id = [] @@ -119,10 +119,10 @@ def extract_data_pipeline(data, solrdoc): data_facet=[x.split(":")[0] for x in data.get("data", [])], esources=data.get("esource", []), property=data.get("property", []), - gpn=gpn, - gpn_id=gpn_id, - gpn_facet_hier_2level=gpn_facet_hier_2level, - gpn_facet_hier_3level=gpn_facet_hier_3level, + planetary_feature=planetary_feature, + planetary_feature_id=planetary_feature_id, + planetary_feature_facet_hier_2level=planetary_feature_facet_hier_2level, + planetary_feature_facet_hier_3level=planetary_feature_facet_hier_3level, uat=uat, uat_id=uat_id, uat_facet_hier=uat_facet_hier, @@ -466,6 +466,23 @@ def transform_json_record(db_record): db_record["bibcode"], type(links_data), links_data ) ) + + # Compute doctype scores on the fly + out["doctype_boost"] = None + + if config.get("DOCTYPE_RANKING", False): + doctype_rank = config.get("DOCTYPE_RANKING") + unique_ranks = sorted(set(doctype_rank.values())) + + # Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score) + rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)} + + # Assign scores to each rank + doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()} + + if "doctype" in out.keys(): + out["doctype_boost"] = doctype_scores.get(out["doctype"], None) + if config.get("ENABLE_HAS", False): # Read-in names of fields to check for solr "has:" field hasfields = sorted(config.get("HAS_FIELDS", [])) diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py index 415689d..28ee0bd 100644 --- a/adsmp/tests/test_solr_updater.py +++ b/adsmp/tests/test_solr_updater.py @@ -340,6 +340,7 @@ def test_solr_transformer(self): "volume", ], ) + self.assertEqual(round(x["doctype_boost"],3),0.857) self.app.update_storage( "bibcode", @@ -546,6 +547,8 @@ def test_solr_transformer(self): "volume", ], ) + self.assertEqual(round(x["doctype_boost"],3),0.857) + def test_links_data_merge(self): # links_data only from bib @@ -663,37 +666,37 @@ def test_extract_data_pipeline(self): d["ned_object_facet_hier"], ) - # Test simple gpn - nonbib = {"gpn": ["Moon/Crater/Langrenus/3273"]} + # Test simple planetary_feature + nonbib = {"planetary_feature": ["Moon/Crater/Langrenus/3273"]} d = solr_updater.extract_data_pipeline(nonbib, None) - self.assertEqual(["Moon/Crater/Langrenus"], d["gpn"]) - self.assertEqual(["3273"], d["gpn_id"]) + self.assertEqual(["Moon/Crater/Langrenus"], d["planetary_feature"]) + self.assertEqual(["3273"], d["planetary_feature_id"]) self.assertEqual( ["0/Moon", "1/Moon/Crater", "2/Moon/Crater/Langrenus"], - d["gpn_facet_hier_3level"], + d["planetary_feature_facet_hier_3level"], ) self.assertEqual( ["0/Moon", "1/Moon/Crater Langrenus"], - d["gpn_facet_hier_2level"], + d["planetary_feature_facet_hier_2level"], ) - # Test gpn with space in feature name - nonbib = {"gpn": ["Mars/Terra/Terra Cimmeria/5930"]} + # Test planetary_feature with space in feature name + nonbib = {"planetary_feature": ["Mars/Terra/Terra Cimmeria/5930"]} d = solr_updater.extract_data_pipeline(nonbib, None) - self.assertEqual(["Mars/Terra/Terra Cimmeria"], d["gpn"]) - self.assertEqual(["5930"], d["gpn_id"]) + self.assertEqual(["Mars/Terra/Terra Cimmeria"], d["planetary_feature"]) + self.assertEqual(["5930"], d["planetary_feature_id"]) self.assertEqual( ["0/Mars", "1/Mars/Terra", "2/Mars/Terra/Terra Cimmeria"], - d["gpn_facet_hier_3level"], + d["planetary_feature_facet_hier_3level"], ) self.assertEqual( ["0/Mars", "1/Mars/Terra Cimmeria"], - d["gpn_facet_hier_2level"], + d["planetary_feature_facet_hier_2level"], ) - # Test one bibcode with multiple gpns assigned + # Test one bibcode with multiple planetary_features assigned nonbib = { - "gpn": [ + "planetary_feature": [ "Moon/Mare/Mare Imbrium/3678", "Moon/Crater/Alder/171", "Moon/Crater/Finsen/1959", @@ -708,9 +711,9 @@ def test_extract_data_pipeline(self): "Moon/Crater/Finsen", "Moon/Crater/Leibnitz", ], - d["gpn"], + d["planetary_feature"], ) - self.assertEqual(["3678", "171", "1959", "3335"], d["gpn_id"]) + self.assertEqual(["3678", "171", "1959", "3335"], d["planetary_feature_id"]) self.assertEqual( [ "0/Moon", @@ -726,7 +729,7 @@ def test_extract_data_pipeline(self): "1/Moon/Crater", "2/Moon/Crater/Leibnitz", ], - d["gpn_facet_hier_3level"], + d["planetary_feature_facet_hier_3level"], ) self.assertEqual( [ @@ -739,7 +742,7 @@ def test_extract_data_pipeline(self): "0/Moon", "1/Moon/Crater Leibnitz", ], - d["gpn_facet_hier_2level"], + d["planetary_feature_facet_hier_2level"], ) # Test uat diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py index 6e74215..6d5ab6c 100644 --- a/adsmp/tests/test_tasks.py +++ b/adsmp/tests/test_tasks.py @@ -537,7 +537,7 @@ def test_avoid_duplicates(self): tasks.task_index_records(["foo"], force=True) self.assertEqual(update_solr.call_count, 1) - self._check_checksum("foo", solr="0x4db9a611") + self._check_checksum("foo", solr="0x8f51bd8d") # now change metrics (solr shouldn't be called) getter.return_value = { @@ -545,7 +545,7 @@ def test_avoid_duplicates(self): "metrics_updated": get_date("1972-04-02"), "bib_data_updated": get_date("1972-04-01"), "metrics": {}, - "solr_checksum": "0x4db9a611", + "solr_checksum": "0x8f51bd8d", } tasks.task_index_records(["foo"], force=True) self.assertEqual(update_solr.call_count, 1) @@ -563,7 +563,7 @@ def test_ignore_checksums_solr(self): "bibcode": "foo", "metrics_updated": get_date("1972-04-02"), "bib_data_updated": get_date("1972-04-01"), - "solr_checksum": "0x4db9a611", + "solr_checksum": "0x8f51bd8d", } # update with matching checksum and then update and ignore checksums @@ -574,7 +574,6 @@ def test_ignore_checksums_solr(self): update_links=False, ignore_checksums=False, ) - # pdb.set_trace() self.assertEqual(update_solr.call_count, 0) tasks.task_index_records( diff --git a/config.py b/config.py index 23d07b0..11ade91 100644 --- a/config.py +++ b/config.py @@ -66,3 +66,27 @@ "uat", "volume", ] + +DOCTYPE_RANKING = { + "article": 1, + "eprint": 1, + "inproceedings": 2, + "inbook": 1, + "abstract": 4, + "book": 1, + "bookreview": 4, + "catalog": 2, + "circular": 3, + "erratum": 6, + "mastersthesis": 3, + "newsletter": 5, + "obituary": 6, + "phdthesis": 3, + "pressrelease": 7, + "proceedings": 3, + "proposal": 4, + "software": 2, + "talk": 4, + "techreport": 3, + "misc": 8 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c8528cb..b7c337b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -adsputils==1.5.2 +adsputils==1.5.5 alembic==0.9.1 httplib2==0.18.1 portalocker==1.7.1