Skip to content

Commit

Permalink
Computing doctype boost on the fly (#185)
Browse files Browse the repository at this point in the history
* Computing doctype boost on the fly

* updated default value for doctype_boost field in solr object

* updated solr checksum for test case

* renamed gpn field to planetary feature

* updating planetary feature test cases

* Bump adsputils to v1.5.5

---------

Co-authored-by: Mugdha Polimera <[email protected]>
Co-authored-by: tjacovich <[email protected]>
  • Loading branch information
3 people authored Jan 13, 2025
1 parent 06f79b0 commit d8f5087
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 36 deletions.
43 changes: 30 additions & 13 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ def extract_data_pipeline(data, solrdoc):
grant.append(grant_no)
grant_facet_hier.extend(generate_hier_facet(agency, grant_no))

gpn = []
gpn_id = []
gpn_facet_hier_2level = []
gpn_facet_hier_3level = []
planetary_feature = []
planetary_feature_id = []
planetary_feature_facet_hier_2level = []
planetary_feature_facet_hier_3level = []

featurelist = [
"albedo feature",
Expand All @@ -50,14 +50,14 @@ def extract_data_pipeline(data, solrdoc):
"satellite feature",
]

for x in data.get("gpn", []):
for x in data.get("planetary_feature", []):
planet, feature, feature_name, id_no = x.split("/", 3)
gpn.append("/".join([planet, feature, feature_name]))
gpn_id.append(id_no)
gpn_facet_hier_3level.extend(generate_hier_facet(planet, feature, feature_name))
planetary_feature.append("/".join([planet, feature, feature_name]))
planetary_feature_id.append(id_no)
planetary_feature_facet_hier_3level.extend(generate_hier_facet(planet, feature, feature_name))
if feature.lower() in featurelist:
feature_name = " ".join([feature, feature_name])
gpn_facet_hier_2level.extend(generate_hier_facet(planet, feature_name))
planetary_feature_facet_hier_2level.extend(generate_hier_facet(planet, feature_name))

uat = []
uat_id = []
Expand Down Expand Up @@ -119,10 +119,10 @@ def extract_data_pipeline(data, solrdoc):
data_facet=[x.split(":")[0] for x in data.get("data", [])],
esources=data.get("esource", []),
property=data.get("property", []),
gpn=gpn,
gpn_id=gpn_id,
gpn_facet_hier_2level=gpn_facet_hier_2level,
gpn_facet_hier_3level=gpn_facet_hier_3level,
planetary_feature=planetary_feature,
planetary_feature_id=planetary_feature_id,
planetary_feature_facet_hier_2level=planetary_feature_facet_hier_2level,
planetary_feature_facet_hier_3level=planetary_feature_facet_hier_3level,
uat=uat,
uat_id=uat_id,
uat_facet_hier=uat_facet_hier,
Expand Down Expand Up @@ -466,6 +466,23 @@ def transform_json_record(db_record):
db_record["bibcode"], type(links_data), links_data
)
)

# Compute doctype scores on the fly
out["doctype_boost"] = None

if config.get("DOCTYPE_RANKING", False):
doctype_rank = config.get("DOCTYPE_RANKING")
unique_ranks = sorted(set(doctype_rank.values()))

# Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score)
rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)}

# Assign scores to each rank
doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()}

if "doctype" in out.keys():
out["doctype_boost"] = doctype_scores.get(out["doctype"], None)

if config.get("ENABLE_HAS", False):
# Read-in names of fields to check for solr "has:" field
hasfields = sorted(config.get("HAS_FIELDS", []))
Expand Down
39 changes: 21 additions & 18 deletions adsmp/tests/test_solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ def test_solr_transformer(self):
"volume",
],
)
self.assertEqual(round(x["doctype_boost"],3),0.857)

self.app.update_storage(
"bibcode",
Expand Down Expand Up @@ -546,6 +547,8 @@ def test_solr_transformer(self):
"volume",
],
)
self.assertEqual(round(x["doctype_boost"],3),0.857)


def test_links_data_merge(self):
# links_data only from bib
Expand Down Expand Up @@ -663,37 +666,37 @@ def test_extract_data_pipeline(self):
d["ned_object_facet_hier"],
)

# Test simple gpn
nonbib = {"gpn": ["Moon/Crater/Langrenus/3273"]}
# Test simple planetary_feature
nonbib = {"planetary_feature": ["Moon/Crater/Langrenus/3273"]}
d = solr_updater.extract_data_pipeline(nonbib, None)
self.assertEqual(["Moon/Crater/Langrenus"], d["gpn"])
self.assertEqual(["3273"], d["gpn_id"])
self.assertEqual(["Moon/Crater/Langrenus"], d["planetary_feature"])
self.assertEqual(["3273"], d["planetary_feature_id"])
self.assertEqual(
["0/Moon", "1/Moon/Crater", "2/Moon/Crater/Langrenus"],
d["gpn_facet_hier_3level"],
d["planetary_feature_facet_hier_3level"],
)
self.assertEqual(
["0/Moon", "1/Moon/Crater Langrenus"],
d["gpn_facet_hier_2level"],
d["planetary_feature_facet_hier_2level"],
)

# Test gpn with space in feature name
nonbib = {"gpn": ["Mars/Terra/Terra Cimmeria/5930"]}
# Test planetary_feature with space in feature name
nonbib = {"planetary_feature": ["Mars/Terra/Terra Cimmeria/5930"]}
d = solr_updater.extract_data_pipeline(nonbib, None)
self.assertEqual(["Mars/Terra/Terra Cimmeria"], d["gpn"])
self.assertEqual(["5930"], d["gpn_id"])
self.assertEqual(["Mars/Terra/Terra Cimmeria"], d["planetary_feature"])
self.assertEqual(["5930"], d["planetary_feature_id"])
self.assertEqual(
["0/Mars", "1/Mars/Terra", "2/Mars/Terra/Terra Cimmeria"],
d["gpn_facet_hier_3level"],
d["planetary_feature_facet_hier_3level"],
)
self.assertEqual(
["0/Mars", "1/Mars/Terra Cimmeria"],
d["gpn_facet_hier_2level"],
d["planetary_feature_facet_hier_2level"],
)

# Test one bibcode with multiple gpns assigned
# Test one bibcode with multiple planetary_features assigned
nonbib = {
"gpn": [
"planetary_feature": [
"Moon/Mare/Mare Imbrium/3678",
"Moon/Crater/Alder/171",
"Moon/Crater/Finsen/1959",
Expand All @@ -708,9 +711,9 @@ def test_extract_data_pipeline(self):
"Moon/Crater/Finsen",
"Moon/Crater/Leibnitz",
],
d["gpn"],
d["planetary_feature"],
)
self.assertEqual(["3678", "171", "1959", "3335"], d["gpn_id"])
self.assertEqual(["3678", "171", "1959", "3335"], d["planetary_feature_id"])
self.assertEqual(
[
"0/Moon",
Expand All @@ -726,7 +729,7 @@ def test_extract_data_pipeline(self):
"1/Moon/Crater",
"2/Moon/Crater/Leibnitz",
],
d["gpn_facet_hier_3level"],
d["planetary_feature_facet_hier_3level"],
)
self.assertEqual(
[
Expand All @@ -739,7 +742,7 @@ def test_extract_data_pipeline(self):
"0/Moon",
"1/Moon/Crater Leibnitz",
],
d["gpn_facet_hier_2level"],
d["planetary_feature_facet_hier_2level"],
)

# Test uat
Expand Down
7 changes: 3 additions & 4 deletions adsmp/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,15 +537,15 @@ def test_avoid_duplicates(self):
tasks.task_index_records(["foo"], force=True)

self.assertEqual(update_solr.call_count, 1)
self._check_checksum("foo", solr="0x4db9a611")
self._check_checksum("foo", solr="0x8f51bd8d")

# now change metrics (solr shouldn't be called)
getter.return_value = {
"bibcode": "foo",
"metrics_updated": get_date("1972-04-02"),
"bib_data_updated": get_date("1972-04-01"),
"metrics": {},
"solr_checksum": "0x4db9a611",
"solr_checksum": "0x8f51bd8d",
}
tasks.task_index_records(["foo"], force=True)
self.assertEqual(update_solr.call_count, 1)
Expand All @@ -563,7 +563,7 @@ def test_ignore_checksums_solr(self):
"bibcode": "foo",
"metrics_updated": get_date("1972-04-02"),
"bib_data_updated": get_date("1972-04-01"),
"solr_checksum": "0x4db9a611",
"solr_checksum": "0x8f51bd8d",
}

# update with matching checksum and then update and ignore checksums
Expand All @@ -574,7 +574,6 @@ def test_ignore_checksums_solr(self):
update_links=False,
ignore_checksums=False,
)
# pdb.set_trace()

self.assertEqual(update_solr.call_count, 0)
tasks.task_index_records(
Expand Down
24 changes: 24 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,27 @@
"uat",
"volume",
]

DOCTYPE_RANKING = {
"article": 1,
"eprint": 1,
"inproceedings": 2,
"inbook": 1,
"abstract": 4,
"book": 1,
"bookreview": 4,
"catalog": 2,
"circular": 3,
"erratum": 6,
"mastersthesis": 3,
"newsletter": 5,
"obituary": 6,
"phdthesis": 3,
"pressrelease": 7,
"proceedings": 3,
"proposal": 4,
"software": 2,
"talk": 4,
"techreport": 3,
"misc": 8
}
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
adsputils==1.5.2
adsputils==1.5.5
alembic==0.9.1
httplib2==0.18.1
portalocker==1.7.1
Expand Down

0 comments on commit d8f5087

Please sign in to comment.