Skip to content

Commit

Permalink
Merge branch 'spacemansteve-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Jan 9, 2020
2 parents b24bfd6 + 3ed47e8 commit 50e39ed
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 12 deletions.
29 changes: 21 additions & 8 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,27 @@ def extract_data_pipeline(data, solrdoc):


def extract_augments_pipeline(db_augments, solrdoc):
"""retrieve expected agumented affiliation values"""
"""retrieve expected agumented affiliation values
aff is a solr virtual field so it should never be set"""
if db_augments is None or len(db_augments) == 0:
return {}
return {'aff': db_augments.get('aff', None),
'aff_abbrev': db_augments.get('aff_abbrev', None),
return {'aff_abbrev': db_augments.get('aff_abbrev', None),
'aff_canonical': db_augments.get('aff_canonical', None),
'aff_facet': db_augments.get('aff_facet', None),
'aff_facet_hier': db_augments.get('aff_facet_hier', None),
'aff_id': db_augments.get('aff_id', None),
'aff_raw': db_augments.get('aff_raw', None),
'institution': db_augments.get('institution', None)}

def modify_affiliations(data, solrdoc):
"""Make sure that preference is given to affiliations extracted
by augment pipeline
"""
if solrdoc.get('aff_raw', None):
solrdoc.pop('aff', None)
elif solrdoc.get('aff', None):
solrdoc['aff_raw'] = solrdoc.pop('aff', None)

def extract_fulltext(data, solrdoc):
out = {}
Expand Down Expand Up @@ -193,16 +203,18 @@ def get_timestamps(db_record, out):
out['update_timestamp'] = date2solrstamp(last_update)
return out

DB_COLUMN_DESTINATIONS = OrderedDict([

DB_COLUMN_DESTINATIONS = [
('bib_data', ''),
('orcid_claims', get_orcid_claims),
('nonbib_data', extract_data_pipeline),
('metrics', extract_metrics_pipeline),
('id', 'id'),
('fulltext', extract_fulltext),
('#timestamps', get_timestamps), # use 'id' to be always called
('augments', extract_augments_pipeline) # over aff field, adds aff_*
])
('augments', extract_augments_pipeline), # over aff field, adds aff_*
('#affiliations', modify_affiliations)
]


def delete_by_bibcodes(bibcodes, urls):
Expand Down Expand Up @@ -263,7 +275,7 @@ def transform_json_record(db_record):

# order timestamps (if any)
timestamps = []
for k, v in DB_COLUMN_DESTINATIONS.items():
for k, v in DB_COLUMN_DESTINATIONS:
ts = db_record.get(k + '_updated', None)
if ts:
ts = time.mktime(ts.timetuple())
Expand All @@ -285,7 +297,9 @@ def transform_json_record(db_record):
else:
if target is None:
continue

out.update(db_record.get(field))

elif field.startswith('#'):
if callable(target):
x = target(db_record, out) # in the interest of speed, don't create copy of out
Expand Down Expand Up @@ -319,6 +333,5 @@ def transform_json_record(db_record):
except (KeyError, ValueError):
# here if record holds unexpected value
logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(db_record['bibcode'], type(links_data), links_data))
return out

return out
20 changes: 17 additions & 3 deletions adsmp/tests/test_solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import unittest
import json
import re
import os
import math
import mock
import adsputils
Expand Down Expand Up @@ -171,19 +170,27 @@ def test_solr_transformer(self):
u'grants': [u'2419335 g', u'3111723 g*'],
u'citation_count_norm': .2,
})
rec = self.app.get_record('bibcode')
x = solr_updater.transform_json_record(rec)
self.assertFalse('aff' in x, 'virtual field should not be in solr output')
self.assertTrue(x['aff_raw'] == rec['bib_data']['aff'],
'solr record should include aff from bib data when augment is not available')
self.assertFalse('aff_abbrev' in x,
'augment field should not be in solr record when augment is not available')

self.app.update_storage('bibcode', 'augment',
{u'aff': [u'augment aff', u'-', u'-', u'-'],
{u'aff': [u'augment pipeline aff', u'-', u'-', u'-'],
u'aff_abbrev': [u'-', u'-', u'-', u'-'],
u'aff_canonical': [u'-', u'-', u'-', u'-'],
u'aff_facet': [u'-', u'-', u'-', u'-'],
u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
u'aff_id': [u'-', u'-', u'-', u'-'],
u'aff_raw': [u'augment pipeline aff', u'-', u'-', u'-'],
u'institution': [u'-', u'-', u'-', u'-']})

rec = self.app.get_record('bibcode')
self.assertDictContainsSubset({u'abstract': u'abstract text',
u'ack': u'aaa',
u'aff': [u'augment aff', u'-', u'-', u'-'],
u'aff_abbrev': [u'-', u'-', u'-', u'-'],
u'aff_canonical': [u'-', u'-', u'-', u'-'],
u'aff_facet': [u'-', u'-', u'-', u'-'],
Expand Down Expand Up @@ -294,6 +301,13 @@ def test_solr_transformer(self):
else:
self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z')

rec = self.app.get_record('bibcode')
x = solr_updater.transform_json_record(rec)
self.assertFalse('aff' in x) # virtual field should not be in solr output
self.assertEquals(x['aff_raw'], rec['augments']['aff']) # solr record should prioritize aff data from augment
self.assertEquals(x['aff_abbrev'], rec['augments']['aff_abbrev']) # solr record should include augment data


def test_links_data_merge(self):
# links_data only from bib
db_record = {'bibcode': 'foo',
Expand Down
1 change: 1 addition & 0 deletions adsmp/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def test_task_update_record_augments(self):
u"aff_facet": [],
u"aff_facet_hier": [],
u"aff_id": [],
u"aff_raw": [],
u"author": [
u"Mikhail, E. M.",
u"Kurtz, M. K.",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
git+https://github.com/adsabs/ADSPipelineUtils.git@v1.0.15
git+https://github.com/adsabs/ADSPipelineUtils.git@v1.1.0
alembic==0.9.1
DateTime==4.1.1
librabbitmq==1.6.1
Expand Down

0 comments on commit 50e39ed

Please sign in to comment.