Merge branch 'spacemansteve-master'

adsabs · Jan 9, 2020 · 50e39ed · 50e39ed
2 parents b24bfd6 + 3ed47e8
commit 50e39ed
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 12 deletions.
diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py
@@ -73,17 +73,27 @@ def extract_data_pipeline(data, solrdoc):
 
 
 def extract_augments_pipeline(db_augments, solrdoc):
-    """retrieve expected agumented affiliation values"""
+    """retrieve expected agumented affiliation values 
+
+    aff is a solr virtual field so it should never be set"""
     if db_augments is None or len(db_augments) == 0:
         return {}
-    return {'aff': db_augments.get('aff', None),
-            'aff_abbrev': db_augments.get('aff_abbrev', None),
+    return {'aff_abbrev': db_augments.get('aff_abbrev', None),
             'aff_canonical': db_augments.get('aff_canonical', None),
             'aff_facet': db_augments.get('aff_facet', None),
             'aff_facet_hier': db_augments.get('aff_facet_hier', None),
             'aff_id': db_augments.get('aff_id', None),
+            'aff_raw': db_augments.get('aff_raw', None),
             'institution': db_augments.get('institution', None)}
 
+def modify_affiliations(data, solrdoc):
+    """Make sure that preference is given to affiliations extracted
+    by augment pipeline
+    """
+    if solrdoc.get('aff_raw', None):
+        solrdoc.pop('aff', None)
+    elif solrdoc.get('aff', None):
+        solrdoc['aff_raw'] = solrdoc.pop('aff', None)
 
 def extract_fulltext(data, solrdoc):
     out = {}
@@ -193,16 +203,18 @@ def get_timestamps(db_record, out):
         out['update_timestamp'] = date2solrstamp(last_update)
     return out
 
-DB_COLUMN_DESTINATIONS = OrderedDict([
+
+DB_COLUMN_DESTINATIONS = [
     ('bib_data', ''),
     ('orcid_claims', get_orcid_claims),
     ('nonbib_data', extract_data_pipeline),
     ('metrics', extract_metrics_pipeline),
     ('id', 'id'),
     ('fulltext', extract_fulltext),
     ('#timestamps', get_timestamps), # use 'id' to be always called
-    ('augments', extract_augments_pipeline)  # over aff field, adds aff_*
-    ])
+    ('augments', extract_augments_pipeline),  # over aff field, adds aff_*
+    ('#affiliations', modify_affiliations)
+    ]
 
 
 def delete_by_bibcodes(bibcodes, urls):
@@ -263,7 +275,7 @@ def transform_json_record(db_record):
 
     # order timestamps (if any)
     timestamps = []
-    for k, v in DB_COLUMN_DESTINATIONS.items():
+    for k, v in DB_COLUMN_DESTINATIONS:
         ts = db_record.get(k + '_updated', None)
         if ts:
             ts = time.mktime(ts.timetuple())
@@ -285,7 +297,9 @@ def transform_json_record(db_record):
             else:
                 if target is None:
                     continue
+
                 out.update(db_record.get(field))
+
         elif field.startswith('#'):
             if callable(target):
                 x = target(db_record, out) # in the interest of speed, don't create copy of out
@@ -319,6 +333,5 @@ def transform_json_record(db_record):
             except (KeyError, ValueError):
                     # here if record holds unexpected value
                     logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(db_record['bibcode'], type(links_data), links_data))
-    return out
 
     return out
diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py
@@ -9,7 +9,6 @@
 import unittest
 import json
 import re
-import os
 import math
 import mock
 import adsputils
@@ -171,19 +170,27 @@ def test_solr_transformer(self):
              u'grants': [u'2419335 g', u'3111723 g*'],
              u'citation_count_norm': .2,
              })
+        rec = self.app.get_record('bibcode')
+        x = solr_updater.transform_json_record(rec)
+        self.assertFalse('aff' in x, 'virtual field should not be in solr output')
+        self.assertTrue(x['aff_raw'] == rec['bib_data']['aff'],
+                          'solr record should include aff from bib data when augment is not available')
+        self.assertFalse('aff_abbrev' in x,
+                         'augment field should not be in solr record when augment is not available')
+
         self.app.update_storage('bibcode', 'augment',
-                                {u'aff': [u'augment aff', u'-', u'-', u'-'],
+                                {u'aff': [u'augment pipeline aff', u'-', u'-', u'-'],
                                  u'aff_abbrev': [u'-', u'-', u'-', u'-'],
                                  u'aff_canonical': [u'-', u'-', u'-', u'-'],
                                  u'aff_facet': [u'-', u'-', u'-', u'-'],
                                  u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
                                  u'aff_id': [u'-', u'-', u'-', u'-'],
+                                 u'aff_raw': [u'augment pipeline aff', u'-', u'-', u'-'],
                                  u'institution': [u'-', u'-', u'-', u'-']})
 
         rec = self.app.get_record('bibcode')
         self.assertDictContainsSubset({u'abstract': u'abstract text',
              u'ack': u'aaa',
-             u'aff': [u'augment aff', u'-', u'-', u'-'],
              u'aff_abbrev': [u'-', u'-', u'-', u'-'],
              u'aff_canonical': [u'-', u'-', u'-', u'-'],
              u'aff_facet': [u'-', u'-', u'-', u'-'],
@@ -294,6 +301,13 @@ def test_solr_transformer(self):
             else:
                 self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z')
 
+        rec = self.app.get_record('bibcode')
+        x = solr_updater.transform_json_record(rec)
+        self.assertFalse('aff' in x)  #  virtual field should not be in solr output
+        self.assertEquals(x['aff_raw'], rec['augments']['aff'])  # solr record should prioritize aff data from augment
+        self.assertEquals(x['aff_abbrev'], rec['augments']['aff_abbrev'])  # solr record should include augment data
+
+
     def test_links_data_merge(self):
         # links_data only from bib
         db_record = {'bibcode': 'foo',

diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py
@@ -129,6 +129,7 @@ def test_task_update_record_augments(self):
                 u"aff_facet": [],
                 u"aff_facet_hier": [],
                 u"aff_id": [],
+                u"aff_raw": [],
                 u"author": [
                     u"Mikhail, E. M.",
                     u"Kurtz, M. K.",

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-git+https://github.com/adsabs/ADSPipelineUtils.git@v1.0.15
+git+https://github.com/adsabs/ADSPipelineUtils.git@v1.1.0
 alembic==0.9.1
 DateTime==4.1.1
 librabbitmq==1.6.1