Skip to content

Commit

Permalink
Spacy batch keyword extraction (#1581)
Browse files Browse the repository at this point in the history
* Add SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder

* updated logging

* Added support for SpaCy Batch Extract Keywords meta field

* Configured meta sample source values and updated exclude query

* Removed no longer used sourceValueFieldsToReturn

---------

Co-authored-by: Daniel Ecer <[email protected]>
  • Loading branch information
HazalCiplak and de-code authored Dec 30, 2024
1 parent 8b18e4d commit ff3c2d4
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 12 deletions.
5 changes: 5 additions & 0 deletions data_pipeline/generic_web_api/generic_web_api_data_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ def get_data_single_page_response(
dynamic_request_parameters=dynamic_request_parameters
)
LOGGER.info(
"Request URL: %s %s",
data_config.dynamic_request_builder.method,
url
)
LOGGER.debug(
"Request URL: %s %s (json: %r)",
data_config.dynamic_request_builder.method,
url,
Expand Down
29 changes: 29 additions & 0 deletions data_pipeline/generic_web_api/request_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing_extensions import NotRequired, TypedDict

from data_pipeline.utils.data_pipeline_timestamp import datetime_to_string
from data_pipeline.utils.json import remove_key_with_null_value
from data_pipeline.utils.pipeline_utils import replace_placeholders


Expand Down Expand Up @@ -325,9 +326,37 @@ def get_json(
]


class SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder(WebApiDynamicRequestBuilder):
def __init__(self, **kwargs):
super().__init__(**{
**kwargs,
'method': 'POST',
'max_source_values_per_request': 10
})

def get_json(
self,
dynamic_request_parameters: WebApiDynamicRequestParameters
) -> dict:
assert dynamic_request_parameters.source_values is not None
return {
"data": remove_key_with_null_value([
{
"type": "extract-keyword-request",
"attributes": {
"content": source_value['text']
},
"meta": source_value.get('meta')
}
for source_value in dynamic_request_parameters.source_values
])
}


WEB_API_REQUEST_BUILDER_CLASS_BY_NAME_MAP: Mapping[str, Type[WebApiDynamicRequestBuilder]] = {
'single_source_value': SingleSourceValueWebApiDynamicRequestBuilder,
'spacy_keyword_extraction': SpacyKeywordExtractionWebApiDynamicRequestBuilder,
'spacy_batch_keyword_extraction_api': SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder,
'civi': CiviWebApiDynamicRequestBuilder,
'biorxiv_medrxiv_api': BioRxivWebApiDynamicRequestBuilder,
's2_title_abstract_embeddings_api': S2TitleAbstractEmbeddingsWebApiDynamicRequestBuilder,
Expand Down
23 changes: 11 additions & 12 deletions sample_data_config/web-api/web-api-data-pipeline.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -311,17 +311,19 @@ webApi:
- best_oa_location
- related_works

- dataPipelineId: keywords_from_manuscript_abstract
- dataPipelineId: keywords_from_manuscript_abstract_batch
dataset: '{ENV}'
table: test_keywords_from_manuscript_abstract
table: test_keywords_from_manuscript_abstract_batch
source:
include:
bigQuery:
projectName: 'elife-data-pipeline'
sqlQuery: |-
SELECT
version_id,
TO_HEX(MD5(abstract)) AS abstract_md5,
STRUCT(
version_id,
TO_HEX(MD5(abstract)) AS abstract_md5
) AS meta,
CONCAT(version_id, '/', TO_HEX(MD5(abstract))) AS change_id,
abstract AS text
FROM `elife-data-pipeline.{ENV}.mv_manuscript_version`
Expand All @@ -333,17 +335,14 @@ webApi:
ignoreNotFound: true
projectName: 'elife-data-pipeline'
sqlQuery:
SELECT CONCAT(version_id, '/', abstract_md5) AS change_id
FROM `elife-data-pipeline.{ENV}.test_keywords_from_manuscript_abstract`
SELECT CONCAT(extract_keyword_result.meta.version_id, '/', extract_keyword_result.meta.abstract_md5) AS change_id
FROM `elife-data-pipeline.{ENV}.test_keywords_from_manuscript_abstract_batch` AS response
JOIN UNNEST(response.data) AS extract_keyword_result
keyFieldNameFromInclude: 'change_id'
requestBuilder:
name: 'spacy_keyword_extraction'
name: 'spacy_batch_keyword_extraction_api'
dataUrl:
# Note: we currently require there to be a `text` value that will be url encoded by the `spacy_keyword_extraction` request builder
urlExcludingConfigurableParameters: https://spacy-keyword-extraction-api--stg.elifesciences.org/v1/extract-keywords?text={text}
urlExcludingConfigurableParameters: https://spacy-keyword-extraction-api--stg.elifesciences.org/v1/batch-extract-keywords
response:
provenanceEnabled: True
sourceValueFieldsToReturn:
- version_id
- abstract_md5
batchSize: 19
67 changes: 67 additions & 0 deletions tests/unit_test/generic_web_api/request_builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CrossrefMetadataWebApiDynamicRequestBuilder,
S2TitleAbstractEmbeddingsWebApiDynamicRequestBuilder,
SingleSourceValueWebApiDynamicRequestBuilder,
SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder,
WebApiDynamicRequestBuilder,
get_url_with_added_or_replaced_query_parameters,
get_web_api_request_builder_class,
Expand Down Expand Up @@ -203,6 +204,72 @@ def test_should_replace_placeholders(self):
assert url.rstrip('?') == TEST_API_URL_1 + '/buddy1'


class TestSpacyBatchKeywordExtractionWebApiDynamicRequestBuilder:
def test_should_set_method_to_post(self):
dynamic_request_builder = SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder(
url_excluding_configurable_parameters=TEST_API_URL_1,
static_parameters={}
)
assert dynamic_request_builder.method == 'POST'

def test_should_set_max_source_values_per_request_to_10(self):
dynamic_request_builder = SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder(
url_excluding_configurable_parameters=TEST_API_URL_1,
static_parameters={}
)
assert dynamic_request_builder.max_source_values_per_request == 10

def test_should_generate_json_data_for_source_values(self):
dynamic_request_builder = SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder(
url_excluding_configurable_parameters=TEST_API_URL_1,
static_parameters={}
)
assert dynamic_request_builder.get_json(
dynamic_request_parameters=WebApiDynamicRequestParameters(
source_values=iter([{
'text': 'Text 1'
}])
)
) == {
"data": [
{
"type": "extract-keyword-request",
"attributes": {
"content": "Text 1"
}
}
]
}

def test_should_generate_json_data_with_meta_for_source_values(self):
dynamic_request_builder = SpacyBatchKeywordExtractionWebApiDynamicRequestBuilder(
url_excluding_configurable_parameters=TEST_API_URL_1,
static_parameters={}
)
assert dynamic_request_builder.get_json(
dynamic_request_parameters=WebApiDynamicRequestParameters(
source_values=iter([{
'text': 'Text 1',
'meta': {
'version_id': 'v1'
}
}])
)
) == {
"data": [
{
"type": "extract-keyword-request",
"attributes": {
"content": "Text 1"
},
'meta': {
'version_id': 'v1'
}
}
]
}


class TestDynamicS2TitleAbstractEmbeddingsURLBuilder:
def test_should_set_method_to_post(self):
dynamic_request_builder = S2TitleAbstractEmbeddingsWebApiDynamicRequestBuilder(
Expand Down

0 comments on commit ff3c2d4

Please sign in to comment.