From 88756a3359bbdf493901f138ce3fe7dd3600ce81 Mon Sep 17 00:00:00 2001 From: brendagutman Date: Fri, 10 Jan 2025 19:21:24 -0600 Subject: [PATCH 1/3] [FD-1745] Add UMLS as a new search API --- src/search_dragon/__init__.py | 10 - src/search_dragon/_version.py | 10 +- src/search_dragon/external_apis/__init__.py | 52 +++++ src/search_dragon/external_apis/ols_api.py | 43 +++- src/search_dragon/external_apis/umls_api.py | 221 ++++++++++++++++++++ src/search_dragon/result_structure.py | 40 ++-- src/search_dragon/search.py | 35 ++-- 7 files changed, 354 insertions(+), 57 deletions(-) create mode 100644 src/search_dragon/external_apis/umls_api.py diff --git a/src/search_dragon/__init__.py b/src/search_dragon/__init__.py index 74178fc..8250264 100644 --- a/src/search_dragon/__init__.py +++ b/src/search_dragon/__init__.py @@ -1,5 +1,4 @@ import logging -import requests LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" @@ -16,12 +15,3 @@ # Add handlers to the logger logger.addHandler(console_handler) - -def fetch_data(url): - """ """ - response = requests.get(url) - if response.status_code == 200: - return response.json() - else: - print(f"Failed to fetch data: {response.status_code}") - return None diff --git a/src/search_dragon/_version.py b/src/search_dragon/_version.py index 5c4105c..94f3d7b 100644 --- a/src/search_dragon/_version.py +++ b/src/search_dragon/_version.py @@ -1 +1,9 @@ -__version__ = "1.0.1" +__version__ = "1.0.2" + +''' +Change log +1.0.0 - Initial creation +1.0.1 - Add OLS api +1.0.2 - Add UMLS api + +''' \ No newline at end of file diff --git a/src/search_dragon/external_apis/__init__.py b/src/search_dragon/external_apis/__init__.py index 734ab04..79d37a4 100644 --- a/src/search_dragon/external_apis/__init__.py +++ b/src/search_dragon/external_apis/__init__.py @@ -1,9 +1,61 @@ +import os +import requests +from search_dragon import logger + class OntologyAPI: def __init__(self, base_url, api_id, api_name): self.base_url = base_url self.api_id = api_id self.api_name = api_name + def get_api_key(self, api_id): + if api_id == "umls": + API_KEY = os.getenv("UMLS_API_KEY") + if not API_KEY: + raise ValueError( + "API_KEY for {api_id} is not set in the environment variables." + ) + else: + return API_KEY + + def fetch_data(self, url): + """ """ + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + print(f"Failed to fetch data: {response.status_code}") + return None + + def remove_duplicates(self, data): + """ + Remove duplicate records where the 'uri' field is the same. + + Args: + data (list): List of records to filter. + + Returns: + list: Filtered data with duplicates removed. + """ + seen_uris = set() + filtered_data = [] + excluded_data = [] + + for item in data: + uri = item.get("code_iri") + if uri in seen_uris: + excluded_data.append(item) + else: + seen_uris.add(uri) + filtered_data.append(item) + + # Log the excluded records count + message = ( + f"Records({len(excluded_data)}) were excluded as duplicates based on 'uri'.{excluded_data}" + ) + logger.info(message) + + return filtered_data diff --git a/src/search_dragon/external_apis/ols_api.py b/src/search_dragon/external_apis/ols_api.py index 7db7376..7f5bfc7 100644 --- a/src/search_dragon/external_apis/ols_api.py +++ b/src/search_dragon/external_apis/ols_api.py @@ -8,7 +8,7 @@ """ from search_dragon.external_apis import OntologyAPI -from search_dragon import logger, fetch_data +from search_dragon import logger OLS_API_BASE_URL = "https://www.ebi.ac.uk/ols4/api/" OLS_API = "ols" @@ -16,7 +16,12 @@ class OLSSearchAPI(OntologyAPI): def __init__(self): - super().__init__(base_url=OLS_API_BASE_URL, api_id=OLS_API, api_name=OLS_NAME) + super().__init__( + base_url=OLS_API_BASE_URL, + api_id=OLS_API, + api_name=OLS_NAME, + ) + self.total_results_id = 'numFound' def collect_data(self, search_url, results_per_page, start_index): """ @@ -47,18 +52,16 @@ def collect_data(self, search_url, results_per_page, start_index): paginated_url = f"{search_url}&rows={results_per_page}&start={start_index}" logger.info(f"Fetching data from {paginated_url}") - data = fetch_data(paginated_url) + data = self.fetch_data(paginated_url) - - results = data.get("response", {}).get("docs", []) raw_data.extend(results) - total_results = data.get("response", {}).get("numFound", 0) + total_results = data.get("response", {}).get(self.total_results_id, 0) logger.info(f"Total results found: {total_results}") logger.info(f"Retrieved {len(results)} results (start_index: {start_index}).") - # Check if the start_index exceeds total results + # Check if the start_index exceeds total results if start_index >= total_results: message = f"start_index ({start_index}) exceeds total available results ({total_results})." logger.error(message) @@ -69,7 +72,7 @@ def collect_data(self, search_url, results_per_page, start_index): more_results_available = n_results_used < total_results except Exception as e: - logger.error(f"Error fetching data from {paginated_url}: {e}") + logger.error(f"Error fetching data from {search_url}: {e}") return [], more_results_available return raw_data, more_results_available @@ -112,7 +115,23 @@ def format_ontology(self, ontology_list): return ontology_param - def build_url(self, keywords, ontology_list): + def format_results_per_page(self, results_per_page): + """ + Formats the results_per_page into a format readable by the api. + """ + page_size_param = f"rows={results_per_page}" + + return page_size_param + + def format_start_index(self, start_index): + """ + Formats the start_index into a format readable by the api. + """ + start_param = f"start={start_index}" + + return start_param + + def build_url(self, keywords, ontology_list, start_index, results_per_page): """ Constructs the search URL by combining the base URL, formatted keyword, and ontology parameters. @@ -128,9 +147,13 @@ def build_url(self, keywords, ontology_list): keyword_param = self.format_keyword(keywords) ontology_param = self.format_ontology(ontology_list) + start_param = self.format_start_index(start_index) + page_size_param = self.format_results_per_page(results_per_page) # Join the query params with & then join the params to the base url - url_blocks.append("&".join([keyword_param,ontology_param])) + url_blocks.append( + "&".join([keyword_param, ontology_param, start_param, page_size_param]) + ) complete_url = "".join(url_blocks) return complete_url diff --git a/src/search_dragon/external_apis/umls_api.py b/src/search_dragon/external_apis/umls_api.py new file mode 100644 index 0000000..b2076ab --- /dev/null +++ b/src/search_dragon/external_apis/umls_api.py @@ -0,0 +1,221 @@ +""" +UMLS (Unified Medical Language System) API Integration + +This script defines the `UMLSSearchAPI` class that interacts with the umls API to perform ontology searches. The class provides methods to: +- Construct search URLs with query parameters. +- Fetch paginated search results from the UMLS API. +- Harmonize and structure raw results into a standardized format. + +""" + +from search_dragon.external_apis import OntologyAPI +from search_dragon import logger + +UMLS_API_BASE_URL = "https://uts-ws.nlm.nih.gov/rest/search/current" +UMLS_API = "umls" +UMLS_NAME = "Unified Medical Language System" + + +class UMLSSearchAPI(OntologyAPI): + def __init__(self): + super().__init__( + base_url=UMLS_API_BASE_URL, + api_id=UMLS_API, + api_name=UMLS_NAME, + ) + self.total_results_id='recCount' + + def collect_data(self, search_url, results_per_page, start_index): + """ + Fetch a single page of data from the provided search endpoint. + + Args: + search_url (str): The base URL for the search API. + results_per_page (int): Number of results to fetch in this request. + start_index (int): The starting page number for fetching data. + + Returns: + Tuple: + - raw_data (list): Results from the requested page. + - more_results_available (bool): Whether more results are available. + """ + raw_data = [] + results_per_page = int(results_per_page) + start_index = int(start_index) + more_results_available = False + + try: + # Construct the paginated URL + paginated_url = f"{search_url}" + logger.info(f"Fetching data from {paginated_url}") + + # Fetch data + data = self.fetch_data(paginated_url) + logger.info(f"Returned data: {data}") + + # Extract results + results = data.get("result", {}).get("results", []) + raw_data.extend(results) + + total_results = data.get("result", {}).get(self.total_results_id, 0) + logger.info(f"Total results found: {total_results}") + logger.info(f"Retrieved {len(results)} results (start_index: {start_index}).") + + # Check if the start_index exceeds total results + if start_index >= total_results: + message = f"start_index ({start_index}) exceeds total available results ({total_results})." + logger.error(message) + raise ValueError(message) + + # Check if more results are available after this request. + n_results_used = start_index + results_per_page + 1 + more_results_available = n_results_used < total_results + + except Exception as e: + logger.error(f"Error fetching data from {search_url}: {e}") + return [], more_results_available + + return raw_data, more_results_available + + def format_keyword(self, keywords): + """ + Formats the provided keywords for the search query. + + Args: + keywords (str): The search terms + + Returns: + The formatted query parameter to be inserted into the search url. + + Example return: "q=brain%20cancer" + """ + + keywords = keywords.replace(" ", "%20") + + keyword_param = f"string={keywords}" + + return keyword_param + + def format_ontology(self, ontology_list): + """ + Formats the included ontologies into a query parameter for the search URL. + + Args: + ontology_list (dict): A dictionary containing ontology data. + + Returns: + str: The formatted ontology query parameter + + Example return: "ontology=uberon,ma" + """ + + formatted_ontologies = ",".join(ontology_list) + + ontology_param = f"sabs={formatted_ontologies}" + + return ontology_param + + def format_key(self): + """ + Formats the api key into a format readable by the api. + """ + api_key = self.get_api_key(UMLS_API) + key_param = f"apiKey={api_key}" + + return key_param + + def format_results_per_page(self, results_per_page): + """ + Formats the results_per_page into a format readable by the api. + """ + page_size_param = f"pageSize={results_per_page}" + + return page_size_param + + def format_start_index(self, start_index): + """ + Formats the start_index into a format readable by the api. + """ + start_param = f"pageNumber={start_index}" + + return start_param + + def format_api_specific_params(self): + """ + Formats the parameters that aren't required across all apis + """ + return_type_param = f"returnIdType=code" + + return return_type_param + + def build_url(self, keywords, ontology_list, start_index, results_per_page): + """ + Constructs the search URL by combining the base URL, formatted keyword, and ontology parameters. + + Args: + keywords (str): The search keyword(s). + ontology_list (dict): The ontology data to be included in the search. + + Returns: + str: The complete search URL. + """ + url_blocks = [] + url_blocks.append(f"{self.base_url}?") + + keyword_param = self.format_keyword(keywords) + ontology_param = self.format_ontology(ontology_list) + start_param = self.format_start_index(start_index) + page_size_param = self.format_results_per_page(results_per_page) + return_type_param = self.format_api_specific_params() + + key_param = self.format_key() + + # Join the query params with & then join the params to the base url + url_blocks.append( + "&".join( + [ + keyword_param, + ontology_param, + start_param, + page_size_param, + return_type_param, + key_param, + ] + ) + ) + complete_url = "".join(url_blocks) + + return complete_url + + def harmonize_data(self, raw_results, ontology_data): + """ + Harmonizes the raw API results into a standardized format for further processing. + + Args: + raw_results (dict or list): Raw results returned from the OLS API. + ontology_data (dict): The ontology data used to get ontology systems + + Returns: + dict: A dictionary containing the harmonized data. + """ + if isinstance(raw_results, list): + return [self.harmonize_data(item, ontology_data) for item in raw_results] + + # Get the ontology prefix from the raw result + ontology_prefix = raw_results.get("rootSource") + + # Retrieve the corresponding value from ontology_list + system = ontology_data.get(ontology_prefix) + + harmonized_data = { + "code": raw_results.get( + "ui", "" + ), # The umls Concept Unique Identifier (CUI) + "system": system, # This is the ontology system. + "code_iri": raw_results.get("uri"), + "display": raw_results.get("name"), + "description": raw_results.get("name", []), + "ontology_prefix": ontology_prefix, + } + + return harmonized_data diff --git a/src/search_dragon/result_structure.py b/src/search_dragon/result_structure.py index b16a5d0..db11171 100644 --- a/src/search_dragon/result_structure.py +++ b/src/search_dragon/result_structure.py @@ -6,11 +6,12 @@ def generate_response( - data, search_url, more_results_available + data, search_url, more_results_available, api_instances ): logger.info(f"Count fetched_data {len(data)}") ontology_counts, results_count = get_code_counts(data) + cleaned_data = curate_data(data) structured_data = { @@ -35,25 +36,33 @@ def get_code_counts(data): return ontology_counts, results_counts -def remove_duplicates(data): + +def remove_duplicates(self, data): """ - Some ontologies include codes from within other ontologies. Filter out those - api results where the ontology_prefix(code prefix ex: MONDO) does not match - the ontology code for the record. + Remove duplicate records where the 'uri' field is the same. + + Args: + data (list): List of records to filter. + + Returns: + list: Filtered data with duplicates removed. """ + seen_uris = set() filtered_data = [] excluded_data = [] - for item in data: - ontology_prefix = item.get("ontology_prefix") - code = item.get("code") - # Check if code starts with the ontology prefix, if it does not, excude and log the record - if not code.lower().startswith(ontology_prefix.lower()): + for item in data: + uri = item.get("code_iri") + if uri in seen_uris: excluded_data.append(item) else: + seen_uris.add(uri) filtered_data.append(item) - message = f"Records({(len(excluded_data))}) are excluded because the code does not start with the ontology_prefix" + # Log the excluded records count + message = ( + f"Records({len(excluded_data)}) were excluded as duplicates based on 'uri'.{excluded_data}" + ) logger.info(message) return filtered_data @@ -94,14 +103,7 @@ def curate_data(data): """ logger.info(f"data length {len(data)}") - # handle duplicates - dup_cleaned = remove_duplicates(data) - - # sanity check - - logger.info(f"data length after removing duplicates {len(dup_cleaned)}") - # handle nulls and data types - cleaned_data = validate_data(dup_cleaned) + cleaned_data = validate_data(data) return cleaned_data diff --git a/src/search_dragon/search.py b/src/search_dragon/search.py index 032d8ae..4c53432 100644 --- a/src/search_dragon/search.py +++ b/src/search_dragon/search.py @@ -5,10 +5,11 @@ from search_dragon import logger from search_dragon.external_apis import OntologyAPI from search_dragon.external_apis.ols_api import OLSSearchAPI +from search_dragon.external_apis.umls_api import UMLSSearchAPI from search_dragon.result_structure import generate_response import argparse -SEARCH_APIS = [{"ols": OLSSearchAPI}] +SEARCH_APIS = [{"ols": OLSSearchAPI},{"umls": UMLSSearchAPI}] def get_api_instance(search_api_list): @@ -22,21 +23,21 @@ def get_api_instance(search_api_list): ''' api_instances = [] - # Process only the APIs in the provided list + available_apis = {key: value for api_dict in SEARCH_APIS for key, value in api_dict.items()} + for search_api in search_api_list: - for api_dict in SEARCH_APIS: - if search_api in api_dict: - api_instances.append(api_dict[search_api]()) - break - else: - # Raise an error if the API is not found - message = f"Ontology API '{search_api}' is not recognized." - logger.info(message) - raise ValueError(message) + if search_api in available_apis: + api_instances.append(available_apis[search_api]()) + else: + # Raise an error if the API is not found + message = f"Ontology API '{search_api}' is not recognized." + logger.error(message) + raise ValueError(message) return api_instances + def run_search(ontology_data, keyword, ontology_list, search_api_list, results_per_page, start_index): """ The master function to execute the search process. It queries the APIs, harmonizes the results, and generates a cleaned, structured response. @@ -51,15 +52,13 @@ def run_search(ontology_data, keyword, ontology_list, search_api_list, results_p dict: The final structured response containing harmonized and curated search results. """ - logger.info(f"ontology_list:{ontology_list}") - api_instances = get_api_instance(search_api_list) combined_data = [] for api_instance in api_instances: # Generate the search url - search_url = api_instance.build_url(keyword, ontology_list) + search_url = api_instance.build_url(keyword, ontology_list, start_index, results_per_page) logger.info(f"URL:{search_url}") # Fetch the data @@ -70,15 +69,17 @@ def run_search(ontology_data, keyword, ontology_list, search_api_list, results_p harmonized_data = api_instance.harmonize_data(api_results, ontology_data) logger.info(f"Count harmonized_data: {len(harmonized_data)}") + # Apply speciallized cleaning prior to combining data. + dups_removed = api_instance.remove_duplicates(harmonized_data) + # Combine the ontology api data - combined_data.extend(harmonized_data) + combined_data.extend(dups_removed) logger.info(f"Count combined_data {len(combined_data)}") # Final cleaning and structuring of the combined data - response = generate_response(combined_data, search_url, more_results_available) + response = generate_response(combined_data, search_url, more_results_available, api_instances) logger.info(f"keyword: {keyword}") - # logger.info(response) return response From 6526cf2c00952107695a02a9391102ad68d7db5f Mon Sep 17 00:00:00 2001 From: brendagutman Date: Mon, 13 Jan 2025 17:26:13 -0600 Subject: [PATCH 2/3] [FD-1745] Suggested changes --- README.md | 4 +++- src/search_dragon/external_apis/__init__.py | 14 ------------ src/search_dragon/external_apis/umls_api.py | 25 ++++++++++++--------- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 6202a53..631b396 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,11 @@ 2. **Install the package**
If working on a new feature it is possible to install a package version within the remote or local branch + **NOTE** If testing changes to search-dragon in `locutus` don't forget to deploy a `locutus` branch with the correct `search-dragon` version in the requirements.txt file! + **NOTE** Any new env variables created, e.g. api keys, will need to be added to the `locutus` deployment files. ``` # remote - pip install git+https://github.com/NIH-NCPI/locutus_utilities.git@{branch_name} + pip install git+https://github.com/NIH-NCPI/search-dragon.git@{branch_name} # local pip install -e . diff --git a/src/search_dragon/external_apis/__init__.py b/src/search_dragon/external_apis/__init__.py index 79d37a4..9881fee 100644 --- a/src/search_dragon/external_apis/__init__.py +++ b/src/search_dragon/external_apis/__init__.py @@ -8,16 +8,6 @@ def __init__(self, base_url, api_id, api_name): self.api_id = api_id self.api_name = api_name - def get_api_key(self, api_id): - if api_id == "umls": - API_KEY = os.getenv("UMLS_API_KEY") - if not API_KEY: - raise ValueError( - "API_KEY for {api_id} is not set in the environment variables." - ) - else: - return API_KEY - def fetch_data(self, url): """ """ response = requests.get(url) @@ -56,7 +46,3 @@ def remove_duplicates(self, data): logger.info(message) return filtered_data - - - - diff --git a/src/search_dragon/external_apis/umls_api.py b/src/search_dragon/external_apis/umls_api.py index b2076ab..e2d79b7 100644 --- a/src/search_dragon/external_apis/umls_api.py +++ b/src/search_dragon/external_apis/umls_api.py @@ -10,18 +10,14 @@ from search_dragon.external_apis import OntologyAPI from search_dragon import logger - -UMLS_API_BASE_URL = "https://uts-ws.nlm.nih.gov/rest/search/current" -UMLS_API = "umls" -UMLS_NAME = "Unified Medical Language System" - +import os class UMLSSearchAPI(OntologyAPI): def __init__(self): super().__init__( - base_url=UMLS_API_BASE_URL, - api_id=UMLS_API, - api_name=UMLS_NAME, + base_url="https://uts-ws.nlm.nih.gov/rest/search/current", + api_id= "umls", + api_name="Unified Medical Language System", ) self.total_results_id='recCount' @@ -114,12 +110,21 @@ def format_ontology(self, ontology_list): ontology_param = f"sabs={formatted_ontologies}" return ontology_param - + + def get_api_key(self): + API_KEY = os.getenv("UMLS_API_KEY") + if not API_KEY: + raise ValueError( + f"API_KEY for 'umls' is not set in the environment variables." + ) + else: + return API_KEY + def format_key(self): """ Formats the api key into a format readable by the api. """ - api_key = self.get_api_key(UMLS_API) + api_key = self.get_api_key() key_param = f"apiKey={api_key}" return key_param From 73cbe1a84ba3192875276a978b69833e51c34b3a Mon Sep 17 00:00:00 2001 From: brendagutman Date: Mon, 13 Jan 2025 17:28:17 -0600 Subject: [PATCH 3/3] [FD-1745] Suggested changes --- src/search_dragon/external_apis/ols_api.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/search_dragon/external_apis/ols_api.py b/src/search_dragon/external_apis/ols_api.py index 7f5bfc7..30876dd 100644 --- a/src/search_dragon/external_apis/ols_api.py +++ b/src/search_dragon/external_apis/ols_api.py @@ -10,16 +10,12 @@ from search_dragon.external_apis import OntologyAPI from search_dragon import logger -OLS_API_BASE_URL = "https://www.ebi.ac.uk/ols4/api/" -OLS_API = "ols" -OLS_NAME = "Ontology Lookup Service" - class OLSSearchAPI(OntologyAPI): def __init__(self): super().__init__( - base_url=OLS_API_BASE_URL, - api_id=OLS_API, - api_name=OLS_NAME, + base_url="https://www.ebi.ac.uk/ols4/api/", + api_id="ols", + api_name="Ontology Lookup Service", ) self.total_results_id = 'numFound' @@ -53,7 +49,7 @@ def collect_data(self, search_url, results_per_page, start_index): logger.info(f"Fetching data from {paginated_url}") data = self.fetch_data(paginated_url) - + results = data.get("response", {}).get("docs", []) raw_data.extend(results)