diff --git a/.github/workflows/mc-integration-test.yml b/.github/workflows/mc-integration-test.yml new file mode 100644 index 0000000..ac8998b --- /dev/null +++ b/.github/workflows/mc-integration-test.yml @@ -0,0 +1,71 @@ +name: Integration test against news-search-api:main + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +jobs: + fixture-integration-test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + + name: Integration test with dummy ES data + steps: + + # setup ES index + - name: Configure sysctl limits + run: | + sudo swapoff -a + sudo sysctl -w vm.swappiness=1 + sudo sysctl -w fs.file-max=262144 + sudo sysctl -w vm.max_map_count=262144 + - name: Run Elasticsearch + uses: elastic/elastic-github-actions/elasticsearch@master + with: + stack-version: 8.8.2 + security-enabled: false + - name: Verify Elasticsearch is reachable + run: | + curl --verbose --show-error http://localhost:9200 + + # setup news-search-api server and dummy data + - name: Checkout news-search-api server + uses: actions/checkout@v4 + with: + repository: mediacloud/news-search-api + path: news-search-api + - name: Install news-search-api server python dependencies + working-directory: news-search-api + run: | + pip install -r requirements.txt + - name: Install fixtures + working-directory: news-search-api + run: | + python -m test.create_fixtures + - name: Run news-search-api server + working-directory: news-search-api + run: | + python api.py & + sleep 5 + - name: Verify news-search-api server is reachable + working-directory: news-search-api + run: | + curl --verbose --show-error http://localhost:8000 + + # set up api client code and run test + - name: Main checkout + uses: actions/checkout@v4 + with: + path: main + - name: Install python dependencies + working-directory: main + run: | + pip install -e .[dev] + - name: Run integration test + working-directory: main + run: | + pytest mcnews/tests/test_fixtures.py diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index d332c92..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: do-linting - -on: [push] - -jobs: - - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - - steps: - - uses: actions/checkout@v3 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - - name: Install Deps - run: | - pip install -e .[dev] - pip install pylint - - name: Run Pylint - run: | - pylint $(git ls-files '*.py') diff --git a/.github/workflows/pytest.yml b/.github/workflows/wm-integration-test.yml similarity index 72% rename from .github/workflows/pytest.yml rename to .github/workflows/wm-integration-test.yml index 5bea3f9..2fc3a20 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/wm-integration-test.yml @@ -1,33 +1,30 @@ name: do-testing -on: +on: push: branches: ["main"] pull_request: branches: ["main"] -permissions: - contents: read - jobs: - + build: runs-on: ubuntu-latest - strategy: + strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.10"] steps: - uses: actions/checkout@v3 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - + - name: Install Deps run: | - pip install -e .[dev] + pip install -e .[dev] - name: Run Pytest run: | - pytest + pytest mcnews/tests/test_waybacknews.py diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..d7a9b6b --- /dev/null +++ b/.pylintrc @@ -0,0 +1,12 @@ +[MASTER] +disable= + C0114, # missing-module-docstring + C0115, # missing-class-docstring + C0116, # missing-function-docstring + C0209, # consider-using-f-string + R0913, # too-many-arguments + +[FORMAT] +# Maximum number of characters on a single line. +max-line-length=120 + diff --git a/README.md b/README.md index 03a31c5..3e765ac 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A simple client library to access the Wayback Machine news archive search. Installation ------------ +NB: TBD `pip install wayback-news-search` @@ -18,7 +19,7 @@ Basic Usage Counting matching stories: ```python -from waybacknews.searchapi import SearchApiClient +from mcnews.searchapi import SearchApiClient import datetime as dt api = SearchApiClient("mediacloud") @@ -28,7 +29,7 @@ api.count("coronavirus", dt.datetime(2022, 3, 1), dt.datetime(2022, 4, 1)) Paging over all matching results: ```python -from waybacknews.searchapi import SearchApiClient +from mcnews.searchapi import SearchApiClient import datetime as dt api = SearchApiClient("mediacloud") @@ -48,7 +49,7 @@ Distribution ------------ 1. Run `pytest` to make sure all the test pass -2. Update the version number in `waybacknews/__init__.py` +2. Update the version number in `mcnews/__init__.py` 3. Make a brief note in the version history section below about the changes 4. Commit the changes 5. Tag the commit with a semantic version number - 'v*.*.*' @@ -61,6 +62,9 @@ Distribution Version History --------------- +* __v1.2.1__ - fix paging bug triggered by no results +* __v1.2.0__ - add support for new `expanded` results, and more integration testing +* __v1.1.0__ - add new `paged_articles` method to allow paging over all results * __v1.0.3__ - add 30 sec timeout, remove extra params mcproviders library might be adding * __v1.0.2__ - fix to article endpoint * __v1.0.1__ - automatically escape '/' in query strings, test case for `url` field search diff --git a/mcnews/__init__.py b/mcnews/__init__.py index 3f6fab6..3f262a6 100644 --- a/mcnews/__init__.py +++ b/mcnews/__init__.py @@ -1 +1 @@ -__version__ = '1.0.3' +__version__ = '1.2.1' diff --git a/mcnews/searchapi.py b/mcnews/searchapi.py index e203fb9..387b837 100644 --- a/mcnews/searchapi.py +++ b/mcnews/searchapi.py @@ -1,9 +1,9 @@ import datetime as dt -from typing import List, Dict -import requests +from typing import List, Dict, Optional import logging +import requests import ciso8601 -import waybacknews.util as util +from mcnews import util VERSION = "v1" # the API access URL is versioned for future compatability and maintenance @@ -90,7 +90,7 @@ def _date_query_clause(start_date: dt.datetime, end_date: dt.datetime) -> str: def _overview_query(self, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> Dict: params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))} params.update(kwargs) - results, response = self._query("{}/search/overview".format(self._collection), params, method='POST') + results, _ = self._query("{}/search/overview".format(self._collection), params, method='POST') return results def article(self, article_id: str) -> Dict: @@ -101,36 +101,52 @@ def article(self, article_id: str) -> Dict: def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime, page_size: int = 1000, **kwargs): """ @return: a generator that yeilds lists of articles, grouped by page. - @Question: Should it return articles one by one, not by page? """ params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))} params.update(kwargs) more_pages = True + next_page_token = None while more_pages: - page, response = self._query("{}/search/result".format(self._collection), params, method='POST') + page, next_page_token = self.paged_articles(query, start_date, end_date, page_size, **kwargs, + pagination_token=next_page_token) if self._is_no_results(page): yield [] else: yield page # check if there is a link to the next page more_pages = False - next_link_token = response.headers.get('x-resume-token') - if next_link_token: - params['resume'] = next_link_token + if next_page_token: more_pages = True - def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict: + def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime, + page_size: Optional[int] = 1000, expanded: bool = False, + pagination_token: Optional[str] = None, **kwargs) -> tuple[List[Dict], Optional[str]]: + """ + @return: one page of stories + """ params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))} + if expanded: + params['expanded'] = 1 + if pagination_token: + params['resume'] = pagination_token params.update(kwargs) - results, response = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params, - method='GET') + page, response = self._query("{}/search/result".format(self._collection), params, method='POST') + if self._is_no_results(page): + return [], None + return page, response.headers.get('x-resume-token') + + def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, + **kwargs) -> Dict: + params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))} + params.update(kwargs) + results, _ = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params, method='GET') return results def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'): """ Centralize making the actual queries here for easy maintenance and testing of HTTP comms """ - if 'domains' in params: # remove domains param that might be dangling + if params and ('domains' in params): # remove domains param that might be dangling del params['domains'] if params and ('q' in params): params['q'] = util.sanitize_query(params['q']) @@ -141,9 +157,9 @@ def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'): r = self._session.post(endpoint_url, json=params, timeout=self.TIMEOUT_SECS) else: raise RuntimeError("Unsupported method of '{}'".format(method)) - + if r.status_code >= 500: - raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {}, Params: {}". - format(r.status_code, endpoint_url, params)) - + raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {}," + " Params: {}".format(r.status_code, endpoint_url, params)) + return r.json(), r diff --git a/mcnews/tests/test_fixtures.py b/mcnews/tests/test_fixtures.py new file mode 100644 index 0000000..0f18d18 --- /dev/null +++ b/mcnews/tests/test_fixtures.py @@ -0,0 +1,59 @@ +from unittest import TestCase +import datetime as dt + +import mcnews.searchapi as searchapi + +INTEGRATION_TEST_COLLECTION = "mediacloud_test" +INTEGRATION_TEST_HOST = "http://127.0.0.1:8000" + + +class TestMediaCloudCollection(TestCase): + + def setUp(self) -> None: + self._api = searchapi.SearchApiClient(INTEGRATION_TEST_COLLECTION) + self._api.API_BASE_URL = f"{INTEGRATION_TEST_HOST}/{searchapi.VERSION}/" + + def test_count(self): + results = self._api.count("*", dt.datetime(2023, 1, 1), dt.datetime(2024, 1, 1)) + assert results > 0 + assert results < 5000 + + def test_count_over_time(self): + results = self._api.count_over_time("*", dt.datetime(2020, 1, 1), dt.datetime(2025, 1, 1)) + assert len(results) > 30 + for day in results: + assert 'date' in day + assert 'count' in day + assert 'timestamp' in day + + def test_count_no_results(self): + results = self._api.count("*", dt.datetime(2010, 1, 1), dt.datetime(2010, 1, 1)) + assert results == 0 + + def test_count_date_filter(self): + all = self._api.count("*", dt.datetime(2023, 1, 1), dt.datetime(2024, 1, 1)) + assert all > 0 + w1 = self._api.count("*", dt.datetime(2023, 11, 1), dt.datetime(2024, 11, 8)) + assert all > w1 + + def test_paged_articles(self): + query = "*" + start_date = dt.datetime(2023, 10, 1) + end_date = dt.datetime(2023, 12, 31) + story_count = self._api.count(query, start_date, end_date) + # make sure test case is reasonable size (ie. more than one page, but not too many pages + assert story_count > 1000 + assert story_count < 10000 + # fetch first page + page1, next_token1 = self._api.paged_articles(query, start_date, end_date) + assert len(page1) > 0 + assert next_token1 is not None + page1_url1 = page1[0]['url'] + # grab token, fetch next page + page2, next_token2 = self._api.paged_articles(query, start_date, end_date, pagination_token=next_token1) + assert len(page2) > 0 + assert next_token2 is not None + assert next_token1 != next_token2 # verify paging token changed + page2_urls = [s['url'] for s in page2] + assert page1_url1 not in page2_urls # verify pages don't overlap + diff --git a/mcnews/tests/test_util.py b/mcnews/tests/test_util.py index 1aebbc8..84ab8bd 100644 --- a/mcnews/tests/test_util.py +++ b/mcnews/tests/test_util.py @@ -1,6 +1,6 @@ from unittest import TestCase -import waybacknews.util as util +import mcnews.util as util class TestUtil(TestCase): @@ -11,7 +11,7 @@ def test_sanitize_query(self): assert sanitized == "url:*dailyvoice.com\/new-york\/mountpleasant*" def test_dict_to_list(self): - api_like_data = dict(key1='value1', key2='value2') + api_like_data = { 'key1': 'value1', 'key2':'value2' } list_version = util.dict_to_list(api_like_data) assert len(list_version) == 2 assert list_version[0]['name'] == 'key1' diff --git a/mcnews/tests/test_waybacknews.py b/mcnews/tests/test_waybacknews.py index 411c8c0..6a64751 100644 --- a/mcnews/tests/test_waybacknews.py +++ b/mcnews/tests/test_waybacknews.py @@ -3,7 +3,7 @@ import ciso8601 import requests -from waybacknews.searchapi import SearchApiClient +from mcnews.searchapi import SearchApiClient COLLECTION_MEDIACLOUD = "mediacloud" @@ -79,8 +79,8 @@ def test_sample(self): assert 'publication_date' in r def test_article(self): - STORY_ID = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}" - story = self._api.article(STORY_ID) + story_id = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}" + story = self._api.article(story_id) assert len(story['title']) > 0 assert story['language'] == 'en' assert story['domain'] == 'dailyvoice.com' @@ -94,13 +94,47 @@ def test_all_articles(self): # make sure test case is reasonable size (ie. more than one page, but not too many pages assert story_count > 0 assert story_count < 5000 - # now text it + # now test it found_story_count = 0 for page in self._api.all_articles(query, start_date, end_date): assert len(page) > 0 found_story_count += len(page) assert found_story_count == story_count + def test_paged_articles(self): + query = "biden" + start_date = dt.datetime(2023, 11, 25) + end_date = dt.datetime(2023, 11, 26) + story_count = self._api.count(query, start_date, end_date) + # make sure test case is reasonable size (ie. more than one page, but not too many pages + assert story_count > 0 + assert story_count < 10000 + # fetch first page + page1, next_token1 = self._api.paged_articles(query, start_date, end_date) + assert len(page1) > 0 + assert next_token1 is not None + page1_url1 = page1[0]['url'] + # grab token, fetch next page + page2, next_token2 = self._api.paged_articles(query, start_date, end_date, pagination_token=next_token1) + assert len(page2) > 0 + assert next_token2 is not None + assert next_token1 != next_token2 # verify paging token changed + page2_urls = [s['url'] for s in page2] + assert page1_url1 not in page2_urls # verify pages don't overlap + + def test_paged_expanded_articles(self): + query = "biden" + start_date = dt.datetime(2023, 11, 25) + end_date = dt.datetime(2023, 11, 26) + page1, next_token1 = self._api.paged_articles(query, start_date, end_date) + for s in page1: + assert 'text_content' not in s + page2, next_token2 = self._api.paged_articles(query, start_date, end_date, + pagination_token=next_token1, expanded=True) + for s in page2: + assert 'text_content' in s + + def test_top_sources(self): results = self._api.top_sources("coronavirus", dt.datetime(2022, 3, 1), dt.datetime(2022, 4, 1)) assert len(results) > 0 @@ -131,7 +165,7 @@ def test_top_terms(self): field=SearchApiClient.TERM_FIELD_SNIPPET, aggregation=SearchApiClient.TERM_AGGREGATION_TOP) last_count = 99999999999 - for term, count in results.items(): + for _, count in results.items(): assert last_count >= count last_count = count @@ -142,7 +176,7 @@ def test_content_via_article_url(self): end_date = dt.datetime(2022, 3, 4) for page in self._api.all_articles(query, start_date, end_date): for article in page[:5]: - article_info = requests.get(article['article_url']).json() + article_info = requests.get(article['article_url'], timeout=30).json() assert 'snippet' in article_info assert len(article_info['snippet']) > 0 break diff --git a/setup.py b/setup.py index 1e2402d..7f18639 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python -from setuptools import setup import re import os +from setuptools import setup REQUIRED_PACKAGES = [ # utilities @@ -9,30 +9,29 @@ "ciso8601==2.2.*" # super-fast date parsing ] -with open('waybacknews/__init__.py', 'r') as fd: +with open('mcnews/__init__.py', 'r', encoding='utf8') as fd: version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) # add README.md to distribution this_directory = os.path.abspath(os.path.dirname(__file__)) -with open(os.path.join(this_directory, 'README.md')) as f: +with open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf8') as f: long_description = f.read() -setup(name='wayback-news-search', +setup(name='mediacloud-news-search', maintainer='Rahul Bhargava', maintainer_email='r.bhargava@northeastern.edu', version=version, - description='Wayback Machine news archive search api client', + description='Mediacloud news archive search api client', long_description=long_description, long_description_content_type='text/markdown', - url='https://web.archive.org', - test_suite="waybacknews.test", - packages=['waybacknews'], + test_suite="mcnews.test", + packages=['mcnews'], package_data={'': ['LICENSE']}, python_requires='>3.7', install_requires=REQUIRED_PACKAGES, extras_require={'dev': ['pytest', 'twine']}, project_urls={ - 'Bug Reports': 'https://github.com/mediacloud/wayback-news-search/issues', - 'Source': 'https://github.com/mediacloud/wayback-news-search', + 'Bug Reports': 'https://github.com/mediacloud/mediacloud-news-search/issues', + 'Source': 'https://github.com/mediacloud/mediacloud-news-search', }, )