Merge branch 'mediacloud-main'

merging @rahulbot's changes from wayback-news-client
mediacloud · Dec 12, 2023 · 29f9540 · 29f9540
2 parents 961d2db + 4c72e04
commit 29f9540
Show file tree

Hide file tree

Showing 11 changed files with 242 additions and 77 deletions.
diff --git a/.github/workflows/mc-integration-test.yml b/.github/workflows/mc-integration-test.yml
@@ -0,0 +1,71 @@
+name: Integration test against news-search-api:main
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  fixture-integration-test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+
+    name: Integration test with dummy ES data
+    steps:
+
+      # setup ES index
+      - name: Configure sysctl limits
+        run: |
+          sudo swapoff -a
+          sudo sysctl -w vm.swappiness=1
+          sudo sysctl -w fs.file-max=262144
+          sudo sysctl -w vm.max_map_count=262144
+      - name: Run Elasticsearch
+        uses: elastic/elastic-github-actions/elasticsearch@master
+        with:
+          stack-version: 8.8.2
+          security-enabled: false
+      - name: Verify Elasticsearch is reachable
+        run: |
+          curl --verbose --show-error http://localhost:9200
+
+      # setup news-search-api server and dummy data
+      - name: Checkout news-search-api server
+        uses: actions/checkout@v4
+        with:
+          repository: mediacloud/news-search-api
+          path: news-search-api
+      - name: Install news-search-api server python dependencies
+        working-directory: news-search-api
+        run: |
+          pip install -r requirements.txt
+      - name: Install fixtures
+        working-directory: news-search-api
+        run: |
+          python -m test.create_fixtures
+      - name: Run news-search-api server
+        working-directory: news-search-api
+        run: |
+          python api.py &
+          sleep 5
+      - name: Verify news-search-api server is reachable
+        working-directory: news-search-api
+        run: |
+          curl --verbose --show-error http://localhost:8000
+
+      # set up api client code and run test
+      - name: Main checkout
+        uses: actions/checkout@v4
+        with:
+          path: main
+      - name: Install python dependencies
+        working-directory: main
+        run: |
+          pip install -e .[dev]
+      - name: Run integration test
+        working-directory: main
+        run: |
+          pytest mcnews/tests/test_fixtures.py
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
diff --git a/.github/workflows/pytest.yml → .github/workflows/wm-integration-test.yml b/.github/workflows/pytest.yml → .github/workflows/wm-integration-test.yml
@@ -1,33 +1,30 @@
 name: do-testing
 
-on: 
+on:
   push:
     branches: ["main"]
   pull_request:
     branches: ["main"]
 
-permissions:
-  contents: read
-
 jobs:
-  
+
   build:
     runs-on: ubuntu-latest
-    strategy: 
+    strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.10"]
 
     steps:
       - uses: actions/checkout@v3
-      
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
-      
+
       - name: Install Deps
         run: |
-          pip install -e .[dev] 
+          pip install -e .[dev]
       - name: Run Pytest
         run: |
-          pytest
+          pytest mcnews/tests/test_waybacknews.py
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,12 @@
+[MASTER]
+disable=
+    C0114, # missing-module-docstring
+    C0115, # missing-class-docstring
+    C0116, # missing-function-docstring
+    C0209, # consider-using-f-string
+    R0913, # too-many-arguments
+
+[FORMAT]
+# Maximum number of characters on a single line.
+max-line-length=120
+
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ A simple client library to access the Wayback Machine news archive search.
 Installation
 ------------
 
+NB: TBD
 `pip install wayback-news-search`
 
 
@@ -18,7 +19,7 @@ Basic Usage
 Counting matching stories:
 
 ```python
-from waybacknews.searchapi import SearchApiClient
+from mcnews.searchapi import SearchApiClient
 import datetime as dt
 
 api = SearchApiClient("mediacloud")
@@ -28,7 +29,7 @@ api.count("coronavirus", dt.datetime(2022, 3, 1), dt.datetime(2022, 4, 1))
 Paging over all matching results:
 
 ```python
-from waybacknews.searchapi import SearchApiClient
+from mcnews.searchapi import SearchApiClient
 import datetime as dt
 
 api = SearchApiClient("mediacloud")
@@ -48,7 +49,7 @@ Distribution
 ------------
 
 1. Run `pytest` to make sure all the test pass
-2. Update the version number in `waybacknews/__init__.py`
+2. Update the version number in `mcnews/__init__.py`
 3. Make a brief note in the version history section below about the changes
 4. Commit the changes
 5. Tag the commit with a semantic version number - 'v*.*.*'
@@ -61,6 +62,9 @@ Distribution
 Version History
 ---------------
 
+* __v1.2.1__ - fix paging bug triggered by no results
+* __v1.2.0__ - add support for new `expanded` results, and more integration testing
+* __v1.1.0__ - add new `paged_articles` method to allow paging over all results
 * __v1.0.3__ - add 30 sec timeout, remove extra params mcproviders library might be adding
 * __v1.0.2__ - fix to article endpoint
 * __v1.0.1__ - automatically escape '/' in query strings, test case for `url` field search

diff --git a/mcnews/__init__.py b/mcnews/__init__.py
@@ -1 +1 @@
-__version__ = '1.0.3'
+__version__ = '1.2.1'
diff --git a/mcnews/searchapi.py b/mcnews/searchapi.py
@@ -1,9 +1,9 @@
 import datetime as dt
-from typing import List, Dict
-import requests
+from typing import List, Dict, Optional
 import logging
+import requests
 import ciso8601
-import waybacknews.util as util
+from mcnews import util
 
 VERSION = "v1"  # the API access URL is versioned for future compatability and maintenance
 
@@ -90,7 +90,7 @@ def _date_query_clause(start_date: dt.datetime, end_date: dt.datetime) -> str:
     def _overview_query(self, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> Dict:
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
-        results, response = self._query("{}/search/overview".format(self._collection), params, method='POST')
+        results, _ = self._query("{}/search/overview".format(self._collection), params, method='POST')
         return results
 
     def article(self, article_id: str) -> Dict:
@@ -101,36 +101,52 @@ def article(self, article_id: str) -> Dict:
     def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime, page_size: int = 1000, **kwargs):
         """
         @return: a generator that yeilds lists of articles, grouped by page.
-        @Question: Should it return articles one by one, not by page? 
         """
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
         more_pages = True
+        next_page_token = None
         while more_pages:
-            page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
+            page, next_page_token = self.paged_articles(query, start_date, end_date, page_size, **kwargs,
+                                                        pagination_token=next_page_token)
             if self._is_no_results(page):
                 yield []
             else:
                 yield page
             # check if there is a link to the next page
             more_pages = False
-            next_link_token = response.headers.get('x-resume-token')
-            if next_link_token:
-                params['resume'] = next_link_token
+            if next_page_token:
                 more_pages = True
 
-    def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict:
+    def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime,
+                       page_size: Optional[int] = 1000,  expanded: bool = False,
+                       pagination_token: Optional[str] = None, **kwargs) -> tuple[List[Dict], Optional[str]]:
+        """
+        @return: one page of stories
+        """
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
+        if expanded:
+            params['expanded'] = 1
+        if pagination_token:
+            params['resume'] = pagination_token
         params.update(kwargs)
-        results, response = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params,
-                                        method='GET')
+        page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
+        if self._is_no_results(page):
+            return [], None
+        return page, response.headers.get('x-resume-token')
+
+    def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str,
+              **kwargs) -> Dict:
+        params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
+        params.update(kwargs)
+        results, _ = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params, method='GET')
         return results
 
     def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
         """
         Centralize making the actual queries here for easy maintenance and testing of HTTP comms
         """
-        if 'domains' in params:  # remove domains param that might be dangling
+        if params and ('domains' in params):  # remove domains param that might be dangling
             del params['domains']
         if params and ('q' in params):
             params['q'] = util.sanitize_query(params['q'])
@@ -141,9 +157,9 @@ def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
             r = self._session.post(endpoint_url, json=params, timeout=self.TIMEOUT_SECS)
         else:
             raise RuntimeError("Unsupported method of '{}'".format(method))
-        
+
         if r.status_code >= 500:
-            raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {}, Params: {}".
-                               format(r.status_code, endpoint_url, params))
-                               
+            raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {},"
+                               " Params: {}".format(r.status_code, endpoint_url, params))
+
         return r.json(), r
diff --git a/mcnews/tests/test_fixtures.py b/mcnews/tests/test_fixtures.py
@@ -0,0 +1,59 @@
+from unittest import TestCase
+import datetime as dt
+
+import mcnews.searchapi as searchapi
+
+INTEGRATION_TEST_COLLECTION = "mediacloud_test"
+INTEGRATION_TEST_HOST = "http://127.0.0.1:8000"
+
+
+class TestMediaCloudCollection(TestCase):
+
+    def setUp(self) -> None:
+        self._api = searchapi.SearchApiClient(INTEGRATION_TEST_COLLECTION)
+        self._api.API_BASE_URL = f"{INTEGRATION_TEST_HOST}/{searchapi.VERSION}/"
+
+    def test_count(self):
+        results = self._api.count("*", dt.datetime(2023, 1, 1), dt.datetime(2024, 1, 1))
+        assert results > 0
+        assert results < 5000
+
+    def test_count_over_time(self):
+        results = self._api.count_over_time("*", dt.datetime(2020, 1, 1), dt.datetime(2025, 1, 1))
+        assert len(results) > 30
+        for day in results:
+            assert 'date' in day
+            assert 'count' in day
+            assert 'timestamp' in day
+
+    def test_count_no_results(self):
+        results = self._api.count("*", dt.datetime(2010, 1, 1), dt.datetime(2010, 1, 1))
+        assert results == 0
+
+    def test_count_date_filter(self):
+        all = self._api.count("*", dt.datetime(2023, 1, 1), dt.datetime(2024, 1, 1))
+        assert all > 0
+        w1 = self._api.count("*", dt.datetime(2023, 11, 1), dt.datetime(2024, 11, 8))
+        assert all > w1
+
+    def test_paged_articles(self):
+        query = "*"
+        start_date = dt.datetime(2023, 10, 1)
+        end_date = dt.datetime(2023, 12, 31)
+        story_count = self._api.count(query, start_date, end_date)
+        # make sure test case is reasonable size (ie. more than one page, but not too many pages
+        assert story_count > 1000
+        assert story_count < 10000
+        # fetch first page
+        page1, next_token1 = self._api.paged_articles(query, start_date, end_date)
+        assert len(page1) > 0
+        assert next_token1 is not None
+        page1_url1 = page1[0]['url']
+        # grab token, fetch next page
+        page2, next_token2 = self._api.paged_articles(query, start_date, end_date, pagination_token=next_token1)
+        assert len(page2) > 0
+        assert next_token2 is not None
+        assert next_token1 != next_token2  # verify paging token changed
+        page2_urls = [s['url'] for s in page2]
+        assert page1_url1 not in page2_urls  # verify pages don't overlap
+
diff --git a/mcnews/tests/test_util.py b/mcnews/tests/test_util.py
@@ -1,6 +1,6 @@
 from unittest import TestCase
 
-import waybacknews.util as util
+import mcnews.util as util
 
 
 class TestUtil(TestCase):
@@ -11,7 +11,7 @@ def test_sanitize_query(self):
         assert sanitized == "url:*dailyvoice.com\/new-york\/mountpleasant*"
 
     def test_dict_to_list(self):
-        api_like_data = dict(key1='value1', key2='value2')
+        api_like_data = { 'key1': 'value1', 'key2':'value2' }
         list_version = util.dict_to_list(api_like_data)
         assert len(list_version) == 2
         assert list_version[0]['name'] == 'key1'