Merge pull request #37 from mediacloud/feature-unit-tests

Unit tests and CI
mediacloud · Dec 7, 2023 · eae01d8 · eae01d8
2 parents a018774 + 609659b
commit eae01d8
Show file tree

Hide file tree

Showing 12 changed files with 206 additions and 60 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,36 @@
+name: Test Code
+
+on:
+  - push
+
+jobs:
+  test:
+    if: github.event_name == 'push'
+    runs-on: ubuntu-latest
+    name: Unit tests in Docker
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure sysctl limits
+        run: |
+          sudo swapoff -a
+          sudo sysctl -w vm.swappiness=1
+          sudo sysctl -w fs.file-max=262144
+          sudo sysctl -w vm.max_map_count=262144
+      - name: Run Elasticsearch
+        uses: elastic/elastic-github-actions/elasticsearch@master
+        with:
+          stack-version: 8.8.2
+          security-enabled: false
+      - name: Elasticsearch is reachable
+        run: |
+          curl --verbose --show-error http://localhost:9200
+
+      - name: Install python dependencies
+        run: |
+          pip install -r requirements.txt
+      - name: Install fixtures
+        run: |
+          python -m test.create_fixtures
+      - name: Run all tests
+        run: |
+          pytest
diff --git a/.gitignore b/.gitignore
@@ -140,3 +140,4 @@ cython_debug/
 # App-specific ignores
 config.yml
 *.back
+.idea
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
diff --git a/Dockerfile b/Dockerfile
@@ -5,29 +5,10 @@ FROM    python:3.10 AS base
 ENV     STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
 WORKDIR /app
 CMD     ["./api.py"]
-RUN     pip install --no-cache-dir \
-            altair \
-            "elasticsearch==8.8.0" \
-            fastapi \
-            matplotlib \
-            pandas \
-            pydantic \
-            requests \
-            streamlit \
-            "uvicorn[standard]" \
-            wordcloud \
-            pyyaml
-
-# Lint code
+# Install depedencides
 FROM    base
-RUN     pip install --no-cache-dir pylint
+RUN     pip install --no-cache-dir -r requirements.txt
 COPY    . ./
-RUN     pylint *.py \
-            --max-line-length=120 \
-            --good-names="c,ct,e,ep,id,q,r" \
-            --disable="C0103,C0114,C0115,C0116" \
-            --extension-pkg-whitelist="pydantic"
-
 # Build image
 FROM    base
 COPY    . ./
diff --git a/api.py b/api.py
@@ -389,7 +389,7 @@ def _search_result(collection: Collection, q: str, req: Request, resp: Response,
     base = proxy_base_url(req)
     qurl = f"{base}/{collection.value}/search/result?q={quote_plus(q)}"
     if len(res["hits"]["hits"]) == config["maxpage"]:
-        resume_key = encode(res["hits"]["hits"][-1]["sort"][0])
+        resume_key = encode(str(res["hits"]["hits"][-1]["sort"][0]))
         resp.headers["x-resume-token"] = resume_key
         resp.headers["link"] = f'<{qurl}&resume={resume_key}>; rel="next"'
     return [format_match(h, base, collection.value) for h in res["hits"]["hits"]]

diff --git a/docs/testing.md b/docs/testing.md
@@ -0,0 +1,10 @@
+Testing
+=======
+
+To run unit tests locally (via `pytest`) you need to have an elasticsearch index running.
+
+You can do this via Docker:
+ * docker pull elasticsearch:8.8.2 
+ * docker run --rm --name es-news-search-api -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" elasticsearch:8.8.2
+
+Then if you hit http://127.0.0.1:9200 you should see some json text response indicating it is running. 
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,13 @@
+altair==5.2.*
+elasticsearch==8.8.*
+fastapi==0.104.*
+matplotlib==3.8.*
+pandas==2.1.*
+pydantic==2.5.*
+requests
+streamlit==1.29.*
+uvicorn[standard]
+wordcloud==1.9.*
+pyyaml==6.0.*
+pytest==7.*
+httpx==0.25.*
diff --git a/test/__init__.py b/test/__init__.py
@@ -0,0 +1,5 @@
+import os
+
+INDEX_NAME = "mediacloud_test"
+ELASTICSEARCH_URL = "http://127.0.0.1:9200"
+FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
diff --git a/test/api_test.py b/test/api_test.py
@@ -0,0 +1,62 @@
+import os
+from unittest import TestCase
+from fastapi.testclient import TestClient
+
+from test import INDEX_NAME, ELASTICSEARCH_URL
+os.environ["eshosts"] = ELASTICSEARCH_URL
+# make sure to set this env var before importing the app
+from api import app
+
+TIMEOUT = 30
+
+
+class ApiTest(TestCase):
+
+    def setUp(self):
+        self._client = TestClient(app)
+
+    def test_overview_all(self):
+        # make sure all stories come back and domain is right
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "*"}, timeout=TIMEOUT)
+        assert response.status_code == 200
+        results = response.json()
+        assert 'total' in results
+        assert results['total'] > 1000
+        assert 'matches' in results
+        for story in results['matches']:
+            assert 'canonical_domain' in story
+            assert story['canonical_domain'] == 'example.com'
+
+    def test_overview_no_results(self):
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "asdfdf"}, timeout=TIMEOUT)
+        assert response.status_code == 404
+
+    def test_overview_by_content(self):
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "article"}, timeout=TIMEOUT)
+        assert response.status_code == 200
+        results = response.json()
+        assert 'total' in results
+        assert results['total'] > 1000
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "1"}, timeout=TIMEOUT)
+        assert response.status_code == 200
+        results = response.json()
+        assert 'total' in results
+        assert results['total'] < 1000
+
+    def test_overview_by_pub_date(self):
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/overview',
+                                     json={"q": "* AND publication_date:[2023-11-01 TO 2023-12-10]"}, timeout=TIMEOUT)
+        assert response.status_code == 200
+        results = response.json()
+        assert 'total' in results
+        assert results['total'] > 300
+        assert results['total'] < 1200
+
+    def test_paging(self):
+        response = self._client.post(f'/v1/{INDEX_NAME}/search/result',
+                                     json={"q": "*"}, timeout=TIMEOUT)
+        assert response.status_code == 200
+        results = response.json()
+        assert len(results) == 1000
+        next_page_token = response.headers.get('x-resume-token')
+        assert next_page_token is not None
diff --git a/test/create_fixtures.py b/test/create_fixtures.py
@@ -0,0 +1,76 @@
+import json
+import logging
+import os
+import copy
+import hashlib
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import ConflictError
+
+from test import INDEX_NAME, ELASTICSEARCH_URL, FIXTURES_DIR
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+es_client = Elasticsearch(hosts=[ELASTICSEARCH_URL], basic_auth=("elastic", "changeme"), verify_certs=False)
+
+# first create the index
+es_mappings = {
+    "properties": {
+        "original_url": {"type": "keyword"},
+        "url": {"type": "keyword"},
+        "normalized_url": {"type": "keyword"},
+        "canonical_domain": {"type": "keyword"},
+        "publication_date": {"type": "date", "ignore_malformed": True},
+        "language": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
+        "full_language": {"type": "keyword"},
+        "text_extraction": {"type": "keyword"},
+        "article_title": {
+            "type": "text",
+            "fields": {"keyword": {"type": "keyword"}},
+        },
+        "normalized_article_title": {
+            "type": "text",
+            "fields": {"keyword": {"type": "keyword"}},
+        },
+        "text_content": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
+        "indexed_date": {"type": "date"},
+    }
+}
+
+es_client.indices.create(index=INDEX_NAME, mappings=es_mappings, ignore=400)  # Ignore 400 to handle index already exists
+logger.info(f"Index '{INDEX_NAME}' with field mappings created successfully (or already exists.")
+
+# now import the fixtures
+base_fixture = {
+    "original_url": "http://example.com/article",
+    "url": "http://example.com/article",
+    "normalized_url": "http://example.com/article",
+    "article_title": "Sample Article ",
+    "normalized_article_title": "sample_article_",
+    "text_content": "This is the content of the sample article ",
+    "canonical_domain": "example.com",
+    "publication_date": "2023-11-01",
+    "indexed_date": "2023-12-01",
+    "language": "en",
+    "full_language": "en-us",
+    "text_extraction": "trafilatura",
+}
+
+imported_count = 0
+for idx in range(0, 2000):
+    fixture = copy.copy(base_fixture)
+    fixture['original_url'] += str(idx)
+    fixture['url'] += str(idx)
+    fixture['normalized_url'] += str(idx)
+    fixture['article_title'] += str(idx)
+    fixture['normalized_article_title'] += str(idx)
+    fixture['text_content'] += str(idx)
+    fixture['publication_date'] = "2023-" + str(10+int(idx / 1000)) + "-" + str(1 + (idx % 29)).zfill(2)
+    fixture['indexed_date'] = "2023-" + str(10+int(idx / 1000)) + "-" + str(1 + (idx % 29)).zfill(2)
+    url_hash = hashlib.sha256(fixture['url'].encode("utf-8")).hexdigest()
+    try:
+        response = es_client.index(index=INDEX_NAME, id=url_hash, document=fixture)
+        imported_count += 1
+    except ConflictError:
+        logger.warning("  duplicate fixture, ignoring")
+logger.info(f"  Imported {imported_count}")
-Original file line number
+Diff line change
@@ Expand Up / @@ -140,3 +140,4 @@ cython_debug/ @@
     # App-specific ignores
     config.yml
     *.back
+    .idea