Skip to content

Commit

Permalink
Merge pull request #37 from mediacloud/feature-unit-tests
Browse files Browse the repository at this point in the history
Unit tests and CI
  • Loading branch information
rahulbot authored Dec 7, 2023
2 parents a018774 + 609659b commit eae01d8
Show file tree
Hide file tree
Showing 12 changed files with 206 additions and 60 deletions.
15 changes: 0 additions & 15 deletions .github/workflows/lint.yml

This file was deleted.

19 changes: 0 additions & 19 deletions .github/workflows/sync.yml

This file was deleted.

36 changes: 36 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Test Code

on:
- push

jobs:
test:
if: github.event_name == 'push'
runs-on: ubuntu-latest
name: Unit tests in Docker
steps:
- uses: actions/checkout@v4
- name: Configure sysctl limits
run: |
sudo swapoff -a
sudo sysctl -w vm.swappiness=1
sudo sysctl -w fs.file-max=262144
sudo sysctl -w vm.max_map_count=262144
- name: Run Elasticsearch
uses: elastic/elastic-github-actions/elasticsearch@master
with:
stack-version: 8.8.2
security-enabled: false
- name: Elasticsearch is reachable
run: |
curl --verbose --show-error http://localhost:9200
- name: Install python dependencies
run: |
pip install -r requirements.txt
- name: Install fixtures
run: |
python -m test.create_fixtures
- name: Run all tests
run: |
pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,4 @@ cython_debug/
# App-specific ignores
config.yml
*.back
.idea
4 changes: 0 additions & 4 deletions .gitlab-ci.yml

This file was deleted.

23 changes: 2 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,10 @@ FROM python:3.10 AS base
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
WORKDIR /app
CMD ["./api.py"]
RUN pip install --no-cache-dir \
altair \
"elasticsearch==8.8.0" \
fastapi \
matplotlib \
pandas \
pydantic \
requests \
streamlit \
"uvicorn[standard]" \
wordcloud \
pyyaml

# Lint code
# Install depedencides
FROM base
RUN pip install --no-cache-dir pylint
RUN pip install --no-cache-dir -r requirements.txt
COPY . ./
RUN pylint *.py \
--max-line-length=120 \
--good-names="c,ct,e,ep,id,q,r" \
--disable="C0103,C0114,C0115,C0116" \
--extension-pkg-whitelist="pydantic"

# Build image
FROM base
COPY . ./
2 changes: 1 addition & 1 deletion api.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def _search_result(collection: Collection, q: str, req: Request, resp: Response,
base = proxy_base_url(req)
qurl = f"{base}/{collection.value}/search/result?q={quote_plus(q)}"
if len(res["hits"]["hits"]) == config["maxpage"]:
resume_key = encode(res["hits"]["hits"][-1]["sort"][0])
resume_key = encode(str(res["hits"]["hits"][-1]["sort"][0]))
resp.headers["x-resume-token"] = resume_key
resp.headers["link"] = f'<{qurl}&resume={resume_key}>; rel="next"'
return [format_match(h, base, collection.value) for h in res["hits"]["hits"]]
Expand Down
10 changes: 10 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Testing
=======

To run unit tests locally (via `pytest`) you need to have an elasticsearch index running.

You can do this via Docker:
* docker pull elasticsearch:8.8.2
* docker run --rm --name es-news-search-api -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" elasticsearch:8.8.2

Then if you hit http://127.0.0.1:9200 you should see some json text response indicating it is running.
13 changes: 13 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
altair==5.2.*
elasticsearch==8.8.*
fastapi==0.104.*
matplotlib==3.8.*
pandas==2.1.*
pydantic==2.5.*
requests
streamlit==1.29.*
uvicorn[standard]
wordcloud==1.9.*
pyyaml==6.0.*
pytest==7.*
httpx==0.25.*
5 changes: 5 additions & 0 deletions test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import os

INDEX_NAME = "mediacloud_test"
ELASTICSEARCH_URL = "http://127.0.0.1:9200"
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
62 changes: 62 additions & 0 deletions test/api_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
from unittest import TestCase
from fastapi.testclient import TestClient

from test import INDEX_NAME, ELASTICSEARCH_URL
os.environ["eshosts"] = ELASTICSEARCH_URL
# make sure to set this env var before importing the app
from api import app

TIMEOUT = 30


class ApiTest(TestCase):

def setUp(self):
self._client = TestClient(app)

def test_overview_all(self):
# make sure all stories come back and domain is right
response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "*"}, timeout=TIMEOUT)
assert response.status_code == 200
results = response.json()
assert 'total' in results
assert results['total'] > 1000
assert 'matches' in results
for story in results['matches']:
assert 'canonical_domain' in story
assert story['canonical_domain'] == 'example.com'

def test_overview_no_results(self):
response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "asdfdf"}, timeout=TIMEOUT)
assert response.status_code == 404

def test_overview_by_content(self):
response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "article"}, timeout=TIMEOUT)
assert response.status_code == 200
results = response.json()
assert 'total' in results
assert results['total'] > 1000
response = self._client.post(f'/v1/{INDEX_NAME}/search/overview', json={"q": "1"}, timeout=TIMEOUT)
assert response.status_code == 200
results = response.json()
assert 'total' in results
assert results['total'] < 1000

def test_overview_by_pub_date(self):
response = self._client.post(f'/v1/{INDEX_NAME}/search/overview',
json={"q": "* AND publication_date:[2023-11-01 TO 2023-12-10]"}, timeout=TIMEOUT)
assert response.status_code == 200
results = response.json()
assert 'total' in results
assert results['total'] > 300
assert results['total'] < 1200

def test_paging(self):
response = self._client.post(f'/v1/{INDEX_NAME}/search/result',
json={"q": "*"}, timeout=TIMEOUT)
assert response.status_code == 200
results = response.json()
assert len(results) == 1000
next_page_token = response.headers.get('x-resume-token')
assert next_page_token is not None
76 changes: 76 additions & 0 deletions test/create_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
import logging
import os
import copy
import hashlib
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import ConflictError

from test import INDEX_NAME, ELASTICSEARCH_URL, FIXTURES_DIR

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

es_client = Elasticsearch(hosts=[ELASTICSEARCH_URL], basic_auth=("elastic", "changeme"), verify_certs=False)

# first create the index
es_mappings = {
"properties": {
"original_url": {"type": "keyword"},
"url": {"type": "keyword"},
"normalized_url": {"type": "keyword"},
"canonical_domain": {"type": "keyword"},
"publication_date": {"type": "date", "ignore_malformed": True},
"language": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
"full_language": {"type": "keyword"},
"text_extraction": {"type": "keyword"},
"article_title": {
"type": "text",
"fields": {"keyword": {"type": "keyword"}},
},
"normalized_article_title": {
"type": "text",
"fields": {"keyword": {"type": "keyword"}},
},
"text_content": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
"indexed_date": {"type": "date"},
}
}

es_client.indices.create(index=INDEX_NAME, mappings=es_mappings, ignore=400) # Ignore 400 to handle index already exists
logger.info(f"Index '{INDEX_NAME}' with field mappings created successfully (or already exists.")

# now import the fixtures
base_fixture = {
"original_url": "http://example.com/article",
"url": "http://example.com/article",
"normalized_url": "http://example.com/article",
"article_title": "Sample Article ",
"normalized_article_title": "sample_article_",
"text_content": "This is the content of the sample article ",
"canonical_domain": "example.com",
"publication_date": "2023-11-01",
"indexed_date": "2023-12-01",
"language": "en",
"full_language": "en-us",
"text_extraction": "trafilatura",
}

imported_count = 0
for idx in range(0, 2000):
fixture = copy.copy(base_fixture)
fixture['original_url'] += str(idx)
fixture['url'] += str(idx)
fixture['normalized_url'] += str(idx)
fixture['article_title'] += str(idx)
fixture['normalized_article_title'] += str(idx)
fixture['text_content'] += str(idx)
fixture['publication_date'] = "2023-" + str(10+int(idx / 1000)) + "-" + str(1 + (idx % 29)).zfill(2)
fixture['indexed_date'] = "2023-" + str(10+int(idx / 1000)) + "-" + str(1 + (idx % 29)).zfill(2)
url_hash = hashlib.sha256(fixture['url'].encode("utf-8")).hexdigest()
try:
response = es_client.index(index=INDEX_NAME, id=url_hash, document=fixture)
imported_count += 1
except ConflictError:
logger.warning(" duplicate fixture, ignoring")
logger.info(f" Imported {imported_count}")

0 comments on commit eae01d8

Please sign in to comment.