Skip to content

Commit

Permalink
update mirrors from url
Browse files Browse the repository at this point in the history
  • Loading branch information
elfkuzco committed Jun 7, 2024
1 parent e7b68f2 commit 7b83097
Show file tree
Hide file tree
Showing 13 changed files with 368 additions and 8 deletions.
7 changes: 6 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ dependencies = [
"pydantic == 2.7.2",
"SQLAlchemy == 2.0.30",
"psycopg[binary,pool] == 3.1.19",
"beautifulsoup4 == 4.12.3",
"requests == 2.32.3",
]
license = {text = "GPL-3.0-or-later"}
classifiers = [
Expand All @@ -30,6 +32,9 @@ dynamic = ["version"]
[project.urls]
Homepage = "https://github.com/kiwix/mirrors-qa"

[project.scripts]
mirrors-qa-backend = "mirrors_qa_backend.cli:main"

[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
Expand Down Expand Up @@ -92,7 +97,7 @@ fix-ruff = "inv fix-ruff --args '{args}'"
fixall = "inv fixall --args '{args}'"

[tool.hatch.envs.check]
features = ["scripts", "check"]
features = ["scripts", "test", "check"]

[tool.hatch.envs.check.scripts]
pyright = "inv check-pyright --args '{args}'"
Expand Down
5 changes: 3 additions & 2 deletions backend/src/mirrors_qa_backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
import os

from mirrors_qa_backend.settings import Settings

logger = logging.getLogger("backend")

if not logger.hasHandlers():
logger.setLevel(logging.DEBUG if bool(os.getenv("DEBUG")) else logging.INFO)
logger.setLevel(logging.DEBUG if Settings.debug else logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(asctime)s: %(levelname)s] %(message)s"))
logger.addHandler(handler)
6 changes: 6 additions & 0 deletions backend/src/mirrors_qa_backend/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import sys

if __name__ == "__main__":
from mirrors_qa_backend.cli import main

sys.exit(main())
23 changes: 23 additions & 0 deletions backend/src/mirrors_qa_backend/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import argparse

from mirrors_qa_backend import Settings, db
from mirrors_qa_backend.db import mirrors


def main():
parser = argparse.ArgumentParser(prog="mirrors-qa-backend")
parser.add_argument(
"--update-mirrors",
action="store_true",
help=f"Update the list of mirrors from {Settings.mirrors_url}",
)

args = parser.parse_args()

if args.update_mirrors:
with db.Session.begin() as session:
mirrors.update_mirrors(session, mirrors.get_current_mirror_countries())


if __name__ == "__main__":
main()
13 changes: 10 additions & 3 deletions backend/src/mirrors_qa_backend/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sqlalchemy.orm import sessionmaker

from mirrors_qa_backend import logger
from mirrors_qa_backend.db.models import Mirror
from mirrors_qa_backend.db import mirrors, models
from mirrors_qa_backend.settings import Settings

Session = sessionmaker(
Expand Down Expand Up @@ -38,9 +38,16 @@ def count_from_stmt(session: OrmSession, stmt: SelectBase) -> int:

def initialize_mirrors() -> None:
with Session.begin() as session:
count = count_from_stmt(session, select(Mirror))
count = count_from_stmt(session, select(models.Mirror))
countries = mirrors.get_current_mirror_countries()
if count == 0:
logger.info("No mirrors exist in database.")
# TODO: update mirrors from https://download.kiwix.org/mirrors.html
# update mirrors from https://download.kiwix.org/mirrors.html
if not countries:
logger.info(f"No mirrors were found on {Settings.mirrors_url}")
return
mirrors.create_mirrors(session, countries)
else:
logger.info(f"Found {count} mirrors in database.")
# Update the list of enabled mirrors
mirrors.update_mirrors(session, countries)
140 changes: 140 additions & 0 deletions backend/src/mirrors_qa_backend/db/mirrors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from typing import Any
from urllib.parse import urlsplit

import requests
from bs4 import BeautifulSoup, NavigableString
from bs4.element import Tag
from sqlalchemy import select
from sqlalchemy.orm import Session as OrmSession
from sqlalchemy.orm import selectinload

from mirrors_qa_backend import logger, schemas
from mirrors_qa_backend.db import models
from mirrors_qa_backend.settings import Settings


def create_mirrors(session: OrmSession, countries: list[schemas.Country]) -> None:
for country in countries:
c = models.Country(code=country.code, name=country.name)
c.mirrors = [models.Mirror(**m.model_dump()) for m in country.mirrors]
session.add(c)


def update_mirrors(session: OrmSession, countries: list[schemas.Country]) -> None:
"""
Updates the status of mirrors in the database. Any mirrors in the database
that do not exist in the current mirrors obtained from `countries` are
marked as disabled. New mirrors are saved accordingly.
"""
# If there are no countries, disable all mirrors
if not countries:
for mirror in session.scalars(select(models.Mirror)).all():
mirror.enabled = False
session.add(mirror)
return

query = select(models.Country).options(selectinload(models.Country.mirrors))
# Map the country codes to each country from the database. To be used
# to compare against the list of current countries
db_countries: dict[str, models.Country] = {
country.code: country for country in session.scalars(query).all()
}
# Map the country codes to each country from the current list of coutnries.
# To be used in determining if a country is to be newly registered
current_countries: dict[str, schemas.Country] = {
country.code: country for country in countries
}

for country_code, country in current_countries.items():
if country_code not in db_countries:
# Register all of the country's mirrors as the country is
# a new country
logger.debug("Registering new mirrors for {country_code!r}")
c = models.Country(code=country.code, name=country.name)
c.mirrors = [models.Mirror(**m.model_dump()) for m in country.mirrors]
session.add(c)

for code, db_country in db_countries.items():
if code in current_countries:
# Even though the db_country is "current", ensure it's mirrors
# are in sync with the current mirrors
current_mirrors: dict[str, schemas.Mirror] = {
m.id: m for m in current_countries[code].mirrors
}
db_mirrors: dict[str, models.Mirror] = {m.id: m for m in db_country.mirrors}

for db_mirror in db_mirrors.values():
if db_mirror.id not in current_mirrors:
logger.debug(f"Disabling mirror {db_mirror.id!r}")
db_mirror.enabled = False
session.add(db_mirror)

for mirror_id, mirror in current_mirrors.items():
if mirror_id not in db_mirrors:
logger.debug(
f"Registering new mirror {mirror.id!r} for "
"country: {db_country.name!r}"
)
db_country.mirrors.append(models.Mirror(**mirror.model_dump()))
session.add(db_country)
else:
# disable all of the country's mirrors as they have been removed
for db_mirror in db_country.mirrors:
logger.debug(f"Disabling mirror {db_mirror.id!r}")
db_mirror.enabled = False
session.add(db_mirror)


def get_current_mirror_countries() -> list[schemas.Country]:
def find_country_rows(tag: Tag) -> bool:
"""
Filters out table rows that do not contain mirror
data from the table body.
"""
return tag.name == "tr" and tag.findChild("td", class_="newregion") is None

r = requests.get(Settings.mirrors_url, timeout=Settings.requests_timeout)
r.raise_for_status()

soup = BeautifulSoup(r.text, features="html.parser")
body = soup.find("tbody")

if body is None or isinstance(body, NavigableString):
raise ValueError
# Given a country might have more than one mirror, set up a dictionary
# of country_code to the country's data. If it is the first time we
# are seeing the country, we save it along with its mirror, else,
# we simply update its mirrors list.
countries: dict[str, schemas.Country] = {}
rows = body.find_all(find_country_rows)
for row in rows:
country_name = row.find("img").next_sibling.text.strip()
if country_name in Settings.mirrors_exclusion_list:
continue
country_code = row.find("img")["alt"]
base_url = row.find("a", string="HTTP")["href"]
hostname: Any = urlsplit(
base_url
).netloc # pyright: ignore [reportUnknownMemberType]

if country_code not in countries:
countries[country_code] = schemas.Country(
code=country_code,
name=country_name,
mirrors=[
schemas.Mirror(
id=hostname,
base_url=base_url,
enabled=True,
)
],
)
else:
countries[country_code].mirrors.append(
schemas.Mirror(
id=hostname,
base_url=base_url,
enabled=True,
)
)
return list(countries.values())
27 changes: 27 additions & 0 deletions backend/src/mirrors_qa_backend/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pydantic
from pydantic import ConfigDict


class BaseModel(pydantic.BaseModel):
model_config = ConfigDict(use_enum_values=True)


class Mirror(BaseModel):
id: str # hostname of a mirror URL
base_url: str
enabled: bool
region: str | None = None
asn: str | None = None
score: int | None = None
latitude: float | None = None
longitude: float | None = None
country_only: bool | None = None
region_only: bool | None = None
as_only: bool | None = None
other_countries: list[str] | None = None


class Country(BaseModel):
code: str # two-letter country codes as defined in ISO 3166-1
name: str # full name of country (in English)
mirrors: list[Mirror]
6 changes: 6 additions & 0 deletions backend/src/mirrors_qa_backend/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ class Settings:
"""Shared backend configuration"""

database_url: str = getenv("POSTGRES_URI", mandatory=True)
mirrors_url = "https://download.kiwix.org/mirrors.html"
# comma-seperated list of mirror country names to exclude
mirrors_exclusion_list = getenv("EXCLUDED_MIRRORS", default="Israel").split(",")
debug = bool(getenv("DEBUG", default=False))
# number of seconds before requests time out
requests_timeout = int(getenv("REQUESTS_TIMEOUT", default=5))
19 changes: 19 additions & 0 deletions backend/tests/db/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from collections.abc import Generator

import pytest
from sqlalchemy.orm import Session as OrmSession

from mirrors_qa_backend.db import Session, models


@pytest.fixture
def dbsession() -> Generator[OrmSession, None, None]:
"""
Returns a session to an empty database.
"""
with Session.begin() as session:
# Ensure we are starting with an empty database
models.Base.metadata.drop_all(bind=session.get_bind())
models.Base.metadata.create_all(bind=session.get_bind())
yield session
session.rollback()
Loading

0 comments on commit 7b83097

Please sign in to comment.