Skip to content

Commit

Permalink
use mirrors data to update db instead of countries
Browse files Browse the repository at this point in the history
  • Loading branch information
elfkuzco committed Jun 10, 2024
1 parent 9865a19 commit 7b0c1c4
Show file tree
Hide file tree
Showing 13 changed files with 307 additions and 267 deletions.
3 changes: 2 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"psycopg[binary,pool] == 3.1.19",
"beautifulsoup4 == 4.12.3",
"requests == 2.32.3",
"pycountry == 24.6.1",
]
license = {text = "GPL-3.0-or-later"}
classifiers = [
Expand All @@ -33,7 +34,7 @@ dynamic = ["version"]
Homepage = "https://github.com/kiwix/mirrors-qa"

[project.scripts]
mirrors-qa-backend = "mirrors_qa_backend.cli:main"
mirrors-qa-backend = "mirrors_qa_backend.entrypoint:main"

[project.optional-dependencies]
scripts = [
Expand Down
6 changes: 0 additions & 6 deletions backend/src/mirrors_qa_backend/__main__.py

This file was deleted.

23 changes: 0 additions & 23 deletions backend/src/mirrors_qa_backend/cli.py

This file was deleted.

27 changes: 17 additions & 10 deletions backend/src/mirrors_qa_backend/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from mirrors_qa_backend import logger
from mirrors_qa_backend.db import mirrors, models
from mirrors_qa_backend.extract import get_current_mirrors
from mirrors_qa_backend.settings import Settings

Session = sessionmaker(
Expand Down Expand Up @@ -38,16 +39,22 @@ def count_from_stmt(session: OrmSession, stmt: SelectBase) -> int:

def initialize_mirrors() -> None:
with Session.begin() as session:
count = count_from_stmt(session, select(models.Mirror))
countries = mirrors.get_current_mirror_countries()
if count == 0:
nb_mirrors = count_from_stmt(session, select(models.Mirror))
current_mirrors = get_current_mirrors()
if nb_mirrors == 0:
logger.info("No mirrors exist in database.")
# update mirrors from https://download.kiwix.org/mirrors.html
if not countries:
logger.info(f"No mirrors were found on {Settings.mirrors_url}")
if not current_mirrors:
logger.info(f"No mirrors were found on {Settings.mirrors_url!r}")
return
mirrors.create_mirrors(session, countries)
results = mirrors.update_mirrors(session, current_mirrors)
logger.info(
f"Registered {results.nb_mirrors_added} mirrors "
f"from {Settings.mirrors_url!r}"
)
else:
logger.info(f"Found {count} mirrors in database.")
# Update the list of enabled mirrors
mirrors.update_mirrors(session, countries)
logger.info(f"Found {nb_mirrors} mirrors in database.")
result = mirrors.update_mirrors(session, current_mirrors)
logger.info(
f"Added {result.nb_mirrors_added} mirrors. "
f"Disabled {result.nb_mirrors_disabled} mirrors."
)
214 changes: 92 additions & 122 deletions backend/src/mirrors_qa_backend/db/mirrors.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,110 @@
from typing import Any
from urllib.parse import urlsplit
from dataclasses import dataclass

import requests
from bs4 import BeautifulSoup, NavigableString
from bs4.element import Tag
from sqlalchemy import select
from sqlalchemy.orm import Session as OrmSession
from sqlalchemy.orm import selectinload

from mirrors_qa_backend import logger, schemas
from mirrors_qa_backend.db import models
from mirrors_qa_backend.settings import Settings
from mirrors_qa_backend.exceptions import EmptyMirrorsError


def create_mirrors(session: OrmSession, countries: list[schemas.Country]) -> None:
for country in countries:
c = models.Country(code=country.code, name=country.name)
c.mirrors = [models.Mirror(**m.model_dump()) for m in country.mirrors]
session.add(c)
@dataclass
class UpdateMirrorsResult:
"""Represents the results of an update to the list of mirrors in the database"""

nb_mirrors_added: int = 0
nb_mirrors_disabled: int = 0

def update_mirrors(session: OrmSession, countries: list[schemas.Country]) -> None:

def create_mirrors(session: OrmSession, mirrors: list[schemas.Mirror]) -> int:
"""
Given a list of schemas.Mirror, saves all the mirrors
to the database.
Returns the total number of mirrors created.
Assumes that each mirror does not exist on the database.
"""
Updates the status of mirrors in the database. Any mirrors in the database
that do not exist in the current mirrors obtained from `countries` are
marked as disabled. New mirrors are saved accordingly.
total = 0
for mirror in mirrors:
db_mirror = models.Mirror(
id=mirror.id,
base_url=mirror.base_url,
enabled=mirror.enabled,
region=mirror.region,
asn=mirror.asn,
score=mirror.score,
latitude=mirror.latitude,
longitude=mirror.longitude,
country_only=mirror.country_only,
region_only=mirror.country_only,
as_only=mirror.as_only,
other_countries=mirror.other_countries,
)
# Ensure the country exists for the mirror
country = session.scalars(
select(models.Country).where(models.Country.code == mirror.country.code)
).one_or_none()

if country is None:
country = models.Country(code=mirror.country.code, name=mirror.country.name)
session.add(country)

db_mirror.country = country
session.add(db_mirror)
logger.debug(
f"Registered new mirror: {db_mirror.id!r} for country: {country.name!r}"
)
total += 1
return total


def update_mirrors(
session: OrmSession, mirrors: list[schemas.Mirror]
) -> UpdateMirrorsResult:
"""
Given a list of current_mirrors, compares the list with the existing mirrors
in the database and disables mirrors in the database that are not in the list.
New mirrors from the list that are not in the database are created in the
database.
Returns UpdateMirrorsResult showing the total mirrors added and updated.
"""
result = UpdateMirrorsResult()
# If there are no countries, disable all mirrors
if not countries:
for mirror in session.scalars(select(models.Mirror)).all():
mirror.enabled = False
session.add(mirror)
return

query = select(models.Country).options(selectinload(models.Country.mirrors))
# Map the country codes to each country from the database. To be used
# to compare against the list of current countries
db_countries: dict[str, models.Country] = {
country.code: country for country in session.scalars(query).all()
if not mirrors:
raise EmptyMirrorsError("mirrors list must not be empty")

# Map the id (hostname) of each mirror from the mirrors list for comparison
# against the id of mirrors from the database. To be used in determining
# if this mirror is a new mirror, in which case it should be added
current_mirrors: dict[str, schemas.Mirror] = {
mirror.id: mirror for mirror in mirrors
}
# Map the country codes to each country from the current list of coutnries.
# To be used in determining if a country is to be newly registered
current_countries: dict[str, schemas.Country] = {
country.code: country for country in countries

# Map the id (hostname) of each mirror from the database for comparison
# against the id of mirrors in current_mirrors. To be used in determining
# if this mirror should be disabled
query = select(models.Mirror).options(selectinload(models.Mirror.country))
db_mirrors: dict[str, models.Mirror] = {
mirror.id: mirror for mirror in session.scalars(query).all()
}

for country_code, country in current_countries.items():
if country_code not in db_countries:
# Register all of the country's mirrors as the country is
# a new country
logger.debug("Registering new mirrors for {country_code!r}")
c = models.Country(code=country.code, name=country.name)
c.mirrors = [models.Mirror(**m.model_dump()) for m in country.mirrors]
session.add(c)

for code, db_country in db_countries.items():
if code in current_countries:
# Even though the db_country is "current", ensure it's mirrors
# are in sync with the current mirrors
current_mirrors: dict[str, schemas.Mirror] = {
m.id: m for m in current_countries[code].mirrors
}
db_mirrors: dict[str, models.Mirror] = {m.id: m for m in db_country.mirrors}

for db_mirror in db_mirrors.values():
if db_mirror.id not in current_mirrors:
logger.debug(f"Disabling mirror {db_mirror.id!r}")
db_mirror.enabled = False
session.add(db_mirror)

for mirror_id, mirror in current_mirrors.items():
if mirror_id not in db_mirrors:
logger.debug(
f"Registering new mirror {mirror.id!r} for "
"country: {db_country.name!r}"
)
db_country.mirrors.append(models.Mirror(**mirror.model_dump()))
session.add(db_country)
else:
# disable all of the country's mirrors as they have been removed
for db_mirror in db_country.mirrors:
logger.debug(f"Disabling mirror {db_mirror.id!r}")
db_mirror.enabled = False
session.add(db_mirror)


def get_current_mirror_countries() -> list[schemas.Country]:
def find_country_rows(tag: Tag) -> bool:
"""
Filters out table rows that do not contain mirror
data from the table body.
"""
return tag.name == "tr" and tag.findChild("td", class_="newregion") is None

r = requests.get(Settings.mirrors_url, timeout=Settings.requests_timeout)
r.raise_for_status()

soup = BeautifulSoup(r.text, features="html.parser")
body = soup.find("tbody")

if body is None or isinstance(body, NavigableString):
raise ValueError
# Given a country might have more than one mirror, set up a dictionary
# of country_code to the country's data. If it is the first time we
# are seeing the country, we save it along with its mirror, else,
# we simply update its mirrors list.
countries: dict[str, schemas.Country] = {}
rows = body.find_all(find_country_rows)
for row in rows:
country_name = row.find("img").next_sibling.text.strip()
if country_name in Settings.mirrors_exclusion_list:
continue
country_code = row.find("img")["alt"]
base_url = row.find("a", string="HTTP")["href"]
hostname: Any = urlsplit(
base_url
).netloc # pyright: ignore [reportUnknownMemberType]

if country_code not in countries:
countries[country_code] = schemas.Country(
code=country_code,
name=country_name,
mirrors=[
schemas.Mirror(
id=hostname,
base_url=base_url,
enabled=True,
)
],
)
else:
countries[country_code].mirrors.append(
schemas.Mirror(
id=hostname,
base_url=base_url,
enabled=True,
)
# Create any mirror that doesn't exist on the database
for mirror_id, mirror in current_mirrors.items():
if mirror_id not in db_mirrors:
# Create the mirror as it doesn't exists on the database.
result.nb_mirrors_added += create_mirrors(session, [mirror])

# Disable any mirror in the database that doesn't exist on the current
# list of mirrors
for db_mirror_id, db_mirror in db_mirrors.items():
if db_mirror_id not in current_mirrors:
logger.debug(
f"Disabling mirror: {db_mirror.id!r} for "
f"country: {db_mirror.country.name!r}"
)
return list(countries.values())
db_mirror.enabled = False
session.add(db_mirror)
result.nb_mirrors_disabled += 1
return result
6 changes: 3 additions & 3 deletions backend/src/mirrors_qa_backend/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ class Worker(Base):
# RSA public key in PKCS8 format for generating access tokens required
# to make requests to the web server
pubkey_pkcs8: Mapped[str]
pubkey_fingerprint: Mapped[str | None] = mapped_column(default=None)
pubkey_fingerprint: Mapped[str]

last_seen_on: Mapped[datetime | None] = mapped_column(default=None)
last_seen_on: Mapped[datetime] = mapped_column(default_factory=datetime.now)
countries: Mapped[list[Country]] = relationship(back_populates="worker", init=False)


Expand All @@ -108,7 +108,7 @@ class Test(Base):
id: Mapped[UUID] = mapped_column(
init=False, primary_key=True, server_default=text("uuid_generate_v4()")
)
requested_on: Mapped[datetime]
requested_on: Mapped[datetime] = mapped_column(default_factory=datetime.now)
started_on: Mapped[datetime | None] = mapped_column(default=None)
status: Mapped[StatusEnum] = mapped_column(
Enum(
Expand Down
31 changes: 31 additions & 0 deletions backend/src/mirrors_qa_backend/entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import argparse
import logging

from mirrors_qa_backend import Settings, db, logger
from mirrors_qa_backend.db import mirrors
from mirrors_qa_backend.extract import get_current_mirrors


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--update-mirrors",
action="store_true",
help=f"Update the list of mirrors from {Settings.mirrors_url}",
)
parser.add_argument(
"--verbose", "-v", help="Show verbose output", action="store_true"
)

args = parser.parse_args()

if args.verbose:
logger.setLevel(logging.DEBUG)

if args.update_mirrors:
with db.Session.begin() as session:
mirrors.update_mirrors(session, get_current_mirrors())


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions backend/src/mirrors_qa_backend/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class EmptyMirrorsError(Exception):
pass
Loading

0 comments on commit 7b0c1c4

Please sign in to comment.