Skip to content

Commit

Permalink
Use zimscraperlib session to fetch web content with automatic meaning…
Browse files Browse the repository at this point in the history
…ful retries
  • Loading branch information
benoit74 committed Oct 28, 2024
1 parent cdb9876 commit 44483b1
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 14 deletions.
11 changes: 5 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
from pydantic import BaseModel
from requests import Response

from mindtouch2zim.constants import (
HTTP_TIMEOUT_LONG_SECONDS,
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand Down Expand Up @@ -121,7 +122,7 @@ def _get_text(self, url_subpath_and_query: str) -> str:
full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

resp = requests.get(
resp = web_session.get(

Check warning on line 125 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L125

Added line #L125 was not covered by tests
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
Expand All @@ -131,12 +132,10 @@ def _get_text(self, url_subpath_and_query: str) -> str:
cache_file.write_text(resp.text)
return resp.text

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
def _get_api_resp(self, api_sub_path_and_query: str, timeout: float) -> Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(
resp = web_session.get(

Check warning on line 138 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L138

Added line #L138 was not covered by tests
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
Expand Down
3 changes: 3 additions & 0 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import pathlib

from zimscraperlib.download import get_session
from zimscraperlib.logging import (
getLogger,
)
Expand All @@ -18,3 +19,5 @@
HTTP_TIMEOUT_LONG_SECONDS = 30

logger = getLogger(NAME, level=logging.DEBUG)

web_session = get_session()
24 changes: 19 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@
MindtouchClient,
MindtouchHome,
)
from mindtouch2zim.constants import LANGUAGE_ISO_639_3, NAME, VERSION, logger
from mindtouch2zim.constants import (
LANGUAGE_ISO_639_3,
NAME,
VERSION,
logger,
web_session,
)
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -345,7 +351,9 @@ def run(self) -> Path:
)

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
stream_file(

Check warning on line 354 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L354

Added line #L354 was not covered by tests
home.welcome_image_url, byte_stream=welcome_image, session=web_session
)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

Expand Down Expand Up @@ -437,7 +445,11 @@ def run(self) -> Path:
for asset_url in asset_urls:
try:
asset_content = BytesIO()
stream_file(asset_url.value, byte_stream=asset_content)
stream_file(

Check warning on line 448 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L448

Added line #L448 was not covered by tests
asset_url.value,
byte_stream=asset_content,
session=web_session,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
Expand Down Expand Up @@ -474,7 +486,7 @@ def _process_css(
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
stream_file(css_location, byte_stream=css_buffer, session=web_session)

Check warning on line 489 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L489

Added line #L489 was not covered by tests
css_content = css_buffer.getvalue()
url_rewriter = CssUrlsRewriter(
article_url=HttpUrl(css_location),
Expand Down Expand Up @@ -548,7 +560,9 @@ def _fetch_zim_illustration(self, home: MindtouchHome) -> BytesIO:
try:
logger.debug(f"Downloading {icon_url} illustration")
illustration_content = BytesIO()
stream_file(icon_url, byte_stream=illustration_content)
stream_file(

Check warning on line 563 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L563

Added line #L563 was not covered by tests
icon_url, byte_stream=illustration_content, session=web_session
)
illustration_format = format_for(
illustration_content, from_suffix=False
)
Expand Down
5 changes: 2 additions & 3 deletions scraper/src/mindtouch2zim/vimeo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import requests

from mindtouch2zim.constants import (
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand All @@ -14,7 +13,7 @@ class VimeoThumbnailError(Exception):

def get_vimeo_thumbnail_url(video_url: str) -> str:
"""From a vimeo URL - player or normal - retrieve corresponding thumbnail URL"""
resp = requests.get(
resp = web_session.get(

Check warning on line 16 in scraper/src/mindtouch2zim/vimeo.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/vimeo.py#L16

Added line #L16 was not covered by tests
f"https://vimeo.com/api/oembed.json?url={video_url}",
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
Expand Down

0 comments on commit 44483b1

Please sign in to comment.