diff --git a/docs/usage/scrapy-poet.rst b/docs/usage/scrapy-poet.rst index a043b7ea..e6eb58f3 100644 --- a/docs/usage/scrapy-poet.rst +++ b/docs/usage/scrapy-poet.rst @@ -65,3 +65,24 @@ always requested even when using a :class:`~scrapy_poet.DummyResponse` annotation, and in some dependency combinations two Zyte API requests will be made for the same page. We are planning to solve these problems in the future releases of :doc:`scrapy-poet ` and scrapy-zyte-api. + + +Dependency annotations +====================== + +``ZyteApiProvider`` understands some dependency annotations. The only currently +supported one is :class:`scrapy_zyte_api.ExtractFrom`: + +.. code-block:: python + + from typing import Annotated + + from scrapy_zyte_api import ExtractFrom + + @attrs.define + class MyPageObject(BasePage): + product: Annotated[Product, ExtractFrom.httpResponseBody] + +The provider will set the extraction options based on the annotations, so for +this code ``extractFrom`` will be set to ``httpResponseBody`` in +``productOptions``. diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index 894713e1..4fff205d 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -5,6 +5,7 @@ install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") +from ._annotations import ExtractFrom from ._middlewares import ( ScrapyZyteAPIDownloaderMiddleware, ScrapyZyteAPISpiderMiddleware, diff --git a/scrapy_zyte_api/_annotations.py b/scrapy_zyte_api/_annotations.py new file mode 100644 index 00000000..3caf3f0e --- /dev/null +++ b/scrapy_zyte_api/_annotations.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class ExtractFrom(str, Enum): + httpResponseBody: str = "httpResponseBody" + browserHtml: str = "browserHtml" diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index f1d12f17..4d4b5b1a 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,20 +1,23 @@ -from typing import Any, Callable, Dict, List, Sequence, Set, Type +from typing import Any, Callable, Dict, List, Sequence, Set from weakref import WeakKeyDictionary +from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future -from scrapy_poet import PageObjectInputProvider +from scrapy_poet import AnnotatedResult, PageObjectInputProvider from web_poet import BrowserHtml, BrowserResponse from zyte_common_items import ( Article, ArticleList, ArticleNavigation, + Item, Product, ProductList, ProductNavigation, ) +from scrapy_zyte_api._annotations import ExtractFrom from scrapy_zyte_api.responses import ZyteAPITextResponse try: @@ -42,12 +45,15 @@ def __init__(self, injector): super().__init__(injector) self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary() - def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: + def is_provided(self, type_: Callable) -> bool: + return super().is_provided(strip_annotated(type_)) + + def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None: if request not in self._cached_instances: self._cached_instances[request] = {} self._cached_instances[request].update(mapping) - async def __call__( + async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" @@ -63,7 +69,7 @@ async def __call__( return results html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide - item_keywords = { + item_keywords: Dict[type, str] = { Product: "product", ProductList: "productList", ProductNavigation: "productNavigation", @@ -75,13 +81,38 @@ async def __call__( zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS") if html_requested: zyte_api_meta["browserHtml"] = True + + to_provide_stripped: Set[type] = set() + extract_from_seen: Dict[str, str] = {} + + for cls in to_provide: + cls_stripped = strip_annotated(cls) + assert isinstance(cls_stripped, type) + kw = item_keywords.get(cls_stripped) + if not kw: + continue + to_provide_stripped.add(cls_stripped) + zyte_api_meta[kw] = True + if not is_typing_annotated(cls): + continue + metadata = cls.__metadata__ # type: ignore[attr-defined] + for extract_from in ExtractFrom: + if extract_from in metadata: + prev_extract_from = extract_from_seen.get(kw) + if prev_extract_from and prev_extract_from != extract_from: + raise ValueError( + f"Multiple different extractFrom specified for {kw}" + ) + extract_from_seen[kw] = extract_from + options = zyte_api_meta.setdefault(f"{kw}Options", {}) + # TODO better logic for overwriting the value + options["extractFrom"] = extract_from.value + break + for item_type, kw in item_keywords.items(): - if item_type in to_provide: - zyte_api_meta[kw] = True - else: - options_name = f"{kw}Options" - if options_name in zyte_api_meta: - del zyte_api_meta[options_name] + options_name = f"{kw}Options" + if item_type not in to_provide_stripped and options_name in zyte_api_meta: + del zyte_api_meta[options_name] api_request = Request( url=request.url, @@ -111,9 +142,17 @@ async def __call__( ) results.append(response) self.update_cache(request, {BrowserResponse: response}) - for item_type, kw in item_keywords.items(): - if item_type in to_provide: - item = item_type.from_dict(api_response.raw_api_response[kw]) - results.append(item) - self.update_cache(request, {item_type: item}) + + for cls in to_provide: + cls_stripped = strip_annotated(cls) + assert isinstance(cls_stripped, type) + kw = item_keywords.get(cls_stripped) + if not kw: + continue + assert issubclass(cls_stripped, Item) + item = cls_stripped.from_dict(api_response.raw_api_response[kw]) + if is_typing_annotated(cls): + item = AnnotatedResult(item, cls.__metadata__) # type: ignore[attr-defined] + results.append(item) + self.update_cache(request, {cls: item}) return results diff --git a/setup.py b/setup.py index 5b720739..6c315850 100644 --- a/setup.py +++ b/setup.py @@ -30,8 +30,9 @@ def get_version(): extras_require={ # Sync with [testenv:provider-pinned] @ tox.ini "provider": [ - "scrapy-poet>=0.10.0", - "web-poet>=0.13.0", + "andi>=0.5.0", + "scrapy-poet>=0.18.0", + "web-poet>=0.15.1", "zyte-common-items>=0.7.0", ] }, diff --git a/tests/mockserver.py b/tests/mockserver.py index 77a243e2..3348c756 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -149,6 +149,14 @@ def render_POST(self, request): "price": "10", "currency": "USD", } + extract_from = request_data.get("productOptions", {}).get("extractFrom") + if extract_from: + from scrapy_zyte_api.providers import ExtractFrom + + if extract_from == ExtractFrom.httpResponseBody: + assert isinstance(response_data["product"], dict) + assert isinstance(response_data["product"]["name"], str) + response_data["product"]["name"] += " (from httpResponseBody)" return json.dumps(response_data).encode() diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index b8556c68..aef54a15 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -5,7 +5,7 @@ from functools import partial from http.cookiejar import Cookie from inspect import isclass -from typing import Any, Dict, cast +from typing import Any, Dict, Type, cast from unittest import mock from unittest.mock import patch @@ -198,7 +198,7 @@ async def test_coro_handling(zyte_api: bool, mockserver): async def test_exceptions( caplog: LogCaptureFixture, meta: Dict[str, Dict[str, Any]], - exception_type: Exception, + exception_type: Type[Exception], exception_text: str, mockserver, ): diff --git a/tests/test_providers.py b/tests/test_providers.py index be69b6c2..deff3f54 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,3 +1,5 @@ +import sys + import pytest pytest.importorskip("scrapy_poet") @@ -13,6 +15,7 @@ from web_poet import BrowserHtml, BrowserResponse, ItemPage, field, handle_urls from zyte_common_items import BasePage, Product +from scrapy_zyte_api._annotations import ExtractFrom from scrapy_zyte_api.providers import ZyteApiProvider from . import SETTINGS @@ -199,3 +202,66 @@ async def test_provider_params_remove_unused_options(mockserver): crawler.stats.get_value("scrapy-zyte-api/request_args/productNavigationOptions") is None ) + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +@ensureDeferred +async def test_provider_extractfrom(mockserver): + from typing import Annotated + + @attrs.define + class AnnotatedProductPage(BasePage): + product: Annotated[Product, ExtractFrom.httpResponseBody] + product2: Annotated[Product, ExtractFrom.httpResponseBody] + + class AnnotatedZyteAPISpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type: ignore[override] + yield { + "product": page.product, + "product2": page.product, + } + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + + item, url, _ = await crawl_single_item( + AnnotatedZyteAPISpider, HtmlResource, settings + ) + assert item["product"] == Product.from_dict( + dict( + url=url, + name="Product name (from httpResponseBody)", + price="10", + currency="USD", + ) + ) + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +@ensureDeferred +async def test_provider_extractfrom_double(mockserver, caplog): + from typing import Annotated + + @attrs.define + class AnnotatedProductPage(BasePage): + product: Annotated[Product, ExtractFrom.httpResponseBody] + product2: Annotated[Product, ExtractFrom.browserHtml] + + class AnnotatedZyteAPISpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type: ignore[override] + yield { + "product": page.product, + } + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + + item, _, _ = await crawl_single_item(AnnotatedZyteAPISpider, HtmlResource, settings) + assert item is None + assert "Multiple different extractFrom specified for product" in caplog.text diff --git a/tox.ini b/tox.ini index 1f6c28e7..0f0a9ef6 100644 --- a/tox.ini +++ b/tox.ini @@ -86,10 +86,10 @@ extras = provider basepython=python3.8 extras = provider deps = - # scrapy-poet 0.10.0 depends on scrapy>=2.6.0 + # scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0 {[testenv:pinned-scrapy-2x6]deps} - scrapy-poet==0.10.0 - web-poet==0.13.0 + scrapy-poet==0.18.0 + web-poet==0.15.1 zyte-common-items==0.7.0 [testenv:pinned-extra] @@ -107,6 +107,7 @@ deps = scrapy-zyte-smartproxy [testenv:mypy] +extras = provider deps = mypy==1.4.1 types-setuptools