Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for typing.Annotated, support for extractFrom #141

Merged
merged 18 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from enum import Enum
from typing import Any, Callable, Dict, List, Sequence, Set, Type
from weakref import WeakKeyDictionary

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.utils.defer import maybe_deferred_to_future
Expand All @@ -24,6 +26,11 @@
NO_CALLBACK = None


class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
browserHtml: str = "browserHtml"


class ZyteApiProvider(PageObjectInputProvider):
name = "zyte_api"

Expand Down Expand Up @@ -63,7 +70,7 @@ async def __call__(
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords = {
item_keywords: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Expand All @@ -75,9 +82,23 @@ async def __call__(
zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS")
if html_requested:
zyte_api_meta["browserHtml"] = True
for item_type, kw in item_keywords.items():
if item_type in to_provide:
zyte_api_meta[kw] = True

for cls in to_provide:
cls_stripped = strip_annotated(cls)
kw = item_keywords.get(cls_stripped)
if not kw:
continue
zyte_api_meta[kw] = True
if not is_typing_annotated(cls):
continue
metadata = cls.__metadata__
if cls_stripped == Product:
for option in ExtractFrom:
if option in metadata:
product_options = zyte_api_meta.setdefault("productOptions", {})
product_options["extractFrom"] = option.value
break

api_request = Request(
url=request.url,
meta={
Expand Down Expand Up @@ -106,9 +127,15 @@ async def __call__(
)
results.append(response)
self.update_cache(request, {BrowserResponse: response})
for item_type, kw in item_keywords.items():
if item_type in to_provide:
item = item_type.from_dict(api_response.raw_api_response[kw])
results.append(item)
self.update_cache(request, {item_type: item})

for cls in to_provide:
cls_stripped = strip_annotated(cls)
kw = item_keywords.get(cls_stripped)
if not kw:
continue
item = cls_stripped.from_dict(api_response.raw_api_response[kw])
if is_typing_annotated(cls):
item.__metadata__ = cls.__metadata__
wRAR marked this conversation as resolved.
Show resolved Hide resolved
results.append(item)
self.update_cache(request, {cls_stripped: item})
wRAR marked this conversation as resolved.
Show resolved Hide resolved
return results
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def get_version():
extras_require={
# Sync with [testenv:provider-pinned] @ tox.ini
"provider": [
"scrapy-poet>=0.10.0",
"andi", # fix min version
"scrapy-poet>=0.10.0", # fix min version
"web-poet>=0.13.0",
"zyte-common-items>=0.7.0",
]
Expand Down
32 changes: 31 additions & 1 deletion tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from web_poet import BrowserHtml, BrowserResponse, ItemPage, field, handle_urls
from zyte_common_items import BasePage, Product

from scrapy_zyte_api.providers import ZyteApiProvider
from scrapy_zyte_api.providers import ExtractFrom, ZyteApiProvider

from . import SETTINGS
from .mockserver import get_ephemeral_port
Expand Down Expand Up @@ -179,3 +179,33 @@ async def test_provider_params(mockserver):
_, _, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings)
assert crawler.stats.get_value("scrapy-zyte-api/request_args/browserHtml") == 1
assert crawler.stats.get_value("scrapy-zyte-api/request_args/geolocation") == 1


@ensureDeferred
async def test_provider_extractfrom(mockserver):
from typing import Annotated

@attrs.define
class AnnotatedProductPage(BasePage):
product: Annotated[Product, ExtractFrom.httpResponseBody]

class AnnotatedZyteAPISpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, page: AnnotatedProductPage):
return super().parse_(response, page)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0}

item, url, _ = await crawl_single_item(
AnnotatedZyteAPISpider, HtmlResource, settings
)
assert item["product"] == Product.from_dict(
dict(
url=url,
name="Product name",
price="10",
currency="USD",
)
)