Skip to content

Commit

Permalink
Switch auto field stats to an item pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Aug 26, 2024
1 parent 106b099 commit 71567e6
Show file tree
Hide file tree
Showing 13 changed files with 943 additions and 634 deletions.
41 changes: 33 additions & 8 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,11 @@ ZYTE_API_AUTO_FIELD_STATS

Default: ``False``

Enables stats that indicate which requested fields :ref:`obtained through
scrapy-poet integration <scrapy-poet>` come directly from
:ref:`zapi-extract`.
Enables stats that indicate which fields from yielded items come from
:ref:`zapi-extract` when using :ref:`scrapy-poet integration <scrapy-poet>`.

If for any request no page object class is used to override
:ref:`zapi-extract` fields for a given item type, the following stat is
set:
If for any combination of item type and URL there is no registered page object
class, the following stat is set:

.. code-block:: python
Expand All @@ -28,8 +26,9 @@ set:
.. note:: A literal ``(all fields)`` string is used as value, not a list with
all fields.

If for any request a custom page object class is used to override some
:ref:`zapi-extract` fields, the following stat is set:
When a page object class is registered for a given combination of item type and
URL, and that page object class overrides some fields, the following stat is
set:

.. code-block:: python
Expand All @@ -40,6 +39,32 @@ If for any request a custom page object class is used to override some
.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
whether a field has been overridden or not.

Item URLs are read from the ``url`` field by default. Use
:setting:`ZYTE_API_AUTO_FIELD_URL_FIELDS` to configure a different field for
any given item type.

.. setting:: ZYTE_API_AUTO_FIELD_URL_FIELDS

ZYTE_API_AUTO_FIELD_URL_FIELDS
==============================

Default: ``{}``

Dictionary where keys are item types or their import paths, and values are
strings with the names of the fields in those item types that indicate the
source URL of the item.

For example:

.. code-block:: python
:caption: settings.py
ZYTE_API_AUTO_FIELD_URL_FIELDS = {
"my_project.items.CustomItem": "custom_url_field",
}
If a URL field is not specified for an item, ``url`` is used by default.

.. setting:: ZYTE_API_AUTOMAP_PARAMS

ZYTE_API_AUTOMAP_PARAMS
Expand Down
13 changes: 13 additions & 0 deletions docs/setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ spider to use Zyte API for all requests, set the following setting as well:
ZYTE_API_TRANSPARENT_MODE = True
.. _scrapy-poet-manual-setup:

For :ref:`scrapy-poet integration <scrapy-poet>`, add the following provider to
the ``SCRAPY_POET_PROVIDERS`` setting:

Expand Down Expand Up @@ -150,6 +152,17 @@ middleware to the :setting:`DOWNLOADER_MIDDLEWARES
"scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667,
}
For :setting:`ZYTE_API_AUTO_FIELD_STATS` support, first :ref:`enable
scrapy-poet integration <scrapy-poet-manual-setup>`, and then add the following
item pipeline to the :setting:`ITEM_PIPELINES <scrapy:ITEM_PIPELINES>` setting:

.. code-block:: python
:caption: settings.py
ITEM_PIPELINES = {
"scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0,
}
.. _reactor-change:

Expand Down
90 changes: 90 additions & 0 deletions scrapy_zyte_api/_poet_item_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from logging import getLogger
from typing import Any

from itemadapter import ItemAdapter
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy_poet.downloadermiddlewares import InjectionMiddleware
from web_poet.fields import get_fields_dict
from web_poet.utils import get_fq_class_name
from zyte_common_items.fields import is_auto_field

logger = getLogger(__name__)


class ScrapyZyteAPIPoetItemPipeline:

@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def __init__(self, crawler: Crawler):
if not crawler.settings.getbool("ZYTE_API_AUTO_FIELD_STATS", False):
raise NotConfigured

raw_url_fields = crawler.settings.getdict("ZYTE_API_AUTO_FIELD_URL_FIELDS", {})
self._url_fields = {load_object(k): v for k, v in raw_url_fields.items()}
self._seen = set()
self._crawler = crawler
self._stats = crawler.stats
self._cls_without_url = set()

def open_spider(self, spider):
for component in self._crawler.engine.downloader.middleware.middlewares:
if isinstance(component, InjectionMiddleware):
self._registry = component.injector.registry
return
raise RuntimeError(
"Could not find "
"scrapy_poet.downloadermiddlewares.InjectionMiddleware among "
"downloader middlewares. scrapy-poet may be misconfigured."
)

def process_item(self, item: Any, spider: Spider):
cls = item.__class__

url_field = self._url_fields.get(cls, "url")
adapter = ItemAdapter(item)
url = adapter.get(url_field, None)
if not url:
if cls not in self._cls_without_url:
self._cls_without_url.add(cls)
logger.warning(
f"An item of type {cls} was missing a non-empty URL in "
f"its {url_field!r} field. An item URL is necessary to "
f"determine the page object that was used to generate "
f"that item, and hence print the auto field stats that "
f"you requested by enabling the ZYTE_API_AUTO_FIELD_STATS "
f"setting. If {url_field!r} is the wrong URL field for "
f"that item type, use the ZYTE_API_AUTO_FIELD_URL_FIELDS "
f"setting to set a different field."
)
return

page_cls = self._registry.page_cls_for_item(url, cls)
cls = page_cls or cls

if cls in self._seen:
return
self._seen.add(cls)

if not page_cls:
field_list = "(all fields)"
else:
cls = page_cls
auto_fields = set()
missing_fields = False
for field_name in get_fields_dict(cls):
if is_auto_field(cls, field_name): # type: ignore[arg-type]
auto_fields.add(field_name)
else:
missing_fields = True
if missing_fields:
field_list = " ".join(sorted(auto_fields))
else:
field_list = "(all fields)"

cls_fqn = get_fq_class_name(cls)
self._stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
2 changes: 2 additions & 0 deletions scrapy_zyte_api/addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,11 @@ def update_settings(self, settings: BaseSettings) -> None:
except ImportError:
pass
else:
from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline
from scrapy_zyte_api.providers import ZyteApiProvider

_setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543)
_setdefault(settings, "ITEM_PIPELINES", ScrapyZyteAPIPoetItemPipeline, 0)
_setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100)

if settings.getbool("ZYTE_API_SESSION_ENABLED", False):
Expand Down
1 change: 1 addition & 0 deletions scrapy_zyte_api/poet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._poet_item_pipelines import ScrapyZyteAPIPoetItemPipeline
35 changes: 1 addition & 34 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast
from typing import Any, Callable, Dict, List, Optional, Sequence, Set

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
Expand All @@ -13,8 +13,6 @@
HttpResponseHeaders,
)
from web_poet.annotated import AnnotatedInstance
from web_poet.fields import get_fields_dict
from web_poet.utils import get_fq_class_name
from zyte_common_items import (
Article,
ArticleList,
Expand All @@ -32,7 +30,6 @@
ProductList,
ProductNavigation,
)
from zyte_common_items.fields import is_auto_field

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
from scrapy_zyte_api._annotations import _ActionResult
Expand Down Expand Up @@ -84,38 +81,9 @@ class ZyteApiProvider(PageObjectInputProvider):
Screenshot,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._should_track_auto_fields = None
self._tracked_auto_fields = set()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
if cls not in _ITEM_KEYWORDS:
return
if self._should_track_auto_fields is None:
self._should_track_auto_fields = crawler.settings.getbool(
"ZYTE_API_AUTO_FIELD_STATS", False
)
if self._should_track_auto_fields is False:
return
cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls
if cls in self._tracked_auto_fields:
return
self._tracked_auto_fields.add(cls)
if cls in _ITEM_KEYWORDS:
field_list = "(all fields)"
else:
auto_fields = set()
for field_name in get_fields_dict(cls):
if is_auto_field(cls, field_name): # type: ignore[arg-type]
auto_fields.add(field_name)
field_list = " ".join(sorted(auto_fields))
cls_fqn = get_fq_class_name(cls)
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
Expand All @@ -125,7 +93,6 @@ async def __call__( # noqa: C901
http_response = None
screenshot_requested = Screenshot in to_provide
for cls in list(to_provide):
self._track_auto_fields(crawler, request, cast(type, cls))
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4
per-file-ignores =
tests/test_auto_field_stats.py: E402
tests/test_providers.py: E402
scrapy_zyte_api/__init__.py: F401
scrapy_zyte_api/poet.py: F401
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def get_version():
"andi>=0.6.0",
"scrapy-poet>=0.22.3",
"web-poet>=0.17.0",
"zyte-common-items>=0.20.0",
# "zyte-common-items>=0.20.0",
"zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields",
]
},
classifiers=[
Expand Down
4 changes: 4 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
SETTINGS["SCRAPY_POET_PROVIDERS"] = {
"scrapy_zyte_api.providers.ZyteApiProvider": 1100
}
SETTINGS["ITEM_PIPELINES"] = {
"scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0
}
SETTINGS_ADDON: SETTINGS_T = {
"ADDONS": {
Addon: 500,
Expand Down Expand Up @@ -108,6 +111,7 @@ def serialize_settings(settings):
del result[setting]
for setting in (
"DOWNLOADER_MIDDLEWARES",
"ITEM_PIPELINES",
"SCRAPY_POET_PROVIDERS",
"SPIDER_MIDDLEWARES",
):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
POET = False
InjectionMiddleware = None
ZyteApiProvider: Optional[Type] = None
ScrapyZyteAPIPoetItemPipeline: Optional[Type] = None
else:
POET = True
from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline
from scrapy_zyte_api.providers import ZyteApiProvider

_crawler = get_crawler()
Expand Down Expand Up @@ -120,6 +122,7 @@ def _test_setting_changes(initial_settings, expected_settings):
# Test separately settings that copy_to_dict messes up.
for setting in (
"DOWNLOADER_MIDDLEWARES",
"ITEM_PIPELINES",
"SCRAPY_POET_PROVIDERS",
"SPIDER_MIDDLEWARES",
):
Expand All @@ -145,6 +148,7 @@ def _test_setting_changes(initial_settings, expected_settings):
"http": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPDownloadHandler",
"https": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPSDownloadHandler",
},
"ITEM_PIPELINES": {},
"REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter",
"SPIDER_MIDDLEWARES": {
ScrapyZyteAPISpiderMiddleware: 100,
Expand Down Expand Up @@ -230,6 +234,9 @@ def test_no_poet_setting_changes(initial_settings, expected_settings):
ScrapyZyteAPISessionDownloaderMiddleware: 667,
InjectionMiddleware: 543,
},
"ITEM_PIPELINES": {
ScrapyZyteAPIPoetItemPipeline: 0,
},
"SCRAPY_POET_PROVIDERS": {
ZyteApiProvider: 1100,
},
Expand Down
3 changes: 2 additions & 1 deletion tests/test_api_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,7 @@ async def test_default_params_immutability(setting_key, meta_key, setting, meta)
async def _test_automap(
settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None
):
caplog.clear()
request = Request(url="https://example.com", **request_kwargs)
request.meta["zyte_api_automap"] = meta
settings = {**settings, "ZYTE_API_TRANSPARENT_MODE": True}
Expand Down Expand Up @@ -694,7 +695,7 @@ async def _test_automap(
for warning in warnings:
assert warning in caplog.text
else:
assert not caplog.records
assert not caplog.records, caplog.records[0].args


@pytest.mark.parametrize(
Expand Down
Loading

0 comments on commit 71567e6

Please sign in to comment.