diff --git a/docs/reference/components.rst b/docs/reference/components.rst index 5345a688..293288f1 100644 --- a/docs/reference/components.rst +++ b/docs/reference/components.rst @@ -76,6 +76,9 @@ tied to any specific item type. .. autoclass:: zyte_common_items.Request(**kwargs) :members: +.. autoclass:: zyte_common_items.SerpOrganicResult(**kwargs) + :members: + .. autoclass:: zyte_common_items.SocialMediaPostAuthor(**kwargs) :members: diff --git a/docs/reference/items.rst b/docs/reference/items.rst index 0c607a47..65e7d059 100644 --- a/docs/reference/items.rst +++ b/docs/reference/items.rst @@ -109,6 +109,17 @@ Job posting :members: dateDownloaded, probability, validationMessages +Search engine results +===================== + +.. autoclass:: zyte_common_items.Serp(**kwargs) + :members: + :inherited-members: + +.. autoclass:: zyte_common_items.SerpMetadata(**kwargs) + :members: dateDownloaded, displayedQuery, searchedQuery, totalOrganicResults, validationMessages + + Social media post ================= diff --git a/docs/reference/pages.rst b/docs/reference/pages.rst index 4553b0e1..30090f49 100644 --- a/docs/reference/pages.rst +++ b/docs/reference/pages.rst @@ -114,6 +114,18 @@ Job posting .. autoclass:: zyte_common_items.AutoJobPostingPage(**kwargs) :show-inheritance: +Search engine results +===================== + +.. autoclass:: zyte_common_items.BaseSerpPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.SerpPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.AutoSerpPage(**kwargs) + :show-inheritance: + Social media post ================= diff --git a/tests/test_components.py b/tests/test_components.py index c98588f1..fdf4516e 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -23,6 +23,7 @@ Reactions, RealEstateArea, Request, + SerpOrganicResult, SocialMediaPostAuthor, SocialMediaPostMetadata, StarRating, @@ -199,6 +200,15 @@ def test_reactions(): Reactions(reposts=1, likes=2, dislikes=3) +def test_serp_organic_result(): + SerpOrganicResult( + description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.", + name="Foobar", + url="https://en.wikipedia.org/wiki/Foobar", + rank=1, + ) + + def test_social_media_post_author(): SocialMediaPostAuthor( numberOfFollowers=5, diff --git a/tests/test_items.py b/tests/test_items.py index ee3298c3..9a3cadeb 100644 --- a/tests/test_items.py +++ b/tests/test_items.py @@ -49,6 +49,9 @@ RealEstateArea, RealEstateMetadata, Request, + Serp, + SerpMetadata, + SerpOrganicResult, SocialMediaPost, SocialMediaPostAuthor, SocialMediaPostMetadata, @@ -498,6 +501,30 @@ ), } +_SERP_MIN_KWARGS: dict = { + "url": "https://example.com/search?q=foo+bar", +} + +_SERP_ALL_KWARGS: dict = { + **_SERP_MIN_KWARGS, + "organicResults": [ + SerpOrganicResult( + description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.", + name="Foobar", + url="https://en.wikipedia.org/wiki/Foobar", + rank=1, + ), + ], + "url": "https://example.com/search?q=foo+bar", + "pageNumber": 1, + "metadata": SerpMetadata( + dateDownloaded="2022-12-31T13:01:54Z", + displayedQuery="foo bar", + searchedQuery="foo bar", + totalOrganicResults=999_999_999_999, + ), +} + _SOCIAL_MEDIA_POST_MIN_KWARGS: dict = { "url": "https://example.com/viewjob/12345", } @@ -837,6 +864,28 @@ def test_job_posting_missing_fields(): JobPosting(**incomplete_kwargs) +def test_serp_all_fields(): + serp = Serp(**_SERP_ALL_KWARGS) + for field in list(_SERP_ALL_KWARGS): + assert getattr(serp, field) == _SERP_ALL_KWARGS[field] + + +def test_serp_min_fields(): + serp = Serp(**_SERP_MIN_KWARGS) + for field in list(_SERP_ALL_KWARGS): + if field in _SERP_MIN_KWARGS: + continue + assert getattr(serp, field) is None + + +def test_serp_missing_fields(): + for required_field in list(_SERP_MIN_KWARGS): + incomplete_kwargs: dict = copy(_SERP_MIN_KWARGS) + del incomplete_kwargs[required_field] + with pytest.raises(TypeError): + Serp(**incomplete_kwargs) + + def test_social_media_post_all_fields(): social_media_post = SocialMediaPost(**_SOCIAL_MEDIA_POST_ALL_KWARGS) for field in list(_SOCIAL_MEDIA_POST_ALL_KWARGS): @@ -874,6 +923,8 @@ def test_social_media_post_missing_fields(): (ProductNavigation, False), (ProductVariant, False), (RealEstate, True), + (Serp, False), + (SocialMediaPost, True), ), ) def test_get_probability_request(cls, has_proba): diff --git a/tests/test_pages.py b/tests/test_pages.py index 574e73a0..d8416bbb 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -220,6 +220,13 @@ def test_matching_items(): "ProductNavigation": {"dateDownloaded", "validationMessages"}, "RealEstate": {"dateDownloaded", "probability", "validationMessages"}, "JobPosting": {"dateDownloaded", "probability", "searchText", "validationMessages"}, + "Serp": { + "dateDownloaded", + "displayedQuery", + "searchedQuery", + "totalOrganicResults", + "validationMessages", + }, "SocialMediaPost": { "dateDownloaded", "probability", diff --git a/tox.ini b/tox.ini index 50f412cc..d228c3ea 100644 --- a/tox.ini +++ b/tox.ini @@ -74,7 +74,7 @@ commands = mypy zyte_common_items tests basepython = python3 deps = twine==5.1.1 - build==0.10.0 + build==1.2.1 commands = python -m build --sdist twine check dist/* diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index 44e055f4..47f88861 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -61,6 +61,9 @@ RealEstateMetadata, SearchRequestTemplate, SearchRequestTemplateMetadata, + Serp, + SerpMetadata, + SerpOrganicResult, SocialMediaPost, SocialMediaPostMetadata, ) @@ -77,6 +80,7 @@ AutoProductNavigationPage, AutoProductPage, AutoRealEstatePage, + AutoSerpPage, AutoSocialMediaPostPage, BaseArticleListPage, BaseArticleNavigationPage, @@ -88,6 +92,7 @@ BaseProductNavigationPage, BaseProductPage, BaseRealEstatePage, + BaseSerpPage, BaseSocialMediaPostPage, BusinessPlacePage, HasMetadata, @@ -99,5 +104,6 @@ ProductPage, RealEstatePage, SearchRequestTemplatePage, + SerpPage, SocialMediaPostPage, ) diff --git a/zyte_common_items/items/__init__.py b/zyte_common_items/items/__init__.py index 0fe9e452..3abfb550 100644 --- a/zyte_common_items/items/__init__.py +++ b/zyte_common_items/items/__init__.py @@ -13,4 +13,5 @@ SearchRequestTemplate, SearchRequestTemplateMetadata, ) +from .serp import Serp, SerpMetadata, SerpOrganicResult from .social_media_post import SocialMediaPost, SocialMediaPostMetadata diff --git a/zyte_common_items/items/serp.py b/zyte_common_items/items/serp.py new file mode 100644 index 00000000..03361752 --- /dev/null +++ b/zyte_common_items/items/serp.py @@ -0,0 +1,69 @@ +from typing import List, Optional + +import attrs + +from zyte_common_items.base import Item +from zyte_common_items.components import ListMetadata +from zyte_common_items.converters import ( + to_metadata_optional, + url_to_str, + url_to_str_optional, +) + + +@attrs.define(kw_only=True) +class SerpOrganicResult: + """Data from a non-paid result of a search engine results page.""" + + #: Result excerpt. + description: Optional[str] = None + + #: Result title. + name: Optional[str] = None + + #: Result URL. + url: Optional[str] = attrs.field( + default=None, converter=url_to_str_optional, kw_only=True + ) + + #: Result position among other organic results from the same search engine + #: results page. + #: + #: This is the rank within a specific page, not within an entire search. + #: That is, the first result of any page, even if it not the first page of + #: a search, must be 1. + rank: Optional[int] = None + + +@attrs.define(kw_only=True) +class SerpMetadata(ListMetadata): + """Metadata class for :data:`zyte_common_items.Serp.metadata`.""" + + #: Search query as seen in the webpage. + displayedQuery: Optional[str] = None + + #: Search query as specified in the input URL. + searchedQuery: Optional[str] = None + + #: Total number of organic results reported by the search engine. + totalOrganicResults: Optional[int] = None + + +@attrs.define(kw_only=True) +class Serp(Item): + """Data from a `search engine results page + `_.""" + + #: List of search results excluding paid results. + organicResults: Optional[List[SerpOrganicResult]] = None + + #: Search URL. + url: str = attrs.field(converter=url_to_str) + + #: Page number. + pageNumber: Optional[int] = None + + #: Contains metadata about the data extraction process. + metadata: Optional[SerpMetadata] = attrs.field( + default=None, converter=to_metadata_optional(SerpMetadata), kw_only=True # type: ignore[misc] + ) diff --git a/zyte_common_items/pages/__init__.py b/zyte_common_items/pages/__init__.py index 269f7024..6eaeb63f 100644 --- a/zyte_common_items/pages/__init__.py +++ b/zyte_common_items/pages/__init__.py @@ -23,6 +23,7 @@ ) from .real_estate import AutoRealEstatePage, BaseRealEstatePage, RealEstatePage from .search_request_template import SearchRequestTemplatePage +from .serp import AutoSerpPage, BaseSerpPage, SerpPage from .social_media_post import ( AutoSocialMediaPostPage, BaseSocialMediaPostPage, diff --git a/zyte_common_items/pages/serp.py b/zyte_common_items/pages/serp.py new file mode 100644 index 00000000..de79dcc5 --- /dev/null +++ b/zyte_common_items/pages/serp.py @@ -0,0 +1,39 @@ +from typing import List, Optional + +import attrs +from web_poet import Returns + +from zyte_common_items.fields import auto_field +from zyte_common_items.items import Serp, SerpMetadata, SerpOrganicResult + +from .base import BasePage, Page +from .mixins import HasMetadata + + +class BaseSerpPage(BasePage, Returns[Serp], HasMetadata[SerpMetadata]): + pass + + +class SerpPage(Page, Returns[Serp], HasMetadata[SerpMetadata]): + pass + + +@attrs.define +class AutoSerpPage(BaseSerpPage): + serp: Serp + + @auto_field + def organicResults(self) -> Optional[List[SerpOrganicResult]]: + return self.serp.organicResults + + @auto_field + def url(self) -> str: + return self.serp.url + + @auto_field + def pageNumber(self) -> Optional[int]: + return self.serp.pageNumber + + @auto_field + def metadata(self) -> Optional[SerpMetadata]: + return self.serp.metadata