Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Serp #98

Merged
merged 4 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/reference/components.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ tied to any specific item type.
.. autoclass:: zyte_common_items.Request(**kwargs)
:members:

.. autoclass:: zyte_common_items.SerpOrganicResult(**kwargs)
:members:

.. autoclass:: zyte_common_items.SocialMediaPostAuthor(**kwargs)
:members:

Expand Down
11 changes: 11 additions & 0 deletions docs/reference/items.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ Job posting
:members: dateDownloaded, probability, validationMessages


Search engine results
=====================

.. autoclass:: zyte_common_items.Serp(**kwargs)
:members:
:inherited-members:

.. autoclass:: zyte_common_items.SerpMetadata(**kwargs)
:members: dateDownloaded, displayedQuery, searchedQuery, totalOrganicResults, validationMessages


Social media post
=================

Expand Down
12 changes: 12 additions & 0 deletions docs/reference/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ Job posting
.. autoclass:: zyte_common_items.AutoJobPostingPage(**kwargs)
:show-inheritance:

Search engine results
=====================

.. autoclass:: zyte_common_items.BaseSerpPage(**kwargs)
:show-inheritance:

.. autoclass:: zyte_common_items.SerpPage(**kwargs)
:show-inheritance:

.. autoclass:: zyte_common_items.AutoSerpPage(**kwargs)
:show-inheritance:

Social media post
=================

Expand Down
10 changes: 10 additions & 0 deletions tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Reactions,
RealEstateArea,
Request,
SerpOrganicResult,
SocialMediaPostAuthor,
SocialMediaPostMetadata,
StarRating,
Expand Down Expand Up @@ -199,6 +200,15 @@ def test_reactions():
Reactions(reposts=1, likes=2, dislikes=3)


def test_serp_organic_result():
SerpOrganicResult(
description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.",
name="Foobar",
url="https://en.wikipedia.org/wiki/Foobar",
rank=1,
)


def test_social_media_post_author():
SocialMediaPostAuthor(
numberOfFollowers=5,
Expand Down
51 changes: 51 additions & 0 deletions tests/test_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
RealEstateArea,
RealEstateMetadata,
Request,
Serp,
SerpMetadata,
SerpOrganicResult,
SocialMediaPost,
SocialMediaPostAuthor,
SocialMediaPostMetadata,
Expand Down Expand Up @@ -498,6 +501,30 @@
),
}

_SERP_MIN_KWARGS: dict = {
"url": "https://example.com/search?q=foo+bar",
}

_SERP_ALL_KWARGS: dict = {
**_SERP_MIN_KWARGS,
"organicResults": [
SerpOrganicResult(
description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.",
name="Foobar",
url="https://en.wikipedia.org/wiki/Foobar",
rank=1,
),
],
"url": "https://example.com/search?q=foo+bar",
"pageNumber": 1,
"metadata": SerpMetadata(
dateDownloaded="2022-12-31T13:01:54Z",
displayedQuery="foo bar",
searchedQuery="foo bar",
totalOrganicResults=999_999_999_999,
),
}

_SOCIAL_MEDIA_POST_MIN_KWARGS: dict = {
"url": "https://example.com/viewjob/12345",
}
Expand Down Expand Up @@ -837,6 +864,28 @@ def test_job_posting_missing_fields():
JobPosting(**incomplete_kwargs)


def test_serp_all_fields():
serp = Serp(**_SERP_ALL_KWARGS)
for field in list(_SERP_ALL_KWARGS):
assert getattr(serp, field) == _SERP_ALL_KWARGS[field]


def test_serp_min_fields():
serp = Serp(**_SERP_MIN_KWARGS)
for field in list(_SERP_ALL_KWARGS):
if field in _SERP_MIN_KWARGS:
continue
assert getattr(serp, field) is None


def test_serp_missing_fields():
for required_field in list(_SERP_MIN_KWARGS):
incomplete_kwargs: dict = copy(_SERP_MIN_KWARGS)
del incomplete_kwargs[required_field]
with pytest.raises(TypeError):
Serp(**incomplete_kwargs)


def test_social_media_post_all_fields():
social_media_post = SocialMediaPost(**_SOCIAL_MEDIA_POST_ALL_KWARGS)
for field in list(_SOCIAL_MEDIA_POST_ALL_KWARGS):
Expand Down Expand Up @@ -874,6 +923,8 @@ def test_social_media_post_missing_fields():
(ProductNavigation, False),
(ProductVariant, False),
(RealEstate, True),
(Serp, False),
(SocialMediaPost, True),
),
)
def test_get_probability_request(cls, has_proba):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ def test_matching_items():
"ProductNavigation": {"dateDownloaded", "validationMessages"},
"RealEstate": {"dateDownloaded", "probability", "validationMessages"},
"JobPosting": {"dateDownloaded", "probability", "searchText", "validationMessages"},
"Serp": {
"dateDownloaded",
"displayedQuery",
"searchedQuery",
"totalOrganicResults",
"validationMessages",
},
"SocialMediaPost": {
"dateDownloaded",
"probability",
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ commands = mypy zyte_common_items tests
basepython = python3
deps =
twine==5.1.1
build==0.10.0
build==1.2.1
commands =
python -m build --sdist
twine check dist/*
Expand Down
6 changes: 6 additions & 0 deletions zyte_common_items/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
RealEstateMetadata,
SearchRequestTemplate,
SearchRequestTemplateMetadata,
Serp,
SerpMetadata,
SerpOrganicResult,
SocialMediaPost,
SocialMediaPostMetadata,
)
Expand All @@ -77,6 +80,7 @@
AutoProductNavigationPage,
AutoProductPage,
AutoRealEstatePage,
AutoSerpPage,
AutoSocialMediaPostPage,
BaseArticleListPage,
BaseArticleNavigationPage,
Expand All @@ -88,6 +92,7 @@
BaseProductNavigationPage,
BaseProductPage,
BaseRealEstatePage,
BaseSerpPage,
BaseSocialMediaPostPage,
BusinessPlacePage,
HasMetadata,
Expand All @@ -99,5 +104,6 @@
ProductPage,
RealEstatePage,
SearchRequestTemplatePage,
SerpPage,
SocialMediaPostPage,
)
1 change: 1 addition & 0 deletions zyte_common_items/items/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
SearchRequestTemplate,
SearchRequestTemplateMetadata,
)
from .serp import Serp, SerpMetadata, SerpOrganicResult
from .social_media_post import SocialMediaPost, SocialMediaPostMetadata
69 changes: 69 additions & 0 deletions zyte_common_items/items/serp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import List, Optional

import attrs

from zyte_common_items.base import Item
from zyte_common_items.components import ListMetadata
from zyte_common_items.converters import (
to_metadata_optional,
url_to_str,
url_to_str_optional,
)


@attrs.define(kw_only=True)
class SerpOrganicResult:
"""Data from a non-paid result of a search engine results page."""

#: Result excerpt.
description: Optional[str] = None

#: Result title.
name: Optional[str] = None

#: Result URL.
url: Optional[str] = attrs.field(
default=None, converter=url_to_str_optional, kw_only=True
)

#: Result position among other organic results from the same search engine
#: results page.
#:
#: This is the rank within a specific page, not within an entire search.
#: That is, the first result of any page, even if it not the first page of
#: a search, must be 1.
rank: Optional[int] = None


@attrs.define(kw_only=True)
class SerpMetadata(ListMetadata):
"""Metadata class for :data:`zyte_common_items.Serp.metadata`."""

#: Search query as seen in the webpage.
displayedQuery: Optional[str] = None

#: Search query as specified in the input URL.
searchedQuery: Optional[str] = None

#: Total number of organic results reported by the search engine.
totalOrganicResults: Optional[int] = None


@attrs.define(kw_only=True)
class Serp(Item):
"""Data from a `search engine results page
<https://en.wikipedia.org/wiki/Search_engine_results_page>`_."""

#: List of search results excluding paid results.
organicResults: Optional[List[SerpOrganicResult]] = None

#: Search URL.
url: str = attrs.field(converter=url_to_str)

#: Page number.
pageNumber: Optional[int] = None

#: Contains metadata about the data extraction process.
metadata: Optional[SerpMetadata] = attrs.field(
default=None, converter=to_metadata_optional(SerpMetadata), kw_only=True # type: ignore[misc]
)
1 change: 1 addition & 0 deletions zyte_common_items/pages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from .real_estate import AutoRealEstatePage, BaseRealEstatePage, RealEstatePage
from .search_request_template import SearchRequestTemplatePage
from .serp import AutoSerpPage, BaseSerpPage, SerpPage
from .social_media_post import (
AutoSocialMediaPostPage,
BaseSocialMediaPostPage,
Expand Down
39 changes: 39 additions & 0 deletions zyte_common_items/pages/serp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Optional

import attrs
from web_poet import Returns

from zyte_common_items.fields import auto_field
from zyte_common_items.items import Serp, SerpMetadata, SerpOrganicResult

from .base import BasePage, Page
from .mixins import HasMetadata


class BaseSerpPage(BasePage, Returns[Serp], HasMetadata[SerpMetadata]):
pass


class SerpPage(Page, Returns[Serp], HasMetadata[SerpMetadata]):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
pass


@attrs.define
class AutoSerpPage(BaseSerpPage):
serp: Serp

@auto_field
def organicResults(self) -> Optional[List[SerpOrganicResult]]:
return self.serp.organicResults

@auto_field
def url(self) -> str:
return self.serp.url

@auto_field
def pageNumber(self) -> Optional[int]:
return self.serp.pageNumber

@auto_field
def metadata(self) -> Optional[SerpMetadata]:
return self.serp.metadata