Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

w3lib-based url classes #46

Draft
wants to merge 1 commit into
base: url-page-inputs
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions tests/test_page_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,30 +446,3 @@ def test_browser_html():
assert html.css("p::text").getall() == ["Hello, ", "world!"]
assert isinstance(html.selector, parsel.Selector)


def test_url_base_class():
url_str = "http://example.com"
url = _Url(url_str)
assert str(url) == url_str
assert repr(url) == "_Url('http://example.com')"

with pytest.raises(TypeError):
_Url(123)


def test_url_subclass():
url_str = "http://example.com"

class MyUrl(_Url):
pass

class MyUrl2(_Url):
pass

url = MyUrl(url_str)
assert str(url) == url_str
assert url._url == url_str
assert repr(url) == "MyUrl('http://example.com')"

url2 = MyUrl2(url)
assert str(url2) == str(url)
45 changes: 45 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pytest

from web_poet._base import _Url


def test_url_base_class():
url_str = "http://example.com"
url = _Url(url_str)
assert str(url) == url_str
assert repr(url) == "_Url('http://example.com')"


def test_url_init_validation():
with pytest.raises(TypeError):
_Url(123)


def test_url_subclasses():
url_str = "http://example.com"

class MyUrl(_Url):
pass

class MyUrl2(_Url):
pass

url = MyUrl(url_str)
assert str(url) == url_str
assert url._url == url_str
assert repr(url) == "MyUrl('http://example.com')"

url2 = MyUrl2(url)
assert str(url2) == str(url)


def test_urljoin():
url = _Url("http://example.com/foo/bar?x=y#fragment")
assert str(url.join("baz")) == "http://example.com/foo/baz"
assert str(url / "baz") == "http://example.com/foo/baz"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is different from yarl, for the cases path doesn't end with "/": in yarl you need to call url.parent / "baz" to get urljoin behavior. I'm not sure what's better.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to the proposed behavior, clean and fit to web scraping scenarios.



def test_update_query():
url = _Url("http://example.com/foo/bar?x=y#fragment")
assert str(url % {"foo": "bar"}) == "http://example.com/foo/bar?x=y&foo=bar#fragment"
assert str(url % {"x": "z"}) == "http://example.com/foo/bar?x=z#fragment"
18 changes: 17 additions & 1 deletion web_poet/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
In general, users shouldn't import and use the contents of this module.
"""


from urllib.parse import urljoin
from typing import Type, TypeVar, List, Dict, Union

from multidict import CIMultiDict
from w3lib.url import add_or_replace_parameters

T_headers = TypeVar("T_headers", bound="_HttpHeaders")
T_url = TypeVar("T_url", bound="_Url")


class _HttpHeaders(CIMultiDict):
Expand Down Expand Up @@ -43,8 +45,22 @@ def __init__(self, url: Union[str, '_Url']):
f"got {url.__class__} instance instead")
self._url = str(url)

def join(self: T_url, other: Union[str, '_Url']) -> T_url:
return self.__class__(urljoin(self._url, str(other)))

def update_query(self: T_url,
new_parameters: Dict[str, str]) -> T_url:
new_url = add_or_replace_parameters(self._url,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yarl's update_query looks way more powerful than w3lib's add_or_replace_parameters, see https://yarl.readthedocs.io/en/latest/api.html#yarl.URL.update_query for the supported arguments. I like that multiple values are handled in yarl, and duplicate query parameters in general.

new_parameters=new_parameters)
return self.__class__(new_url)

def __str__(self) -> str:
return self._url

def __repr__(self) -> str:
return f"{self.__class__.__name__}({self._url!r})"

def __mod__(self: T_url, other: Dict[str, str]) -> T_url:
return self.update_query(other)

__truediv__ = join