diff --git a/poetry.lock b/poetry.lock
index eacb424..d68535f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1241,6 +1241,31 @@ files = [
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
+[[package]]
+name = "types-requests"
+version = "2.32.0.20240712"
+description = "Typing stubs for requests"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "types-requests-2.32.0.20240712.tar.gz", hash = "sha256:90c079ff05e549f6bf50e02e910210b98b8ff1ebdd18e19c873cd237737c1358"},
+ {file = "types_requests-2.32.0.20240712-py3-none-any.whl", hash = "sha256:f754283e152c752e46e70942fa2a146b5bc70393522257bb85bd1ef7e019dcc3"},
+]
+
+[package.dependencies]
+urllib3 = ">=2"
+
+[[package]]
+name = "types-setuptools"
+version = "71.1.0.20240726"
+description = "Typing stubs for setuptools"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "types-setuptools-71.1.0.20240726.tar.gz", hash = "sha256:85ba28e9461bb1be86ebba4db0f1c2408f2b11115b1966334ea9dc464e29303e"},
+ {file = "types_setuptools-71.1.0.20240726-py3-none-any.whl", hash = "sha256:a7775376f36e0ff09bcad236bf265777590a66b11623e48c20bfc30f1444ea36"},
+]
+
[[package]]
name = "typing-extensions"
version = "4.12.2"
@@ -1406,4 +1431,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.8"
-content-hash = "4bb71f99de019c462a80715c25fe5009d6a79d8375c56893cc931e46dd199f9a"
+content-hash = "adbac27b89b1b9e13fb8dd281bfe69d0da0abd0c19fad7ad9c7738fe783fe5e1"
diff --git a/pyproject.toml b/pyproject.toml
index 2dbcdde..df4a7f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,8 @@ lxml = "^5.2.2"
colorama = "^0.4.6"
requests = "^2.32.3"
aiohttp = "^3.10.0"
+types-requests = "^2.32.0.20240712"
+types-setuptools = "^71.1.0.20240726"
[tool.poetry.group.dev.dependencies]
responses = "^0.13.4"
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..85e2741
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,31 @@
+# conftest.py
+from typing import Callable, Generator
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.utils import setup_mock_response
+
+root_url = "http://example.com"
+
+
+@pytest.fixture
+def mock_urlopen() -> Generator[MagicMock, None, None]:
+ with patch("urllib.request.urlopen") as mock:
+ yield mock
+
+
+@pytest.fixture
+def mock_response() -> MagicMock:
+ response = MagicMock()
+ response.read.return_value = b""
+ response.status = 200
+ return response
+
+
+@pytest.fixture
+def setup_responses() -> Callable[[str, str, int], None]:
+ def _setup_responses(url: str, body: str, status: int = 200) -> None:
+ setup_mock_response(url=url, body=body, status=status)
+
+ return _setup_responses
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index 6f12e48..8fb7741 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -1,50 +1,46 @@
+import urllib
import urllib.error
from io import BytesIO
from logging import DEBUG, ERROR, WARNING
+from typing import Callable
from unittest.mock import MagicMock, mock_open, patch
import pytest
import responses
from datacrawl import Spider, SpiderSettings
+from pytest import LogCaptureFixture
-from tests.utils import setup_mock_response
+from tests.conftest import root_url
@responses.activate
-def test_crawl() -> None:
- setup_mock_response(
- url="http://example.com",
- body="
link",
- status=200,
- )
- setup_mock_response(
- url="http://example.com/test",
- body="link",
- status=200,
+def test_crawl(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(root_url, f"link", 200)
+ setup_responses(
+ f"{root_url}/test", f"link", 200
)
- # Mock urllib.request.urlopen to avoid real network calls
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
+ mock_urlopen.side_effect = lambda url: mock_response
- spider = Spider(SpiderSettings(root_url="http://example.com", max_links=10))
- spider.crawl("http://example.com")
+ spider = Spider(SpiderSettings(root_url=root_url, max_links=10))
+ spider.crawl(root_url)
- assert "http://example.com" in spider.crawl_result
- assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"]
+ assert root_url in spider.crawl_result
+ assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"]
- spider.crawl("http://example.com/test")
+ spider.crawl(f"{root_url}/test")
- assert "http://example.com/test" in spider.crawl_result
- assert spider.crawl_result["http://example.com/test"]["urls"] == ["http://example.com"]
+ assert f"{root_url}/test" in spider.crawl_result
+ assert spider.crawl_result[f"{root_url}/test"]["urls"] == [root_url]
@responses.activate
-def test_crawl_invalid_url(caplog) -> None: # type: ignore
- spider = Spider(SpiderSettings(root_url="http://example.com"))
+def test_crawl_invalid_url(mock_urlopen: MagicMock, caplog: LogCaptureFixture) -> None:
+ spider = Spider(SpiderSettings(root_url=root_url))
with caplog.at_level(DEBUG):
spider.crawl("invalid_url")
@@ -54,118 +50,99 @@ def test_crawl_invalid_url(caplog) -> None: # type: ignore
@responses.activate
-def test_crawl_already_crawled_url(caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://example.com",
- body="link",
- status=200,
- )
+def test_crawl_already_crawled_url(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(root_url, f"link", 200)
- spider = Spider(SpiderSettings(root_url="http://example.com"))
+ spider = Spider(SpiderSettings(root_url=root_url))
+ mock_urlopen.side_effect = lambda url: mock_response
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
-
- with caplog.at_level(DEBUG):
- spider.crawl("http://example.com")
- spider.crawl("http://example.com")
+ with caplog.at_level(DEBUG):
+ spider.crawl(root_url)
+ spider.crawl(root_url)
assert "URL already crawled:" in caplog.text
- assert spider.crawl_result == {"http://example.com": {"urls": ["http://example.com"]}}
+ assert spider.crawl_result == {root_url: {"urls": [root_url]}}
@responses.activate
-def test_crawl_unfetchable_url() -> None:
- setup_mock_response(
- url="http://example.com",
- body="link",
- status=404,
- )
-
- spider = Spider(SpiderSettings(root_url="http://example.com"))
-
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 404
- mock_urlopen.return_value = mock_response
-
- spider.crawl("http://example.com")
-
+def test_crawl_unfetchable_url(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(root_url, f"link", 404)
+
+ spider = Spider(SpiderSettings(root_url=root_url))
+ mock_response.status = 404
+ mock_urlopen.side_effect = lambda url: mock_response
+
+ spider.crawl(root_url)
assert spider.crawl_result == {}
@responses.activate
-def test_crawl_found_invalid_url(caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://example.com",
- body="link",
- status=200,
- )
+def test_crawl_found_invalid_url(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(root_url, "link", 200)
- spider = Spider(SpiderSettings(root_url="http://example.com"))
+ spider = Spider(SpiderSettings(root_url=root_url))
+ mock_urlopen.side_effect = lambda url: mock_response
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
-
- with caplog.at_level(DEBUG):
- spider.crawl("http://example.com")
+ with caplog.at_level(DEBUG):
+ spider.crawl(root_url)
assert "Invalid url:" in caplog.text
- assert spider.crawl_result == {"http://example.com": {"urls": []}}
+ assert spider.crawl_result == {root_url: {"urls": []}}
@responses.activate
-def test_crawl_found_duplicate_url() -> None:
- setup_mock_response(
- url="http://example.com",
- body="link1"
- + "link2",
- status=200,
+def test_crawl_found_duplicate_url(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(
+ root_url,
+ f"link1"
+ f"link2",
+ 200,
)
- spider = Spider(SpiderSettings(root_url="http://example.com"))
-
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
+ spider = Spider(SpiderSettings(root_url=root_url))
+ mock_urlopen.side_effect = lambda url: mock_response
- spider.crawl("http://example.com")
-
- assert spider.crawl_result == {"http://example.com": {"urls": ["http://duplicate.com"]}}
+ spider.crawl(root_url)
+ assert spider.crawl_result == {root_url: {"urls": [root_url]}}
@responses.activate
-def test_crawl_no_urls_in_page() -> None:
- setup_mock_response(url="http://example.com", body="", status=200)
-
- spider = Spider(SpiderSettings(root_url="http://example.com"))
+def test_crawl_no_urls_in_page(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(root_url, "", 200)
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
+ spider = Spider(SpiderSettings(root_url=root_url))
+ mock_urlopen.side_effect = lambda url: mock_response
- spider.crawl("http://example.com")
-
- assert spider.crawl_result == {"http://example.com": {"urls": []}}
+ spider.crawl(root_url)
+ assert spider.crawl_result == {root_url: {"urls": []}}
@responses.activate
def test_save_results() -> None:
- spider = Spider(
- SpiderSettings(root_url="http://example.com", max_links=10, save_to_file="out.json")
- )
- spider.crawl_result = {"http://example.com": {"urls": ["http://example.com/test"]}}
+ spider = Spider(SpiderSettings(root_url=root_url, max_links=10, save_to_file="out.json"))
+ spider.crawl_result = {root_url: {"urls": [f"{root_url}/test"]}}
with patch("builtins.open", mock_open()) as mocked_file:
spider.save_results()
@@ -173,177 +150,160 @@ def test_save_results() -> None:
@responses.activate
-def test_url_regex() -> None:
- setup_mock_response(
- url="http://example.com",
- body="link"
- + "link",
- status=200,
+def test_url_regex(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(
+ root_url,
+ f"link"
+ f"link",
+ 200,
)
- # This regex matches strings starting with "http://example.com/"
- # And only have numeric characters after it
- regex = r"http://example\.com/[0-9]+"
-
- spider = Spider(SpiderSettings(root_url="http://example.com", url_regex=regex))
+ regex = rf"{root_url}/[0-9]+"
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
+ spider = Spider(SpiderSettings(root_url=root_url, url_regex=regex))
+ mock_urlopen.side_effect = lambda url: mock_response
- spider.start()
-
- assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/123"]
- assert "http://example.com/test" not in spider.crawl_result["http://example.com"]["urls"]
+ spider.start()
+ assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/123"]
+ assert f"{root_url}/test" not in spider.crawl_result[root_url]["urls"]
@responses.activate
-def test_include_body() -> None:
- setup_mock_response(
- url="http://example.com",
- body="link",
- status=200,
- )
- setup_mock_response(
- url="http://example.com/test",
- body="This is a header",
- status=200,
- )
-
- spider = Spider(SpiderSettings(root_url="http://example.com", include_body=True))
+def test_include_body(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+) -> None:
+ setup_responses(root_url, f"link", 200)
+ setup_responses(f"{root_url}/test", "This is a header", 200)
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
-
- spider.start()
+ spider = Spider(SpiderSettings(root_url=root_url, include_body=True))
+ mock_urlopen.side_effect = lambda url: mock_response
+ spider.start()
assert (
- spider.crawl_result["http://example.com"]["body"]
- == 'link'
+ spider.crawl_result[root_url]["body"]
+ == f'link'
)
assert (
- spider.crawl_result["http://example.com/test"]["body"]
+ spider.crawl_result[f"{root_url}/test"]["body"]
== "This is a header"
)
@responses.activate
-def test_internal_links_only(caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://internal.com",
- body="link"
- + "link",
- status=200,
- )
-
- spider = Spider(SpiderSettings(root_url="http://internal.com", internal_links_only=True))
+def test_internal_links_only(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ f"link"
+ "link",
+ 200,
+ )
+
+ spider = Spider(SpiderSettings(root_url=root_url, internal_links_only=True))
+ mock_urlopen.side_effect = lambda url: mock_response
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
-
- with caplog.at_level(DEBUG):
- spider.crawl("http://internal.com")
+ with caplog.at_level(DEBUG):
+ spider.crawl(root_url)
assert "Skipping: External link:" in caplog.text
- assert spider.crawl_result == {"http://internal.com": {"urls": ["http://internal.com/test"]}}
+ assert spider.crawl_result == {root_url: {"urls": [f"{root_url}/test"]}}
@responses.activate
-def test_external_links_only(caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://internal.com",
- body="link"
- + "link",
- status=200,
- )
+def test_external_links_only(
+ mock_urlopen: MagicMock,
+ mock_response: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ f"link"
+ "link",
+ 200,
+ )
+
+ spider = Spider(SpiderSettings(root_url=root_url, external_links_only=True))
+ mock_urlopen.side_effect = lambda url: mock_response
- spider = Spider(SpiderSettings(root_url="http://internal.com", external_links_only=True))
-
- with patch("urllib.request.urlopen") as mock_urlopen:
- mock_response = MagicMock()
- mock_response.read.return_value = b""
- mock_response.status = 200
- mock_urlopen.return_value = mock_response
-
- with caplog.at_level(DEBUG):
- spider.crawl("http://internal.com")
+ with caplog.at_level(DEBUG):
+ spider.crawl(root_url)
assert "Skipping: Internal link:" in caplog.text
- assert spider.crawl_result == {"http://internal.com": {"urls": ["http://external.com/test"]}}
+ assert spider.crawl_result == {root_url: {"urls": ["http://external.com/test"]}}
@responses.activate
def test_external_and_internal_links_only() -> None:
with pytest.raises(ValueError):
Spider(
- SpiderSettings(
- root_url="http://example.com",
- internal_links_only=True,
- external_links_only=True,
- )
+ SpiderSettings(root_url=root_url, internal_links_only=True, external_links_only=True)
)
@patch.object(Spider, "crawl")
@patch.object(Spider, "save_results")
def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:
- spider = Spider(SpiderSettings(root_url="http://example.com", max_links=10))
+ spider = Spider(SpiderSettings(root_url=root_url, max_links=10))
mock_crawl.side_effect = lambda url: spider.crawl_result.update(
- {url: {"urls": ["http://example.com/test"]}}
+ {url: {"urls": [f"{root_url}/test"]}}
)
- print(mock_save_results)
spider.start()
assert mock_crawl.call_count == 1
- assert "http://example.com" in spider.crawl_result
- assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"]
+ assert root_url in spider.crawl_result
+ assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"]
@patch.object(Spider, "crawl")
@patch.object(Spider, "save_results")
def test_start_with_save_to_file(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:
- spider = Spider(
- SpiderSettings(root_url="http://example.com", max_links=10, save_to_file="file.txt")
- )
+ spider = Spider(SpiderSettings(root_url=root_url, max_links=10, save_to_file="file.txt"))
mock_crawl.side_effect = lambda url: spider.crawl_result.update(
- {url: {"urls": ["http://example.com/test"]}}
+ {url: {"urls": [f"{root_url}/test"]}}
)
spider.start()
assert mock_crawl.call_count == 1
- assert "http://example.com" in spider.crawl_result
- assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"]
+ assert root_url in spider.crawl_result
+ assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"]
mock_save_results.assert_called_once()
@responses.activate
@patch("urllib.request.urlopen")
-def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://crawlable.com",
- body="link",
- status=200,
- )
- setup_mock_response(
- url="http://notcrawlable.com",
- body="link",
- status=200,
+def test_respect_robots_txt(
+ mock_urlopen: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ "link",
+ 200,
+ )
+ setup_responses(
+ "http://notcrawlable.com",
+ f"link",
+ 200,
)
mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /")
- if url == "http://crawlable.com/robots.txt"
+ if url == f"{root_url}/robots.txt"
else (
BytesIO(b"User-agent: *\nDisallow: /")
if url == "http://notcrawlable.com/robots.txt"
@@ -351,85 +311,89 @@ def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore
)
)
- spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True))
with caplog.at_level(DEBUG):
spider.start()
- assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://notcrawlable.com"]}}
-
+ assert spider.crawl_result == {root_url: {"urls": ["http://notcrawlable.com"]}}
assert "Skipped: Url doesn't allow crawling:" in caplog.text
-
assert "http://notcrawlable.com/robots.txt" in spider.robots
@responses.activate
@patch("urllib.request.urlopen")
-def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://crawlable.com",
- body="link",
- status=200,
- )
+def test_respect_robots_txt_allowed(
+ mock_urlopen: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(root_url, f"link", 200)
mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /")
- if url == "http://crawlable.com/robots.txt"
+ if url == f"{root_url}/robots.txt"
else urllib.error.URLError(f"No mock for {url}")
)
- spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True))
with caplog.at_level(DEBUG):
- spider.crawl("http://crawlable.com")
+ spider.crawl(root_url)
- assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://crawlable.com"]}}
+ assert spider.crawl_result == {root_url: {"urls": [root_url]}}
@responses.activate
@patch("urllib.request.urlopen")
-def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://notcrawlable.com",
- body="link",
- status=200,
+def test_respect_robots_txt_not_allowed(
+ mock_urlopen: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ "link",
+ 200,
)
mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nDisallow: /")
- if url == "http://notcrawlable.com/robots.txt"
+ if url == f"{root_url}/robots.txt"
else urllib.error.URLError(f"No mock for {url}")
)
- spider = Spider(SpiderSettings(root_url="http://notcrawlable.com", respect_robots_txt=True))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True))
with caplog.at_level(DEBUG):
- spider.crawl("http://notcrawlable.com")
+ spider.crawl(root_url)
assert spider.crawl_result == {}
-
assert "Skipped: Url doesn't allow crawling:" in caplog.text
-
- assert "http://notcrawlable.com/robots.txt" in spider.robots
+ assert f"{root_url}/robots.txt" in spider.robots
@responses.activate
@patch("urllib.request.urlopen")
-def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://crawlable.com",
- body="link",
- status=200,
- )
- setup_mock_response(
- url="http://notcrawlable.com",
- body="link",
- status=200,
+def test_respect_robots_txt_disabled(
+ mock_urlopen: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ "link",
+ 200,
+ )
+ setup_responses(
+ "http://notcrawlable.com",
+ f"link",
+ 200,
)
mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /")
- if url == "http://crawlable.com/robots.txt"
+ if url == f"{root_url}/robots.txt"
else (
BytesIO(b"User-agent: *\nDisallow: /")
if url == "http://notcrawlable.com/robots.txt"
@@ -438,7 +402,7 @@ def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ign
)
with caplog.at_level(WARNING):
- spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=False))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=False))
assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text
@@ -446,40 +410,43 @@ def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ign
spider.start()
assert spider.crawl_result == {
- "http://crawlable.com": {"urls": ["http://notcrawlable.com"]},
- "http://notcrawlable.com": {"urls": ["http://crawlable.com"]},
+ root_url: {"urls": ["http://notcrawlable.com"]},
+ "http://notcrawlable.com": {"urls": [root_url]},
}
assert "Skipped: Url doesn't allow crawling:" not in caplog.text
-
assert "http://notcrawlable.com/robots.txt" not in spider.robots
@responses.activate
@patch("urllib.request.urlopen")
@patch("time.sleep", return_value=None)
-def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://crawlable.com",
- body="link",
- status=200,
+def test_respect_robots_txt_crawl_delay(
+ mock_sleep: MagicMock,
+ mock_urlopen: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ "link",
+ 200,
)
mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1")
- if url == "http://crawlable.com/robots.txt"
+ if url == f"{root_url}/robots.txt"
else urllib.error.URLError(f"No mock for {url}")
)
- spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True))
with caplog.at_level(DEBUG):
- spider.crawl("http://crawlable.com")
+ spider.crawl(root_url)
assert mock_sleep.call_count == 1
mock_sleep.assert_called_with(1.0)
-
- assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://notcrawlable.com"]}}
+ assert spider.crawl_result == {root_url: {"urls": ["http://notcrawlable.com"]}}
def test_crawl_no_root_url() -> None:
@@ -489,20 +456,23 @@ def test_crawl_no_root_url() -> None:
@patch("time.sleep")
@responses.activate
-def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://transient.error",
- body="link",
- status=503,
+def test_crawl_url_transient_retry(
+ mock_sleep: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ f"link",
+ 503,
)
- spider = Spider(SpiderSettings(root_url="http://transient.error", respect_robots_txt=False))
+ spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=False))
with caplog.at_level(ERROR):
- spider.crawl("http://transient.error")
+ spider.crawl(root_url)
assert spider.crawl_result == {}
-
assert len(responses.calls) == 6
expected_delays = [1, 2, 3, 4, 5]
@@ -514,26 +484,25 @@ def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore
@patch("time.sleep")
@responses.activate
-def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore
- setup_mock_response(
- url="http://transient.error",
- body="link",
- status=503,
+def test_crawl_url_transient_retry_custom_retry_amount(
+ mock_sleep: MagicMock,
+ setup_responses: Callable[[str, str, int], None],
+ caplog: LogCaptureFixture,
+) -> None:
+ setup_responses(
+ root_url,
+ f"link",
+ 503,
)
spider = Spider(
- SpiderSettings(
- root_url="http://transient.error",
- max_retry_attempts=10,
- respect_robots_txt=False,
- )
+ SpiderSettings(root_url=root_url, max_retry_attempts=10, respect_robots_txt=False)
)
with caplog.at_level(ERROR):
- spider.crawl("http://transient.error")
+ spider.crawl(root_url)
assert spider.crawl_result == {}
-
assert len(responses.calls) == 11
expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]