From d3d6e88b7cabec94175533bc4501757141068da7 Mon Sep 17 00:00:00 2001 From: Indrajith Indraprastham Date: Mon, 5 Aug 2024 20:04:01 +0530 Subject: [PATCH] feat: refactored test_crawler with fixtures --- poetry.lock | 27 ++- pyproject.toml | 2 + tests/conftest.py | 31 +++ tests/test_crawler.py | 537 ++++++++++++++++++++---------------------- 4 files changed, 312 insertions(+), 285 deletions(-) create mode 100644 tests/conftest.py diff --git a/poetry.lock b/poetry.lock index eacb424..d68535f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1241,6 +1241,31 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "types-requests" +version = "2.32.0.20240712" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-requests-2.32.0.20240712.tar.gz", hash = "sha256:90c079ff05e549f6bf50e02e910210b98b8ff1ebdd18e19c873cd237737c1358"}, + {file = "types_requests-2.32.0.20240712-py3-none-any.whl", hash = "sha256:f754283e152c752e46e70942fa2a146b5bc70393522257bb85bd1ef7e019dcc3"}, +] + +[package.dependencies] +urllib3 = ">=2" + +[[package]] +name = "types-setuptools" +version = "71.1.0.20240726" +description = "Typing stubs for setuptools" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-setuptools-71.1.0.20240726.tar.gz", hash = "sha256:85ba28e9461bb1be86ebba4db0f1c2408f2b11115b1966334ea9dc464e29303e"}, + {file = "types_setuptools-71.1.0.20240726-py3-none-any.whl", hash = "sha256:a7775376f36e0ff09bcad236bf265777590a66b11623e48c20bfc30f1444ea36"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1406,4 +1431,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "4bb71f99de019c462a80715c25fe5009d6a79d8375c56893cc931e46dd199f9a" +content-hash = "adbac27b89b1b9e13fb8dd281bfe69d0da0abd0c19fad7ad9c7738fe783fe5e1" diff --git a/pyproject.toml b/pyproject.toml index 2dbcdde..df4a7f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,8 @@ lxml = "^5.2.2" colorama = "^0.4.6" requests = "^2.32.3" aiohttp = "^3.10.0" +types-requests = "^2.32.0.20240712" +types-setuptools = "^71.1.0.20240726" [tool.poetry.group.dev.dependencies] responses = "^0.13.4" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..85e2741 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,31 @@ +# conftest.py +from typing import Callable, Generator +from unittest.mock import MagicMock, patch + +import pytest + +from tests.utils import setup_mock_response + +root_url = "http://example.com" + + +@pytest.fixture +def mock_urlopen() -> Generator[MagicMock, None, None]: + with patch("urllib.request.urlopen") as mock: + yield mock + + +@pytest.fixture +def mock_response() -> MagicMock: + response = MagicMock() + response.read.return_value = b"" + response.status = 200 + return response + + +@pytest.fixture +def setup_responses() -> Callable[[str, str, int], None]: + def _setup_responses(url: str, body: str, status: int = 200) -> None: + setup_mock_response(url=url, body=body, status=status) + + return _setup_responses diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 6f12e48..8fb7741 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,50 +1,46 @@ +import urllib import urllib.error from io import BytesIO from logging import DEBUG, ERROR, WARNING +from typing import Callable from unittest.mock import MagicMock, mock_open, patch import pytest import responses from datacrawl import Spider, SpiderSettings +from pytest import LogCaptureFixture -from tests.utils import setup_mock_response +from tests.conftest import root_url @responses.activate -def test_crawl() -> None: - setup_mock_response( - url="http://example.com", - body="link", - status=200, - ) - setup_mock_response( - url="http://example.com/test", - body="link", - status=200, +def test_crawl( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses(root_url, f"link", 200) + setup_responses( + f"{root_url}/test", f"link", 200 ) - # Mock urllib.request.urlopen to avoid real network calls - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response + mock_urlopen.side_effect = lambda url: mock_response - spider = Spider(SpiderSettings(root_url="http://example.com", max_links=10)) - spider.crawl("http://example.com") + spider = Spider(SpiderSettings(root_url=root_url, max_links=10)) + spider.crawl(root_url) - assert "http://example.com" in spider.crawl_result - assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"] + assert root_url in spider.crawl_result + assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"] - spider.crawl("http://example.com/test") + spider.crawl(f"{root_url}/test") - assert "http://example.com/test" in spider.crawl_result - assert spider.crawl_result["http://example.com/test"]["urls"] == ["http://example.com"] + assert f"{root_url}/test" in spider.crawl_result + assert spider.crawl_result[f"{root_url}/test"]["urls"] == [root_url] @responses.activate -def test_crawl_invalid_url(caplog) -> None: # type: ignore - spider = Spider(SpiderSettings(root_url="http://example.com")) +def test_crawl_invalid_url(mock_urlopen: MagicMock, caplog: LogCaptureFixture) -> None: + spider = Spider(SpiderSettings(root_url=root_url)) with caplog.at_level(DEBUG): spider.crawl("invalid_url") @@ -54,118 +50,99 @@ def test_crawl_invalid_url(caplog) -> None: # type: ignore @responses.activate -def test_crawl_already_crawled_url(caplog) -> None: # type: ignore - setup_mock_response( - url="http://example.com", - body="link", - status=200, - ) +def test_crawl_already_crawled_url( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses(root_url, f"link", 200) - spider = Spider(SpiderSettings(root_url="http://example.com")) + spider = Spider(SpiderSettings(root_url=root_url)) + mock_urlopen.side_effect = lambda url: mock_response - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response - - with caplog.at_level(DEBUG): - spider.crawl("http://example.com") - spider.crawl("http://example.com") + with caplog.at_level(DEBUG): + spider.crawl(root_url) + spider.crawl(root_url) assert "URL already crawled:" in caplog.text - assert spider.crawl_result == {"http://example.com": {"urls": ["http://example.com"]}} + assert spider.crawl_result == {root_url: {"urls": [root_url]}} @responses.activate -def test_crawl_unfetchable_url() -> None: - setup_mock_response( - url="http://example.com", - body="link", - status=404, - ) - - spider = Spider(SpiderSettings(root_url="http://example.com")) - - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 404 - mock_urlopen.return_value = mock_response - - spider.crawl("http://example.com") - +def test_crawl_unfetchable_url( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses(root_url, f"link", 404) + + spider = Spider(SpiderSettings(root_url=root_url)) + mock_response.status = 404 + mock_urlopen.side_effect = lambda url: mock_response + + spider.crawl(root_url) assert spider.crawl_result == {} @responses.activate -def test_crawl_found_invalid_url(caplog) -> None: # type: ignore - setup_mock_response( - url="http://example.com", - body="link", - status=200, - ) +def test_crawl_found_invalid_url( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses(root_url, "link", 200) - spider = Spider(SpiderSettings(root_url="http://example.com")) + spider = Spider(SpiderSettings(root_url=root_url)) + mock_urlopen.side_effect = lambda url: mock_response - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response - - with caplog.at_level(DEBUG): - spider.crawl("http://example.com") + with caplog.at_level(DEBUG): + spider.crawl(root_url) assert "Invalid url:" in caplog.text - assert spider.crawl_result == {"http://example.com": {"urls": []}} + assert spider.crawl_result == {root_url: {"urls": []}} @responses.activate -def test_crawl_found_duplicate_url() -> None: - setup_mock_response( - url="http://example.com", - body="link1" - + "link2", - status=200, +def test_crawl_found_duplicate_url( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses( + root_url, + f"link1" + f"link2", + 200, ) - spider = Spider(SpiderSettings(root_url="http://example.com")) - - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response + spider = Spider(SpiderSettings(root_url=root_url)) + mock_urlopen.side_effect = lambda url: mock_response - spider.crawl("http://example.com") - - assert spider.crawl_result == {"http://example.com": {"urls": ["http://duplicate.com"]}} + spider.crawl(root_url) + assert spider.crawl_result == {root_url: {"urls": [root_url]}} @responses.activate -def test_crawl_no_urls_in_page() -> None: - setup_mock_response(url="http://example.com", body="", status=200) - - spider = Spider(SpiderSettings(root_url="http://example.com")) +def test_crawl_no_urls_in_page( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses(root_url, "", 200) - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response + spider = Spider(SpiderSettings(root_url=root_url)) + mock_urlopen.side_effect = lambda url: mock_response - spider.crawl("http://example.com") - - assert spider.crawl_result == {"http://example.com": {"urls": []}} + spider.crawl(root_url) + assert spider.crawl_result == {root_url: {"urls": []}} @responses.activate def test_save_results() -> None: - spider = Spider( - SpiderSettings(root_url="http://example.com", max_links=10, save_to_file="out.json") - ) - spider.crawl_result = {"http://example.com": {"urls": ["http://example.com/test"]}} + spider = Spider(SpiderSettings(root_url=root_url, max_links=10, save_to_file="out.json")) + spider.crawl_result = {root_url: {"urls": [f"{root_url}/test"]}} with patch("builtins.open", mock_open()) as mocked_file: spider.save_results() @@ -173,177 +150,160 @@ def test_save_results() -> None: @responses.activate -def test_url_regex() -> None: - setup_mock_response( - url="http://example.com", - body="link" - + "link", - status=200, +def test_url_regex( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses( + root_url, + f"link" + f"link", + 200, ) - # This regex matches strings starting with "http://example.com/" - # And only have numeric characters after it - regex = r"http://example\.com/[0-9]+" - - spider = Spider(SpiderSettings(root_url="http://example.com", url_regex=regex)) + regex = rf"{root_url}/[0-9]+" - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response + spider = Spider(SpiderSettings(root_url=root_url, url_regex=regex)) + mock_urlopen.side_effect = lambda url: mock_response - spider.start() - - assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/123"] - assert "http://example.com/test" not in spider.crawl_result["http://example.com"]["urls"] + spider.start() + assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/123"] + assert f"{root_url}/test" not in spider.crawl_result[root_url]["urls"] @responses.activate -def test_include_body() -> None: - setup_mock_response( - url="http://example.com", - body="link", - status=200, - ) - setup_mock_response( - url="http://example.com/test", - body="This is a header", - status=200, - ) - - spider = Spider(SpiderSettings(root_url="http://example.com", include_body=True)) +def test_include_body( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], +) -> None: + setup_responses(root_url, f"link", 200) + setup_responses(f"{root_url}/test", "This is a header", 200) - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response - - spider.start() + spider = Spider(SpiderSettings(root_url=root_url, include_body=True)) + mock_urlopen.side_effect = lambda url: mock_response + spider.start() assert ( - spider.crawl_result["http://example.com"]["body"] - == 'link' + spider.crawl_result[root_url]["body"] + == f'link' ) assert ( - spider.crawl_result["http://example.com/test"]["body"] + spider.crawl_result[f"{root_url}/test"]["body"] == "This is a header" ) @responses.activate -def test_internal_links_only(caplog) -> None: # type: ignore - setup_mock_response( - url="http://internal.com", - body="link" - + "link", - status=200, - ) - - spider = Spider(SpiderSettings(root_url="http://internal.com", internal_links_only=True)) +def test_internal_links_only( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + f"link" + "link", + 200, + ) + + spider = Spider(SpiderSettings(root_url=root_url, internal_links_only=True)) + mock_urlopen.side_effect = lambda url: mock_response - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response - - with caplog.at_level(DEBUG): - spider.crawl("http://internal.com") + with caplog.at_level(DEBUG): + spider.crawl(root_url) assert "Skipping: External link:" in caplog.text - assert spider.crawl_result == {"http://internal.com": {"urls": ["http://internal.com/test"]}} + assert spider.crawl_result == {root_url: {"urls": [f"{root_url}/test"]}} @responses.activate -def test_external_links_only(caplog) -> None: # type: ignore - setup_mock_response( - url="http://internal.com", - body="link" - + "link", - status=200, - ) +def test_external_links_only( + mock_urlopen: MagicMock, + mock_response: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + f"link" + "link", + 200, + ) + + spider = Spider(SpiderSettings(root_url=root_url, external_links_only=True)) + mock_urlopen.side_effect = lambda url: mock_response - spider = Spider(SpiderSettings(root_url="http://internal.com", external_links_only=True)) - - with patch("urllib.request.urlopen") as mock_urlopen: - mock_response = MagicMock() - mock_response.read.return_value = b"" - mock_response.status = 200 - mock_urlopen.return_value = mock_response - - with caplog.at_level(DEBUG): - spider.crawl("http://internal.com") + with caplog.at_level(DEBUG): + spider.crawl(root_url) assert "Skipping: Internal link:" in caplog.text - assert spider.crawl_result == {"http://internal.com": {"urls": ["http://external.com/test"]}} + assert spider.crawl_result == {root_url: {"urls": ["http://external.com/test"]}} @responses.activate def test_external_and_internal_links_only() -> None: with pytest.raises(ValueError): Spider( - SpiderSettings( - root_url="http://example.com", - internal_links_only=True, - external_links_only=True, - ) + SpiderSettings(root_url=root_url, internal_links_only=True, external_links_only=True) ) @patch.object(Spider, "crawl") @patch.object(Spider, "save_results") def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None: - spider = Spider(SpiderSettings(root_url="http://example.com", max_links=10)) + spider = Spider(SpiderSettings(root_url=root_url, max_links=10)) mock_crawl.side_effect = lambda url: spider.crawl_result.update( - {url: {"urls": ["http://example.com/test"]}} + {url: {"urls": [f"{root_url}/test"]}} ) - print(mock_save_results) spider.start() assert mock_crawl.call_count == 1 - assert "http://example.com" in spider.crawl_result - assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"] + assert root_url in spider.crawl_result + assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"] @patch.object(Spider, "crawl") @patch.object(Spider, "save_results") def test_start_with_save_to_file(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None: - spider = Spider( - SpiderSettings(root_url="http://example.com", max_links=10, save_to_file="file.txt") - ) + spider = Spider(SpiderSettings(root_url=root_url, max_links=10, save_to_file="file.txt")) mock_crawl.side_effect = lambda url: spider.crawl_result.update( - {url: {"urls": ["http://example.com/test"]}} + {url: {"urls": [f"{root_url}/test"]}} ) spider.start() assert mock_crawl.call_count == 1 - assert "http://example.com" in spider.crawl_result - assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"] + assert root_url in spider.crawl_result + assert spider.crawl_result[root_url]["urls"] == [f"{root_url}/test"] mock_save_results.assert_called_once() @responses.activate @patch("urllib.request.urlopen") -def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore - setup_mock_response( - url="http://crawlable.com", - body="link", - status=200, - ) - setup_mock_response( - url="http://notcrawlable.com", - body="link", - status=200, +def test_respect_robots_txt( + mock_urlopen: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + "link", + 200, + ) + setup_responses( + "http://notcrawlable.com", + f"link", + 200, ) mock_urlopen.side_effect = lambda url: ( BytesIO(b"User-agent: *\nAllow: /") - if url == "http://crawlable.com/robots.txt" + if url == f"{root_url}/robots.txt" else ( BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" @@ -351,85 +311,89 @@ def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore ) ) - spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True)) with caplog.at_level(DEBUG): spider.start() - assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://notcrawlable.com"]}} - + assert spider.crawl_result == {root_url: {"urls": ["http://notcrawlable.com"]}} assert "Skipped: Url doesn't allow crawling:" in caplog.text - assert "http://notcrawlable.com/robots.txt" in spider.robots @responses.activate @patch("urllib.request.urlopen") -def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore - setup_mock_response( - url="http://crawlable.com", - body="link", - status=200, - ) +def test_respect_robots_txt_allowed( + mock_urlopen: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses(root_url, f"link", 200) mock_urlopen.side_effect = lambda url: ( BytesIO(b"User-agent: *\nAllow: /") - if url == "http://crawlable.com/robots.txt" + if url == f"{root_url}/robots.txt" else urllib.error.URLError(f"No mock for {url}") ) - spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True)) with caplog.at_level(DEBUG): - spider.crawl("http://crawlable.com") + spider.crawl(root_url) - assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://crawlable.com"]}} + assert spider.crawl_result == {root_url: {"urls": [root_url]}} @responses.activate @patch("urllib.request.urlopen") -def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore - setup_mock_response( - url="http://notcrawlable.com", - body="link", - status=200, +def test_respect_robots_txt_not_allowed( + mock_urlopen: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + "link", + 200, ) mock_urlopen.side_effect = lambda url: ( BytesIO(b"User-agent: *\nDisallow: /") - if url == "http://notcrawlable.com/robots.txt" + if url == f"{root_url}/robots.txt" else urllib.error.URLError(f"No mock for {url}") ) - spider = Spider(SpiderSettings(root_url="http://notcrawlable.com", respect_robots_txt=True)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True)) with caplog.at_level(DEBUG): - spider.crawl("http://notcrawlable.com") + spider.crawl(root_url) assert spider.crawl_result == {} - assert "Skipped: Url doesn't allow crawling:" in caplog.text - - assert "http://notcrawlable.com/robots.txt" in spider.robots + assert f"{root_url}/robots.txt" in spider.robots @responses.activate @patch("urllib.request.urlopen") -def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore - setup_mock_response( - url="http://crawlable.com", - body="link", - status=200, - ) - setup_mock_response( - url="http://notcrawlable.com", - body="link", - status=200, +def test_respect_robots_txt_disabled( + mock_urlopen: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + "link", + 200, + ) + setup_responses( + "http://notcrawlable.com", + f"link", + 200, ) mock_urlopen.side_effect = lambda url: ( BytesIO(b"User-agent: *\nAllow: /") - if url == "http://crawlable.com/robots.txt" + if url == f"{root_url}/robots.txt" else ( BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" @@ -438,7 +402,7 @@ def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ign ) with caplog.at_level(WARNING): - spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=False)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=False)) assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text @@ -446,40 +410,43 @@ def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ign spider.start() assert spider.crawl_result == { - "http://crawlable.com": {"urls": ["http://notcrawlable.com"]}, - "http://notcrawlable.com": {"urls": ["http://crawlable.com"]}, + root_url: {"urls": ["http://notcrawlable.com"]}, + "http://notcrawlable.com": {"urls": [root_url]}, } assert "Skipped: Url doesn't allow crawling:" not in caplog.text - assert "http://notcrawlable.com/robots.txt" not in spider.robots @responses.activate @patch("urllib.request.urlopen") @patch("time.sleep", return_value=None) -def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore - setup_mock_response( - url="http://crawlable.com", - body="link", - status=200, +def test_respect_robots_txt_crawl_delay( + mock_sleep: MagicMock, + mock_urlopen: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + "link", + 200, ) mock_urlopen.side_effect = lambda url: ( BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1") - if url == "http://crawlable.com/robots.txt" + if url == f"{root_url}/robots.txt" else urllib.error.URLError(f"No mock for {url}") ) - spider = Spider(SpiderSettings(root_url="http://crawlable.com", respect_robots_txt=True)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=True)) with caplog.at_level(DEBUG): - spider.crawl("http://crawlable.com") + spider.crawl(root_url) assert mock_sleep.call_count == 1 mock_sleep.assert_called_with(1.0) - - assert spider.crawl_result == {"http://crawlable.com": {"urls": ["http://notcrawlable.com"]}} + assert spider.crawl_result == {root_url: {"urls": ["http://notcrawlable.com"]}} def test_crawl_no_root_url() -> None: @@ -489,20 +456,23 @@ def test_crawl_no_root_url() -> None: @patch("time.sleep") @responses.activate -def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore - setup_mock_response( - url="http://transient.error", - body="link", - status=503, +def test_crawl_url_transient_retry( + mock_sleep: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + f"link", + 503, ) - spider = Spider(SpiderSettings(root_url="http://transient.error", respect_robots_txt=False)) + spider = Spider(SpiderSettings(root_url=root_url, respect_robots_txt=False)) with caplog.at_level(ERROR): - spider.crawl("http://transient.error") + spider.crawl(root_url) assert spider.crawl_result == {} - assert len(responses.calls) == 6 expected_delays = [1, 2, 3, 4, 5] @@ -514,26 +484,25 @@ def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore @patch("time.sleep") @responses.activate -def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore - setup_mock_response( - url="http://transient.error", - body="link", - status=503, +def test_crawl_url_transient_retry_custom_retry_amount( + mock_sleep: MagicMock, + setup_responses: Callable[[str, str, int], None], + caplog: LogCaptureFixture, +) -> None: + setup_responses( + root_url, + f"link", + 503, ) spider = Spider( - SpiderSettings( - root_url="http://transient.error", - max_retry_attempts=10, - respect_robots_txt=False, - ) + SpiderSettings(root_url=root_url, max_retry_attempts=10, respect_robots_txt=False) ) with caplog.at_level(ERROR): - spider.crawl("http://transient.error") + spider.crawl(root_url) assert spider.crawl_result == {} - assert len(responses.calls) == 11 expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]