diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb72d11..e029c35 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,12 +18,12 @@ jobs: - name: Install dependencies run: | poetry install --with dev - - name: Run linter :pylint + - name: Run black run: | - poetry run pylint src - - name: Run mypy :type_checking + poetry run black --check . + - name: Run ruff run: | - poetry run mypy --install-types --non-interactive src + poetry run ruff check . test: needs: lint diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e6a10db..1530c4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,25 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + rev: v4.6.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace + + - repo: https://github.com/psf/black + rev: 24.8.0 # Use the latest version + hooks: + - id: black + args: ["--line-length=100"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: "v0.5.6" # Use the latest version + hooks: + - id: ruff + args: ["--line-length=100"] + - repo: local hooks: - - id: pylint - name: Run pylint - entry: poetry run pylint - language: system - types: [python] - args: ["src"] - stages: [commit] - id: pytest name: Run pytest entry: poetry run pytest @@ -21,8 +27,9 @@ repos: pass_filenames: false always_run: true stages: [push] + - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.10.0" + rev: "v1.11.1" hooks: - id: mypy args: [--install-types, --non-interactive] diff --git a/poetry.lock b/poetry.lock index 75c882b..eacb424 100644 --- a/poetry.lock +++ b/poetry.lock @@ -136,20 +136,6 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" -[[package]] -name = "astroid" -version = "3.0.3" -description = "An abstract syntax tree for Python with inference support." -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "astroid-3.0.3-py3-none-any.whl", hash = "sha256:92fcf218b89f449cdf9f7b39a269f8d5d617b27be68434912e11e79203963a17"}, - {file = "astroid-3.0.3.tar.gz", hash = "sha256:4148645659b08b70d72460ed1921158027a9e53ae8b7234149b1400eddacbb93"}, -] - -[package.dependencies] -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} - [[package]] name = "async-timeout" version = "4.0.3" @@ -201,6 +187,52 @@ charset-normalizer = ["charset-normalizer"] html5lib = ["html5lib"] lxml = ["lxml"] +[[package]] +name = "black" +version = "24.8.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.8" +files = [ + {file = "black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6"}, + {file = "black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb"}, + {file = "black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42"}, + {file = "black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a"}, + {file = "black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1"}, + {file = "black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af"}, + {file = "black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4"}, + {file = "black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af"}, + {file = "black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368"}, + {file = "black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed"}, + {file = "black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018"}, + {file = "black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2"}, + {file = "black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd"}, + {file = "black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2"}, + {file = "black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e"}, + {file = "black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920"}, + {file = "black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c"}, + {file = "black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e"}, + {file = "black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47"}, + {file = "black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb"}, + {file = "black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed"}, + {file = "black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "certifi" version = "2024.6.2" @@ -322,6 +354,20 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -400,21 +446,6 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] -[[package]] -name = "dill" -version = "0.3.8" -description = "serialize all of Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"}, - {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"}, -] - -[package.extras] -graph = ["objgraph (>=1.7.2)"] -profile = ["gprof2dot (>=2022.7.29)"] - [[package]] name = "distlib" version = "0.3.8" @@ -578,20 +609,6 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] -[[package]] -name = "isort" -version = "5.13.2" -description = "A Python utility / library to sort Python imports." -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, - {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, -] - -[package.extras] -colors = ["colorama (>=0.4.6)"] - [[package]] name = "lxml" version = "5.2.2" @@ -750,17 +767,6 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.10)"] -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -optional = false -python-versions = ">=3.6" -files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, -] - [[package]] name = "multidict" version = "6.0.5" @@ -940,6 +946,17 @@ files = [ {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + [[package]] name = "platformdirs" version = "4.2.2" @@ -989,36 +1006,6 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" -[[package]] -name = "pylint" -version = "3.0.2" -description = "python code static checker" -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "pylint-3.0.2-py3-none-any.whl", hash = "sha256:60ed5f3a9ff8b61839ff0348b3624ceeb9e6c2a92c514d81c9cc273da3b6bcda"}, - {file = "pylint-3.0.2.tar.gz", hash = "sha256:0d4c286ef6d2f66c8bfb527a7f8a629009e42c99707dec821a03e1b51a4c1496"}, -] - -[package.dependencies] -astroid = ">=3.0.1,<=3.1.0-dev0" -colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} -dill = [ - {version = ">=0.2", markers = "python_version < \"3.11\""}, - {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, - {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, -] -isort = ">=4.2.5,<6" -mccabe = ">=0.6,<0.8" -platformdirs = ">=2.2.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -tomlkit = ">=0.10.1" -typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} - -[package.extras] -spelling = ["pyenchant (>=3.2,<4.0)"] -testutils = ["gitpython (>3)"] - [[package]] name = "pytest" version = "8.3.2" @@ -1194,6 +1181,33 @@ urllib3 = ">=1.25.10" [package.extras] tests = ["coverage (>=3.7.1,<6.0.0)", "flake8", "mypy", "pytest (>=4.6)", "pytest (>=4.6,<5.0)", "pytest-cov", "pytest-localserver", "types-mock", "types-requests", "types-six"] +[[package]] +name = "ruff" +version = "0.5.6" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.5.6-py3-none-linux_armv6l.whl", hash = "sha256:a0ef5930799a05522985b9cec8290b185952f3fcd86c1772c3bdbd732667fdcd"}, + {file = "ruff-0.5.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b652dc14f6ef5d1552821e006f747802cc32d98d5509349e168f6bf0ee9f8f42"}, + {file = "ruff-0.5.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80521b88d26a45e871f31e4b88938fd87db7011bb961d8afd2664982dfc3641a"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9bc8f328a9f1309ae80e4d392836e7dbc77303b38ed4a7112699e63d3b066ab"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d394940f61f7720ad371ddedf14722ee1d6250fd8d020f5ea5a86e7be217daf"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111a99cdb02f69ddb2571e2756e017a1496c2c3a2aeefe7b988ddab38b416d36"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e395daba77a79f6dc0d07311f94cc0560375ca20c06f354c7c99af3bf4560c5d"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c476acb43c3c51e3c614a2e878ee1589655fa02dab19fe2db0423a06d6a5b1b6"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2ff8003f5252fd68425fd53d27c1f08b201d7ed714bb31a55c9ac1d4c13e2eb"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c94e084ba3eaa80c2172918c2ca2eb2230c3f15925f4ed8b6297260c6ef179ad"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1f77c1c3aa0669fb230b06fb24ffa3e879391a3ba3f15e3d633a752da5a3e670"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f908148c93c02873210a52cad75a6eda856b2cbb72250370ce3afef6fb99b1ed"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:563a7ae61ad284187d3071d9041c08019975693ff655438d8d4be26e492760bd"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:94fe60869bfbf0521e04fd62b74cbca21cbc5beb67cbb75ab33fe8c174f54414"}, + {file = "ruff-0.5.6-py3-none-win32.whl", hash = "sha256:e6a584c1de6f8591c2570e171cc7ce482bb983d49c70ddf014393cd39e9dfaed"}, + {file = "ruff-0.5.6-py3-none-win_amd64.whl", hash = "sha256:d7fe7dccb1a89dc66785d7aa0ac283b2269712d8ed19c63af908fdccca5ccc1a"}, + {file = "ruff-0.5.6-py3-none-win_arm64.whl", hash = "sha256:57c6c0dd997b31b536bff49b9eee5ed3194d60605a4427f735eeb1f9c1b8d264"}, + {file = "ruff-0.5.6.tar.gz", hash = "sha256:07c9e3c2a8e1fe377dd460371c3462671a728c981c3205a5217291422209f642"}, +] + [[package]] name = "six" version = "1.16.0" @@ -1227,17 +1241,6 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -[[package]] -name = "tomlkit" -version = "0.12.5" -description = "Style preserving TOML library" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomlkit-0.12.5-py3-none-any.whl", hash = "sha256:af914f5a9c59ed9d0762c7b64d3b5d5df007448eb9cd2edc8a46b1eafead172f"}, - {file = "tomlkit-0.12.5.tar.gz", hash = "sha256:eef34fba39834d4d6b73c9ba7f3e4d1c417a4e56f89a7e96e090dd0d24b8fb3c"}, -] - [[package]] name = "typing-extensions" version = "4.12.2" @@ -1403,4 +1406,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "e87226c1134ec29c44295cd0f137528f09266c4c0113a084c074c38bd89a008d" +content-hash = "4bb71f99de019c462a80715c25fe5009d6a79d8375c56893cc931e46dd199f9a" diff --git a/pyproject.toml b/pyproject.toml index f943be9..94f7951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,12 +22,10 @@ beautifulsoup4 = "^4.12.3" lxml = "^5.2.2" colorama = "^0.4.6" requests = "^2.32.3" -pylint = "3.0.2" aiohttp = "^3.10.0" [tool.poetry.group.dev.dependencies] responses = "^0.13.4" -pylint = "^3.0.2" mypy = "^1.10.0" pytest-cov = "^5.0.0" requests-mock = "^1.12.1" @@ -35,7 +33,15 @@ pre-commit = ">=2.15,<3.0" pytest-asyncio = "^0.23.8" pytest = "^8.3.2" aioresponses = "^0.7.6" +black = "^24.8.0" +ruff = "^0.5.6" +[tool.ruff] +line-length = 100 +select = ["E", "F", "I", "N"] + +[tool.black] +line-length = 100 [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/script.py b/script.py index df7a5d7..470c605 100755 --- a/script.py +++ b/script.py @@ -1,4 +1,5 @@ import subprocess + def post_install() -> None: subprocess.run(["poetry", "run", "pre-commit", "install"], check=True) diff --git a/src/tiny_web_crawler/__init__.py b/src/tiny_web_crawler/__init__.py index cef0704..3879d04 100644 --- a/src/tiny_web_crawler/__init__.py +++ b/src/tiny_web_crawler/__init__.py @@ -1,2 +1,4 @@ from tiny_web_crawler.core.spider import Spider from tiny_web_crawler.core.spider_settings import SpiderSettings + +__all__ = ["Spider", "SpiderSettings"] diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py index 8d8b040..9d4a75a 100644 --- a/src/tiny_web_crawler/core/spider.py +++ b/src/tiny_web_crawler/core/spider.py @@ -1,33 +1,40 @@ from __future__ import annotations -from dataclasses import dataclass, field import json -import time import re - -from typing import Dict, List, Set, Any -from concurrent.futures import ThreadPoolExecutor, as_completed +import time import urllib.parse import urllib.robotparser +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from logging import DEBUG, INFO +from typing import Any, Dict, List, Set + import requests from tiny_web_crawler.core.spider_settings import SpiderSettings +from tiny_web_crawler.logger import get_logger, set_logging_level from tiny_web_crawler.networking.fetcher import fetch_url -from tiny_web_crawler.networking.validator import is_valid_url from tiny_web_crawler.networking.formatter import format_url -from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url -from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG +from tiny_web_crawler.networking.robots_txt import ( + get_robots_txt_url, + is_robots_txt_allowed, + setup_robots_txt_parser, +) +from tiny_web_crawler.networking.validator import is_valid_url -DEFAULT_SCHEME: str = 'http://' +DEFAULT_SCHEME: str = "http://" logger = get_logger() + @dataclass class Spider: """ A simple web crawler class. Attributes: - settings (SpiderSettings): The SpiderSettings object with the settings for the Spider object + settings (SpiderSettings): + The SpiderSettings object with the settings for the Spider object """ settings: SpiderSettings @@ -50,11 +57,11 @@ def __post_init__(self) -> None: if not self.settings.respect_robots_txt: logger.warning( - "Ignoring robots.txt files! You might be at risk of:\n"+ - "Agent/IP bans;\n"+ - "Disrupted operation;\n"+ - "Increased suspicion from anti-bot services;\n"+ - "Potential legal action;" + "Ignoring robots.txt files! You might be at risk of:\n" + + "Agent/IP bans;\n" + + "Disrupted operation;\n" + + "Increased suspicion from anti-bot services;\n" + + "Potential legal action;" ) def save_results(self) -> None: @@ -62,7 +69,7 @@ def save_results(self) -> None: Saves the crawl results into a JSON file. """ if self.settings.save_to_file: - with open(self.settings.save_to_file, 'w', encoding='utf-8') as file: + with open(self.settings.save_to_file, "w", encoding="utf-8") as file: json.dump(self.crawl_result, file, indent=4) def crawl(self, url: str) -> None: @@ -89,19 +96,19 @@ def crawl(self, url: str) -> None: if not soup: return - links = soup.body.find_all('a', href=True) if soup.body else [] - self.crawl_result[url] = {'urls': []} + links = soup.body.find_all("a", href=True) if soup.body else [] + self.crawl_result[url] = {"urls": []} if self.settings.include_body: - self.crawl_result[url]['body'] = str(soup) + self.crawl_result[url]["body"] = str(soup) for link in links: - pretty_url = format_url(link['href'].lstrip(), url, self.scheme) + pretty_url = format_url(link["href"].lstrip(), url, self.scheme) if self._should_skip_link(pretty_url, url): continue - self.crawl_result[url]['urls'].append(pretty_url) + self.crawl_result[url]["urls"].append(pretty_url) self.crawl_set.add(pretty_url) logger.debug("Link found: %s", pretty_url) @@ -114,18 +121,24 @@ def _should_skip_link(self, pretty_url: str, url: str) -> bool: logger.debug("Invalid url: %s", pretty_url) return True - if pretty_url in self.crawl_result[url]['urls']: + if pretty_url in self.crawl_result[url]["urls"]: return True if self.settings.url_regex and not re.compile(self.settings.url_regex).match(pretty_url): logger.debug("Skipping: URL didn't match regex: %s", pretty_url) return True - if self.settings.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc: + if ( + self.settings.internal_links_only + and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc + ): logger.debug("Skipping: External link: %s", pretty_url) return True - if self.settings.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc: + if ( + self.settings.external_links_only + and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc + ): logger.debug("Skipping: Internal link: %s", pretty_url) return True diff --git a/src/tiny_web_crawler/core/spider_settings.py b/src/tiny_web_crawler/core/spider_settings.py index 5bd2f21..044d9c3 100644 --- a/src/tiny_web_crawler/core/spider_settings.py +++ b/src/tiny_web_crawler/core/spider_settings.py @@ -1,6 +1,6 @@ +from dataclasses import dataclass from typing import Optional -from dataclasses import dataclass @dataclass class GeneralSettings: @@ -23,18 +23,30 @@ class GeneralSettings: delay: float = 0.5 verbose: bool = True + @dataclass class CrawlSettings: """ A simple dataclass to store crawl settings for the Spider class Attributes: - url_regex (Optional[str]): A regular expression against which urls will be matched before crawling - include_body (bool): Whether or not to include the crawled page's body in crawl_result (Default: False) - internal_links_only (bool): Whether or not to crawl only internal links (Default: False) - external_links_only (bool): Whether or not to crawl only external links (Default: False) - respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True) + url_regex (Optional[str]): + A regular expression against which urls will be matched before crawling + + include_body (bool): + Whether or not to include the crawled page's body in crawl_result + (Default: False) + + internal_links_only (bool): + Whether or not to crawl only internal links (Default: False) + + external_links_only (bool): + Whether or not to crawl only external links (Default: False) + + respect_robots_txt (bool): + Whether or not to respect website's robots.txt files (default: True) """ + url_regex: Optional[str] = None include_body: bool = False internal_links_only: bool = False @@ -42,6 +54,7 @@ class CrawlSettings: respect_robots_txt: bool = True max_retry_attempts: int = 5 + @dataclass class SpiderSettings(GeneralSettings, CrawlSettings): """ @@ -50,7 +63,10 @@ class SpiderSettings(GeneralSettings, CrawlSettings): def __post_init__(self) -> None: if self.root_url == "": - raise ValueError("\"root_url\" argument is required") + raise ValueError('"root_url" argument is required') if self.internal_links_only and self.external_links_only: - raise ValueError("Only one of internal_links_only and external_links_only can be set to True") + + raise ValueError( + "Only one of internal_links_only and external_links_only can be set to True" + ) diff --git a/src/tiny_web_crawler/logging.py b/src/tiny_web_crawler/logger.py similarity index 85% rename from src/tiny_web_crawler/logging.py rename to src/tiny_web_crawler/logger.py index 7148770..915a998 100644 --- a/src/tiny_web_crawler/logging.py +++ b/src/tiny_web_crawler/logger.py @@ -1,13 +1,12 @@ import logging -from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL, NOTSET # pylint: disable=unused-import from colorama import Fore LOGGER_NAME: str = "tiny-web-crawler" -DEFAULT_LOG_LEVEL: int = INFO +DEFAULT_LOG_LEVEL: int = logging.INFO -class ColorFormatter(logging.Formatter): +class ColorFormatter(logging.Formatter): message_format: str = "%(levelname)s %(message)s" FORMATS = { @@ -15,7 +14,7 @@ class ColorFormatter(logging.Formatter): logging.INFO: Fore.BLUE + message_format + Fore.RESET, logging.WARNING: Fore.YELLOW + message_format + Fore.RESET, logging.ERROR: Fore.RED + message_format + Fore.RESET, - logging.CRITICAL: Fore.RED + message_format + Fore.RESET + logging.CRITICAL: Fore.RED + message_format + Fore.RESET, } def format(self, record: logging.LogRecord) -> str: @@ -28,7 +27,7 @@ def get_logger() -> logging.Logger: return logging.getLogger(LOGGER_NAME) -def set_logging_level(level:int) -> None: +def set_logging_level(level: int) -> None: get_logger().setLevel(level=level) diff --git a/src/tiny_web_crawler/networking/fetcher.py b/src/tiny_web_crawler/networking/fetcher.py index a3eb6dc..f53672a 100644 --- a/src/tiny_web_crawler/networking/fetcher.py +++ b/src/tiny_web_crawler/networking/fetcher.py @@ -6,26 +6,28 @@ import requests from bs4 import BeautifulSoup -from tiny_web_crawler.logging import get_logger +from tiny_web_crawler.logger import get_logger TRANSIENT_ERRORS = [408, 502, 503, 504] logger = get_logger() + def is_transient_error(status_code: int) -> bool: return status_code in TRANSIENT_ERRORS + def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSoup]: try: response = requests.get(url, timeout=10) response.raise_for_status() data = response.text - return BeautifulSoup(data, 'lxml') + return BeautifulSoup(data, "lxml") except requests.exceptions.HTTPError as http_err: if response.status_code and is_transient_error(response.status_code) and retries > 0: logger.error("Transient HTTP error occurred: %s. Retrying...", http_err) - time.sleep( attempts+1 ) - return fetch_url( url, retries-1 , attempts+1) + time.sleep(attempts + 1) + return fetch_url(url, retries - 1, attempts + 1) logger.error("HTTP error occurred: %s", http_err) return None @@ -43,13 +45,16 @@ async def fetch_url_async(url: str, retries: int, attempts: int = 0) -> Optional try: async with session.get(url, timeout=10) as response: if response.status in TRANSIENT_ERRORS and retries > 0: - logger.error("Transient HTTP error occurred: %s. Retrying...", response.status) + logger.error( + "Transient HTTP error occurred: %s. Retrying...", + response.status, + ) await asyncio.sleep(attempts + 1) return await fetch_url_async(url, retries - 1, attempts + 1) response.raise_for_status() data = await response.text() - return BeautifulSoup(data, 'lxml') + return BeautifulSoup(data, "lxml") except aiohttp.ClientResponseError as http_err: if response.status in TRANSIENT_ERRORS and retries > 0: logger.error("Transient HTTP error occurred: %s. Retrying...", http_err) diff --git a/src/tiny_web_crawler/networking/formatter.py b/src/tiny_web_crawler/networking/formatter.py index c57afe7..74f4adc 100644 --- a/src/tiny_web_crawler/networking/formatter.py +++ b/src/tiny_web_crawler/networking/formatter.py @@ -1,7 +1,9 @@ import urllib.parse + import validators -DEFAULT_SCHEME: str = 'http://' +DEFAULT_SCHEME: str = "http://" + def format_url(url: str, base_url: str, scheme: str = DEFAULT_SCHEME) -> str: """ @@ -16,7 +18,7 @@ def format_url(url: str, base_url: str, scheme: str = DEFAULT_SCHEME) -> str: str: The formatted URL. """ parsed_url = urllib.parse.urlparse(url) - base_url = base_url.rstrip('/') + base_url = base_url.rstrip("/") if parsed_url.scheme: scheme = parsed_url.scheme @@ -25,7 +27,7 @@ def format_url(url: str, base_url: str, scheme: str = DEFAULT_SCHEME) -> str: if validators.url(DEFAULT_SCHEME + parsed_url.path): return DEFAULT_SCHEME + parsed_url.path - if parsed_url.path.startswith('/'): + if parsed_url.path.startswith("/"): return base_url + parsed_url.path return f"{base_url}/{parsed_url.path}" diff --git a/src/tiny_web_crawler/networking/robots_txt.py b/src/tiny_web_crawler/networking/robots_txt.py index fd24d88..d25568a 100644 --- a/src/tiny_web_crawler/networking/robots_txt.py +++ b/src/tiny_web_crawler/networking/robots_txt.py @@ -1,8 +1,10 @@ import urllib.parse import urllib.robotparser from typing import Optional + import requests + def get_robots_txt_url(url: str) -> str: """ Returns a url to a robots.txt file from the provided url. @@ -16,10 +18,12 @@ def get_robots_txt_url(url: str) -> str: parsed_url = urllib.parse.urlparse(url) - return parsed_url.scheme + "://"+ parsed_url.netloc + "/robots.txt" + return parsed_url.scheme + "://" + parsed_url.netloc + "/robots.txt" -def is_robots_txt_allowed(url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None) -> bool: +def is_robots_txt_allowed( + url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None +) -> bool: """ Checks if the provided URL can be crawled, according to its corresponding robots.txt file @@ -46,7 +50,8 @@ def setup_robots_txt_parser(robots_txt_url: str) -> urllib.robotparser.RobotFile robot_txt_url (str): The URL to the robots.txt file. Returns: - urllib.robotparser.RobotFileParser: The RobotFileParser object with the url already read. + urllib.robotparser.RobotFileParser: + The RobotFileParser object with the url already read. """ robot_parser = urllib.robotparser.RobotFileParser() diff --git a/src/tiny_web_crawler/networking/validator.py b/src/tiny_web_crawler/networking/validator.py index 3be7cbe..5f6cd3a 100644 --- a/src/tiny_web_crawler/networking/validator.py +++ b/src/tiny_web_crawler/networking/validator.py @@ -1,5 +1,6 @@ import validators + def is_valid_url(url: str) -> bool: """ Checks if the provided URL is valid. diff --git a/tests/logging/test_logging.py b/tests/logging/test_logging.py index 5637878..d02b6bd 100644 --- a/tests/logging/test_logging.py +++ b/tests/logging/test_logging.py @@ -1,9 +1,15 @@ import logging +from logging import DEBUG, ERROR, INFO + +import pytest import responses +from tiny_web_crawler import Spider, SpiderSettings +from tiny_web_crawler.logger import ( + LOGGER_NAME, + get_logger, + set_logging_level, +) -from tiny_web_crawler import Spider -from tiny_web_crawler import SpiderSettings -from tiny_web_crawler.logging import get_logger, set_logging_level, DEBUG, INFO, ERROR, LOGGER_NAME from tests.utils import setup_mock_response @@ -14,7 +20,7 @@ def test_get_logger() -> None: assert logger.name == LOGGER_NAME -def test_set_logging_level(caplog) -> None: # type: ignore +def test_set_logging_level(caplog: pytest.LogCaptureFixture) -> None: logger = get_logger() set_logging_level(ERROR) @@ -30,33 +36,20 @@ def test_set_logging_level(caplog) -> None: # type: ignore def test_verbose_logging_level() -> None: logger = get_logger() - Spider( - SpiderSettings(root_url="http://example.com", - verbose=True) - ) + Spider(SpiderSettings(root_url="http://example.com", verbose=True)) assert logger.getEffectiveLevel() == DEBUG - Spider( - SpiderSettings(root_url="http://example.com", - verbose=False) - ) + Spider(SpiderSettings(root_url="http://example.com", verbose=False)) assert logger.getEffectiveLevel() == INFO @responses.activate -def test_verbose_true(caplog) -> None: # type: ignore - setup_mock_response( - url="http://example.com", - body="
", - status=200 - ) +def test_verbose_true(caplog) -> None: # type: ignore + setup_mock_response(url="http://example.com", body="", status=200) - spider = Spider( - SpiderSettings(root_url="http://example.com", - verbose=True) - ) + spider = Spider(SpiderSettings(root_url="http://example.com", verbose=True)) spider.start() assert len(caplog.text) > 0 @@ -64,12 +57,8 @@ def test_verbose_true(caplog) -> None: # type: ignore @responses.activate -def test_verbose_false_no_errors(caplog) -> None: # type: ignore - setup_mock_response( - url="http://example.com", - body="", - status=200 - ) +def test_verbose_false_no_errors(caplog) -> None: # type: ignore + setup_mock_response(url="http://example.com", body="", status=200) spider = Spider(SpiderSettings(root_url="http://example.com", verbose=False)) spider.start() @@ -78,15 +67,14 @@ def test_verbose_false_no_errors(caplog) -> None: # type: ignore @responses.activate -def test_verbose_false_errors(caplog) -> None: # type: ignore +def test_verbose_false_errors(caplog) -> None: # type: ignore setup_mock_response( url="http://example.com", body="link", - status=200 + status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com", verbose=False)) + spider = Spider(SpiderSettings(root_url="http://example.com", verbose=False)) spider.start() assert "DEBUG" not in caplog.text diff --git a/tests/networking/test_fetcher.py b/tests/networking/test_fetcher.py index 2b89f17..9138ac4 100644 --- a/tests/networking/test_fetcher.py +++ b/tests/networking/test_fetcher.py @@ -1,4 +1,5 @@ import asyncio +from logging import ERROR from unittest.mock import patch import pytest @@ -7,7 +8,6 @@ from aiohttp import ClientConnectionError, ClientError from aioresponses import aioresponses from bs4 import BeautifulSoup -from tiny_web_crawler.logging import ERROR from tiny_web_crawler.networking.fetcher import fetch_url, fetch_url_async from tests.utils import setup_mock_response @@ -25,7 +25,7 @@ async def test_fetch_url_async_success() -> None: assert result is not None assert isinstance(result, BeautifulSoup) - assert result.find('a').text == "link" + assert result.find("a").text == "link" @pytest.mark.asyncio @@ -39,6 +39,7 @@ async def test_fetch_url_async_http_error() -> None: assert result is None + @pytest.mark.asyncio async def test_fetch_url_async_transient_error_retry() -> None: url = "http://example.com" @@ -52,7 +53,8 @@ async def test_fetch_url_async_transient_error_retry() -> None: assert result is not None assert isinstance(result, BeautifulSoup) - assert result.find('a').text == "link" + assert result.find("a").text == "link" + @pytest.mark.asyncio async def test_fetch_url_async_connection_error() -> None: @@ -65,6 +67,7 @@ async def test_fetch_url_async_connection_error() -> None: assert result is None + @pytest.mark.asyncio async def test_fetch_url_async_timeout_error() -> None: url = "http://example.com" @@ -76,6 +79,7 @@ async def test_fetch_url_async_timeout_error() -> None: assert result is None + @pytest.mark.asyncio async def test_fetch_url_async_request_exception() -> None: url = "http://example.com" @@ -87,12 +91,13 @@ async def test_fetch_url_async_request_exception() -> None: assert result is None + @responses.activate def test_fetch_url() -> None: setup_mock_response( url="http://example.com", body="link", - status=200 + status=200, ) resp = fetch_url("http://example.com", 1) @@ -102,8 +107,7 @@ def test_fetch_url() -> None: @responses.activate -def test_fetch_url_connection_error(caplog) -> None: # type: ignore - +def test_fetch_url_connection_error(caplog) -> None: # type: ignore with caplog.at_level(ERROR): # Fetch url whose response isn't mocked to raise ConnectionError resp = fetch_url("http://connection.error", 1) @@ -113,15 +117,15 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore @responses.activate -def test_fetch_url_http_error(caplog) -> None: # type: ignore +def test_fetch_url_http_error(caplog) -> None: # type: ignore error_codes = [403, 404, 412] for error_code in error_codes: setup_mock_response( url=f"http://http.error/{error_code}", body="link", - status=error_code - ) + status=error_code, + ) with caplog.at_level(ERROR): resp = fetch_url(f"http://http.error/{error_code}", 1) @@ -131,12 +135,8 @@ def test_fetch_url_http_error(caplog) -> None: # type: ignore @responses.activate -def test_fetch_url_timeout_error(caplog) -> None: # type: ignore - setup_mock_response( - url="http://timeout.error", - body=requests.exceptions.Timeout(), - status=408 - ) +def test_fetch_url_timeout_error(caplog) -> None: # type: ignore + setup_mock_response(url="http://timeout.error", body=requests.exceptions.Timeout(), status=408) with caplog.at_level(ERROR): # Fetch url whose response isn't mocked to raise ConnectionError @@ -147,11 +147,11 @@ def test_fetch_url_timeout_error(caplog) -> None: # type: ignore @responses.activate -def test_fetch_url_requests_exception(caplog) -> None: # type: ignore +def test_fetch_url_requests_exception(caplog) -> None: # type: ignore setup_mock_response( url="http://requests.exception", body=requests.exceptions.RequestException(), - status=404 + status=404, ) with caplog.at_level(ERROR): @@ -164,11 +164,11 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore @patch("time.sleep") @responses.activate -def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore +def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore setup_mock_response( url="http://transient.error", body="link", - status=503 + status=503, ) max_retry_attempts = 5 @@ -191,11 +191,11 @@ def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: @patch("time.sleep") @responses.activate -def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore +def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore setup_mock_response( url="http://transient.error", body="link", - status=503 + status=503, ) max_retry_attempts = 10 @@ -218,16 +218,16 @@ def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: @patch("time.sleep") @responses.activate -def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore +def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore setup_mock_response( url="http://transient.error", body="link", - status=503 + status=503, ) setup_mock_response( url="http://transient.error", body="link", - status=200 + status=200, ) max_retry_attempts = 1 diff --git a/tests/networking/test_formatter.py b/tests/networking/test_formatter.py index 9d17fef..aba8233 100644 --- a/tests/networking/test_formatter.py +++ b/tests/networking/test_formatter.py @@ -1,6 +1,6 @@ import pytest +from tiny_web_crawler.networking.formatter import DEFAULT_SCHEME, format_url -from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME @pytest.mark.parametrize( "url, base_url, expected", @@ -10,7 +10,7 @@ ("path1/path2", "http://example.com", "http://example.com/path1/path2"), ("/path1/path2", "http://example.com", "http://example.com/path1/path2"), ("path.com", "http://example.com", f"{DEFAULT_SCHEME}path.com"), - ] + ], ) def test_format_url(url: str, base_url: str, expected: str) -> None: assert format_url(url, base_url) == expected diff --git a/tests/networking/test_robots_txt.py b/tests/networking/test_robots_txt.py index 0feb9ad..a7593ea 100644 --- a/tests/networking/test_robots_txt.py +++ b/tests/networking/test_robots_txt.py @@ -1,10 +1,14 @@ -from unittest.mock import patch, MagicMock -from io import BytesIO import urllib.robotparser +from io import BytesIO +from unittest.mock import MagicMock, patch import pytest +from tiny_web_crawler.networking.robots_txt import ( + get_robots_txt_url, + is_robots_txt_allowed, + setup_robots_txt_parser, +) -from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser @pytest.mark.parametrize( "url, expected", @@ -15,13 +19,13 @@ ("http://example/path1/path2/path3/path4", "http://example/robots.txt"), ("http://example/path#fragment", "http://example/robots.txt"), ("http://example/path?query=test", "http://example/robots.txt"), - ] + ], ) def test_get_robots_txt_url(url: str, expected: str) -> None: assert get_robots_txt_url(url) == expected -@patch('urllib.request.urlopen') +@patch("urllib.request.urlopen") def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None: # Mock the response content of robots.txt mock_response = b"User-agent: *\nAllow: /" @@ -30,7 +34,7 @@ def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None: assert is_robots_txt_allowed("http://example.com") -@patch('urllib.request.urlopen') +@patch("urllib.request.urlopen") def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None: # Mock the response content of robots.txt mock_response = b"User-agent: *\nDisallow: /" @@ -39,7 +43,7 @@ def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None: assert not is_robots_txt_allowed("http://example.com") -@patch('urllib.request.urlopen') +@patch("urllib.request.urlopen") def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None: # Mock the response content of robots.txt mock_response = b"User-agent: *\nDisallow: /private" diff --git a/tests/networking/test_validator.py b/tests/networking/test_validator.py index caf417c..686cbc7 100644 --- a/tests/networking/test_validator.py +++ b/tests/networking/test_validator.py @@ -1,5 +1,6 @@ from tiny_web_crawler.networking.validator import is_valid_url + def test_is_valid_url() -> None: assert is_valid_url("http://example.com") is True - assert is_valid_url('invalid') is False + assert is_valid_url("invalid") is False diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 7acf0d2..4357b52 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,16 +1,15 @@ +import urllib.error from io import BytesIO +from logging import DEBUG, ERROR, WARNING from unittest.mock import MagicMock, mock_open, patch -import urllib.error - -import responses import pytest +import responses +from tiny_web_crawler import Spider, SpiderSettings -from tiny_web_crawler import Spider -from tiny_web_crawler import SpiderSettings -from tiny_web_crawler.logging import DEBUG, WARNING, ERROR from tests.utils import setup_mock_response + @responses.activate def test_crawl() -> None: setup_mock_response( @@ -24,30 +23,21 @@ def test_crawl() -> None: status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com", - max_links=10) - ) + spider = Spider(SpiderSettings(root_url="http://example.com", max_links=10)) spider.crawl("http://example.com") assert "http://example.com" in spider.crawl_result - assert spider.crawl_result["http://example.com"]["urls"] == [ - "http://example.com/test" - ] + assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/test"] spider.crawl("http://example.com/test") assert "http://example.com/test" in spider.crawl_result - assert spider.crawl_result["http://example.com/test"]["urls"] == [ - "http://example.com" - ] + assert spider.crawl_result["http://example.com/test"]["urls"] == ["http://example.com"] @responses.activate def test_crawl_invalid_url(caplog) -> None: # type: ignore - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) with caplog.at_level(DEBUG): spider.crawl("invalid_url") @@ -64,18 +54,14 @@ def test_crawl_already_crawled_url(caplog) -> None: # type: ignore status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) with caplog.at_level(DEBUG): spider.crawl("http://example.com") spider.crawl("http://example.com") assert "URL already crawled:" in caplog.text - assert spider.crawl_result == { - "http://example.com": {"urls": ["http://example.com"]} - } + assert spider.crawl_result == {"http://example.com": {"urls": ["http://example.com"]}} @responses.activate @@ -86,9 +72,7 @@ def test_crawl_unfetchable_url() -> None: status=404, ) - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) spider.crawl("http://example.com") assert spider.crawl_result == {} @@ -102,9 +86,7 @@ def test_crawl_found_invalid_url(caplog) -> None: # type: ignore status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) with caplog.at_level(DEBUG): spider.crawl("http://example.com") @@ -122,25 +104,17 @@ def test_crawl_found_duplicate_url() -> None: status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) spider.crawl("http://example.com") - assert spider.crawl_result == { - "http://example.com": {"urls": ["http://duplicate.com"]} - } + assert spider.crawl_result == {"http://example.com": {"urls": ["http://duplicate.com"]}} @responses.activate def test_crawl_no_urls_in_page() -> None: - setup_mock_response( - url="http://example.com", body="", status=200 - ) + setup_mock_response(url="http://example.com", body="", status=200) - spider = Spider( - SpiderSettings(root_url="http://example.com") - ) + spider = Spider(SpiderSettings(root_url="http://example.com")) spider.crawl("http://example.com") assert spider.crawl_result == {"http://example.com": {"urls": []}} @@ -149,10 +123,8 @@ def test_crawl_no_urls_in_page() -> None: @responses.activate def test_save_results() -> None: spider = Spider( - SpiderSettings(root_url="http://example.com", - max_links=10, - save_to_file="out.json") - ) + SpiderSettings(root_url="http://example.com", max_links=10, save_to_file="out.json") + ) spider.crawl_result = {"http://example.com": {"urls": ["http://example.com/test"]}} with patch("builtins.open", mock_open()) as mocked_file: @@ -173,20 +145,12 @@ def test_url_regex() -> None: # And only have numeric characters after it regex = r"http://example\.com/[0-9]+" - spider = Spider( - SpiderSettings(root_url="http://example.com", - url_regex=regex) - ) + spider = Spider(SpiderSettings(root_url="http://example.com", url_regex=regex)) spider.start() - assert spider.crawl_result["http://example.com"]["urls"] == [ - "http://example.com/123" - ] + assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/123"] - assert ( - "http://example.com/test" - not in spider.crawl_result["http://example.com"]["urls"] - ) + assert "http://example.com/test" not in spider.crawl_result["http://example.com"]["urls"] @responses.activate @@ -202,10 +166,7 @@ def test_include_body() -> None: status=200, ) - spider = Spider( - SpiderSettings(root_url="http://example.com", - include_body=True) - ) + spider = Spider(SpiderSettings(root_url="http://example.com", include_body=True)) spider.start() assert ( @@ -219,18 +180,15 @@ def test_include_body() -> None: @responses.activate -def test_internal_links_only(caplog) -> None: # type: ignore +def test_internal_links_only(caplog) -> None: # type: ignore setup_mock_response( url="http://internal.com", body="link" - +"link", + + "link", status=200, ) - spider = Spider( - SpiderSettings(root_url="http://internal.com", - internal_links_only=True) - ) + spider = Spider(SpiderSettings(root_url="http://internal.com", internal_links_only=True)) with caplog.at_level(DEBUG): spider.crawl("http://internal.com") @@ -240,18 +198,15 @@ def test_internal_links_only(caplog) -> None: # type: ignore @responses.activate -def test_external_links_only(caplog) -> None: # type: ignore +def test_external_links_only(caplog) -> None: # type: ignore setup_mock_response( url="http://internal.com", body="link" - +"link", + + "link