Skip to content

Commit

Permalink
Merge pull request #48 from Mews/use-settings-classes
Browse files Browse the repository at this point in the history
  • Loading branch information
indrajithi authored Jun 21, 2024
2 parents bdd8c4b + 8ee3b37 commit 391e460
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ ignored-parents=
max-args=10

# Maximum number of attributes for a class (see R0902).
max-attributes=17
max-attributes=15

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
Expand Down
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,31 @@ pip install tiny-web-crawler
## Usage

```python
from tiny_web_crawler.crawler import Spider
from tiny_web_crawler import Spider
from tiny_web_crawler import SpiderSettings

root_url = 'http://github.com'
max_links = 2
settings = SpiderSettings(
root_url = 'http://github.com',
max_links = 2
)

crawl = Spider(root_url, max_links)
crawl.start()
spider = Spider(settings)
spider.start()


# Set workers and delay (default: delay is 0.5 sec and verbose is True)
# If you do not want delay, set delay=0

crawl = Spider(root_url='https://github.com', max_links=5, max_workers=5, delay=1, verbose=False)
crawl.start()
settings = SpiderSettings(
root_url = 'https://github.com',
max_links = 5,
max_workers = 5,
delay = 1,
verbose = False
)

spider = Spider(settings)
spider.start()

```

Expand Down
1 change: 1 addition & 0 deletions src/tiny_web_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from tiny_web_crawler.core.spider import Spider
from tiny_web_crawler.core.spider_settings import SpiderSettings
68 changes: 22 additions & 46 deletions src/tiny_web_crawler/core/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
import time
import re

from typing import Dict, List, Optional, Set, Any
from typing import Dict, List, Set, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.parse
import urllib.robotparser
import requests

from tiny_web_crawler.core.spider_settings import SpiderSettings
from tiny_web_crawler.networking.fetcher import fetch_url
from tiny_web_crawler.networking.validator import is_valid_url
from tiny_web_crawler.networking.formatter import format_url
Expand All @@ -26,53 +27,28 @@ class Spider:
A simple web crawler class.
Attributes:
root_url (str): The root URL to start crawling from.
max_links (int): The maximum number of links to crawl.
crawl_result (Dict[str, Dict[str, Any]): The dictionary storing the crawl results.
crawl_set (Set[str]): A set of URLs to be crawled.
link_count (int): The current count of crawled links.
save_to_file (Optional[str]): The file path to save the crawl results.
max_workers (int): Max count of concurrent workers
delay (float): request delay
url_regex (Optional[str]): A regular expression against which urls will be matched before crawling
include_body (bool): Whether or not to include the crawled page's body in crawl_result (default: False)
internal_links_only (bool): Whether or not to crawl only internal links
external_links_only (bool): Whether or not to crawl only external links
respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
settings (SpiderSettings): The SpiderSettings object with the settings for the Spider object
"""

root_url: str
root_netloc: str = field(init=False)
max_links: int = 5
save_to_file: Optional[str] = None
max_workers: int = 1
delay: float = 0.5
verbose: bool = True
settings: SpiderSettings

crawl_result: Dict[str, Dict[str, Any]] = field(default_factory=dict)
crawl_set: Set[str] = field(default_factory=set)
link_count: int = 0
url_regex: Optional[str] = None
include_body: bool = False
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True

def __post_init__(self) -> None:
self.scheme: str = DEFAULT_SCHEME

self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}

self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc

if self.internal_links_only and self.external_links_only:
raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
self.root_netloc: str = urllib.parse.urlparse(self.settings.root_url).netloc

if self.verbose:
if self.settings.verbose:
set_logging_level(DEBUG)
else:
set_logging_level(INFO)

if not self.respect_robots_txt:
if not self.settings.respect_robots_txt:
logger.warning(
"Ignoring robots.txt files! You might be at risk of:\n"+
"Agent/IP bans;\n"+
Expand All @@ -85,8 +61,8 @@ def save_results(self) -> None:
"""
Saves the crawl results into a JSON file.
"""
if self.save_to_file:
with open(self.save_to_file, 'w', encoding='utf-8') as file:
if self.settings.save_to_file:
with open(self.settings.save_to_file, 'w', encoding='utf-8') as file:
json.dump(self.crawl_result, file, indent=4)

def crawl(self, url: str) -> None:
Expand All @@ -104,7 +80,7 @@ def crawl(self, url: str) -> None:
logger.debug("URL already crawled: %s", url)
return

if self.respect_robots_txt and not self._handle_robots_txt(url):
if self.settings.respect_robots_txt and not self._handle_robots_txt(url):
logger.debug("Skipped: Url doesn't allow crawling: %s", url)
return

Expand All @@ -116,7 +92,7 @@ def crawl(self, url: str) -> None:
links = soup.body.find_all('a', href=True) if soup.body else []
self.crawl_result[url] = {'urls': []}

if self.include_body:
if self.settings.include_body:
self.crawl_result[url]['body'] = str(soup)

for link in links:
Expand All @@ -129,7 +105,7 @@ def crawl(self, url: str) -> None:
self.crawl_set.add(pretty_url)
logger.debug("Link found: %s", pretty_url)

if self.link_count < self.max_links:
if self.link_count < self.settings.max_links:
self.link_count += 1
logger.debug("Links crawled: %s", self.link_count)

Expand All @@ -141,15 +117,15 @@ def _should_skip_link(self, pretty_url: str, url: str) -> bool:
if pretty_url in self.crawl_result[url]['urls']:
return True

if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
if self.settings.url_regex and not re.compile(self.settings.url_regex).match(pretty_url):
logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
return True

if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
if self.settings.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: External link: %s", pretty_url)
return True

if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
if self.settings.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: Internal link: %s", pretty_url)
return True

Expand Down Expand Up @@ -182,21 +158,21 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
Returns:
Dict[str, Dict[str, List[str]]]: The crawl results.
"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {executor.submit(self.crawl, self.root_url)}
with ThreadPoolExecutor(max_workers=self.settings.max_workers) as executor:
futures = {executor.submit(self.crawl, self.settings.root_url)}

while self.link_count < self.max_links and futures:
while self.link_count < self.settings.max_links and futures:
for future in as_completed(futures):
futures.remove(future)
if future.exception() is None:
while self.link_count < self.max_links and self.crawl_set:
while self.link_count < self.settings.max_links and self.crawl_set:
url = self.crawl_set.pop()
if url not in self.crawl_result:
futures.add(executor.submit(self.crawl, url))
time.sleep(self.delay)
time.sleep(self.settings.delay)
break # Break to check the next future

if self.save_to_file:
if self.settings.save_to_file:
self.save_results()
logger.debug("Exiting....")
return self.crawl_result
55 changes: 55 additions & 0 deletions src/tiny_web_crawler/core/spider_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import Optional

from dataclasses import dataclass

@dataclass
class GeneralSettings:
"""
A simple dataclass to store general settings for the Spider class
Attributes:
root_url (str): The root URL to start crawling from.
max_links (int): The maximum number of links to crawl. (Default: 5)
save_to_file (Optional[str]): The file path to save the crawl results.
max_workers (int): Max count of concurrent workers. (Default: 1)
delay (float): Delay between requests. (Default: 0.5)
verbose (bool): Whether or not to print debug messages (Default: True)
"""

root_url: str = ""
max_links: int = 5
save_to_file: Optional[str] = None
max_workers: int = 1
delay: float = 0.5
verbose: bool = True

@dataclass
class CrawlSettings:
"""
A simple dataclass to store crawl settings for the Spider class
Attributes:
url_regex (Optional[str]): A regular expression against which urls will be matched before crawling
include_body (bool): Whether or not to include the crawled page's body in crawl_result (Default: False)
internal_links_only (bool): Whether or not to crawl only internal links (Default: False)
external_links_only (bool): Whether or not to crawl only external links (Default: False)
respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
"""
url_regex: Optional[str] = None
include_body: bool = False
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True

@dataclass
class SpiderSettings(GeneralSettings, CrawlSettings):
"""
A simple dataclass that stores all the settings for the Spider class
"""

def __post_init__(self) -> None:
if self.root_url == "":
raise ValueError("\"root_url\" argument is required")

if self.internal_links_only and self.external_links_only:
raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
23 changes: 17 additions & 6 deletions tests/logging/test_logging.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
import responses

from tiny_web_crawler.core.spider import Spider
from tiny_web_crawler import Spider
from tiny_web_crawler import SpiderSettings
from tiny_web_crawler.logging import get_logger, set_logging_level, DEBUG, INFO, ERROR, LOGGER_NAME
from tests.utils import setup_mock_response

Expand Down Expand Up @@ -29,11 +30,17 @@ def test_set_logging_level(caplog) -> None: # type: ignore
def test_verbose_logging_level() -> None:
logger = get_logger()

spider = Spider("http://example.com", verbose=True) # pylint: disable=unused-variable
spider = Spider( # pylint: disable=unused-variable
SpiderSettings(root_url="http://example.com",
verbose=True)
)

assert logger.getEffectiveLevel() == DEBUG

spider = Spider("http://example.com", verbose=False) # pylint: disable=unused-variable
spider = Spider( # pylint: disable=unused-variable
SpiderSettings(root_url="http://example.com",
verbose=False)
)

assert logger.getEffectiveLevel() == INFO

Expand All @@ -46,7 +53,10 @@ def test_verbose_true(caplog) -> None: # type: ignore
status=200
)

spider = Spider("http://example.com", verbose=True)
spider = Spider(
SpiderSettings(root_url="http://example.com",
verbose=True)
)
spider.start()

assert len(caplog.text) > 0
Expand All @@ -61,7 +71,7 @@ def test_verbose_false_no_errors(caplog) -> None: # type: ignore
status=200
)

spider = Spider("http://example.com", verbose=False)
spider = Spider(SpiderSettings(root_url="http://example.com", verbose=False))
spider.start()

assert len(caplog.text) == 0
Expand All @@ -75,7 +85,8 @@ def test_verbose_false_errors(caplog) -> None: # type: ignore
status=200
)

spider = Spider("http://example.com", verbose=False)
spider = Spider(
SpiderSettings(root_url="http://example.com", verbose=False))
spider.start()

assert "DEBUG" not in caplog.text
Expand Down
Loading

0 comments on commit 391e460

Please sign in to comment.