Merge pull request #48 from Mews/use-settings-classes

DataCrawl-AI · Jun 21, 2024 · 391e460 · 391e460
2 parents bdd8c4b + 8ee3b37
commit 391e460
Show file tree

Hide file tree

Showing 7 changed files with 197 additions and 81 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -288,7 +288,7 @@ ignored-parents=
 max-args=10
 
 # Maximum number of attributes for a class (see R0902).
-max-attributes=17
+max-attributes=15
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr=5

diff --git a/README.md b/README.md
@@ -29,20 +29,31 @@ pip install tiny-web-crawler
 ## Usage
 
 ```python
-from tiny_web_crawler.crawler import Spider
+from tiny_web_crawler import Spider
+from tiny_web_crawler import SpiderSettings
 
-root_url = 'http://github.com'
-max_links = 2
+settings = SpiderSettings(
+    root_url = 'http://github.com',
+    max_links = 2
+)
 
-crawl = Spider(root_url, max_links)
-crawl.start()
+spider = Spider(settings)
+spider.start()
 
 
 # Set workers and delay (default: delay is 0.5 sec and verbose is True)
 # If you do not want delay, set delay=0
 
-crawl = Spider(root_url='https://github.com', max_links=5, max_workers=5, delay=1, verbose=False)
-crawl.start()
+settings = SpiderSettings(
+    root_url = 'https://github.com',
+    max_links = 5,
+    max_workers = 5,
+    delay = 1,
+    verbose = False
+)
+
+spider = Spider(settings)
+spider.start()
 
 ```
 

diff --git a/src/tiny_web_crawler/__init__.py b/src/tiny_web_crawler/__init__.py
@@ -1 +1,2 @@
 from tiny_web_crawler.core.spider import Spider
+from tiny_web_crawler.core.spider_settings import SpiderSettings
diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py
@@ -5,12 +5,13 @@
 import time
 import re
 
-from typing import Dict, List, Optional, Set, Any
+from typing import Dict, List, Set, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import urllib.parse
 import urllib.robotparser
 import requests
 
+from tiny_web_crawler.core.spider_settings import SpiderSettings
 from tiny_web_crawler.networking.fetcher import fetch_url
 from tiny_web_crawler.networking.validator import is_valid_url
 from tiny_web_crawler.networking.formatter import format_url
@@ -26,53 +27,28 @@ class Spider:
     A simple web crawler class.
 
     Attributes:
-        root_url (str): The root URL to start crawling from.
-        max_links (int): The maximum number of links to crawl.
-        crawl_result (Dict[str, Dict[str, Any]): The dictionary storing the crawl results.
-        crawl_set (Set[str]): A set of URLs to be crawled.
-        link_count (int): The current count of crawled links.
-        save_to_file (Optional[str]): The file path to save the crawl results.
-        max_workers (int): Max count of concurrent workers
-        delay (float): request delay
-        url_regex (Optional[str]): A regular expression against which urls will be matched before crawling
-        include_body (bool): Whether or not to include the crawled page's body in crawl_result (default: False)
-        internal_links_only (bool): Whether or not to crawl only internal links
-        external_links_only (bool): Whether or not to crawl only external links
-        respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
+        settings (SpiderSettings): The SpiderSettings object with the settings for the Spider object
     """
 
-    root_url: str
-    root_netloc: str = field(init=False)
-    max_links: int = 5
-    save_to_file: Optional[str] = None
-    max_workers: int = 1
-    delay: float = 0.5
-    verbose: bool = True
+    settings: SpiderSettings
+
     crawl_result: Dict[str, Dict[str, Any]] = field(default_factory=dict)
     crawl_set: Set[str] = field(default_factory=set)
     link_count: int = 0
-    url_regex: Optional[str] = None
-    include_body: bool = False
-    internal_links_only: bool = False
-    external_links_only: bool = False
-    respect_robots_txt: bool = True
 
     def __post_init__(self) -> None:
         self.scheme: str = DEFAULT_SCHEME
 
         self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}
 
-        self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc
-
-        if self.internal_links_only and self.external_links_only:
-            raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
+        self.root_netloc: str = urllib.parse.urlparse(self.settings.root_url).netloc
 
-        if self.verbose:
+        if self.settings.verbose:
             set_logging_level(DEBUG)
         else:
             set_logging_level(INFO)
 
-        if not self.respect_robots_txt:
+        if not self.settings.respect_robots_txt:
             logger.warning(
                 "Ignoring robots.txt files! You might be at risk of:\n"+
                 "Agent/IP bans;\n"+
@@ -85,8 +61,8 @@ def save_results(self) -> None:
         """
         Saves the crawl results into a JSON file.
         """
-        if self.save_to_file:
-            with open(self.save_to_file, 'w', encoding='utf-8') as file:
+        if self.settings.save_to_file:
+            with open(self.settings.save_to_file, 'w', encoding='utf-8') as file:
                 json.dump(self.crawl_result, file, indent=4)
 
     def crawl(self, url: str) -> None:
@@ -104,7 +80,7 @@ def crawl(self, url: str) -> None:
             logger.debug("URL already crawled: %s", url)
             return
 
-        if self.respect_robots_txt and not self._handle_robots_txt(url):
+        if self.settings.respect_robots_txt and not self._handle_robots_txt(url):
             logger.debug("Skipped: Url doesn't allow crawling: %s", url)
             return
 
@@ -116,7 +92,7 @@ def crawl(self, url: str) -> None:
         links = soup.body.find_all('a', href=True) if soup.body else []
         self.crawl_result[url] = {'urls': []}
 
-        if self.include_body:
+        if self.settings.include_body:
             self.crawl_result[url]['body'] = str(soup)
 
         for link in links:
@@ -129,7 +105,7 @@ def crawl(self, url: str) -> None:
             self.crawl_set.add(pretty_url)
             logger.debug("Link found: %s", pretty_url)
 
-        if self.link_count < self.max_links:
+        if self.link_count < self.settings.max_links:
             self.link_count += 1
             logger.debug("Links crawled: %s", self.link_count)
 
@@ -141,15 +117,15 @@ def _should_skip_link(self, pretty_url: str, url: str) -> bool:
         if pretty_url in self.crawl_result[url]['urls']:
             return True
 
-        if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
+        if self.settings.url_regex and not re.compile(self.settings.url_regex).match(pretty_url):
             logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
             return True
 
-        if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
+        if self.settings.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
             logger.debug("Skipping: External link: %s", pretty_url)
             return True
 
-        if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
+        if self.settings.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
             logger.debug("Skipping: Internal link: %s", pretty_url)
             return True
 
@@ -182,21 +158,21 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
         Returns:
             Dict[str, Dict[str, List[str]]]: The crawl results.
         """
-        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-            futures = {executor.submit(self.crawl, self.root_url)}
+        with ThreadPoolExecutor(max_workers=self.settings.max_workers) as executor:
+            futures = {executor.submit(self.crawl, self.settings.root_url)}
 
-            while self.link_count < self.max_links and futures:
+            while self.link_count < self.settings.max_links and futures:
                 for future in as_completed(futures):
                     futures.remove(future)
                     if future.exception() is None:
-                        while self.link_count < self.max_links and self.crawl_set:
+                        while self.link_count < self.settings.max_links and self.crawl_set:
                             url = self.crawl_set.pop()
                             if url not in self.crawl_result:
                                 futures.add(executor.submit(self.crawl, url))
-                                time.sleep(self.delay)
+                                time.sleep(self.settings.delay)
                                 break  # Break to check the next future
 
-        if self.save_to_file:
+        if self.settings.save_to_file:
             self.save_results()
         logger.debug("Exiting....")
         return self.crawl_result
diff --git a/src/tiny_web_crawler/core/spider_settings.py b/src/tiny_web_crawler/core/spider_settings.py
@@ -0,0 +1,55 @@
+from typing import Optional
+
+from dataclasses import dataclass
+
+@dataclass
+class GeneralSettings:
+    """
+    A simple dataclass to store general settings for the Spider class
+
+    Attributes:
+        root_url (str): The root URL to start crawling from.
+        max_links (int): The maximum number of links to crawl. (Default: 5)
+        save_to_file (Optional[str]): The file path to save the crawl results.
+        max_workers (int): Max count of concurrent workers. (Default: 1)
+        delay (float): Delay between requests. (Default: 0.5)
+        verbose (bool): Whether or not to print debug messages (Default: True)
+    """
+
+    root_url: str = ""
+    max_links: int = 5
+    save_to_file: Optional[str] = None
+    max_workers: int = 1
+    delay: float = 0.5
+    verbose: bool = True
+
+@dataclass
+class CrawlSettings:
+    """
+    A simple dataclass to store crawl settings for the Spider class
+
+    Attributes:
+        url_regex (Optional[str]): A regular expression against which urls will be matched before crawling
+        include_body (bool): Whether or not to include the crawled page's body in crawl_result (Default: False)
+        internal_links_only (bool): Whether or not to crawl only internal links (Default: False)
+        external_links_only (bool): Whether or not to crawl only external links (Default: False)
+        respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
+    """
+    url_regex: Optional[str] = None
+    include_body: bool = False
+    internal_links_only: bool = False
+    external_links_only: bool = False
+    respect_robots_txt: bool = True
+
+@dataclass
+class SpiderSettings(GeneralSettings, CrawlSettings):
+    """
+    A simple dataclass that stores all the settings for the Spider class
+    """
+
+    def __post_init__(self) -> None:
+        if self.root_url == "":
+            raise ValueError("\"root_url\" argument is required")
+
+        if self.internal_links_only and self.external_links_only:
+            raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
diff --git a/tests/logging/test_logging.py b/tests/logging/test_logging.py
@@ -1,7 +1,8 @@
 import logging
 import responses
 
-from tiny_web_crawler.core.spider import Spider
+from tiny_web_crawler import Spider
+from tiny_web_crawler import SpiderSettings
 from tiny_web_crawler.logging import get_logger, set_logging_level, DEBUG, INFO, ERROR, LOGGER_NAME
 from tests.utils import setup_mock_response
 
@@ -29,11 +30,17 @@ def test_set_logging_level(caplog) -> None: # type: ignore
 def test_verbose_logging_level() -> None:
     logger = get_logger()
 
-    spider = Spider("http://example.com", verbose=True) # pylint: disable=unused-variable
+    spider = Spider( # pylint: disable=unused-variable
+        SpiderSettings(root_url="http://example.com",
+                       verbose=True)
+                        )
 
     assert logger.getEffectiveLevel() == DEBUG
 
-    spider = Spider("http://example.com", verbose=False) # pylint: disable=unused-variable
+    spider = Spider( # pylint: disable=unused-variable
+        SpiderSettings(root_url="http://example.com",
+                       verbose=False)
+                        )
 
     assert logger.getEffectiveLevel() == INFO
 
@@ -46,7 +53,10 @@ def test_verbose_true(caplog) -> None: # type: ignore
         status=200
     )
 
-    spider = Spider("http://example.com", verbose=True)
+    spider = Spider(
+        SpiderSettings(root_url="http://example.com",
+                       verbose=True)
+                        )
     spider.start()
 
     assert len(caplog.text) > 0
@@ -61,7 +71,7 @@ def test_verbose_false_no_errors(caplog) -> None: # type: ignore
         status=200
     )
 
-    spider = Spider("http://example.com", verbose=False)
+    spider = Spider(SpiderSettings(root_url="http://example.com", verbose=False))
     spider.start()
 
     assert len(caplog.text) == 0
@@ -75,7 +85,8 @@ def test_verbose_false_errors(caplog) -> None: # type: ignore
         status=200
     )
 
-    spider = Spider("http://example.com", verbose=False)
+    spider = Spider(
+        SpiderSettings(root_url="http://example.com", verbose=False))
     spider.start()
 
     assert "DEBUG" not in caplog.text
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from tiny_web_crawler.core.spider import Spider
		from tiny_web_crawler.core.spider_settings import SpiderSettings