DataCrawl-AI · indrajithi · Jun 16, 2024 · Jun 16, 2024 · Jun 16, 2024 · Jun 16, 2024
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -66,6 +66,29 @@ def test_save_results() -> None:
         mocked_file.assert_called_once_with("out.json", "w", encoding='utf-8')
 
 
+@responses.activate
+def test_url_regex() -> None:
+    # Mock HTTP response
+    responses.add(
+        responses.GET,
+        "http://example.com",
+        body="<html><body><a href='http://example.com/123'>link</a><a href='http://example.com/test'>link</a></body></html>",
+        status=200,
+        content_type="text/html",
+    )
+
+    # This regex matches strings starting with "http://example.com/" 
+    # And only have numeric characters after it
+    regex = r"http://example\.com/[0-9]+" 
+
+    spider = Spider("http://example.com", 0, url_regex=regex)
+    spider.start()
+
+    assert spider.crawl_result["http://example.com"]["urls"] == ["http://example.com/123"]
+
+    assert "http://example.com/test" not in spider.crawl_result["http://example.com"]["urls"]
+
+
 @patch.object(Spider, "crawl")
 @patch.object(Spider, "save_results")
 def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:

diff --git a/tiny_web_crawler/crawler.py b/tiny_web_crawler/crawler.py
@@ -6,6 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import time
+import re
 import requests
 import validators
 from bs4 import BeautifulSoup
@@ -29,6 +30,7 @@ class Spider():
         save_to_file (Optional[str]): The file path to save the crawl results.
         max_workers (int): Max count of concurrent workers
         delay (float): request delay
+        url_regex (Optional[str]): A regular expression against which urls will be matched before crawling
     """
 
     root_url: str
@@ -41,6 +43,7 @@ class Spider():
     crawl_set: Set[str] = field(default_factory=set)
     link_count: int = 0
     scheme: str = field(default=DEFAULT_SCHEME, init=False)
+    url_regex: Optional[str] = None
 
     def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
         """
@@ -152,6 +155,11 @@ def crawl(self, url: str) -> None:
             if pretty_url in self.crawl_result[url]['urls']:
                 continue
 
+            if self.url_regex:
+                if not re.compile(self.url_regex).match(pretty_url):
+                    self.verbose_print(Fore.YELLOW + f"Skipping: URL didn't match regx: {pretty_url}")
+                    continue
+
             self.crawl_result[url]['urls'].append(pretty_url)
             self.crawl_set.add(pretty_url)
             self.verbose_print(Fore.BLUE + f"Link found: {pretty_url}")