Skip to content

Commit

Permalink
Handle negative statuses in web scrape tool
Browse files Browse the repository at this point in the history
  • Loading branch information
Shulyaka committed Dec 19, 2024
1 parent c28c5ae commit bda1562
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 20 deletions.
47 changes: 30 additions & 17 deletions custom_components/powerllm/tools/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,33 @@ def setup(hass: HomeAssistant):

@llm_tool(hass)
def web_scrape(url: str):
"""Get latest content of a web page."""
downloaded = trafilatura.fetch_url(url=url)

parsed = trafilatura.extract(
downloaded,
url=url,
output_format="json",
include_links=True,
deduplicate=True,
favor_precision=False,
favor_recall=True,
with_metadata=True,
)

result = json.loads(parsed)

return {k: v for k, v in result.items() if v and k not in REMOVE_KEYS}
"""Get text from a web page.
Use it to get up-to-date information from the internet.
"""
downloaded = trafilatura.fetch_response(url=url)

if downloaded.data:
parsed = trafilatura.extract(
downloaded.data,
url=downloaded.url,
output_format="json",
include_links=True,
deduplicate=True,
favor_precision=False,
favor_recall=True,
with_metadata=True,
)
result = json.loads(parsed)
else:
result = {"error": "No data downloaded."}

result = {k: v for k, v in result.items() if v and k not in REMOVE_KEYS}

if downloaded.url != url:
result["url"] = downloaded.url

if downloaded.status != 200:
result["status"] = downloaded.status

return result
9 changes: 6 additions & 3 deletions tests/tools/test_web_scrape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Test web scrape tool."""

from unittest.mock import patch
from unittest.mock import MagicMock, patch


def test_test(hass):
Expand All @@ -10,7 +10,10 @@ def test_test(hass):
async def test_web_scrape_tool(async_call_tool) -> None:
"""Test web scrape tool."""

helloworld = """<!DOCTYPE html>
helloworld = MagicMock()
helloworld.status = 200
helloworld.url = "example.com"
helloworld.data = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
Expand All @@ -24,7 +27,7 @@ async def test_web_scrape_tool(async_call_tool) -> None:
</html>
"""

with patch("trafilatura.fetch_url", return_value=helloworld) as mock_fetch:
with patch("trafilatura.fetch_response", return_value=helloworld) as mock_fetch:
response = await async_call_tool("web_scrape", url="example.com")

mock_fetch.assert_called_once_with(url="example.com")
Expand Down

0 comments on commit bda1562

Please sign in to comment.