Skip to content

Commit

Permalink
chore: use non-chrome browser for web content
Browse files Browse the repository at this point in the history
  • Loading branch information
zhudotexe committed Dec 4, 2024
1 parent 89475bf commit 3e38db2
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 73 deletions.
74 changes: 5 additions & 69 deletions redel/tools/browsing/impl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import contextlib
import logging
import tempfile
from typing import Optional, TYPE_CHECKING
Expand All @@ -12,12 +11,6 @@
import httpx
import pymupdf
import pymupdf4llm
from playwright.async_api import (
BrowserContext,
TimeoutError as PlaywrightTimeoutError,
async_playwright,
Error as PlaywrightError,
)
except ImportError:
raise ImportError(
"You are missing required dependencies to use the bundled tools. Please install ReDel using `pip install"
Expand All @@ -40,11 +33,6 @@ class Browsing(ToolBase):
Renders webpages in Markdown and has basic support for reading PDFs.
"""

# app-global browser instance
playwright = None
browser = None
browser_context = None

def __init__(
self,
*args,
Expand Down Expand Up @@ -78,51 +66,6 @@ def __init__(
"text/": self.html_content,
}

# === resources + app lifecycle ===
# noinspection PyMethodMayBeStatic
async def get_browser(self, **kwargs) -> BrowserContext:
"""Get the current active browser context, or launch it on the first call."""
if Browsing.playwright is None:
Browsing.playwright = await async_playwright().start()
if Browsing.browser is None:
Browsing.browser = await Browsing.playwright.chromium.launch(**kwargs)
if Browsing.browser_context is None:
Browsing.browser_context = await Browsing.browser.new_context()
return Browsing.browser_context

async def get_page(self, create=True) -> Optional["Page"]:
"""Get the current page.
Returns None if the browser is not on a page unless `create` is True, in which case it creates a new page.
"""
if self.page is None and create:
context = await self.get_browser()
if self.page_concurrency_sem:
await self.page_concurrency_sem.acquire()
self.page = await context.new_page()
return self.page

async def cleanup(self):
await super().cleanup()
if self.page is not None:
await self.page.close()
if self.page_concurrency_sem:
self.page_concurrency_sem.release()
self.page = None

async def close(self):
await super().close()
try:
if (browser := Browsing.browser) is not None:
Browsing.browser = None
await browser.close()
if (pw := Browsing.playwright) is not None:
Browsing.playwright = None
await pw.stop()
except PlaywrightError:
# sometimes playwright doesn't like closing in parallel
pass

# ==== functions ====
@ai_function()
async def search(self, query: str):
Expand Down Expand Up @@ -190,21 +133,14 @@ async def json_content(self, href: str) -> str:

async def html_content(self, href: str) -> str:
"""Default handler for all other content types."""
page = await self.get_page()
await page.goto(href)
with contextlib.suppress(PlaywrightTimeoutError):
await page.wait_for_load_state("networkidle", timeout=10_000)
# header
title = await page.title()
header = f"{title}\n{'=' * len(title)}\n{page.url}\n\n"

content_html = await page.content()
content = web_markdownify(content_html)
resp = await self.http.get(href)
resp.raise_for_status()
await resp.aread()
content = web_markdownify(resp.text)
# summarization
content = await self.maybe_summarize(content)
# result
result = header + content
return result
return content

# ==== helpers ====
async def maybe_summarize(self, content, max_len=None):
Expand Down
5 changes: 1 addition & 4 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
- long engine: claude-3-opus (for summarizing long webpages, if ANTHROPIC_API_KEY is set)
"""

import asyncio
import logging
import os
from pathlib import Path
Expand All @@ -38,8 +37,6 @@
else:
long_engine = None

# only allow 3 chrome tabs at once to save my server
web_concurrency_sem = asyncio.Semaphore(3)

# Define the configuration for each interactive session
ai = ReDel(
Expand All @@ -49,7 +46,7 @@
tool_configs={
Browsing: {
"always_include": True,
"kwargs": {"long_engine": long_engine, "page_concurrency_sem": web_concurrency_sem},
"kwargs": {"long_engine": long_engine},
},
},
max_delegation_depth=4,
Expand Down

0 comments on commit 3e38db2

Please sign in to comment.