diff --git a/llama_hub/library.json b/llama_hub/library.json index 62b36e4de7..5b3da6d92f 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -1074,5 +1074,14 @@ "microsoft 365", "microsoft365" ] + }, + "TrafilaturaWebReader": { + "id": "web/trafilatura_web", + "author": "NA", + "keywords":[ + "trafilatura", + "web", + "web reader" + ] } } \ No newline at end of file diff --git a/llama_hub/web/trafilatura_web/base.py b/llama_hub/web/trafilatura_web/base.py index 6b9a482850..a17e34e95e 100644 --- a/llama_hub/web/trafilatura_web/base.py +++ b/llama_hub/web/trafilatura_web/base.py @@ -1,4 +1,5 @@ from typing import List +from importlib.util import find_spec from llama_index.readers.base import BaseReader from llama_index.readers.schema.base import Document @@ -12,6 +13,14 @@ class TrafilaturaWebReader(BaseReader): """ + def __init__(self) -> None: + + if find_spec("trafilatura") is None: + raise ImportError( + "Missing package: trafilatura.\n" + "Please `pip install trafilatura` to use this Reader" + ) + def load_data(self, urls: List[str]) -> List[Document]: """Load data from the urls.