Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Update TrafilaturaWebReader in library.json (#602)
Browse files Browse the repository at this point in the history
* Update TrafilaturaWebReader in loader class and installation check

* Resolve tests
  • Loading branch information
ravi03071991 authored Oct 25, 2023
1 parent 01979d5 commit a5d2643
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
9 changes: 9 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1074,5 +1074,14 @@
"microsoft 365",
"microsoft365"
]
},
"TrafilaturaWebReader": {
"id": "web/trafilatura_web",
"author": "NA",
"keywords":[
"trafilatura",
"web",
"web reader"
]
}
}
9 changes: 9 additions & 0 deletions llama_hub/web/trafilatura_web/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from importlib.util import find_spec

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
Expand All @@ -12,6 +13,14 @@ class TrafilaturaWebReader(BaseReader):
"""

def __init__(self) -> None:

if find_spec("trafilatura") is None:
raise ImportError(
"Missing package: trafilatura.\n"
"Please `pip install trafilatura` to use this Reader"
)

def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the urls.
Expand Down

0 comments on commit a5d2643

Please sign in to comment.