Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Update TrafilaturaWebReader in library.json #602

Merged
merged 2 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1074,5 +1074,14 @@
"microsoft 365",
"microsoft365"
]
},
"TrafilaturaWebReader": {
"id": "trafilatura_web_reader",
ravi03071991 marked this conversation as resolved.
Show resolved Hide resolved
"author": "NA",
"keywords":[
"trafilatura",
"web",
"web reader"
]
}
}
9 changes: 9 additions & 0 deletions llama_hub/web/trafilatura_web/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from importlib.util import find_spec

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
Expand All @@ -12,6 +13,14 @@ class TrafilaturaWebReader(BaseReader):

"""

def __init__(self) -> None:

if find_spec("trafilatura") is None:
raise ImportError(
"Missing package: trafilatura.\n"
"Please `pip install trafilatura` to use this Reader"
)

def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the urls.

Expand Down
Loading