- updating documentation to match method defaults (#591)

- exposing NewsArticleReader through __init__.py - allowing generators to be passed to load_data. - adding requirements.txt to match other /web/ packages
run-llama · Oct 20, 2023 · f03d940 · f03d940
1 parent d1d0e83
commit f03d940
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 5 deletions.
diff --git a/llama_hub/web/news/README.md b/llama_hub/web/news/README.md
@@ -11,7 +11,7 @@ Pass in an array of individual page URLs:
 ```python
 from llama_hub.web.news import NewsArticleReader
 
-reader = NewsArticleReader()
+reader = NewsArticleReader(use_nlp=False)
 documents = reader.load_data([
     'https://www.cnbc.com/2023/08/03/amazon-amzn-q2-earnings-report-2023.html',
     'https://www.theverge.com/2023/8/3/23818388/brave-search-image-video-results-privacy-index'

diff --git a/llama_hub/web/news/__init__.py b/llama_hub/web/news/__init__.py
@@ -0,0 +1,3 @@
+from .base import NewsArticleReader
+
+__all__ = ["NewsArticleReader"]
diff --git a/llama_hub/web/news/base.py b/llama_hub/web/news/base.py
@@ -1,6 +1,6 @@
 """News article reader using Newspaper."""
 import logging
-from typing import Any, List
+from typing import Any, List, Generator
 
 from llama_index.readers.base import BaseReader
 from llama_index.schema import Document
@@ -16,7 +16,7 @@ class NewsArticleReader(BaseReader):
 
     Args:
         text_mode (bool): Whether to load a text version or HTML version of the content (default=True).
-        use_nlp (bool): Whether to use NLP to extract additional summary and keywords (default=False).
+        use_nlp (bool): Whether to use NLP to extract additional summary and keywords (default=True).
         newspaper_kwargs: Additional keyword arguments to pass to newspaper.Article. See
             https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html#article
     """
@@ -43,8 +43,8 @@ def load_data(self, urls: List[str]) -> List[Document]:
             List[Document]: List of documents.
 
         """
-        if not isinstance(urls, list):
-            raise ValueError("urls must be a list of strings.")
+        if not isinstance(urls, list) and not isinstance(urls, Generator):
+            raise ValueError("urls must be a list or generator.")
         documents = []
         for url in urls:
             from newspaper import Article

diff --git a/llama_hub/web/news/requirements.txt b/llama_hub/web/news/requirements.txt
@@ -0,0 +1 @@
+newspaper3k
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .base import NewsArticleReader

		__all__ = ["NewsArticleReader"]