run-llama · jerryjliu · Feb 13, 2024 · Jan 28, 2024 · Jan 28, 2024 · Feb 1, 2024
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -674,7 +674,17 @@
   "SECFilingsLoader": {
     "id": "sec_filings",
     "author": "Athe-kunal",
-    "keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
+    "extra_files":[
+      "secData.py",
+      "sec_filings_extractor.py",
+      "section_names.py"
+    ],
+    "keywords": [
+      "finance",
+      "SEC Filings",
+      "10-K",
+      "10-Q"
+    ]
   },
   "GuruReader": {
     "id": "guru",

diff --git a/llama_hub/sec_filings/README.md b/llama_hub/sec_filings/README.md
@@ -10,81 +10,38 @@ Install the required dependencies
 python install -r requirements.txt
 ```
 
-The SEC Downloader expects 5 attributes
+The SEC Downloader expects 4 attributes
 
 * tickers: It is a list of valid tickers
-* amount: Number of documents that you want to download
-* filing_type: 10-K or 10-Q filing type
-* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
+* filing_types (List): 10-K or 10-Q or S-1 filing type 
 * include_amends: To include amendments or not.
+* year: The year for which you need the data
 
 ## Usage
 ```python
 from llama_index import download_loader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+docs = loader.load_data()
 ```
-It will download the data in the following directories and sub-directories
+
+It also returns the following metadata
+
+* Filing Date of the filing
+* Reporting date of the filing
+* Accession number of the filing (unique identifier of the filing)
+* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
+* Section name of the text
+
+There are also section names in different document types. You can check it by running
 
 ```python
-- AAPL
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-Q_12.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_12.json
-  - 2023
-    - 10-Q_04.json
-- GOOGL
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-Q_09.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_09.json
-  - 2023
-    - 10-Q_03.json
-- TSLA
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-KA.json
-    - 10-Q_09.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_09.json
-  - 2023
-    - 10-Q_03.json
-```
+from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q
 
-Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
+print(SECTIONS_10K)
+```
 
 ## EXAMPLES
 
@@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
 index = VectorStoreIndex.from_documents(documents)
 index.query('What are the risk factors of Tesla for the year 2022?')
 
@@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
-
-dir_loader = DirectoryLoader("data\TSLA\2022")
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-index = VectorstoreIndexCreator().from_loaders([dir_loader])
+index = VectorstoreIndexCreator().from_documents(documents)
 retriever = index.vectorstore.as_retriever()
 qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
 
@@ -131,5 +85,5 @@ qa.run(query)
 ```
 ## REFERENCES
 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
-2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
+
 
diff --git a/llama_hub/sec_filings/base.py b/llama_hub/sec_filings/base.py
@@ -1,107 +1,71 @@
-try:
-    from llama_hub.sec_filings.sec_filings import SECExtractor
-except ImportError:
-    # relative import from file
-    from sec_filings import SECExtractor
-
-import concurrent.futures
-import json
-import os
-import time
-from collections import defaultdict
-from typing import List
-
+from llama_index.schema import Document
 from llama_index.readers.base import BaseReader
+from llama_hub.sec_filings.secData import sec_main
+from datetime import datetime
+from typing import List, Optional
+import warnings
+import sys
 
 
 class SECFilingsLoader(BaseReader):
-    """
-    SEC Filings loader
-    Get the SEC filings of multiple tickers
-    """
-
     def __init__(
         self,
-        tickers: List[str],
-        amount: int,
-        filing_type: str = "10-K",
-        num_workers: int = 2,
-        include_amends: bool = False,
+        ticker: str,
+        year: int,
+        filing_types: List[str],
+        include_amends: bool = True,
+        amount: Optional[int] = None,
     ):
-        assert filing_type in [
-            "10-K",
-            "10-Q",
-        ], "The supported document types are 10-K and 10-Q"
+        """SEC Filings loader for 10-K, 10-Q and S-1 filings
+
+        Args:
+            ticker (str): Symbol of the company
+            year (str): Year of the data required
+        """
+        curr_year = datetime.now().year
+        assert year <= curr_year, "The year should be less than current year"
 
-        self.tickers = tickers
-        self.amount = amount
-        self.filing_type = filing_type
-        self.num_workers = num_workers
+        self.ticker = ticker
+        self.year = str(year)
+        self.filing_types = filing_types
         self.include_amends = include_amends
+        if amount is not None:
+            warnings.warn(
+                "The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            sys.exit(1)
 
-        self.se = SECExtractor(
-            tickers, amount, filing_type, include_amends=include_amends
+    def load_data(self) -> List[Document]:
+        section_texts = sec_main(
+            self.ticker, self.year, self.filing_types, self.include_amends
         )
+        docs = []
+        for filings in section_texts:
+            texts_dict = filings[-1]
 
-        os.makedirs("data", exist_ok=True)
-
-    def multiprocess_run(self, tic):
-        # print(f"Started for {tic}")
-        tic_dict = self.se.get_accession_numbers(tic)
-        text_dict = defaultdict(list)
-        for tic, fields in tic_dict.items():
-            os.makedirs(f"data/{tic}", exist_ok=True)
-            print(f"Started for {tic}")
-
-            field_urls = [field["url"] for field in fields]
-            years = [field["year"] for field in fields]
-            with concurrent.futures.ProcessPoolExecutor(
-                max_workers=self.num_workers
-            ) as executor:
-                results = executor.map(self.se.get_text_from_url, field_urls)
-            for idx, res in enumerate(results):
-                all_text, filing_type = res
-                text_dict[tic].append(
-                    {
-                        "year": years[idx],
-                        "ticker": tic,
-                        "all_texts": all_text,
-                        "filing_type": filing_type,
-                    }
+            for section_name, text in texts_dict.items():
+                docs.append(
+                    Document(
+                        text=text,
+                        extra_info={
+                            "accessionNumber": filings[0],
+                            "filing_type": filings[1],
+                            "filingDate": filings[2],
+                            "reportDate": filings[3],
+                            "sectionName": section_name,
+                        },
+                    )
                 )
-        return text_dict
+        return docs
 
-    def load_data(self):
-        start = time.time()
-        thread_workers = min(len(self.tickers), self.num_workers)
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=thread_workers
-        ) as executor:
-            results = executor.map(self.multiprocess_run, self.tickers)
 
-        for res in results:
-            curr_tic = list(res.keys())[0]
-            for data in res[curr_tic]:
-                curr_year = data["year"]
-                curr_filing_type = data["filing_type"]
-                if curr_filing_type in ["10-K/A", "10-Q/A"]:
-                    curr_filing_type = curr_filing_type.replace("/", "")
-                if curr_filing_type in ["10-K", "10-KA"]:
-                    os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
-                    with open(
-                        f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
-                    ) as f:
-                        json.dump(data, f, indent=4)
-                elif curr_filing_type in ["10-Q", "10-QA"]:
-                    os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
-                    with open(
-                        f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
-                        "w",
-                    ) as f:
-                        json.dump(data, f, indent=4)
-                print(
-                    f"Done for {curr_tic} for document {curr_filing_type} and year"
-                    f" {curr_year}"
-                )
+# Test case file test.py
+
+# from base import SECFilingsLoader
 
-        print(f"It took {round(time.time()-start,2)} seconds")
+# if __name__ == '__main__':
+#     docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
+#     d = docs.load_data()
+#     print(d)
diff --git a/llama_hub/sec_filings/prepline_sec_filings/fetch.py b/llama_hub/sec_filings/prepline_sec_filings/fetch.py
@@ -2,10 +2,9 @@
 import json
 import os
 import re
-import sys
-from typing import List, Optional, Tuple, Union
-
 import requests
+from typing import List, Optional, Tuple, Union
+import sys
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -26,20 +25,16 @@ def inner(func):
 
     limits = fake_decorator
     sleep_and_retry = fake_decorator
-try:
-    from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
-        VALID_FILING_TYPES,
-    )
-except ImportError:
-    from prepline_sec_filings.sec_document import VALID_FILING_TYPES
+
+from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES
 
 SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
 SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
 SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
 
 
 def get_filing(
-    cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
+    accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
 ) -> str:
     """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
     limits specified on the SEC website.
@@ -55,18 +50,25 @@ def _get_filing(
 ) -> str:
     """Wrapped so filings can be retrieved with an existing session."""
     url = archive_url(cik, accession_number)
-    response = session.get(url)
+    # headers = {
+    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    # }
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = session.get(url, headers=headers)
     response.raise_for_status()
     return response.text
 
 
 @sleep_and_retry
 @limits(calls=10, period=1)
-def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
+def get_cik_by_ticker(ticker: str) -> str:
     """Gets a CIK number from a stock ticker by running a search on the SEC website."""
     cik_re = re.compile(r".*CIK=(\d{10}).*")
     url = _search_url(ticker)
-    response = session.get(url, stream=True)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    response = requests.get(url, stream=True, headers=headers)
     response.raise_for_status()
     results = cik_re.findall(response.text)
     return str(results[0])

diff --git a/llama_hub/sec_filings/requirements.txt b/llama_hub/sec_filings/requirements.txt
@@ -1,8 +1,3 @@
-aiohttp==3.8.4
-Faker==19.1.0
-PyYAML==6.0.1
-ratelimit==2.2.1
-starlette==0.30.0
 unstructured==0.8.1
-urllib3==2.0.4
 scikit-learn
+ratelimit==2.2.1