-
Notifications
You must be signed in to change notification settings - Fork 736
SEC Filings loader bug fixes #909
Changes from all commits
e7f7f60
89c4b7f
8716970
a52e8a2
81423d3
78734da
10cfec6
7314157
441414b
d0a0e70
d3d8fcc
9046eb0
814b43c
7c56287
25e3df3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,107 +1,71 @@ | ||
try: | ||
from llama_hub.sec_filings.sec_filings import SECExtractor | ||
except ImportError: | ||
# relative import from file | ||
from sec_filings import SECExtractor | ||
|
||
import concurrent.futures | ||
import json | ||
import os | ||
import time | ||
from collections import defaultdict | ||
from typing import List | ||
|
||
from llama_index.schema import Document | ||
from llama_index.readers.base import BaseReader | ||
from llama_hub.sec_filings.secData import sec_main | ||
from datetime import datetime | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make sure to add this file to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The SEC filings already exists in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah see some other files that have the extra_files parameter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I have added this |
||
from typing import List, Optional | ||
import warnings | ||
import sys | ||
|
||
|
||
class SECFilingsLoader(BaseReader): | ||
""" | ||
SEC Filings loader | ||
Get the SEC filings of multiple tickers | ||
""" | ||
|
||
def __init__( | ||
self, | ||
tickers: List[str], | ||
amount: int, | ||
filing_type: str = "10-K", | ||
num_workers: int = 2, | ||
include_amends: bool = False, | ||
ticker: str, | ||
year: int, | ||
filing_types: List[str], | ||
include_amends: bool = True, | ||
amount: Optional[int] = None, | ||
): | ||
assert filing_type in [ | ||
"10-K", | ||
"10-Q", | ||
], "The supported document types are 10-K and 10-Q" | ||
"""SEC Filings loader for 10-K, 10-Q and S-1 filings | ||
|
||
Args: | ||
ticker (str): Symbol of the company | ||
year (str): Year of the data required | ||
""" | ||
curr_year = datetime.now().year | ||
assert year <= curr_year, "The year should be less than current year" | ||
|
||
self.tickers = tickers | ||
self.amount = amount | ||
self.filing_type = filing_type | ||
self.num_workers = num_workers | ||
self.ticker = ticker | ||
self.year = str(year) | ||
self.filing_types = filing_types | ||
self.include_amends = include_amends | ||
if amount is not None: | ||
warnings.warn( | ||
"The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
sys.exit(1) | ||
|
||
self.se = SECExtractor( | ||
tickers, amount, filing_type, include_amends=include_amends | ||
def load_data(self) -> List[Document]: | ||
section_texts = sec_main( | ||
self.ticker, self.year, self.filing_types, self.include_amends | ||
) | ||
docs = [] | ||
for filings in section_texts: | ||
texts_dict = filings[-1] | ||
|
||
os.makedirs("data", exist_ok=True) | ||
|
||
def multiprocess_run(self, tic): | ||
# print(f"Started for {tic}") | ||
tic_dict = self.se.get_accession_numbers(tic) | ||
text_dict = defaultdict(list) | ||
for tic, fields in tic_dict.items(): | ||
os.makedirs(f"data/{tic}", exist_ok=True) | ||
print(f"Started for {tic}") | ||
|
||
field_urls = [field["url"] for field in fields] | ||
years = [field["year"] for field in fields] | ||
with concurrent.futures.ProcessPoolExecutor( | ||
max_workers=self.num_workers | ||
) as executor: | ||
results = executor.map(self.se.get_text_from_url, field_urls) | ||
for idx, res in enumerate(results): | ||
all_text, filing_type = res | ||
text_dict[tic].append( | ||
{ | ||
"year": years[idx], | ||
"ticker": tic, | ||
"all_texts": all_text, | ||
"filing_type": filing_type, | ||
} | ||
for section_name, text in texts_dict.items(): | ||
docs.append( | ||
Document( | ||
text=text, | ||
extra_info={ | ||
"accessionNumber": filings[0], | ||
"filing_type": filings[1], | ||
"filingDate": filings[2], | ||
"reportDate": filings[3], | ||
"sectionName": section_name, | ||
}, | ||
) | ||
) | ||
return text_dict | ||
return docs | ||
|
||
def load_data(self): | ||
start = time.time() | ||
thread_workers = min(len(self.tickers), self.num_workers) | ||
with concurrent.futures.ThreadPoolExecutor( | ||
max_workers=thread_workers | ||
) as executor: | ||
results = executor.map(self.multiprocess_run, self.tickers) | ||
|
||
for res in results: | ||
curr_tic = list(res.keys())[0] | ||
for data in res[curr_tic]: | ||
curr_year = data["year"] | ||
curr_filing_type = data["filing_type"] | ||
if curr_filing_type in ["10-K/A", "10-Q/A"]: | ||
curr_filing_type = curr_filing_type.replace("/", "") | ||
if curr_filing_type in ["10-K", "10-KA"]: | ||
os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True) | ||
with open( | ||
f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w" | ||
) as f: | ||
json.dump(data, f, indent=4) | ||
elif curr_filing_type in ["10-Q", "10-QA"]: | ||
os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True) | ||
with open( | ||
f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json", | ||
"w", | ||
) as f: | ||
json.dump(data, f, indent=4) | ||
print( | ||
f"Done for {curr_tic} for document {curr_filing_type} and year" | ||
f" {curr_year}" | ||
) | ||
# Test case file test.py | ||
|
||
# from base import SECFilingsLoader | ||
|
||
print(f"It took {round(time.time()-start,2)} seconds") | ||
# if __name__ == '__main__': | ||
# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"]) | ||
# d = docs.load_data() | ||
# print(d) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,10 +2,9 @@ | |
import json | ||
import os | ||
import re | ||
import sys | ||
from typing import List, Optional, Tuple, Union | ||
|
||
import requests | ||
from typing import List, Optional, Tuple, Union | ||
import sys | ||
|
||
if sys.version_info < (3, 8): | ||
from typing_extensions import Final | ||
|
@@ -26,20 +25,16 @@ def inner(func): | |
|
||
limits = fake_decorator | ||
sleep_and_retry = fake_decorator | ||
try: | ||
from llama_hub.sec_filings.prepline_sec_filings.sec_document import ( | ||
VALID_FILING_TYPES, | ||
) | ||
except ImportError: | ||
from prepline_sec_filings.sec_document import VALID_FILING_TYPES | ||
|
||
from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES | ||
|
||
SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data" | ||
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar" | ||
SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions" | ||
|
||
|
||
def get_filing( | ||
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str | ||
accession_number: Union[str, int], cik: Union[str, int], company: str, email: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did you switch the arg positions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The text extraction from SEC documents is a demanding process, hence I implemented a multiprocessing method so that it can be faster. In the get_filing_partial = partial(
get_filing,
cik=rgld_cik,
company="Unstructured Technologies",
email="[email protected]",
)
sec_extractor = SECExtractor(ticker=ticker) For the partial function to work, the first argument needs to be the accession number (a unique identifier for each file). Hence, I switched the arguments. Is there a better way to do it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see. i'm mostly trying to minimize the number of breaking changes, and seems like there's not a way to prevent this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, understood It is not an user-facing function, hopefully it will break previous implementations. |
||
) -> str: | ||
"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate | ||
limits specified on the SEC website. | ||
|
@@ -55,18 +50,25 @@ def _get_filing( | |
) -> str: | ||
"""Wrapped so filings can be retrieved with an existing session.""" | ||
url = archive_url(cik, accession_number) | ||
response = session.get(url) | ||
# headers = { | ||
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
# } | ||
headers = {"User-Agent": "Mozilla/5.0"} | ||
response = session.get(url, headers=headers) | ||
response.raise_for_status() | ||
return response.text | ||
|
||
|
||
@sleep_and_retry | ||
@limits(calls=10, period=1) | ||
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str: | ||
def get_cik_by_ticker(ticker: str) -> str: | ||
"""Gets a CIK number from a stock ticker by running a search on the SEC website.""" | ||
cik_re = re.compile(r".*CIK=(\d{10}).*") | ||
url = _search_url(ticker) | ||
response = session.get(url, stream=True) | ||
headers = { | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | ||
} | ||
response = requests.get(url, stream=True, headers=headers) | ||
response.raise_for_status() | ||
results = cik_re.findall(response.text) | ||
return str(results[0]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,3 @@ | ||
aiohttp==3.8.4 | ||
Faker==19.1.0 | ||
PyYAML==6.0.1 | ||
ratelimit==2.2.1 | ||
starlette==0.30.0 | ||
unstructured==0.8.1 | ||
urllib3==2.0.4 | ||
scikit-learn | ||
ratelimit==2.2.1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we keep the deleted attributes as deprecated, for backwards compat? and just not show it here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As mentioned above, the previous implementation was breaking, and the amount parameter is a bit ambiguous. In my conversation, users would like to pull the documents for a given year or a list years, not number of filings. Hence, the year parameter serves better.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok sounds good. In general we are trying to minimize the number of breaking changes, it's not good to switch user-facing params around because that breaks existing implementations.
If the previous implementation doesn't work at all then sure we can remove (and log a warning to the user that it no longer works). If it still does then I vote we leave in the parameter for backwards compat
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, understood
In the latest commit, I have added the amount deprecating warning. Please do suggest, if I need to make other changes.