-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_tables.py
256 lines (218 loc) · 10.7 KB
/
extract_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import os
import warnings
import time
import pandas as pd
import re
import platform
import datetime
import glob
# import chromedriver_binary
from collections import Counter
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from utils import arguements, init_logger, ROOT_PATH
"""
document.querySelectorAll("body > document:nth-child(1) > type:nth-child(1) > sequence:nth-child(1) > filename:nth-child(1) > description:nth-child(1) > text:nth-child(1) > div:nth-child(418) ~ div"
"""
def test_xpath_elements(
url: str,
xpath: str
) -> list:
args = arguements()
options = Options()
# options.binary_location = args.chrome_path
options.set_capability('goog:loggerPrefs', {'browser': 'ALL'})
options.add_argument("--verbose")
driver = webdriver.Chrome(executable_path=args.chrome_driver_path, options=options) if platform.system(
) == "Linux" else webdriver.Chrome(options=options)
driver.get(url)
tables = driver.find_elements(By.XPATH, value=xpath)
tables = sorted(tables, key=lambda table: table.location['y'])
if not os.path.exists(os.path.join(ROOT_PATH, args.cik, 'test_path')):
os.mkdir(os.path.join(ROOT_PATH, args.cik, 'test_path'))
for i, table in enumerate(tables):
table = malformed_table(table.get_attribute("outerHTML"))
dfs = pd.read_html(table.prettify(), displayed_only=False)
for df in dfs:
df.to_csv(os.path.join(ROOT_PATH, args.cik,
'test_path', f"test_{i}.csv"))
driver.close()
return tables
def get_xpath_elements(
driver: webdriver,
xpaths: list,
) -> list:
tables = []
for path in xpaths:
time.sleep(1)
tables.extend(driver.find_elements(By.XPATH, value=path))
# driver.execute('document.querySelectorAll("body > document:nth-child(1) > type:nth-child(1) > sequence:nth-child(1) > filename:nth-child(1) > description:nth-child(1) > text:nth-child(1) > div:nth-child(418) ~ div"')
logger.debug(f"GOT ELEMENTS - {tables}")
return tables
def get_table_date(
page_content: str
) -> str:
datetime_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b'
datetimes = re.findall(datetime_pattern, page_content)
count = Counter(datetimes)
table_date = count.most_common(1)
if not table_date:
return 'no_date_found'
return datetime.datetime.strptime(table_date[0][0], '%B %d, %Y').strftime('%Y%m%d')
def parse_link_element(
driver: webdriver,
) -> str:
iframe = driver.find_elements(By.CSS_SELECTOR, value='#ixvFrame')
if iframe:
time.sleep(1)
logger.debug(f"IFRAME - {iframe[0]}")
driver.switch_to.frame(iframe[0])
time.sleep(1)
link_element = driver.find_elements(By.ID, value="menu-dropdown-link")
logger.debug(f"LINK ELEMENT - {link_element}")
if not link_element:
return None
driver.execute_script("arguments[0].click();", link_element[0])
form_element = driver.find_elements(By.ID, value='form-information-html')
logger.debug(f"FORM ELEMENT - {form_element}")
if not form_element:
return None
driver.execute_script("arguments[0].click();", form_element[0])
logger.debug(f"SWITCHING HANDLES - {driver.window_handles[-1]}")
time.sleep(1)
driver.switch_to.window(driver.window_handles[-1])
return driver.current_url
def malformed_table(
table: str
) -> BeautifulSoup:
soup = BeautifulSoup(table, 'lxml')
# If there's no <table> tag, wrap everything inside a table
if not soup.table:
new_table = soup.new_tag("table")
for tag in soup.find_all(True):
new_table.append(tag.extract())
soup.append(new_table)
return soup
def remove_duplicate_element(
elements: webdriver.remote.webelement.WebElement
) -> list:
ids = set()
unique_elements = []
for e in elements:
if e.id not in ids:
ids.add(e.id)
unique_elements.append(e)
return unique_elements
def main() -> None:
assert os.path.exists(f"urls/{args.cik}.csv"),"SCRAP LINKS FIRST"
warnings.simplefilter(action='ignore', category=FutureWarning)
# desired_dpi = 2.0
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--no-sandbox") # Bypass OS security model, required in some environments
# options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
# options.add_argument(f"--force-device-scale-factor={desired_dpi}")
# options.add_experimental_option("mobileEmulation", {"deviceMetrics": {"width": 1920, "height": 1080, "pixelRatio": 3.0}})
# driver = webdriver.Chrome(executable_path=args.chrome_driver_path, options=options) if platform.system(
# ) == "Linux" else webdriver.Chrome(options=options)
# driver.set_window_size(1920, 1080)
driver = webdriver.Firefox()
table_title = "Schedule of Investments"
urls = pd.read_csv(os.path.join(ROOT_PATH, args.url_csv), index_col=False)
with open(args.x_path) as file:
gen_paths = [line.rstrip() for line in file.readlines()]
for i in range(urls.shape[0]):
_, table_date, url = urls.iloc[i]
# table_date, url = '2018-12-31', 'https://www.sec.gov/Archives/edgar/data/0001512931/000114420419012276/tv514438_10k.htm'
logger.info(f"DATETIMES - {table_date}")
if len(glob.glob(f"{args.cik}/{table_date}/*.csv")) > 0:
continue
logger.info(f"ACCESSING - {url}")
driver.get(url)
inline_url = parse_link_element(driver)
if inline_url is not None:
logger.info(f'FINAL URL - {inline_url}')
time.sleep(2)
driver.get(inline_url)
html_content = driver.page_source
out_path, spec_path = os.path.join(ROOT_PATH, args.cik, table_date), os.path.join(
ROOT_PATH, args.cik, table_date, 'spec_paths.txt')
if not os.path.exists(out_path):
os.mkdir(out_path)
logger.info(
f'SAVE FILE - {url.split("/")[-1].replace(".htm","")+".html"}')
html_to_file = os.path.join(ROOT_PATH, out_path, url.split(
'/')[-1].replace(".htm", "")+".html")
with open(html_to_file, "w", encoding='utf-8') as file:
file.write(BeautifulSoup(html_content, 'html.parser').prettify())
xpaths = gen_paths
tables = get_xpath_elements(driver, xpaths)
if os.path.exists(spec_path):
with open(spec_path) as file:
xpaths = [line.rstrip() for line in file.readlines()]
# spec_paths.extend(xpaths)
tables = get_xpath_elements(driver, xpaths)
logger.debug(f"USING XPATHS - {xpaths}")
tables = sorted(tables, key=lambda table: table.location['y'])
tables = remove_duplicate_element(tables)
if not os.path.exists(args.save_image_path):
os.mkdir(args.save_image_path)
for i, table in enumerate(tables):
if os.path.exists(os.path.join(ROOT_PATH, args.cik, table_date, f"{table_title.replace(' ','_')}_{i}.csv")):
continue
table = malformed_table(table.get_attribute("outerHTML"))
dfs = pd.read_html(table.prettify(), displayed_only=False)
if not dfs:
logger.debug(f"NO TABLES - {dfs}")
continue
dfs[0].to_csv(os.path.join(ROOT_PATH, args.cik, table_date,
f"{table_title.replace(' ','_')}_{i}.csv"), encoding='utf-8')
ss_path = os.path.join(args.save_image_path, table_date, f'soi_table_{i}.png')
if os.path.exists(ss_path) or not args.take_sc:
continue
if not os.path.exists(os.path.join(args.save_image_path, table_date)):
os.mkdir(os.path.join(args.save_image_path, table_date))
try:
logger.info(
f"Taking screenshot {ss_path}")
ss = table.screenshot(ss_path)
except Exception as e:
logger.info(e)
# break
driver.close()
return
if __name__ == "__main__":
"""
python .\extract_tables.py --cik 1501729 --url-csv urls/1501729.csv --x-path xpaths/1501729.txt
python .\extract_tables.py --cik 1396440 --url-csv urls/1396440.csv --x-path xpaths/1396440.txt
python .\extract_tables.py --cik 1422183 --url-csv urls/1422183.csv --x-path xpaths/1422183.txt
python .\extract_tables.py --cik 1490349 --url-csv urls/1490349.csv --x-path xpaths/1490349.txt
python .\extract_tables.py --cik 1379785 --url-csv urls/1379785.csv --x-path xpaths/1379785.txt
python .\extract_tables.py --cik 1490927 --url-csv urls/1490927.csv --x-path xpaths/1490927.txt
python .\extract_tables.py --cik 1418076 --url-csv urls/1418076.csv --x-path xpaths/1418076.txt
python .\extract_tables.py --cik 1544206 --url-csv urls/1544206.csv --x-path xpaths/1544206.txt
python .\extract_tables.py --cik 1370755 --url-csv urls/1370755.csv --x-path xpaths/1370755.txt
python .\extract_tables.py --cik 1326003 --url-csv urls/1326003.csv --x-path xpaths/1326003.txt
python3 extract_tables.py --cik 1580345 --url-csv urls/1580345.csv --x-path xpaths/1580345.txt
python3 extract_tables.py --cik 1535778 --url-csv urls/1535778.csv --x-path xpaths/1535778.txt
python .\extract_tables.py --cik 1487918 --url-csv urls/1487918.csv --x-path xpaths/1487918.txt
python3 extract_tables.py --cik 1512931 --url-csv urls/1512931.csv --x-path xpaths/1512931.txt
python3 extract_tables.py --cik 1372807 --url-csv urls/1372807.csv --x-path xpaths/1372807.txt
python3 extract_tables.py --cik 1675033 --url-csv urls/1675033.csv --x-path xpaths/1675033.txt
python3 extract_tables.py --cik 3906 --url-csv urls/3906.csv --x-path xpaths/3906.txt
sudo apt install chromium-chromedrive
sudo apt-get install chromium-driver
"""
args = arguements()
logger = init_logger(args.cik)
main()
# test_xpath_elements(
# url='https://www.sec.gov/Archives/edgar/data/0001501729/000110465921102950/tm2124358-1_10q.htm',
# xpath='//div[font[contains(text(), "Schedule of Investments")]]/parent::div/parent::font/following-sibling::table'
# )