This repository has been archived by the owner on Feb 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathgpt-extract.py
executable file
·274 lines (223 loc) · 7.9 KB
/
gpt-extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python
"""
Generic ChatGPT extraction script. Converts any input data to
any output JSON, as specified by a given JSON schema document.
This is dependent on the ChatGPT wrapper library:
https://github.com/mmabrouk/chatgpt-wrapper
Make sure to also run playwright install before running
this extractor script!
"""
import argparse
from datetime import datetime
import json
import os
import re
import sys
import time
from chatgpt_wrapper import ChatGPT
# max chars to use in prompt
DOC_MAX_LENGTH=3000
parser = argparse.ArgumentParser(description='Extract structured data from text using ChatGPT.')
parser.add_argument(
'--input-type',
choices=['txt', 'json'],
help='Input file type: txt (one doc per line) or json (list of objects, add document key path using --dockey)'
)
parser.add_argument(
'--keydoc',
help='If using JSON input type, this is the key of the document'
)
parser.add_argument(
'--keyid',
help='If using JSON input type, this is the key of the id/page no'
)
parser.add_argument(
'--headless',
action='store_true',
help='Hide the browser'
)
parser.add_argument(
'--continue-at',
help='Continue extration at this document index'
)
parser.add_argument(
'--continue-last',
action='store_true',
help='Continue extration at the last document extracted'
)
parser.add_argument(
'--browser',
default="firefox",
help='Choose a browser to use. Needs to already be installed with `playwright install`. Defaults to firefox.'
)
parser.add_argument(
'infile',
help='Input file'
)
parser.add_argument(
'schema_file',
help='Path to JSON Schema file'
)
parser.add_argument(
'outfile',
help='Path to output results JSON file'
)
def clean_document(page_text):
# cleaned = re.sub("[\n]+", "\n", re.sub("[ \t]+", " ", page_text)).strip()
cleaned = re.sub(r"[\t ]+", " ", re.sub(r"[\n]+", "\n", page_text)).strip()
if len(cleaned) < DOC_MAX_LENGTH:
return cleaned
front = cleaned[:DOC_MAX_LENGTH - 500]
end = cleaned[-500:]
return f"{front} {end}"
def scrape_via_prompt(chat, page_text, schema):
prompt = f"```{clean_document(page_text)}```\n\nFor the given text, can you provide a JSON representation that strictly follows this schema:\n\n```{schema}```"
print("Entering prompt", len(prompt), "bytes")
response = None
# increasing this increases the wait time
waited = 0
# use this prompt so we can change it ("can you continue the
# previous..") but keep track of the original prompt
current_prompt = prompt
while True:
response = chat.ask(current_prompt)
if waited == 0:
print(f"{'='*70}\nPrompt\n{'-'*70}\n{current_prompt}")
print(f"{'='*70}\nResponse\n{'-'*70}\n{response}")
waited += 1
if waited > 5:
print("Timed out on this prompt")
break
if "unusable response produced by chatgpt" in response.lower():
wait_seconds = 120 * waited
print("Bad response! Waiting longer for", wait_seconds, "seconds")
time.sleep(wait_seconds)
continue
bad_input = (
"it is not possible to generate a json representation "
"of the provided text"
)
if bad_input in response.lower():
response = None
print("Bad input! Skipping this text")
continue
if response.strip() == "HTTP Error 429: Too many requests":
# sleep for one hour
print("Sleeping for one hour due to rate limiting...")
time.sleep(60 * 60)
continue
if "}" not in response:
# retry the session if it's not completing the JSON
print("Broken JSON response, sleeping then retrying")
time.sleep(20)
continue
# we have a good response here
break
return prompt, response
def upsert_result(results, result):
pk = result["id"]
for r_ix, r_result in enumerate(results):
if r_result["id"] != pk:
continue
# overwrite
results[r_ix] = result
return
# if we're here we did't update an existing result
results.append(result)
def run(documents, schema, outfile, headless=False,
continue_at=None, continue_last=False, browser=None):
print("Starting ChatGPT interface...")
chat = ChatGPT(headless=headless, browser=browser)
time.sleep(5)
# TODO: Check for login prompt
# TODO: Optionally clear all prev sessions
results = []
if os.path.exists(outfile):
with open(outfile, "r") as f:
results = json.load(f)
already_scraped = set([
r.get("id") for r in results
])
if already_scraped:
print("Already scraped", already_scraped)
if continue_last:
continue_at = max(list(already_scraped)) + 1
print("Continuing at", continue_at)
print(len(documents), "documents to scrape")
# flag so that we only sleep after the first try
first_scrape = True
for p_ix, page_data in enumerate(documents):
pk = page_data["id"]
page_text = page_data["text"]
if not page_text:
print("Blank text for ID:", pk, "Skipping...")
continue
print("Doc ID:", pk, "Text length:", len(page_text))
if continue_at is not None and pk < continue_at:
continue
if not first_scrape:
print("Sleeping for rate limiting")
time.sleep(60)
first_scrape = False
prompt, response = scrape_via_prompt(chat, page_text, schema)
first_scrape = False
if response is None:
print("Skipping page due to blank response")
continue
data = None
try:
data = json.loads(response.split("```")[1])
except Exception as e:
print("Bad result on ID", pk)
print("Parse error:", e)
continue
result = {
"id": pk,
"text": page_text,
"prompt": prompt,
"response": response,
"data": data,
}
upsert_result(results, result)
print("Saving results to", outfile)
with open(outfile, "w") as f:
f.write(json.dumps(results, indent=2))
print("ID", pk, "complete")
def parse_input_documents(args):
documents = []
with open(args.infile, "r") as f:
if args.input_type == "txt":
for i, doc in enumerate(f.readlines()):
documents.append({
"id": i,
"text": doc
})
elif args.input_type == "json":
with open(args.infile, "r") as f:
input_json = json.load(f)
type_err_msg = "Input JSON must be an array of objects"
assert args.keydoc, "--keydoc required with JSON input type"
# assert args.keyid, "--keyid required with JSON input type"
assert isinstance(input_json, list), type_err_msg
assert isinstance(input_json[0], dict), type_err_msg
assert args.keydoc in input_json[0], f"'{args.keydoc}' not in JSON"
# assert args.keyid in input_json[0], f"'{args.keyid}' not in JSON"
for ix, doc_data in enumerate(input_json):
documents.append({
"id": doc_data[args.keyid] if args.keyid else ix,
"text": doc_data[args.keydoc]
})
return documents
if __name__ == "__main__":
args = parser.parse_args()
documents = parse_input_documents(args)
with open(args.schema_file, "r") as f:
schema = json.load(f)
assert not (args.continue_last and args.continue_at), \
"--continue-at and --continue-last can't be used together"
run(documents, schema, args.outfile,
headless=args.headless,
continue_at=args.continue_at,
continue_last=args.continue_last,
browser=args.browser,
)