-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpreprocess_swde.py
118 lines (103 loc) · 4.67 KB
/
preprocess_swde.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import sys
from pathlib import Path
import tqdm
import pickle
FILE = Path(__file__).resolve()
ROOT = FILE.parents[1] # root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
from src.preprocess import extract_features, extract_features_ae_task
from src.domlm import DOMLMConfig
def extract_labels(label_files):
label_info = {}
for file in label_files:
label = file.name.split('-')[-1].replace('.txt', '')
with open(file, 'r') as f:
content = f.readlines()
for line in content[2:]:
page_id = line.split('\t')[0]
if page_id not in label_info:
label_info[page_id] = {}
nums = line.split('\t')[1]
value = line.split('\t')[2].strip()
label_info[page_id][label] = {
'nums': nums,
'value': value,
}
return label_info
def preprocess_swde(input_dir, config, output_dir, domains):
SWDE_PATH = Path(input_dir)
PROC_PATH = Path(output_dir)
DOMAINS = domains
config = DOMLMConfig.from_json_file(config)
start_from = 0
for domain in DOMAINS:
files = sorted((SWDE_PATH / domain).glob("**/*.htm"))[start_from:]
pbar = tqdm.tqdm(files,total=len(files))
errors = []
for path in pbar:
pbar.set_description(f"Processing {path.relative_to(SWDE_PATH / domain)}")
with open(path,'r') as f:
html = f.read()
try:
features = extract_features(html,config)
dir_name = PROC_PATH / domain / path.parent.name
dir_name.mkdir(parents=True,exist_ok=True)
with open(dir_name / path.with_suffix(".pkl").name,'wb') as f:
pickle.dump(features,f)
except Exception as e:
print(e)
errors.append(path)
pass
print(f"Total errors: {len(errors)}")
def preprocess_swde_attr_extract(input_dir, config_file, output_dir, domains):
SWDE_PATH = Path(input_dir)
LABEL_PATH = SWDE_PATH / 'groundtruth'
PROC_PATH = Path(output_dir)
DOMAINS = domains
config = DOMLMConfig.from_json_file(config_file)
for domain in DOMAINS:
for website_dir in (SWDE_PATH / domain).iterdir():
if not website_dir.is_dir():
continue
files = sorted((website_dir.glob("./*.htm")))
website_name = website_dir.name.split('-')[1][:website_dir.name.split('-')[1].index('(')]
label_files = sorted((LABEL_PATH / domain).glob(f'{domain}-{website_name}*'))
label_infos = extract_labels(label_files)
pbar = tqdm.tqdm(files, total=len(files))
errors = []
for path in pbar:
pbar.set_description(f"Processing {path.relative_to(SWDE_PATH / domain)}")
with open(path,'r') as f:
html = f.read()
try:
label2text = label_infos[path.name.split('.')[0]]
text2label = {v['value']: {'label':k, 'nums':v['nums']} for k,v in label2text.items()}
features = extract_features_ae_task(html, text2label, config)
dir_name = PROC_PATH / domain / path.parent.name
dir_name.mkdir(parents=True,exist_ok=True)
with open(dir_name / path.with_suffix(".pkl").name,'wb') as f:
pickle.dump(features, f)
except Exception as e:
print(e)
errors.append(path)
pass
print(f"Total errors: {len(errors)}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='attr_extract', help='preprocess data for tasks', choices=['domlm', 'attr_extract'])
parser.add_argument('--input_dir', type=str, default='data/swde_html/sourceCode/sourceCode', help='data directory')
parser.add_argument('--config', type=str, default='domlm-config/config.json', help='config file')
parser.add_argument('--output_dir', type=str, default='data/swde_ae_preprocessed', help='output directory')
parser.add_argument('--domains', type=str, default='university', help='domains')
args = parser.parse_args()
task = args.task
input_dir = args.input_dir
config = args.config
output_dir = args.output_dir
domains = args.domains.split(',')
if task == 'domlm':
preprocess_swde(input_dir, config, output_dir, domains)
elif task == 'attr_extract':
preprocess_swde_attr_extract(input_dir, config, output_dir, domains)