-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
134 lines (108 loc) · 3.79 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# third party
from functools import partial
import torch
from datasets import load_dataset
from torch.utils.data import Dataset
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor
# first party
from utils import prepare_train_features, prepare_validation_features
def load_squad_data(tokenizer): # noqa
"""
Function to load the squad v2 dataset from huggingface 🤗
"""
dataset = load_dataset("squad_v2")
raw_validation_data = dataset["validation"].select(range(100))
# Inputs
max_length = 384
doc_stride = 128
# Set up partial functions - training features
training_features = partial(
prepare_train_features,
tokenizer=tokenizer,
max_length=max_length,
doc_stride=doc_stride,
)
# Set up partial function - validation features
validation_features = partial(
prepare_validation_features,
tokenizer=tokenizer,
max_length=max_length,
doc_stride=doc_stride,
)
# Create the training dataset
training_data = (
dataset["train"]
.select(range(100))
.map(
training_features,
batched=True,
remove_columns=dataset["train"].column_names,
)
)
# Create the validation training st
validation_data = raw_validation_data.map(
validation_features,
batched=True,
remove_columns=dataset["validation"].column_names,
)
# Combine the validation data for training
# and evaluation
validation_datasets = {
"raw_validation_data": raw_validation_data,
"validation_data": validation_data,
}
return training_data, validation_datasets
def load_examples(
data_dir, data_file, tokenizer, evaluate=False, output_examples=False
):
processor = SquadV2Processor()
if evaluate:
examples = processor.get_dev_examples(data_dir, filename=data_file)
else:
examples = processor.get_train_examples(data_dir, filename=data_file)
features, dataset = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=384,
doc_stride=128,
max_query_length=64,
is_training=not evaluate,
return_dataset="pt",
threads=1,
)
if output_examples:
return dataset, examples, features
return dataset
class SquadDataset(Dataset):
"""
Class that builds the encodings for the tokens
from the Squad V2 dataset
"""
def __init__(self, data, mode="training"): # noqa
self.data = data
self.input_ids = data[0].tolist()
self.attention_mask = data[1].tolist()
self.mode = mode
# If mode is training then we need to extract
# the start and stop positions
if self.mode == "training":
# Add the start and end positions
self.start_positions = data[3].tolist()
self.end_positions = data[4].tolist()
def __len__(self): # noqa
return len(self.input_ids)
def __getitem__(self, idx): # noqa
# We need to get the outputs for all of the texts
# Contexts
# Set up input ids to be torch tensors
input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.long)
# Gather outputs
outputs = {"input_ids": input_ids, "attention_mask": attention_mask}
if self.mode == "training":
start_positions = torch.tensor(self.start_positions[idx], dtype=torch.long)
end_positions = torch.tensor(self.end_positions[idx], dtype=torch.long)
outputs["start_positions"] = start_positions
outputs["end_positions"] = end_positions
return outputs