-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathsanguo.py
70 lines (68 loc) · 2.39 KB
/
sanguo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from transformers import pipeline, set_seed
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import GPT2TokenizerFast
# save dir
SAVE_PATH = "./sanguo"
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
trainer = BpeTrainer(vocab_size=50000, show_progress=True,
inital_alphabet=ByteLevel.alphabet(), special_tokens=special_tokens)
files = ["text/sanguoyanyi.txt"]
tokenizer.train(files, trainer)
newtokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
newtokenizer.save_pretrained(SAVE_PATH)
# load tokenizer from pretrained
tokenizer = GPT2Tokenizer.from_pretrained(SAVE_PATH)
tokenizer.add_special_tokens({"eos_token": "</s>", "bos_token": "<s>",
"unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>"})
# creating the configurations from which the model can be made
config = GPT2Config(
vocab_size=tokenizer.vocab_size,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = GPT2LMHeadModel(config)
# setting train data
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./text/sanguoyanyi.txt",
block_size=32,
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False, mlm_probability=0.15
)
# setting train args
training_args = TrainingArguments(
output_dir="./output",
overwrite_output_dir=True,
num_train_epochs=20,
per_gpu_train_batch_size=16,
save_steps=2000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
trainer.train()
model.save_pretrained(SAVE_PATH)
# test model
generator = pipeline('text-generation', model=SAVE_PATH)
set_seed(13)
txt = generator("吕布", max_length=10)
print(txt)