-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtry.py
69 lines (54 loc) · 2.09 KB
/
try.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
import torch
from flask import Flask, request, Response, jsonify
from torch.nn import functional as F
from queue import Queue, Empty
import time
import threading
import re
import json
# from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
train_path = './test.txt'
test_path = './test.txt'
#train_path = '/home/workspace/gpt2-fine-tuning/test.txt'
#test_path = '/home/workspace/gpt2-fine-tuning/test.txt'
from transformers import TextDataset,DataCollatorForLanguageModeling
def load_dataset(train_path,test_path,tokenizer):
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=train_path,
block_size=128)
test_dataset = TextDataset(
tokenizer=tokenizer,
file_path=test_path,
block_size=128)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
return train_dataset,test_dataset,data_collator
train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
model = AutoModelWithLMHead.from_pretrained("gpt2")
training_args = TrainingArguments(
output_dir="./result", #The output directory
overwrite_output_dir=True, #overwrite the content of the output directory
num_train_epochs=3, # number of training epochs
per_device_train_batch_size=1, # batch size for training
per_device_eval_batch_size=32, # batch size for evaluation
eval_steps = 400, # Number of update steps between two evaluations.
save_steps=800, # after # steps model is saved
warmup_steps=500,# number of warmup steps for learning rate scheduler
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=test_dataset
# prediction_loss_only=True,
)
# trainer.train("result")
# trainer.save_model("result")
# tokenizer.save_pretrained("result")