forked from Sundrops/video-caption.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsample.py
95 lines (84 loc) · 4.17 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import os
import argparse
import skvideo.io
import torch
import misc.utils as utils
import PIL
from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel
from dataloader import VideoDataset
from pretrainedmodels import utils as ptm_utils
from process_features import process_batches, create_batches
from models.ConvS2VT import ConvS2VT
def main(opt):
dataset = VideoDataset(opt, 'inference')
opt["vocab_size"] = dataset.get_vocab_size()
opt["seq_length"] = dataset.max_len
if opt['beam_size'] != 1:
assert opt["batch_size"] == 1
if opt["model"] == 'S2VTModel':
model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
n_layers=opt['num_layers'],
rnn_cell=opt['rnn_type'],
bidirectional=opt["bidirectional"],
rnn_dropout_p=opt["rnn_dropout_p"])
elif opt["model"] == "S2VTAttModel":
encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
n_layers=opt['num_layers'],
rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
n_layers=opt['num_layers'],
rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
model = S2VTAttModel(encoder, decoder)
else:
return
# if torch.cuda.device_count() > 1:
# print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
# model = nn.DataParallel(model)
convnet = 'nasnetalarge'
vocab = dataset.get_vocab()
full_decoder = ConvS2VT(convnet, model, opt)
tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
load_img_fn = PIL.Image.fromarray
for video_path in opt['videos']:
print(video_path)
with torch.no_grad():
frames = skvideo.io.vread(video_path)
# bp ---
batches = create_batches(frames, load_img_fn, tf_img_fn)
seq_prob, seq_preds = full_decoder(batches, mode='inference')
sents = utils.decode_sequence(vocab, seq_preds)
for sent in sents:
print(sent)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('videos', nargs='+',
help='space delimited list of video paths')
parser.add_argument('--recover_opt', type=str, required=True,
help='recover train opts from saved opt_json')
parser.add_argument('--saved_model', type=str, default='',
help='path to saved model to evaluate')
# parser.add_argument('--rnn_type', type=str, default='gru', help='lstm or gru')
parser.add_argument('--dump_json', type=int, default=1,
help='Dump json with predictions into vis folder? (1=yes,0=no)')
parser.add_argument('--results_path', type=str, default='results/')
parser.add_argument('--dump_path', type=int, default=0,
help='Write image paths along with predictions into vis json? (1=yes,0=no)')
parser.add_argument('--gpu', type=str, default='0',
help='gpu device number')
parser.add_argument('--batch_size', type=int, default=128,
help='minibatch size')
parser.add_argument('--sample_max', type=int, default=1,
help='0/1. whether sample max probs to get next word in inference stage')
parser.add_argument('--temperature', type=float, default=1.0)
parser.add_argument('--beam_size', type=int, default=1,
help='used when sample_max = 1. Usually 2 or 3 works well.')
args = parser.parse_args()
args = vars((args))
opt = json.load(open(args["recover_opt"]))
for k, v in args.items():
opt[k] = v
os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]
main(opt)