-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathq_attention_module.py
150 lines (138 loc) · 6.59 KB
/
q_attention_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@time:2019/4/13 下午11:02
@author:bigmelon
"""
import tensorflow as tf
slim = tf.contrib.slim
def derive_sese_output(t, c):
"""
this implementation is the implementation of softmax to F_l[i][c][j] in the paper
"""
# t(batch_size,height,width,channel)=(0,1,2,3)
t = tf.transpose(t, perm=[3, 0, 1, 2]) # (c,b,h,w)=(4,2,1,3)
t = tf.reshape(t, shape=(c, -1)) # (c, b*h*w)=(4,6)
# log_softmax = logits - log(reduce_sum(exp(logits), axis))
tt = tf.nn.log_softmax(t, axis=1) # (c, b*h*w)=(4,6)
ttt = tf.nn.softmax(t, axis=1) # (c, b*h*w)=(4,6)
ttt = tt*ttt # (c, b*h*w)=(4,6)
ttt = -tf.reduce_sum(ttt, axis=1) # (c,)=(4,)
tttt = tf.reshape(ttt, shape=(c, 1, 1, 1)) # (c,b,h,w)=(4,1,1,1)
ttttt = tf.transpose(tttt, perm=[1, 2, 3, 0]) # (b,h,w,c)=(1,1,1,4)
return ttttt
def derive_sese_output_v1(t):
""" v1 compute softmax for each channel | compute entropy for each channel
only need to compute softmax in axis=2 for a 1d_cnn
todo for 2d feature map in 1 batch: should first use << reshape + softmax + reshape >>
tmp = tf.reshape(arr1, shape=(-1, 4, 2)) #
tmp = tf.nn.softmax(tmp, axis=1) # Calculate for both axes
rs = tf.reshape(tmp, shape=(-1, 2, 2, 2))"""
st = tf.nn.softmax(t, axis=2) # (b,h,w,c) axis=2 so only w changes for softmax
lst = tf.nn.log_softmax(t, axis=2)
t = tf.multiply(st, lst)
t = -tf.reduce_sum(t, axis=2, keepdims=True)
return t
def sese_block(input_feature, name, v, ratio=8):
"""
Contains the implementation of Softmax-Entropy-Squeeze-and-Excitation block. => standard sese-block
todo the way for ratio settings seems important for this algorithm!!!!
ratio should not be larger than input channels????
"""
kernel_initializer1 = tf.contrib.layers.variance_scaling_initializer()
kernel_initializer2 = tf.contrib.layers.xavier_initializer()
bias_initializer = tf.constant_initializer(value=0.0)
with tf.variable_scope(name):
# height = input_feature.get_shape()[1]
# width = input_feature.get_shape()[2]
channel = input_feature.get_shape()[-1]
# sese
entropy_fmap = derive_sese_output(input_feature, channel) if v == 1 else derive_sese_output_v1(input_feature)
# two fc
# outputs = activation(inputs.kernel + bias)
# units = channel // ratio # todo must define the shape by tf.reshape else it raise value error "shape undefined"
excitation = tf.layers.dense(inputs=tf.reshape(entropy_fmap, [-1, 1, 1, channel]),
units=channel // ratio,
activation=tf.nn.relu,
kernel_initializer=kernel_initializer1,
bias_initializer=bias_initializer,
name='bottleneck_fc')
# print(excitation)
# print(excitation.name)
# unit = channel
excitation = tf.layers.dense(inputs=excitation, # (2,1,1,4)
units=channel,
activation=tf.nn.sigmoid,
kernel_initializer=kernel_initializer2,
bias_initializer=bias_initializer,
name='recover_fc')
# print(excitation)
# print(excitation.name)
# todo step-4 Fscale op -> using broadcast of python
scale = input_feature * excitation # (2,2,3,4)
# print(scale)
# print(scale.name)
# convert to float32
try:
scale = tf.to_float(x=scale, name='ToFloat')
except TypeError as t:
exit(f'[!] {str(t)}')
return scale
def se_block(input_feature, name, ratio=8):
"""
Contains the implementation of Squeeze-and-Excitation block
As described in https://arxiv.org/abs/1709.01507.
"""
# todo find out why kernel&bias use different initializer?
kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
bias_initializer = tf.constant_initializer(value=0.0)
with tf.variable_scope(name):
channel = input_feature.get_shape()[-1] # get last dimension size = channel
# todo step-1 Global average pooling by reduce_mean is more efficient!
squeeze = tf.reduce_mean(input_feature, axis=[1, 2], keepdims=True) # input_feature=(?,h,w,c) squeeze=(?,1,1,c)
# todo step-2 two stacked fc layers
# outputs=activation(inputs.kernel + bias)
# units=channel // ratio => excitation = 1*1*channel//ratio
excitation = tf.layers.dense(inputs=squeeze,
units=channel // ratio,
activation=tf.nn.relu,
kernel_initializer=kernel_initializer,
bias_initializer=bias_initializer,
name='bottleneck_fc')
# unit=channel => excitation = 1*1*channel
# print(excitation)
# print(excitation.name)
excitation = tf.layers.dense(inputs=excitation,
units=channel,
activation=tf.nn.sigmoid,
kernel_initializer=kernel_initializer,
bias_initializer=bias_initializer,
name='recover_fc')
# todo after tf.nn.sigmoid -> the value of excitation should be range in 0-1
# print(excitation)
# print(excitation.name)
# step-3
# (b,1,w,c)*(b,1,1,c) 自动进行了 broadcast 相当于(b,1,w,c)*(b,1,w,c) -> 对每个map进行了加权
scale = input_feature * excitation
# print(scale)
# print(scale.name)
# convert to float32
try:
scale = tf.to_float(x=scale, name='ToFloat')
except TypeError as t:
exit(f'[!] {str(t)}')
return scale
# def fully_connected(inputs,
# num_outputs,
# activation_fn=nn.relu,
# normalizer_fn=None,
# normalizer_params=None,
# weights_initializer=initializers.xavier_initializer(),
# weights_regularizer=None,
# biases_initializer=init_ops.zeros_initializer(),
# biases_regularizer=None,
# reuse=None,
# variables_collections=None,
# outputs_collections=None,
# trainable=True,
# scope=None):