-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathGetFeature31day.py
324 lines (280 loc) · 13.6 KB
/
GetFeature31day.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/python env
#-*- coding: utf-8 -*-
# 由TrainModel更改过来,目的是提取31days的特征。
import MySQLdb
import MySQLdb.cursors
import numpy as np
import string
from ObtainPredict import PredictEmption
__author__ = 'Huang yi'
transform_dataformat_sql = '''INSERT INTO 31day_train_user
(user_id,item_id,behavior_type,user_geohash,item_catagory, daydiff)
SELECT user_id,item_id,behavior_type,user_geohash,item_catagory,DATEDIFF(time, '2014-11-18') AS daydiff
FROM train_user'''
merge_look_sql = '''INSERT INTO 31day_user_look
(user_id, item_id, look)
SELECT user_id, item_id, group_concat(daydiff) as look
FROM 31day_train_user WHERE behavior_type=1 GROUP BY user_id, item_id '''
merge_store_sql = '''INSERT INTO 31day_user_store
(user_id, item_id, store)
SELECT user_id, item_id, group_concat(daydiff) as store
FROM 31day_train_user WHERE behavior_type=2 GROUP BY user_id, item_id '''
merge_cart_sql = '''INSERT INTO 31day_user_cart
(user_id, item_id, cart)
SELECT user_id, item_id, group_concat(daydiff) as cart
FROM 31day_train_user WHERE behavior_type=3 GROUP BY user_id, item_id '''
merge_buy_sql = '''INSERT INTO 31day_user_buy
(user_id, item_id, buy)
SELECT user_id, item_id, group_concat(daydiff) as buy
FROM 31day_train_user WHERE behavior_type=4 GROUP BY user_id, item_id '''
#四个表合并太慢,先建立索引。这里应该用outer join,但mysql没有。后面再补充。
merge_all_behavoir = '''INSERT INTO 31day_user_features
(user_id, item_id, look, store, cart, buy)
SELECT 31day_user_look.user_id, 31day_user_look.item_id,
31day_user_look.look, 31day_user_store.store, 31day_user_cart.cart, 31day_user_buy.buy
FROM 31day_user_look
LEFT join 31day_user_store ON 31day_user_look.user_id = 31day_user_store.user_id AND
31day_user_look.item_id = 31day_user_store.item_id
LEFT join 31day_user_cart ON 31day_user_look.user_id = 31day_user_cart.user_id AND
31day_user_look.item_id = 31day_user_cart.item_id
LEFT join 31day_user_buy ON 31day_user_look.user_id = 31day_user_buy.user_id AND
31day_user_look.item_id = 31day_user_buy.item_id'''
class TrainModel(object):
def __init__(self):
self.db = MySQLdb.connect("localhost", "root", "199194", "tianchi",
cursorclass = MySQLdb.cursors.DictCursor)
self.cursor = self.db.cursor()
def MergeData(self):
try:
# self.cursor.execute(transform_dataformat_sql)
# self.db.commit()
# self.cursor.execute(merge_look_sql)
# self.db.commit()
# self.cursor.execute(merge_store_sql)
# self.db.commit()
# self.cursor.execute(merge_cart_sql)
# self.db.commit()
# self.cursor.execute(merge_buy_sql)
# self.db.commit()
# 执行前先建立索引
self.cursor.execute(merge_all_behavoir)
self.db.commit()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
self.db.rollback()
self.db.close
#特性:一个内四种行为次数 + 总和
def ExtractMonthlyBehavior(self, user_id, item_id):
self.cursor.execute('SELECT look, store, cart, buy FROM 31day_user_features WHERE user_id=%s AND item_id=%s', (user_id, item_id))
str = self.cursor.fetchall()
if not str:
raise TypeError('Database NULL!')
look_times = len(str[0]['look'].split(',')) if str[0]['look'] else 0
store_times = len(str[0]['store'].split(',')) if str[0]['store'] else 0
cart_times = len(str[0]['cart'].split(',')) if str[0]['cart'] else 0
buy_times = len(str[0]['buy'].split(',')) if str[0]['buy'] else 0
all_times = look_times + store_times + cart_times + buy_times
feature = (look_times, store_times, cart_times, buy_times, all_times )
return feature
#特性:最后四天四种行为次数 + 总和
def ExtractLastdaysBehavior(self, user_id, item_id):
self.cursor.execute('SELECT look, store, cart, buy FROM 31day_user_features WHERE user_id=%s AND item_id=%s', (user_id, item_id))
str = self.cursor.fetchall()
if not str:
raise TypeError('Database NULL!')
look_times = store_times = cart_times = buy_times = 0
look_date = str[0]['look'].split(',') if str[0]['look'] else 0
store_date = str[0]['store'].split(',') if str[0]['store'] else 0
cart_date = str[0]['cart'].split(',') if str[0]['cart'] else 0
buy_date = str[0]['buy'].split(',') if str[0]['buy'] else 0
if look_date:
for l in look_date:
if l and int(l) >= 26:
look_times += 1
if store_date:
for l in store_date:
if l and int(l) >= 26:
store_times += 1
if cart_date:
for l in cart_date:
if l and int(l) >= 26:
cart_times += 1
if buy_date:
for l in buy_date:
if l and int(l) >= 26:
buy_times += 1
all_times = look_times + store_times + cart_times + buy_times
feature = (look_times, store_times, cart_times, buy_times, all_times)
return feature
#特征:最后一次行为离分割点的间隔
def ExtractLRUsed(self, user_id, item_id):
self.cursor.execute(' SELECT max(daydiff) FROM 31day_train_user WHERE user_id=%s AND item_id=%s', (user_id, item_id))
LRU = self.cursor.fetchall()
LRUnumber = LRU[0]['max(daydiff)'] if LRU else 0
feature = int(LRUnumber)
return feature
# 特征: 最近一天收藏,加购的次数(基于有购物倾向的策略)
def ExtractLast1Trend(self, user_id, item_id):
self.cursor.execute('SELECT store, cart FROM 31day_user_features WHERE user_id=%s AND item_id=%s', (user_id, item_id))
str_all = self.cursor.fetchall()
if not str_all:
raise TypeError('Database NULL!')
store_times = cart_times = 0
store_date = str_all[0]['store'].split(',') if str_all[0]['store'] else 0
cart_date = str_all[0]['cart'].split(',') if str_all[0]['cart'] else 0
if store_date:
for l in store_date:
if l and int(l) == 29:
store_times += 1
if cart_date:
for l in cart_date:
if l and int(l) == 29:
cart_times += 1
feature = (store_times, cart_times)
return feature
# 特征: 最近三天查看,收藏,加购的次数(基于有购物倾向的策略)
def ExtractLast3Trend(self, user_id, item_id):
self.cursor.execute('SELECT look, store, cart FROM 31day_user_features WHERE user_id=%s AND item_id=%s', (user_id, item_id))
str_all = self.cursor.fetchall()
if not str_all:
raise TypeError('Database NULL!')
look_times=store_times = cart_times = 0
look_date = str_all[0]['look'].split(',') if str_all[0]['look'] else 0
store_date = str_all[0]['store'].split(',') if str_all[0]['store'] else 0
cart_date = str_all[0]['cart'].split(',') if str_all[0]['cart'] else 0
if look_date:
for l in look_date:
if l and int(l) >= 27:
look_times += 1
if store_date:
for l in store_date:
if l and int(l) == 27:
store_times += 1
if cart_date:
for l in cart_date:
if l and int(l) == 27:
cart_times += 1
feature = (look_times, store_times, cart_times)
return feature
# 特征: 最近一天,三天,七天购买的次数(基于买过不会再买的策略)
def ExtractLast7Buy(self, user_id, item_id):
self.cursor.execute('SELECT buy FROM 31day_user_features WHERE user_id=%s AND item_id=%s', (user_id, item_id))
str_all = self.cursor.fetchall()
if not str_all:
raise TypeError('Database NULL!')
buy_times_1day = buy_times_3day = buy_times_7day = 0
buy_date = str_all[0]['buy'].split(',') if str_all[0]['buy'] else 0
if buy_date:
for l in buy_date:
if l and int(l) == 29:
buy_times_1day += 1
if l and int(l) >= 27:
buy_times_3day += 1
if l and int(l) >= 23:
buy_times_7day += 1
feature = (buy_times_1day, buy_times_3day, buy_times_7day)
return feature
def MergeFeatures(self):
fop = open('31day_data_features.txt', 'a')
self.cursor.execute('SELECT user_id, item_id FROM 31day_user_features')
record = self.cursor.fetchall()
for rec in record:
userid = rec['user_id']
itemid = rec['item_id']
# 融合所有特征 19D
feature0 = self.ExtractMonthlyBehavior(userid, itemid) # 5D
feature1 = self.ExtractLastdaysBehavior(userid, itemid) # 5D
feature2 = self.ExtractLRUsed(userid, itemid) # 1D
feature3 = self.ExtractLast1Trend(userid, itemid) # 2D
feature4 = self.ExtractLast3Trend(userid, itemid) # 3D
feature5 = self.ExtractLast7Buy(userid, itemid) # 3D
features = feature0, feature1, feature2, feature3, feature4, feature5
line = features[0] + features[1] + (features[2],) + features[3] + feature4 + feature5
fop.write('%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d' % (line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9],
line[10],line[11],line[12],line[13],line[14],line[15],line[16],line[17],line[18]))
fop.write('\n')
fop.close()
def Transform2Matrix(self):
# 第一次加载重写txt文件到numpy的格式npy中
predict_data = np.loadtxt('31day_data_features.txt')
np.save('31day_data_features.npy',predict_data)
# 用31day的31daySet(即所有457万数据的11维特征)得到predict购买的list
def PerformPredict(self):
fop = open('predict_all_pairs.txt', 'w+')
train_set = np.load('TrainSet.npy') # 得模型
predict_data = np.load('31day_data_features.npy') # 31天的特征
train_data = train_set[:,0:-1] # 去除label后的特征数据
train_data_label = train_set[:,-1] # label
best_th = 0.024 # 前面计算出来的
PE = PredictEmption()
predict_labels, predict_proba = PE.TrainAndPredict(train_data, train_data_label, predict_data)
index = np.array(range(0,len(predict_proba)))
index_predict = index[predict_proba > best_th] # 大于阈值对应的索引
index_predict = list(index_predict) # 得到的对应31daySet序号的购买index
# 用一个txt文件lines装带有user_id和item_id。和31daySet顺序对应。
self.cursor.execute('SELECT user_id, item_id FROM 31day_user_features')
lines = self.cursor.fetchall()
for i in index_predict:
fop.write('%d,%d' % (lines[i]['user_id'],lines[i]['item_id'])) # 将预测购买的user_id和item_id写入文件
fop.write('\n')
fop.close()
# 用给的商品子集来过滤
def FilterByItems(self):
fop = open('filter_pairs.txt', 'w+')
all_pairs = open('predict_all_pairs.txt', 'r')
lines = []
items = [] # 保存train_id中出现在 预测中的item_id
# 子集train_item过滤
while True:
myString = all_pairs.readline()
if myString:
# 把predict_all_pairs.txt变为方便解析的list
myList = map(string.strip, myString.split(',')) # 去除回车
lines.append(myList) # list格式的pairs
# 读取文件完毕后开始处理
else:
all = np.array(lines) # 矩阵格式的pairs
self.cursor.execute('SELECT item_id FROM train_item')
record = self.cursor.fetchall()
for rec in record:
itemid = rec['item_id']
# 注意两者的类型,all是字符串类型,itemid是long类型
if str(itemid) in all[:,1]:
items.append(itemid)
for it in items:
for pair in lines:
if str(pair[1]) == str(it):
fop.write('%d,%d' % (int(pair[0]),int(pair[1])) )
fop.write('\n')
fop.close()
return 0
# 结果去重复
def RemoveDuplicate(self):
fop1 = open('filter_pairs.txt', 'r')
fop2 = open('remove_pairs.txt', 'w+')
lines = []
# filter_pairs去重
while True:
myString = fop1.readline()
if myString:
myList = map(string.strip, myString.split(',')) # 去除回车
lines.append(myList)
else:
lines_remove = []
for it in lines:
if it not in lines_remove:
lines_remove.append(it)
for it in lines_remove:
fop2.write('%d,%d' % (int(it[0]),int(it[1])))
fop2.write('\n')
fop1.close()
fop2.close()
return 0
if __name__ == '__main__':
model = TrainModel()
# model.MergeData()
# model.MergeFeatures() # 这个很耗时间
# model.Transform2Matrix()
model.PerformPredict()
model.FilterByItems()
model.RemoveDuplicate()