-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontentTensor.py
73 lines (56 loc) · 2.34 KB
/
contentTensor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import pandas as pd
import tensorflow as tf
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
dataPath = os.path.join(os.getcwd(),'datasets','CollabData1')
productsDataFile = os.path.join(dataPath,'amazonSample1.csv')
df = pd.read_csv(productsDataFile, low_memory=False, encoding='ISO-8859-1')
df = df[['unique','product','brand','category','description']]
df = df.fillna('')
def recommend(item):
print("For product", item , ":")
item_df = df[df['product'].str.contains(item)].drop_duplicates()
cate = item_df['category'].iloc[0]
# print(type(cate))
new_brand = item_df['brand'].iloc[0]
cate_array = str(cate).split('>')
if len(cate_array) > 3:
sub_cate = cate_array[0:-2]
new_cate = '>'.join(sub_cate)
elif len(cate_array) >1:
new_cate = cate_array[0] + ">" + cate_array[1]
else:
new_cate = cate_array[0]
new_df = df[df['category'].str.contains(new_cate)].drop_duplicates()
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(new_df['description'])
matrix = matrix.toarray()
#TensorFlow
a = tf.placeholder(tf.float32, shape=matrix.shape, name = 'a')
b = tf.placeholder(tf.float32, shape=matrix.T.shape, name = 'b')
normal_a = tf.nn.l2_normalize(a,0)
normal_b = tf.nn.l2_normalize(b,0)
cos_sim = (tf.matmul(normal_a, normal_b))
sess = tf.Session()
cos_score = sess.run(cos_sim, feed_dict={a:matrix, b:matrix.T})
sess.close()
indices = pd.Series(new_df.index, index=new_df['product']).drop_duplicates()
idx = indices[item]
sim_scores = list(enumerate(cos_score[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
item_indices = [i[0] for i in sim_scores]
new_df = new_df.iloc[item_indices]
df_brand = new_df.loc[new_df['brand'] == new_brand]
df_non_brand = new_df.loc[new_df['brand'] != new_brand]
return df_brand[:5], df_non_brand[:5]
u3_brand, u3_non_brand = recommend('Hornby 2014 Catalogue')
print("\n The recommended products are:\n\n")
print("Similar product by the same brand\n")
for i in range(u3_brand.shape[0]):
print("Product: ",u3_brand['product'].iloc[i])
print("\nSimilar Products\n")
for i in range(u3_non_brand.shape[0]):
print('Product: ',u3_non_brand['product'].iloc[i])
#os.remove('output.csv')
#u3.to_csv('output.csv')