-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoy_precomputation.py
103 lines (86 loc) · 3.08 KB
/
toy_precomputation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import math
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import json
with open('prescraped/artist_result.csv') as c:
table = pd.read_csv(c, header=None)
popular = table[table.iloc[:, 4] >= 65]
candidates = table[table.iloc[:,4]<65]
popular_ids = set()
for pid in popular.iloc[:,0]:
popular_ids.add(pid)
candidates_ids = set()
for cid in candidates.iloc[:,0]:
candidates_ids.add(cid)
means_cols = []
for i in range(5,table.shape[1],2):
means_cols.append(i)
artist_info = {}
genres = set()
for i, row in table.iterrows():
#both = np.array(row.iloc[5:])
means = []
for col in means_cols:
means.append(row.iloc[col])
artist_genres = []
for g in row.iloc[2].replace('[', '').replace(']','').replace("'", "").split(','):
genres.add(g.strip())
artist_genres.append(g.strip())
artist_info[row.iloc[0]] = {'name': row.iloc[1], 'followers': int(row.iloc[3]),
'means': means, 'genres': artist_genres}
data_means = table.iloc[:,means_cols]
#data_both = table.iloc[:,5:]
num_clust = math.floor(popular.shape[0]/2)
means_clusters = KMeans(n_clusters=num_clust, init='k-means++').fit(data_means)
for i, row in table.iterrows():
artist_info[row.iloc[0]]['cluster'] = means_clusters.labels_[i].item()
df_artists_clusters = pd.DataFrame(columns=['id', 'cluster'])
for artist in artist_info:
df_artists_clusters = df_artists_clusters.append({'id': artist, 'cluster': artist_info[artist]['cluster']}, ignore_index=True)
clusters_groups = df_artists_clusters.groupby(['cluster'])
popular_candidates = {}
for pid in popular.iloc[:,0]:
popular_candidates[pid] = []
for cluster in range(num_clust):
g = clusters_groups.get_group(cluster)
p = []
c = []
for id in g.loc[:,'id']:
if id in popular_ids:
p.append(id)
elif id in candidates_ids:
c.append(id)
else:
print('neither')
for pid in p:
for cid in c:
popular_candidates[pid].append(cid)
candidates_scores = {}
for pid in popular_candidates:
candidates = popular_candidates[pid]
candidates_scores[pid] = []
for cid in candidates:
similarity = np.linalg.norm(np.array(artist_info[pid]['means']) - np.array(artist_info[cid]['means'])).item()
cf = artist_info[cid]['followers']
pf = artist_info[pid]['followers']
if cf==0:
popularity = 0
else:
popularity = math.log(cf) / math.log(pf)
novelty = 1 - popularity
score = similarity*novelty
candidates_scores[pid].append(tuple((cid, score)))
name_find = {}
for artist in artist_info:
name = artist_info[artist]['name']
if name in name_find:
name_find[name].append(artist)
else:
name_find[name] = [artist]
with open('precomputed/name_find.json', 'w') as nf_file:
json.dump(name_find, nf_file)
with open('precomputed/artist_info.json', 'w') as ai_file:
json.dump(artist_info, ai_file)
with open('precomputed/candidates_scores.json', 'w') as cs_file:
json.dump(candidates_scores, cs_file)