-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathSimHash.js
121 lines (103 loc) · 3.08 KB
/
SimHash.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/**
* Simhash class. Creates a 32-bit simhash.
*
* // Usage:
* var hash = Simhash.of("This is a test");
*
* // Override default values
* var simhash = new Simhash();
* var hash = simhash.of("This is a test", {
* kshingles: 2,
* maxFeatures: 32
* });
*/
var Jenkins = require('./Jenkins.js').Jenkins;
class SimHash {
constructor(options) {
/**
* By default, we tokenize input into chunks of this size.
*/
this.kshingles = typeof(options) != 'undefined' && typeof(options['kshingles']) != 'undefined' ? options['kshingles'] : 4;
/**
* By default, this many number of minimum shingles will
* be combined to create the final hash.
*/
this.maxFeatures = typeof(options) != 'undefined' && typeof(options['maxFeatures']) != 'undefined' ? options['maxFeatures'] : 128;
}
// --------------------------------------------------
// Public access
// --------------------------------------------------
/**
* Driver function.
*/
hash(input) {
var tokens = tokenize.call(this,input);
var shingles = [];
var jenkins = new Jenkins();
for (var i in tokens) {
shingles.push(jenkins.hash32(tokens[i]));
}
var simhash = combineShingles.call(this,shingles);
simhash >>>= 0;
return simhash;
};
};
// --------------------------------------------------
// Private methods
// --------------------------------------------------
/**
* TODO: Make this private or take closure that implements
* logic to combine shingles.
*/
function combineShingles(shingles) {
if (shingles.length == 0) return;
if (shingles.length == 1) return shingles[0];
shingles.sort(hashComparator);
if (shingles.length > this.maxFeatures) shingles = shingles.splice(this.maxFeatures);
var simhash = 0x0;
var mask = 0x1;
for (var pos = 0; pos < 32; pos++) {
var weight = 0;
for (var i in shingles) {
shingle = parseInt(shingles[i], 16);
weight += !(~shingle & mask) == 1 ? 1 : -1;
}
if (weight > 0) simhash |= mask;
mask <<= 1;
}
return simhash;
};
/**
* Tokenizes input into 'kshingles' number of tokens.
*/
function tokenize(original) {
var size = original.length;
if (size <= this.kshingles) {
return [original.substr(0)];
}
var shingles = [];
for (var i = 0; i < size; i = i + this.kshingles) {
shingles.push(i + this.kshingles < size ? original.slice(i, i + this.kshingles) : original.slice(i));
}
return shingles;
};
/**
* Calculates binary hamming distance of two base 16 integers.
*/
function hammingDistanceSlow(x, y) {
var distance = 0;
var val = parseInt(x, 16) ^ parseInt(y, 16);
while (val) {
++distance;
val &= val - 1;
}
return distance;
};
/**
* TODO: Use a priority queue. Till then this comparator is
* used to find the least 'maxFeatures' shingles.
*/
function hashComparator(a, b) {
return a < b ? -1 : (a > b ? 1 : 0);
};
module.exports.SimHash = SimHash;