From 4d26cbc7de7944fdb8bd65b6891fcd4c438bc847 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 4 Feb 2024 13:51:31 -0500 Subject: [PATCH] format --- benchmarks/speedtesting.cpp | 170 +++++------ examples/example.cpp | 77 +++-- examples/example2.cpp | 50 ++-- examples/example3.cpp | 53 ++-- examples/example4.cpp | 56 ++-- examples/example5.cpp | 96 +++---- examples/example6.cpp | 122 ++++---- examples/example64bits.cpp | 50 ++-- include/adler32.h | 73 +++-- include/characterhash.h | 107 ++++--- include/cyclichash.h | 264 +++++++++-------- include/generalhash.h | 213 +++++++------- include/mersennetwister.h | 549 ++++++++++++++++-------------------- include/rabinkarphash.h | 308 ++++++++++---------- include/threewisehash.h | 143 +++++----- include/ztimer.h | 79 +++--- tests/unit.cpp | 354 ++++++++++++----------- 17 files changed, 1336 insertions(+), 1428 deletions(-) diff --git a/benchmarks/speedtesting.cpp b/benchmarks/speedtesting.cpp index a8ca039..ffaffc8 100644 --- a/benchmarks/speedtesting.cpp +++ b/benchmarks/speedtesting.cpp @@ -1,105 +1,111 @@ -#include -#include #include "cyclichash.h" -#include "rabinkarphash.h" #include "generalhash.h" +#include "rabinkarphash.h" #include "threewisehash.h" #include "ztimer.h" +#include +#include using namespace std; - -template -double hashALot( int n, int L, uint ttimes,uint sizeoftest , vector & recorder) { - ZTimer t; - for(uint times = 0; times(n); ++k) { - hf.eat(static_cast(k)); - } - for(uint k = n; k(k-n),static_cast(k)); - } - /* The goal of the recorder is to prevent - the compiler from deciding that this whole computation - is not required! - */ - recorder.push_back(hf.hashvalue); +template +double hashALot(int n, int L, uint ttimes, uint sizeoftest, + vector &recorder) { + ZTimer t; + for (uint times = 0; times < ttimes; ++times) { + hashfunction hf(n, L); + for (uint k = 0; k < static_cast(n); ++k) { + hf.eat(static_cast(k)); + } + for (uint k = n; k < sizeoftest; ++k) { + hf.update(static_cast(k - n), + static_cast(k)); } - return t.split()/(1000.0*ttimes); + /* The goal of the recorder is to prevent + the compiler from deciding that this whole computation + is not required! + */ + recorder.push_back(hf.hashvalue); + } + return t.split() / (1000.0 * ttimes); } - -template -double hashALot( int n, int L, uint ttimes , vector & recorder, vector & data) { - ZTimer t; - for(uint times = 0; times(n); ++k) { - hf.eat(data[k]); - } - for(uint k = n; k +double hashALot(int n, int L, uint ttimes, vector &recorder, + vector &data) { + ZTimer t; + for (uint times = 0; times < ttimes; ++times) { + hashfunction hf(n, L); + for (uint k = 0; k < static_cast(n); ++k) { + hf.eat(data[k]); + } + for (uint k = n; k < data.size(); ++k) { + hf.update(data[k - n], data[k]); } - return t.split()/1000.0; + /* The goal of the recorder is to prevent + the compiler from deciding that this whole computation + is not required! + */ + recorder.push_back(hf.hashvalue); + } + return t.split() / 1000.0; } void synthetic() { - int L = 19; - vector recorder; - uint sizeoftest = 100000000; - cout<<"#n three-wise General BufferedGeneral Cyclic Karp-Rabin "< >(n,L,1,sizeoftest,recorder)<<" "; - cout< >(n,L,1,sizeoftest,recorder)<<" "; - cout< >(n,L,1,sizeoftest,recorder)<<" "; - cout< >(n,L+n,1,sizeoftest,recorder)<< " "; - cout< >(n,L,1,sizeoftest,recorder)<>(n, L, 1, sizeoftest, recorder) + << " "; + cout << hashALot>(n, L, 1, sizeoftest, recorder) + << " "; + cout << hashALot>(n, L, 1, sizeoftest, recorder) + << " "; + cout << hashALot>(n, L + n, 1, sizeoftest, recorder) << " "; + cout << hashALot>(n, L, 1, sizeoftest, recorder) << endl; + } + cout << "# L= " << L << " char-length= " << sizeoftest << endl; } -void grabFileContent(vector & data, string filename) { - string line; - ifstream file(filename.c_str()); +void grabFileContent(vector &data, string filename) { + string line; + ifstream file(filename.c_str()); + std::getline(file, line); + while (file.good()) { std::getline(file, line); - while ( file.good() ) { - std::getline(file, line); - for(uint k = 0; k recorder; - uint repeats=1; - vector data; - grabFileContent(data, filename); - cout<<"#n three-wise General BufferedGeneral Cyclic Karp-Rabin "< >(n,L,repeats,recorder,data)<<" "; - cout< >(n,L,repeats,recorder,data)<<" "; - cout< >(n,L,repeats,recorder,data)<<" "; - cout< >(n,L+n,repeats,recorder,data)<< " "; - cout< >(n,L,repeats,recorder,data)<>(n, L, repeats, recorder, data) + << " "; + cout << hashALot>(n, L, repeats, recorder, data) + << " "; + cout << hashALot>(n, L, repeats, recorder, data) + << " "; + cout << hashALot>(n, L + n, repeats, recorder, data) << " "; + cout << hashALot>(n, L, repeats, recorder, data) << endl; + } + cout << "# L= " << L << " char-length= " << data.size() + << " repeats=" << repeats << endl; } -int main(int params, char ** args) { - if (params == 1) - synthetic(); - else - realdata(args[1]); +int main(int params, char **args) { + if (params == 1) + synthetic(); + else + realdata(args[1]); - return 0; + return 0; } - diff --git a/examples/example.cpp b/examples/example.cpp index bdec1e6..27e66b4 100644 --- a/examples/example.cpp +++ b/examples/example.cpp @@ -1,56 +1,45 @@ +#include +#include #include #include -#include -#include #include "rabinkarphash.h" -int main(int argc, char * argv[]) -{ - size_t q = 3; - size_t k = 4; - typedef KarpRabinHash<> HashFunction; - std::vector > hashPtr(q); - for(size_t z = 0; z < hashPtr.size(); ++z) - { - std::unique_ptr & ptr = hashPtr[z]; - ptr.reset(new HashFunction(k, 12)); - } - - std::string str = "ACGTAACGT"; - for (size_t j = 0; j < k; j++) - { - for(size_t z = 0; z < hashPtr.size(); ++z) - { - std::unique_ptr & ptr = hashPtr[z]; - ptr->eat(str[j]); - } +int main() { + size_t q = 3; + size_t k = 4; + typedef KarpRabinHash<> HashFunction; + std::vector> hashPtr(q); + for (size_t z = 0; z < hashPtr.size(); ++z) { + std::unique_ptr &ptr = hashPtr[z]; + ptr.reset(new HashFunction(k, 12)); + } + std::string str = "ACGTAACGT"; + for (size_t j = 0; j < k; j++) { + for (size_t z = 0; z < hashPtr.size(); ++z) { + std::unique_ptr &ptr = hashPtr[z]; + ptr->eat(str[j]); } + } - for (size_t i = 0;; i++) - { - std::cout << std::string(str.begin() + i, str.begin() + i + k); - for(size_t z = 0; z < hashPtr.size(); ++z) - { - std::unique_ptr & ptr = hashPtr[z]; - std::cout << ' ' << ptr->hashvalue; - } + for (size_t i = 0;; i++) { + std::cout << std::string(str.begin() + i, str.begin() + i + k); + for (size_t z = 0; z < hashPtr.size(); ++z) { + std::unique_ptr &ptr = hashPtr[z]; + std::cout << ' ' << ptr->hashvalue; + } - std::cout << std::endl; - if (i + k < str.size()) - { - for(size_t z = 0; z < hashPtr.size(); ++z) - { - std::unique_ptr & ptr = hashPtr[z]; - ptr->update(str[i], str[i + k]); - } - } - else - { - break; - } + std::cout << std::endl; + if (i + k < str.size()) { + for (size_t z = 0; z < hashPtr.size(); ++z) { + std::unique_ptr &ptr = hashPtr[z]; + ptr->update(str[i], str[i + k]); + } + } else { + break; } + } - return 0; + return 0; } diff --git a/examples/example2.cpp b/examples/example2.cpp index 659eeb2..8e4495f 100644 --- a/examples/example2.cpp +++ b/examples/example2.cpp @@ -1,34 +1,34 @@ +#include +#include #include #include -#include -#include // given hash value of "ABCD", can I have value of // "ABCDE", without computing the whole hash value? #include "cyclichash.h" - -int main(int argc, char * argv[]) -{ - CyclicHash<> hf(5,19); - string input = "ABCDE"; - hf.eat(input[0]);//A - hf.eat(input[1]);//B - hf.eat(input[2]);//C - hf.eat(input[3]);//D - cout<<"Hash value of ABCD is " << hf.hashvalue << endl; - // we check the answer going the long way... - const std::vector charvectslice(input.begin(), input.begin()+4); - uint32_t trueanswerslice = hf.hash(charvectslice); - if(trueanswerslice != hf.hashvalue ) throw runtime_error("bug"); - // we continue - hf.eat(input[4]);//E - cout<<"Hash value of ABCDE is " << hf.hashvalue << endl; - // we check the answer going the long way - const std::vector charvect(input.begin(), input.end()); - uint32_t trueanswer = hf.hash(charvect); - if(trueanswer != hf.hashvalue ) throw runtime_error("bug"); - return 0; - +int main() { + CyclicHash<> hf(5, 19); + string input = "ABCDE"; + hf.eat(input[0]); // A + hf.eat(input[1]); // B + hf.eat(input[2]); // C + hf.eat(input[3]); // D + cout << "Hash value of ABCD is " << hf.hashvalue << endl; + // we check the answer going the long way... + const std::vector charvectslice(input.begin(), + input.begin() + 4); + uint32_t trueanswerslice = hf.hash(charvectslice); + if (trueanswerslice != hf.hashvalue) + throw runtime_error("bug"); + // we continue + hf.eat(input[4]); // E + cout << "Hash value of ABCDE is " << hf.hashvalue << endl; + // we check the answer going the long way + const std::vector charvect(input.begin(), input.end()); + uint32_t trueanswer = hf.hash(charvect); + if (trueanswer != hf.hashvalue) + throw runtime_error("bug"); + return 0; } \ No newline at end of file diff --git a/examples/example3.cpp b/examples/example3.cpp index 902dcfb..ee53a94 100644 --- a/examples/example3.cpp +++ b/examples/example3.cpp @@ -1,8 +1,7 @@ +#include +#include #include #include -#include -#include - #include "cyclichash.h" @@ -10,31 +9,27 @@ // "ABC"quicky? int demo1() { - CyclicHash<> hf(3, 32); - string input = "ABCD"; - hf.eat(input[1]); //B - hf.eat(input[2]); //C - hf.eat(input[3]); //D - cout << "Hash value of BCD is " << hf.hashvalue << endl; - // we check the answer going the long way... - const std::vector charvectslice(input.begin() + 1, - input.begin() + 4); - uint32_t trueanswerslice = hf.hash(charvectslice); - if (trueanswerslice != hf.hashvalue) - throw runtime_error("bug"); - // we continue - hf.reverse_update(input[0], input[3]); //remove D, prepend A - cout << "Hash value of ABC is " << hf.hashvalue << endl; - // we check the answer going the long way - const std::vector charvect(input.begin(), input.begin() + 3); - uint32_t trueanswer = hf.hash(charvect); - if (trueanswer != hf.hashvalue) - throw runtime_error("bug"); - return 0; + CyclicHash<> hf(3, 32); + string input = "ABCD"; + hf.eat(input[1]); // B + hf.eat(input[2]); // C + hf.eat(input[3]); // D + cout << "Hash value of BCD is " << hf.hashvalue << endl; + // we check the answer going the long way... + const std::vector charvectslice(input.begin() + 1, + input.begin() + 4); + uint32_t trueanswerslice = hf.hash(charvectslice); + if (trueanswerslice != hf.hashvalue) + throw runtime_error("bug"); + // we continue + hf.reverse_update(input[0], input[3]); // remove D, prepend A + cout << "Hash value of ABC is " << hf.hashvalue << endl; + // we check the answer going the long way + const std::vector charvect(input.begin(), input.begin() + 3); + uint32_t trueanswer = hf.hash(charvect); + if (trueanswer != hf.hashvalue) + throw runtime_error("bug"); + return 0; } - -int main(int argc, char * argv[]) -{ - demo1(); -} +int main() { demo1(); } diff --git a/examples/example4.cpp b/examples/example4.cpp index fba33bd..b93bcd4 100644 --- a/examples/example4.cpp +++ b/examples/example4.cpp @@ -1,35 +1,35 @@ -#include -#include #include #include +#include +#include #include "cyclichash.h" /** -* Test of the prepend and append functions to test slightly longer and slightly shorter n-grams. -*/ - -int main(int argc, char * argv[]) -{ - CyclicHash hf(4, 64); - string input = "XABCDY"; - string base(input.begin() + 1, input.end() - 1); - string extend(input.begin() + 1, input.end()); - string prepend(input.begin(), input.end() - 1); - - for (string::const_iterator j = base.begin(); j != base.end(); ++j) - { - hf.eat(*j); - } - - std::cout << base << " " << hf.hash(base) << std::endl; - std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " << hf.hash(prepend) << std::endl; - std::cout << extend << " " << hf.hash_extend(input.back()) << " " << hf.hash(extend) << std::endl; - - assert(hf.hashvalue == hf.hash(base)); - assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); - assert(hf.hash_extend(input.back()) == hf.hash(extend)); - - return 0; - + * Test of the prepend and append functions to test slightly longer and slightly + * shorter n-grams. + */ + +int main(int argc, char *argv[]) { + CyclicHash hf(4, 64); + string input = "XABCDY"; + string base(input.begin() + 1, input.end() - 1); + string extend(input.begin() + 1, input.end()); + string prepend(input.begin(), input.end() - 1); + + for (string::const_iterator j = base.begin(); j != base.end(); ++j) { + hf.eat(*j); + } + + std::cout << base << " " << hf.hash(base) << std::endl; + std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " + << hf.hash(prepend) << std::endl; + std::cout << extend << " " << hf.hash_extend(input.back()) << " " + << hf.hash(extend) << std::endl; + + assert(hf.hashvalue == hf.hash(base)); + assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); + assert(hf.hash_extend(input.back()) == hf.hash(extend)); + + return 0; } diff --git a/examples/example5.cpp b/examples/example5.cpp index 1b3b8ff..ab30ab8 100644 --- a/examples/example5.cpp +++ b/examples/example5.cpp @@ -1,8 +1,8 @@ -#include -#include +#include "cyclichash.h" #include #include -#include "cyclichash.h" +#include +#include /* An issue is application-specific and has to do with the nature of DNA. Even @@ -29,77 +29,73 @@ the hash for a particular k-mer (n-gram) in the DNA, I just XOR the current forward and reverse hashes. */ - // Define DNA's complementary nucleotides // // Daniel: This is probably inefficient. Needlessly so. -// if efficiency matters, you want to define the character hash so that it takes the -// key 'A' to the hash value of 'T' and so forth. +// if efficiency matters, you want to define the character hash so that it takes +// the key 'A' to the hash value of 'T' and so forth. // -# define nucleotide_complement(ch) ( \ - (toupper(ch)) == 'A' ? 'T' : \ - (toupper(ch)) == 'T' ? 'A' : \ - (toupper(ch)) == 'C' ? 'G' : 'C' \ -) +#define nucleotide_complement(ch) \ + ((toupper(ch)) == 'A' ? 'T' \ + : (toupper(ch)) == 'T' ? 'A' \ + : (toupper(ch)) == 'C' ? 'G' \ + : 'C') // A sequence and its reverse complement (such as "GATTACA" and "TGTAATC") are // biologically identical and should hash to the same value. A sequence that is // equal to its reverse complement is a special case and should be handled // accordingly. // -#define canonical_hash(fwd, rev) ( \ - fwd == rev ? rev : fwd ^ rev \ -) +#define canonical_hash(fwd, rev) (fwd == rev ? rev : fwd ^ rev) #define WORDSIZE 5 #define SEED1 42 #define SEED2 1985 #define HASHBITS 64 - // full string hash from scratch (for comparison) -uint64_t fullhash(const string & input) { +uint64_t fullhash(const string &input) { assert(input.size() == WORDSIZE); - CyclicHash forward(input.size(), SEED1, SEED2, HASHBITS); - CyclicHash reverse(input.size(), SEED1, SEED2, HASHBITS); - for (int j = 0; j < input.size(); j++) { - forward.eat(input[j]); - reverse.eat(nucleotide_complement(input[input.size() - 1 - j])); - } - return canonical_hash(forward.hashvalue, reverse.hashvalue); + CyclicHash forward(input.size(), SEED1, SEED2, HASHBITS); + CyclicHash reverse(input.size(), SEED1, SEED2, HASHBITS); + for (int j = 0; j < input.size(); j++) { + forward.eat(input[j]); + reverse.eat(nucleotide_complement(input[input.size() - 1 - j])); + } + return canonical_hash(forward.hashvalue, reverse.hashvalue); } // check the rolling hash // k is the k-gram size, input is any string void demo(int k, string input) { - // Initialize the hash function to compute the hash of the first k-mer. - CyclicHash forward(k, SEED1, SEED2, HASHBITS); - CyclicHash reverse(k, SEED1, SEED2, HASHBITS); - for (int j = 0; j < k; j++) { - forward.eat(input[j]); - // going backward - reverse.eat(nucleotide_complement(input[k - 1 - j])); - } - // rolling has - uint64_t hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); - assert(fullhash(input.substr(0,k)) == hashval); - std::cout << input.substr(0,k) << " " << hashval << std::endl; + // Initialize the hash function to compute the hash of the first k-mer. + CyclicHash forward(k, SEED1, SEED2, HASHBITS); + CyclicHash reverse(k, SEED1, SEED2, HASHBITS); + for (int j = 0; j < k; j++) { + forward.eat(input[j]); + // going backward + reverse.eat(nucleotide_complement(input[k - 1 - j])); + } + // rolling has + uint64_t hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); + assert(fullhash(input.substr(0, k)) == hashval); + std::cout << input.substr(0, k) << " " << hashval << std::endl; - for(int j = k ; j < input.size(); j++) { - forward.update(input[j-k], input[j]); - // note: you to flip the parameters of reverse_update - reverse.reverse_update(nucleotide_complement(input[j]), nucleotide_complement(input[j-k])); - // compute the rolling has - hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); - // compare with full string hash - assert(fullhash(input.substr(j-k+1,k)) == hashval); - std::cout << input.substr(j-k+1,k) << " " << hashval << std::endl; - } + for (int j = k; j < input.size(); j++) { + forward.update(input[j - k], input[j]); + // note: you to flip the parameters of reverse_update + reverse.reverse_update(nucleotide_complement(input[j]), + nucleotide_complement(input[j - k])); + // compute the rolling has + hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); + // compare with full string hash + assert(fullhash(input.substr(j - k + 1, k)) == hashval); + std::cout << input.substr(j - k + 1, k) << " " << hashval << std::endl; + } } -int main(int argc, char * argv[]) -{ - demo(5,"GATTACACAATAGCAAATT"); - std::cout << " code looks good " << std::endl; - return 0; +int main(int argc, char *argv[]) { + demo(5, "GATTACACAATAGCAAATT"); + std::cout << " code looks good " << std::endl; + return 0; } diff --git a/examples/example6.cpp b/examples/example6.cpp index e87bc12..ceb92d6 100644 --- a/examples/example6.cpp +++ b/examples/example6.cpp @@ -1,81 +1,81 @@ /** -* This example is from Dmitry Artamonov, it shows that to get the same -* hash values for the same substrings, you need to use the same hasher object -* (since they are randomized). -*/ + * This example is from Dmitry Artamonov, it shows that to get the same + * hash values for the same substrings, you need to use the same hasher object + * (since they are randomized). + */ -#include -#include -#include -#include +#include "adler32.h" #include "cyclichash.h" #include "generalhash.h" -#include "threewisehash.h" #include "rabinkarphash.h" -#include "adler32.h" - -void CalcHashes( const std::string& Inp, const int WindowSize, KarpRabinHash<> & h1, ThreeWiseHash<> & h2, GeneralHash<> & h3, CyclicHash<> & h4, Adler32& h5 ) { - - - int WindowPos = 0; - h1.reset(); - h2.reset(); - h3.reset(); - h4.reset(); - h5.reset(); +#include "threewisehash.h" +#include +#include +#include +#include - for (int i = 0; i < Inp.length(); i++) { - unsigned char InChar = Inp[i]; +void CalcHashes(const std::string &Inp, const int WindowSize, + KarpRabinHash<> &h1, ThreeWiseHash<> &h2, GeneralHash<> &h3, + CyclicHash<> &h4, Adler32 &h5) { - bool Eat = (i < WindowSize); - unsigned char OutChar = ' '; - if (Eat) { - h1.eat(InChar); - h2.eat(InChar); - h3.eat(InChar); - h4.eat(InChar); - h5.eat(InChar); - } else { - OutChar = Inp[i - WindowSize]; - h1.update(OutChar, InChar); - h2.update(OutChar, InChar); - h3.update(OutChar, InChar); - h4.update(OutChar, InChar); - h5.update(OutChar, InChar); - } - if(i + 1 >= WindowSize) { - auto current = Inp.substr(i + 1 - WindowSize, WindowSize); - printf("%04d %02d %c %c %06x %06x %06x %06x %06x %c %s \n", i, WindowPos, InChar, OutChar, h1.hashvalue, h2.hashvalue, h3.hashvalue, h4.hashvalue, h5.hashvalue, (Eat) ? '*' : ' ', current.c_str()); - assert(h1.hash(current) == h1.hashvalue); - assert(h2.hash(current) == h2.hashvalue); - assert(h3.hash(current) == h3.hashvalue); - assert(h4.hash(current) == h4.hashvalue); + int WindowPos = 0; + h1.reset(); + h2.reset(); + h3.reset(); + h4.reset(); + h5.reset(); - } + for (int i = 0; i < Inp.length(); i++) { + unsigned char InChar = Inp[i]; - WindowPos = (WindowPos + 1) % WindowSize; + bool Eat = (i < WindowSize); + unsigned char OutChar = ' '; + if (Eat) { + h1.eat(InChar); + h2.eat(InChar); + h3.eat(InChar); + h4.eat(InChar); + h5.eat(InChar); + } else { + OutChar = Inp[i - WindowSize]; + h1.update(OutChar, InChar); + h2.update(OutChar, InChar); + h3.update(OutChar, InChar); + h4.update(OutChar, InChar); + h5.update(OutChar, InChar); + } + if (i + 1 >= WindowSize) { + auto current = Inp.substr(i + 1 - WindowSize, WindowSize); + printf("%04d %02d %c %c %06x %06x %06x %06x %06x %c %s \n", i, WindowPos, + InChar, OutChar, h1.hashvalue, h2.hashvalue, h3.hashvalue, + h4.hashvalue, h5.hashvalue, (Eat) ? '*' : ' ', current.c_str()); + assert(h1.hash(current) == h1.hashvalue); + assert(h2.hash(current) == h2.hashvalue); + assert(h3.hash(current) == h3.hashvalue); + assert(h4.hash(current) == h4.hashvalue); } + + WindowPos = (WindowPos + 1) % WindowSize; + } } // ---------------------------------------------------------------------------- void Compare() { - const int WindowSize = 16; - KarpRabinHash<> h1(WindowSize); - ThreeWiseHash<> h2(WindowSize); - GeneralHash<> h3(WindowSize); - CyclicHash<> h4(WindowSize); - Adler32 h5(WindowSize); + const int WindowSize = 16; + KarpRabinHash<> h1(WindowSize); + ThreeWiseHash<> h2(WindowSize); + GeneralHash<> h3(WindowSize); + CyclicHash<> h4(WindowSize); + Adler32 h5(WindowSize); - std::string s1 = "Test string for rolling hashes."; // 32 chars - CalcHashes(s1, WindowSize, h1, h2, h3, h4, h5); + std::string s1 = "Test string for rolling hashes."; // 32 chars + CalcHashes(s1, WindowSize, h1, h2, h3, h4, h5); - printf("---------------------------------------\n"); + printf("---------------------------------------\n"); - std::string s2 = "This is some preamble."; - CalcHashes(s2 + s1, WindowSize, h1, h2, h3, h4, h5); + std::string s2 = "This is some preamble."; + CalcHashes(s2 + s1, WindowSize, h1, h2, h3, h4, h5); } -int main() { - Compare(); -} +int main() { Compare(); } diff --git a/examples/example64bits.cpp b/examples/example64bits.cpp index 2fae910..4858265 100644 --- a/examples/example64bits.cpp +++ b/examples/example64bits.cpp @@ -1,33 +1,33 @@ +#include +#include #include #include -#include -#include // Example of 64-bit hashing #include "cyclichash.h" - -int main(int argc, char * argv[]) -{ - CyclicHash hf(5,64); - string input = "ABCDE"; - hf.eat(input[0]);//A - hf.eat(input[1]);//B - hf.eat(input[2]);//C - hf.eat(input[3]);//D - cout<<"Hash value of ABCD is " << hf.hashvalue << endl; - // we check the answer going the long way... - const std::vector charvectslice(input.begin(), input.begin()+4); - uint64_t trueanswerslice = hf.hash(charvectslice); - if(trueanswerslice != hf.hashvalue ) throw runtime_error("bug"); - // we continue - hf.eat(input[4]);//E - cout<<"Hash value of ABCDE is " << hf.hashvalue << endl; - // we check the answer going the long way - const std::vector charvect(input.begin(), input.end()); - uint64_t trueanswer = hf.hash(charvect); - if(trueanswer != hf.hashvalue ) throw runtime_error("bug"); - return 0; - +int main() { + CyclicHash hf(5, 64); + string input = "ABCDE"; + hf.eat(input[0]); // A + hf.eat(input[1]); // B + hf.eat(input[2]); // C + hf.eat(input[3]); // D + cout << "Hash value of ABCD is " << hf.hashvalue << endl; + // we check the answer going the long way... + const std::vector charvectslice(input.begin(), + input.begin() + 4); + uint64_t trueanswerslice = hf.hash(charvectslice); + if (trueanswerslice != hf.hashvalue) + throw runtime_error("bug"); + // we continue + hf.eat(input[4]); // E + cout << "Hash value of ABCDE is " << hf.hashvalue << endl; + // we check the answer going the long way + const std::vector charvect(input.begin(), input.end()); + uint64_t trueanswer = hf.hash(charvect); + if (trueanswer != hf.hashvalue) + throw runtime_error("bug"); + return 0; } diff --git a/include/adler32.h b/include/adler32.h index c4972bd..4f81994 100644 --- a/include/adler32.h +++ b/include/adler32.h @@ -3,46 +3,43 @@ // contributed by Dmitry Artamonov // this is *deterministic* class Adler32 { - uint32 sum1, sum2; -public: - static const uint32_t Base = 65521; - uint32_t hashvalue; - int len; - - Adler32(int window) : sum1(1), sum2(0), hashvalue(0), len (window) {} - - void eat(uint8_t inchar) { - sum1 = (sum1 + inchar) % Base; - sum2 = (sum2 + sum1) % Base; + uint32 sum1, sum2; - hashvalue = (sum2 << 16) | sum1; - } - - void reset() { - sum1 = 1; - sum2 = 0; - hashvalue = 0; +public: + static const uint32_t Base = 65521; + uint32_t hashvalue; + int len; + + Adler32(int window) : sum1(1), sum2(0), hashvalue(0), len(window) {} + + void eat(uint8_t inchar) { + sum1 = (sum1 + inchar) % Base; + sum2 = (sum2 + sum1) % Base; + + hashvalue = (sum2 << 16) | sum1; + } + + void reset() { + sum1 = 1; + sum2 = 0; + hashvalue = 0; + } + + void update(uint8_t outchar, uint8_t inchar) { + int sum2 = (hashvalue >> 16) & 0xffff; + int sum1 = hashvalue & 0xffff; + + sum1 += inchar - outchar; + if (sum1 >= Base) { + sum1 -= Base; + } else if (sum1 < 0) { + sum1 += Base; } - void update(uint8_t outchar, uint8_t inchar) { - int sum2 = (hashvalue >> 16) & 0xffff; - int sum1 = hashvalue & 0xffff; - - sum1 += inchar - outchar; - if (sum1 >= Base) - { - sum1 -= Base; - } - else if (sum1 < 0) - { - sum1 += Base; - } - - sum2 = ((int)(sum2 - len * outchar + sum1 - 1) % (int)Base); - if (sum2 < 0) - { - sum2 += Base; - } - hashvalue = (sum2 << 16) | sum1; + sum2 = ((int)(sum2 - len * outchar + sum1 - 1) % (int)Base); + if (sum2 < 0) { + sum2 += Base; } + hashvalue = (sum2 << 16) | sum1; + } }; diff --git a/include/characterhash.h b/include/characterhash.h index 2c87d4e..da226d4 100644 --- a/include/characterhash.h +++ b/include/characterhash.h @@ -5,84 +5,79 @@ typedef unsigned long long uint64; typedef unsigned int uint32; typedef unsigned int uint; +#include "mersennetwister.h" #include #include #include -#include "mersennetwister.h" using namespace std; - - - class mersenneRNG { public: - mersenneRNG(uint32 maxval) : mtr(),n(maxval) {}; - uint32 operator()() { - return mtr.randInt(n); - } - void seed(uint32 seedval) { - mtr.seed(seedval); - } - void seed() { - mtr.seed(); - } - uint32 rand_max() { - return n; - } + mersenneRNG(uint32 maxval) : mtr(), n(maxval){}; + uint32 operator()() { return mtr.randInt(n); } + void seed(uint32 seedval) { mtr.seed(seedval); } + void seed() { mtr.seed(); } + uint32 rand_max() { return n; } + private: - MTRand mtr; - int n; + MTRand mtr; + int n; }; template #if __cplusplus >= 201402L constexpr #endif -hashvaluetype maskfnc(int bits) { - assert(bits>0); - assert(bits<=sizeof(hashvaluetype)*8); - hashvaluetype x = static_cast(1) << (bits - 1); - return x ^ (x - 1); + hashvaluetype + maskfnc(int bits) { + assert(bits > 0); + assert(size_t(bits) <= sizeof(hashvaluetype) * 8); + hashvaluetype x = static_cast(1) << (bits - 1); + return x ^ (x - 1); } -template +template class CharacterHash { public: - CharacterHash(hashvaluetype maxval) { - if(sizeof(hashvaluetype) <=4) { - mersenneRNG randomgenerator(maxval); - for(size_t k =0; k(randomgenerator()); - } else if (sizeof(hashvaluetype) == 8) { - mersenneRNG randomgenerator(maxval>>32); - mersenneRNG randomgeneratorbase((maxval>>32) ==0 ? maxval : 0xFFFFFFFFU); - for(size_t k =0; k(randomgeneratorbase()) - | (static_cast(randomgenerator()) << 32); - } else throw runtime_error("unsupported hash value type"); - } + CharacterHash(hashvaluetype maxval) { + if (sizeof(hashvaluetype) <= 4) { + mersenneRNG randomgenerator(maxval); + for (size_t k = 0; k < nbrofchars; ++k) + hashvalues[k] = static_cast(randomgenerator()); + } else if (sizeof(hashvaluetype) == 8) { + mersenneRNG randomgenerator(maxval >> 32); + mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval + : 0xFFFFFFFFU); + for (size_t k = 0; k < nbrofchars; ++k) + hashvalues[k] = static_cast(randomgeneratorbase()) | + (static_cast(randomgenerator()) << 32); + } else + throw runtime_error("unsupported hash value type"); + } - CharacterHash(hashvaluetype maxval, uint32 seed1, uint32 seed2) { - if(sizeof(hashvaluetype) <=4) { - mersenneRNG randomgenerator(maxval); - randomgenerator.seed(seed1); - for(size_t k =0; k(randomgenerator()); - } else if (sizeof(hashvaluetype) == 8) { - mersenneRNG randomgenerator(maxval>>32); - mersenneRNG randomgeneratorbase((maxval>>32) ==0 ? maxval : 0xFFFFFFFFU); - randomgenerator.seed(seed1); - randomgeneratorbase.seed(seed2); - for(size_t k =0; k(randomgeneratorbase()) - | (static_cast(randomgenerator()) << 32); - } else throw runtime_error("unsupported hash value type"); - } + CharacterHash(hashvaluetype maxval, uint32 seed1, uint32 seed2) { + if (sizeof(hashvaluetype) <= 4) { + mersenneRNG randomgenerator(maxval); + randomgenerator.seed(seed1); + for (size_t k = 0; k < nbrofchars; ++k) + hashvalues[k] = static_cast(randomgenerator()); + } else if (sizeof(hashvaluetype) == 8) { + mersenneRNG randomgenerator(maxval >> 32); + mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval + : 0xFFFFFFFFU); + randomgenerator.seed(seed1); + randomgeneratorbase.seed(seed2); + for (size_t k = 0; k < nbrofchars; ++k) + hashvalues[k] = static_cast(randomgeneratorbase()) | + (static_cast(randomgenerator()) << 32); + } else + throw runtime_error("unsupported hash value type"); + } - enum {nbrofchars = 1 << ( sizeof(chartype)*8 )}; + enum { nbrofchars = 1 << (sizeof(chartype) * 8) }; - hashvaluetype hashvalues[1 << ( sizeof(chartype)*8 )]; + hashvaluetype hashvalues[1 << (sizeof(chartype) * 8)]; }; #endif diff --git a/include/cyclichash.h b/include/cyclichash.h index ab1b4b7..68516ca 100644 --- a/include/cyclichash.h +++ b/include/cyclichash.h @@ -4,151 +4,145 @@ #include "characterhash.h" /** -* Each instance is a rolling hash function meant to hash streams of characters. -* Each new instance of this class comes with new random keys. -* -* Recommended usage to get L-bit hash values over n-grams: -* CyclicHash<> hf(n,L ); -* for(uint32 k = 0; k + * Each instance is a rolling hash function meant to hash streams of characters. + * Each new instance of this class comes with new random keys. + * + * Recommended usage to get L-bit hash values over n-grams: + * CyclicHash<> hf(n,L ); + * for(uint32 k = 0; k class CyclicHash { public: - // myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters - // mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers - CyclicHash(int myn, int mywordsize=19) : hashvalue(0), - n(myn), wordsize(mywordsize), + // myn is the length of the sequences, e.g., 3 means that you want to hash + // sequences of 3 characters mywordsize is the number of bits you which to + // receive as hash values, e.g., 19 means that the hash values are 19-bit + // integers + CyclicHash(int myn, int mywordsize = 19) + : hashvalue(0), n(myn), wordsize(mywordsize), hasher(maskfnc(wordsize)), - mask1(maskfnc(wordsize-1)), - myr(n%wordsize), - maskn(maskfnc(wordsize-myr)) - { - if(static_cast(wordsize) > 8*sizeof(hashvaluetype)) { - cerr<<"Can't create "<(wordsize - 1)), myr(n % wordsize), + maskn(maskfnc(wordsize - myr)) { + if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { + cerr << "Can't create " << wordsize << "-bit hash values" << endl; + throw "abord"; } + } - CyclicHash(int myn, uint32 seed1, uint32 seed2, int mywordsize=19) : - hashvalue(0), - n(myn), wordsize(mywordsize), + CyclicHash(int myn, uint32 seed1, uint32 seed2, int mywordsize = 19) + : hashvalue(0), n(myn), wordsize(mywordsize), hasher(maskfnc(wordsize), seed1, seed2), - mask1(maskfnc(wordsize-1)), - myr(n%wordsize), - maskn(maskfnc(wordsize-myr)) - { - if(static_cast(wordsize) > 8*sizeof(hashvaluetype)) { - cerr<<"Can't create "<(wordsize - 1)), myr(n % wordsize), + maskn(maskfnc(wordsize - myr)) { + if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { + cerr << "Can't create " << wordsize << "-bit hash values" << endl; + throw "abord"; } - - void fastleftshiftn(hashvaluetype & x) const { - x = ((x & maskn) << myr ) | (x >> (wordsize-myr)) ; - } - - void fastleftshift1(hashvaluetype & x) const { - x = ((x & mask1) << 1 ) | (x >> (wordsize-1)) ; - } - - void fastrightshift1(hashvaluetype & x) const { - x = (x >> 1 ) | ((x & 1)<< (wordsize-1)) ; - } - - - hashvaluetype getfastleftshift1(hashvaluetype x) const { - return ((x & mask1) << 1 ) | (x >> (wordsize-1)) ; - } - - - hashvaluetype getfastrightshift1(hashvaluetype x) const { - return (x >> 1 ) | ((x & 1)<< (wordsize-1)) ; - } - - // this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function - template - hashvaluetype hash(container & c) { - hashvaluetype answer(0); - for(uint k = 0; k(c[k])]; - } - return answer; - } - - hashvaluetype hashz(chartype outchar,uint n) { - hashvaluetype answer = hasher.hashvalues[static_cast(outchar)]; - for(uint k = 0; k> (wordsize - myr)); + } + + void fastleftshift1(hashvaluetype &x) const { + x = ((x & mask1) << 1) | (x >> (wordsize - 1)); + } + + void fastrightshift1(hashvaluetype &x) const { + x = (x >> 1) | ((x & 1) << (wordsize - 1)); + } + + hashvaluetype getfastleftshift1(hashvaluetype x) const { + return ((x & mask1) << 1) | (x >> (wordsize - 1)); + } + + hashvaluetype getfastrightshift1(hashvaluetype x) const { + return (x >> 1) | ((x & 1) << (wordsize - 1)); + } + + // this is a convenience function, use eat,update and .hashvalue to use as a + // rolling hash function + template hashvaluetype hash(container &c) { + hashvaluetype answer(0); + for (uint k = 0; k < c.size(); ++k) { + fastleftshift1(answer); + answer ^= hasher.hashvalues[static_cast(c[k])]; } - - // prepare to process a new string, you will need to call "eat" again - void reset() { - hashvalue = 0; + return answer; + } + + hashvaluetype hashz(chartype outchar, uint n) { + hashvaluetype answer = + hasher.hashvalues[static_cast(outchar)]; + for (uint k = 0; k < n; ++k) { + fastleftshift1(answer); } - - hashvaluetype hashvalue; - int n; - const int wordsize; - CharacterHash hasher; - const hashvaluetype mask1; - const int myr; - const hashvaluetype maskn; - + return answer; + } + + // add inchar as an input and remove outchar, the hashvalue is updated + // this function can be used to update the hash value from the hash value of + // [outchar]ABC to the hash value of ABC[inchar] + void update(chartype outchar, chartype inchar) { + hashvaluetype z(hasher.hashvalues[outchar]); + fastleftshiftn(z); + hashvalue = getfastleftshift1(hashvalue) ^ z ^ hasher.hashvalues[inchar]; + } + + // this is the reverse of the update function. + // this function can be used to update the hash value from the hash value of + // ABC[inchar] to the hash value of [outchar]ABC + void reverse_update(chartype outchar, chartype inchar) { + hashvaluetype z(hasher.hashvalues[outchar]); + fastleftshiftn(z); + hashvalue ^= z ^ hasher.hashvalues[inchar]; + hashvalue = getfastrightshift1(hashvalue); + } + + // add inchar as an input, this is used typically only at the start + // the hash value is updated to that of a longer string (one where inchar was + // appended) + void eat(chartype inchar) { + fastleftshift1(hashvalue); + hashvalue ^= hasher.hashvalues[inchar]; + } + + // for an n-gram X it returns hash value of (n + 1)-gram XY without changing + // the object X. For example, if X = "ABC", then X.hash_extend("D") returns + // value of "ABCD" without changing the state of X + hashvaluetype hash_extend(chartype Y) { + return getfastleftshift1(hashvalue) ^ hasher.hashvalues[Y]; + } + + // same as hash_extend, but with prepending the n-gram with character Y. If X + // = "ABC", then X.hash_prepend("D") returns value of "DABC" without changing + // the state of X + hashvaluetype hash_prepend(chartype Y) { + hashvaluetype z(hasher.hashvalues[Y]); + fastleftshiftn(z); + return z ^ hashvalue; + } + + // prepare to process a new string, you will need to call "eat" again + void reset() { hashvalue = 0; } + + hashvaluetype hashvalue; + int n; + const int wordsize; + CharacterHash hasher; + const hashvaluetype mask1; + const int myr; + const hashvaluetype maskn; }; - - #endif diff --git a/include/generalhash.h b/include/generalhash.h index 2dc4c90..4fb62f3 100644 --- a/include/generalhash.h +++ b/include/generalhash.h @@ -8,126 +8,119 @@ using namespace std; -enum {NOPRECOMP,FULLPRECOMP}; +enum { NOPRECOMP, FULLPRECOMP }; /** -* Each instance is a rolling hash function meant to hash streams of characters. -* Each new instance of this class comes with new random keys. -* -* Recommended usage to get L-bit hash values over n-grams: -* GeneralHash<> hf(n,L ); -* for(uint32 k = 0; k + * Each instance is a rolling hash function meant to hash streams of characters. + * Each new instance of this class comes with new random keys. + * + * Recommended usage to get L-bit hash values over n-grams: + * GeneralHash<> hf(n,L ); + * for(uint32 k = 0; k class GeneralHash { public: - - // myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters - // mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers - GeneralHash(int myn, int mywordsize = 19): - hashvalue(0), - wordsize(mywordsize), - n(myn), - irreduciblepoly(0), + // myn is the length of the sequences, e.g., 3 means that you want to hash + // sequences of 3 characters mywordsize is the number of bits you which to + // receive as hash values, e.g., 19 means that the hash values are 19-bit + // integers + GeneralHash(int myn, int mywordsize = 19) + : hashvalue(0), wordsize(mywordsize), n(myn), irreduciblepoly(0), hasher(maskfnc(wordsize)), - lastbit(static_cast(1)<(1) << wordsize), + precomputedshift(precomputationtype == FULLPRECOMP ? (1 << n) : 0) { + if (wordsize == 19) { + irreduciblepoly = 1 + (1 << 1) + (1 << 2) + (1 << 5) + (1 << 19); + } else if (wordsize == 9) { + irreduciblepoly = 1 + (1 << 2) + (1 << 3) + (1 << 5) + (1 << 9); + } else { + cerr << "unsupported wordsize " << wordsize << " bits, try 19 or 9" + << endl; } - - void fastleftshift(hashvaluetype & x, int r) const { - for (int i = 0; i < r; ++i) { - x <<= 1; - if(( x & lastbit) == lastbit) - x ^= irreduciblepoly; - } + // in case the precomp is activated at the template level + if (precomputationtype == FULLPRECOMP) { + for (hashvaluetype x = 0; x < precomputedshift.size(); ++x) { + hashvaluetype leftover = x << (wordsize - n); + fastleftshift(leftover, n); + precomputedshift[x] = leftover; + } } - - void fastleftshiftn(hashvaluetype & x) const { - x= - // take the last n bits and look-up the result - precomputedshift[(x >> (wordsize-n))] - ^ - // then just shift the first L-n bits - ((x << n) & (lastbit -1 )); + } + // prepare to process a new string, you will need to call "eat" again + void reset() { hashvalue = 0; } + + void fastleftshift(hashvaluetype &x, int r) const { + for (int i = 0; i < r; ++i) { + x <<= 1; + if ((x & lastbit) == lastbit) + x ^= irreduciblepoly; } - - // add inchar as an input and remove outchar, the hashvalue is updated - // this function can be used to update the hash value from the hash value of [outchar]ABC to the hash value of ABC[inchar] - void update(chartype outchar, chartype inchar) { - hashvalue <<= 1; - if(( hashvalue & lastbit) == lastbit) - hashvalue ^= irreduciblepoly; - // - hashvaluetype z (hasher.hashvalues[outchar]); - // the compiler should optimize away the next if/else - if(precomputationtype==FULLPRECOMP) { - fastleftshiftn(z); - hashvalue ^= z ^ hasher.hashvalues[inchar]; - } else { - fastleftshift(z,n); - hashvalue ^= z ^ hasher.hashvalues[inchar]; - } - } - - - - // add inchar as an input, this is used typically only at the start - // the hash value is updated to that of a longer string (one where inchar was appended) - void eat(chartype inchar) { - fastleftshift(hashvalue,1); - hashvalue ^= hasher.hashvalues[inchar]; + } + + void fastleftshiftn(hashvaluetype &x) const { + x = + // take the last n bits and look-up the result + precomputedshift[(x >> (wordsize - n))] ^ + // then just shift the first L-n bits + ((x << n) & (lastbit - 1)); + } + + // add inchar as an input and remove outchar, the hashvalue is updated + // this function can be used to update the hash value from the hash value of + // [outchar]ABC to the hash value of ABC[inchar] + void update(chartype outchar, chartype inchar) { + hashvalue <<= 1; + if ((hashvalue & lastbit) == lastbit) + hashvalue ^= irreduciblepoly; + // + hashvaluetype z(hasher.hashvalues[outchar]); + // the compiler should optimize away the next if/else + if (precomputationtype == FULLPRECOMP) { + fastleftshiftn(z); + hashvalue ^= z ^ hasher.hashvalues[inchar]; + } else { + fastleftshift(z, n); + hashvalue ^= z ^ hasher.hashvalues[inchar]; } - - // this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function - template - hashvaluetype hash(container & c) const { - hashvaluetype answer(0); - for(uint k = 0; k hashvaluetype hash(container &c) const { + hashvaluetype answer(0); + for (uint k = 0; k < c.size(); ++k) { + fastleftshift(answer, 1); + answer ^= hasher.hashvalues[c[k]]; } - - hashvaluetype hashvalue; - const int wordsize; - int n; - hashvaluetype irreduciblepoly; - CharacterHash hasher; - const hashvaluetype lastbit; - vector precomputedshift; - + return answer; + } + + hashvaluetype hashvalue; + const int wordsize; + int n; + hashvaluetype irreduciblepoly; + CharacterHash hasher; + const hashvaluetype lastbit; + vector precomputedshift; }; - - #endif diff --git a/include/mersennetwister.h b/include/mersennetwister.h index 1b8be2e..5f9092a 100644 --- a/include/mersennetwister.h +++ b/include/mersennetwister.h @@ -4,8 +4,8 @@ * Mersenne Twister @article{matsumoto1998mtd, - title={{Mersenne Twister: A 623-Dimensionally Equidistributed Uniform Pseudo-Random Number Generator}}, - author={MATSUMOTO, M. and NISHIMURA, T.}, + title={{Mersenne Twister: A 623-Dimensionally Equidistributed Uniform +Pseudo-Random Number Generator}}, author={MATSUMOTO, M. and NISHIMURA, T.}, journal={ACM Transactions on Modeling and Computer Simulation}, volume={8}, number={1}, @@ -52,8 +52,8 @@ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF @@ -77,365 +77,308 @@ #include #include +#include #include #include -#include class MTRand { -// Data + // Data public: - typedef unsigned long uint32; // unsigned integer type, at least 32 bits + typedef unsigned long uint32; // unsigned integer type, at least 32 bits - enum { N = 624 }; // length of state vector - enum { SAVE = N + 1 }; // length of array for save() + enum { N = 624 }; // length of state vector + enum { SAVE = N + 1 }; // length of array for save() protected: - enum { M = 397 }; // period parameter - - uint32 state[N]; // internal state - uint32 *pNext; // next value to get from state - int left; // number of values left before reload needed + enum { M = 397 }; // period parameter + uint32 state[N]; // internal state + uint32 *pNext; // next value to get from state + int left; // number of values left before reload needed -//Methods + // Methods public: - MTRand( const uint32& oneSeed ); // initialize with a simple uint32 - MTRand( uint32 *const bigSeed, uint32 const seedLength = N ); // or an array - MTRand(); // auto-initialize with /dev/urandom or time() and clock() - - // Do NOT use for CRYPTOGRAPHY without securely hashing several returned - // values together, otherwise the generator state can be learned after - // reading 624 consecutive values. - - // Access to 32-bit random numbers - double rand(); // real number in [0,1] - double rand( const double& n ); // real number in [0,n] - double randExc(); // real number in [0,1) - double randExc( const double& n ); // real number in [0,n) - double randDblExc(); // real number in (0,1) - double randDblExc( const double& n ); // real number in (0,n) - uint32 randInt(); // integer in [0,2^32-1] - uint32 randInt( const uint32& n ); // integer in [0,n] for n < 2^32 - double operator()() { - return rand(); // same as rand() - } - - // Access to 53-bit random numbers (capacity of IEEE double precision) - double rand53(); // real number in [0,1) - - // Access to nonuniform random number distributions - double randNorm( const double& mean = 0.0, const double& variance = 0.0 ); - - // Re-seeding functions with same behavior as initializers - void seed( const uint32 oneSeed ); - void seed( uint32 *const bigSeed, const uint32 seedLength = N ); - void seed(); - - // Saving and loading generator state - void save( uint32* saveArray ) const; // to array of size SAVE - void load( uint32 *const loadArray ); // from such array - friend std::ostream& operator<<( std::ostream& os, const MTRand& mtrand ); - friend std::istream& operator>>( std::istream& is, MTRand& mtrand ); + MTRand(const uint32 &oneSeed); // initialize with a simple uint32 + MTRand(uint32 *const bigSeed, uint32 const seedLength = N); // or an array + MTRand(); // auto-initialize with /dev/urandom or time() and clock() + + // Do NOT use for CRYPTOGRAPHY without securely hashing several returned + // values together, otherwise the generator state can be learned after + // reading 624 consecutive values. + + // Access to 32-bit random numbers + double rand(); // real number in [0,1] + double rand(const double &n); // real number in [0,n] + double randExc(); // real number in [0,1) + double randExc(const double &n); // real number in [0,n) + double randDblExc(); // real number in (0,1) + double randDblExc(const double &n); // real number in (0,n) + uint32 randInt(); // integer in [0,2^32-1] + uint32 randInt(const uint32 &n); // integer in [0,n] for n < 2^32 + double operator()() { + return rand(); // same as rand() + } + + // Access to 53-bit random numbers (capacity of IEEE double precision) + double rand53(); // real number in [0,1) + + // Access to nonuniform random number distributions + double randNorm(const double &mean = 0.0, const double &variance = 0.0); + + // Re-seeding functions with same behavior as initializers + void seed(const uint32 oneSeed); + void seed(uint32 *const bigSeed, const uint32 seedLength = N); + void seed(); + + // Saving and loading generator state + void save(uint32 *saveArray) const; // to array of size SAVE + void load(uint32 *const loadArray); // from such array + friend std::ostream &operator<<(std::ostream &os, const MTRand &mtrand); + friend std::istream &operator>>(std::istream &is, MTRand &mtrand); protected: - void initialize( const uint32 oneSeed ); - void reload(); - uint32 hiBit( const uint32& u ) const { - return u & 0x80000000UL; - } - uint32 loBit( const uint32& u ) const { - return u & 0x00000001UL; - } - uint32 loBits( const uint32& u ) const { - return u & 0x7fffffffUL; - } - uint32 mixBits( const uint32& u, const uint32& v ) const - { - return hiBit(u) | loBits(v); - } - uint32 twist( const uint32& m, const uint32& s0, const uint32& s1 ) const - { - return m ^ (mixBits(s0,s1)>>1) ^ (-static_cast(loBit(s1)) & 0x9908b0dfUL); - } - static uint32 hash( time_t t, clock_t c ); + void initialize(const uint32 oneSeed); + void reload(); + uint32 hiBit(const uint32 &u) const { return u & 0x80000000UL; } + uint32 loBit(const uint32 &u) const { return u & 0x00000001UL; } + uint32 loBits(const uint32 &u) const { return u & 0x7fffffffUL; } + uint32 mixBits(const uint32 &u, const uint32 &v) const { + return hiBit(u) | loBits(v); + } + uint32 twist(const uint32 &m, const uint32 &s0, const uint32 &s1) const { + return m ^ (mixBits(s0, s1) >> 1) ^ + (-static_cast(loBit(s1)) & 0x9908b0dfUL); + } + static uint32 hash(time_t t, clock_t c); }; +MTRand::MTRand(const uint32 &oneSeed) { seed(oneSeed); } -MTRand::MTRand( const uint32& oneSeed ) -{ - seed(oneSeed); +MTRand::MTRand(uint32 *const bigSeed, const uint32 seedLength) { + seed(bigSeed, seedLength); } -MTRand::MTRand( uint32 *const bigSeed, const uint32 seedLength ) -{ - seed(bigSeed,seedLength); -} +MTRand::MTRand() { seed(); } -MTRand::MTRand() -{ - seed(); -} +double MTRand::rand() { return double(randInt()) * (1.0 / 4294967295.0); } -double MTRand::rand() -{ - return double(randInt()) * (1.0/4294967295.0); -} +double MTRand::rand(const double &n) { return rand() * n; } -double MTRand::rand( const double& n ) -{ - return rand() * n; -} +double MTRand::randExc() { return double(randInt()) * (1.0 / 4294967296.0); } -double MTRand::randExc() -{ - return double(randInt()) * (1.0/4294967296.0); -} +double MTRand::randExc(const double &n) { return randExc() * n; } -double MTRand::randExc( const double& n ) -{ - return randExc() * n; +double MTRand::randDblExc() { + return (double(randInt()) + 0.5) * (1.0 / 4294967296.0); } -double MTRand::randDblExc() -{ - return ( double(randInt()) + 0.5 ) * (1.0/4294967296.0); -} +double MTRand::randDblExc(const double &n) { return randDblExc() * n; } -double MTRand::randDblExc( const double& n ) -{ - return randDblExc() * n; +double MTRand::rand53() { + uint32 a = randInt() >> 5, b = randInt() >> 6; + return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); // by Isaku Wada } -double MTRand::rand53() -{ - uint32 a = randInt() >> 5, b = randInt() >> 6; - return ( a * 67108864.0 + b ) * (1.0/9007199254740992.0); // by Isaku Wada +double MTRand::randNorm(const double &mean, const double &variance) { + // Return a real number from a normal (Gaussian) distribution with given + // mean and variance by Box-Muller method + double r = sqrt(-2.0 * log(1.0 - randDblExc())) * variance; + double phi = 2.0 * 3.14159265358979323846264338328 * randExc(); + return mean + r * cos(phi); } -double MTRand::randNorm( const double& mean, const double& variance ) -{ - // Return a real number from a normal (Gaussian) distribution with given - // mean and variance by Box-Muller method - double r = sqrt( -2.0 * log( 1.0-randDblExc()) ) * variance; - double phi = 2.0 * 3.14159265358979323846264338328 * randExc(); - return mean + r * cos(phi); -} - -MTRand::uint32 MTRand::randInt() -{ - // Pull a 32-bit integer from the generator state - // Every other access function simply transforms the numbers extracted here - - if( left == 0 ) reload(); - --left; +MTRand::uint32 MTRand::randInt() { + // Pull a 32-bit integer from the generator state + // Every other access function simply transforms the numbers extracted here - uint32 s1; - s1 = *pNext++; - s1 ^= (s1 >> 11); - s1 ^= (s1 << 7) & 0x9d2c5680UL; - s1 ^= (s1 << 15) & 0xefc60000UL; - return ( s1 ^ (s1 >> 18) ); + if (left == 0) + reload(); + --left; + + uint32 s1; + s1 = *pNext++; + s1 ^= (s1 >> 11); + s1 ^= (s1 << 7) & 0x9d2c5680UL; + s1 ^= (s1 << 15) & 0xefc60000UL; + return (s1 ^ (s1 >> 18)); } -MTRand::uint32 MTRand::randInt( const uint32& n ) -{ - // Find which bits are used in n - // Optimized by Magnus Jonsson (magnus@smartelectronix.com) - uint32 used = n; - used |= used >> 1; - used |= used >> 2; - used |= used >> 4; - used |= used >> 8; - used |= used >> 16; - - // Draw numbers until one is found in [0,n] - uint32 i; - do - i = randInt() & used; // toss unused bits to shorten search - while( i > n ); - return i; +MTRand::uint32 MTRand::randInt(const uint32 &n) { + // Find which bits are used in n + // Optimized by Magnus Jonsson (magnus@smartelectronix.com) + uint32 used = n; + used |= used >> 1; + used |= used >> 2; + used |= used >> 4; + used |= used >> 8; + used |= used >> 16; + + // Draw numbers until one is found in [0,n] + uint32 i; + do + i = randInt() & used; // toss unused bits to shorten search + while (i > n); + return i; } - -void MTRand::seed( const uint32 oneSeed ) -{ - // Seed the generator with a simple uint32 - initialize(oneSeed); - reload(); +void MTRand::seed(const uint32 oneSeed) { + // Seed the generator with a simple uint32 + initialize(oneSeed); + reload(); } - -void MTRand::seed( uint32 *const bigSeed, const uint32 seedLength ) -{ - // Seed the generator with an array of uint32's - // There are 2^19937-1 possible initial states. This function allows - // all of those to be accessed by providing at least 19937 bits (with a - // default seed length of N = 624 uint32's). Any bits above the lower 32 - // in each element are discarded. - // Just call seed() if you want to get array from /dev/urandom - initialize(19650218UL); - int i = 1; - uint32 j = 0; - int k = ( N > seedLength ? N : seedLength ); - for( ; k; --k ) - { - state[i] = - state[i] ^ ( (state[i-1] ^ (state[i-1] >> 30)) * 1664525UL ); - state[i] += ( bigSeed[j] & 0xffffffffUL ) + j; - state[i] &= 0xffffffffUL; - ++i; - ++j; - if( i >= N ) { - state[0] = state[N-1]; - i = 1; - } - if( j >= seedLength ) j = 0; +void MTRand::seed(uint32 *const bigSeed, const uint32 seedLength) { + // Seed the generator with an array of uint32's + // There are 2^19937-1 possible initial states. This function allows + // all of those to be accessed by providing at least 19937 bits (with a + // default seed length of N = 624 uint32's). Any bits above the lower 32 + // in each element are discarded. + // Just call seed() if you want to get array from /dev/urandom + initialize(19650218UL); + int i = 1; + uint32 j = 0; + int k = (uint32(N) > seedLength ? int(N) : int(seedLength)); + for (; k; --k) { + state[i] = state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1664525UL); + state[i] += (bigSeed[j] & 0xffffffffUL) + j; + state[i] &= 0xffffffffUL; + ++i; + ++j; + if (i >= N) { + state[0] = state[N - 1]; + i = 1; } - for( k = N - 1; k; --k ) - { - state[i] = - state[i] ^ ( (state[i-1] ^ (state[i-1] >> 30)) * 1566083941UL ); - state[i] -= i; - state[i] &= 0xffffffffUL; - ++i; - if( i >= N ) { - state[0] = state[N-1]; - i = 1; - } + if (j >= seedLength) + j = 0; + } + for (k = N - 1; k; --k) { + state[i] = + state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1566083941UL); + state[i] -= i; + state[i] &= 0xffffffffUL; + ++i; + if (i >= N) { + state[0] = state[N - 1]; + i = 1; } - state[0] = 0x80000000UL; // MSB is 1, assuring non-zero initial array - reload(); + } + state[0] = 0x80000000UL; // MSB is 1, assuring non-zero initial array + reload(); } +void MTRand::seed() { + // Seed the generator with an array from /dev/urandom if available + // Otherwise use a hash of time() and clock() values -void MTRand::seed() -{ - // Seed the generator with an array from /dev/urandom if available - // Otherwise use a hash of time() and clock() values - - // First try getting an array from /dev/urandom - FILE* urandom = fopen( "/dev/urandom", "rb" ); - if( urandom ) - { - uint32 bigSeed[N]; - uint32 *s = bigSeed; - int i = N; - bool success = true; - while( success && i-- ) - success = fread( s++, sizeof(uint32), 1, urandom ); - fclose(urandom); - if( success ) { - seed( bigSeed, N ); - return; - } + // First try getting an array from /dev/urandom + FILE *urandom = fopen("/dev/urandom", "rb"); + if (urandom) { + uint32 bigSeed[N]; + uint32 *s = bigSeed; + int i = N; + bool success = true; + while (success && i--) + success = fread(s++, sizeof(uint32), 1, urandom); + fclose(urandom); + if (success) { + seed(bigSeed, N); + return; } + } - // Was not successful, so use time() and clock() instead - seed( hash( time(NULL), clock() ) ); + // Was not successful, so use time() and clock() instead + seed(hash(time(NULL), clock())); } - -void MTRand::initialize( const uint32 seed ) -{ - // Initialize generator state with seed - // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier. - // In previous versions, most significant bits (MSBs) of the seed affect - // only MSBs of the state array. Modified 9 Jan 2002 by Makoto Matsumoto. - uint32 *s = state; - uint32 *r = state; - int i = 1; - *s++ = seed & 0xffffffffUL; - for( ; i < N; ++i ) - { - *s++ = ( 1812433253UL * ( *r ^ (*r >> 30) ) + i ) & 0xffffffffUL; - r++; - } +void MTRand::initialize(const uint32 seed) { + // Initialize generator state with seed + // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier. + // In previous versions, most significant bits (MSBs) of the seed affect + // only MSBs of the state array. Modified 9 Jan 2002 by Makoto Matsumoto. + uint32 *s = state; + uint32 *r = state; + int i = 1; + *s++ = seed & 0xffffffffUL; + for (; i < N; ++i) { + *s++ = (1812433253UL * (*r ^ (*r >> 30)) + i) & 0xffffffffUL; + r++; + } } - -void MTRand::reload() -{ - // Generate N new values in state - // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com) - uint32 *p = state; - int i; - for( i = N - M; i--; ++p ) - *p = twist( p[M], p[0], p[1] ); - for( i = M; --i; ++p ) - *p = twist( p[M-N], p[0], p[1] ); - *p = twist( p[M-N], p[0], state[0] ); - - left = N, pNext = state; +void MTRand::reload() { + // Generate N new values in state + // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com) + uint32 *p = state; + int i; + for (i = N - M; i--; ++p) + *p = twist(p[M], p[0], p[1]); + for (i = M; --i; ++p) + *p = twist(p[M - N], p[0], p[1]); + *p = twist(p[M - N], p[0], state[0]); + + left = N, pNext = state; } - -MTRand::uint32 MTRand::hash( time_t t, clock_t c ) -{ - // Get a uint32 from t and c - // Better than uint32(x) in case x is floating point in [0,1] - // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk) - - static uint32 differ = 0; // guarantee time-based seeds will change - - uint32 h1 = 0; - unsigned char *p = reinterpret_cast( &t ); - for( size_t i = 0; i < sizeof(t); ++i ) - { - h1 *= UCHAR_MAX + 2U; - h1 += p[i]; - } - uint32 h2 = 0; - p = reinterpret_cast( &c ); - for( size_t j = 0; j < sizeof(c); ++j ) - { - h2 *= UCHAR_MAX + 2U; - h2 += p[j]; - } - return ( h1 + differ++ ) ^ h2; +MTRand::uint32 MTRand::hash(time_t t, clock_t c) { + // Get a uint32 from t and c + // Better than uint32(x) in case x is floating point in [0,1] + // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk) + + static uint32 differ = 0; // guarantee time-based seeds will change + + uint32 h1 = 0; + unsigned char *p = reinterpret_cast(&t); + for (size_t i = 0; i < sizeof(t); ++i) { + h1 *= UCHAR_MAX + 2U; + h1 += p[i]; + } + uint32 h2 = 0; + p = reinterpret_cast(&c); + for (size_t j = 0; j < sizeof(c); ++j) { + h2 *= UCHAR_MAX + 2U; + h2 += p[j]; + } + return (h1 + differ++) ^ h2; } - -void MTRand::save( uint32* saveArray ) const -{ - uint32 *sa = saveArray; - const uint32 *s = state; - int i = N; - for( ; i--; *sa++ = *s++ ) {} - *sa = left; +void MTRand::save(uint32 *saveArray) const { + uint32 *sa = saveArray; + const uint32 *s = state; + int i = N; + for (; i--; *sa++ = *s++) { + } + *sa = left; } - -void MTRand::load( uint32 *const loadArray ) -{ - uint32 *s = state; - uint32 *la = loadArray; - int i = N; - for( ; i--; *s++ = *la++ ) {} - left = *la; - pNext = &state[N-left]; +void MTRand::load(uint32 *const loadArray) { + uint32 *s = state; + uint32 *la = loadArray; + int i = N; + for (; i--; *s++ = *la++) { + } + left = *la; + pNext = &state[N - left]; } - -std::ostream& operator<<( std::ostream& os, const MTRand& mtrand ) -{ - const MTRand::uint32 *s = mtrand.state; - int i = mtrand.N; - for( ; i--; os << *s++ << "\t" ) {} - return os << mtrand.left; +std::ostream &operator<<(std::ostream &os, const MTRand &mtrand) { + const MTRand::uint32 *s = mtrand.state; + int i = mtrand.N; + for (; i--; os << *s++ << "\t") { + } + return os << mtrand.left; } - -std::istream& operator>>( std::istream& is, MTRand& mtrand ) -{ - MTRand::uint32 *s = mtrand.state; - int i = mtrand.N; - for( ; i--; is >> *s++ ) {} - is >> mtrand.left; - mtrand.pNext = &mtrand.state[mtrand.N-mtrand.left]; - return is; +std::istream &operator>>(std::istream &is, MTRand &mtrand) { + MTRand::uint32 *s = mtrand.state; + int i = mtrand.N; + for (; i--; is >> *s++) { + } + is >> mtrand.left; + mtrand.pNext = &mtrand.state[mtrand.N - mtrand.left]; + return is; } -#endif // MERSENNETWISTER_H +#endif // MERSENNETWISTER_H // Change log: // diff --git a/include/rabinkarphash.h b/include/rabinkarphash.h index 73d8d3d..a885cf0 100644 --- a/include/rabinkarphash.h +++ b/include/rabinkarphash.h @@ -1,180 +1,184 @@ #ifndef KARPRABINHASH #define KARPRABINHASH - #include "characterhash.h" #include - - /** -* This is a randomized version of the Karp-Rabin hash function. -* Each instance is a rolling hash function meant to hash streams of characters. -* Each new instance of this class comes with new random keys. -* -* Recommended usage to get L-bit hash values over n-grams: -* KarpRabinHash<> hf(n,L ); -* for(uint32 k = 0; k + * This is a randomized version of the Karp-Rabin hash function. + * Each instance is a rolling hash function meant to hash streams of characters. + * Each new instance of this class comes with new random keys. + * + * Recommended usage to get L-bit hash values over n-grams: + * KarpRabinHash<> hf(n,L ); + * for(uint32 k = 0; k class KarpRabinHash { public: - // myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters - // mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers - KarpRabinHash(int myn, int mywordsize=19) : hashvalue(0),n(myn), - wordsize(mywordsize), - hasher( maskfnc(wordsize)), - HASHMASK(maskfnc(wordsize)),BtoN(1) { - for (int i=0; i < n ; ++i) { - BtoN *= B; - BtoN &= HASHMASK; - } + // myn is the length of the sequences, e.g., 3 means that you want to hash + // sequences of 3 characters mywordsize is the number of bits you which to + // receive as hash values, e.g., 19 means that the hash values are 19-bit + // integers + KarpRabinHash(int myn, int mywordsize = 19) + : hashvalue(0), n(myn), wordsize(mywordsize), + hasher(maskfnc(wordsize)), + HASHMASK(maskfnc(wordsize)), BtoN(1) { + for (int i = 0; i < n; ++i) { + BtoN *= B; + BtoN &= HASHMASK; } - - // prepare to process a new string, you will need to call "eat" again - void reset() { - hashvalue = 0; + } + + // prepare to process a new string, you will need to call "eat" again + void reset() { hashvalue = 0; } + + // this is a convenience function, use eat,update and .hashvalue to use as a + // rolling hash function + template hashvaluetype hash(container &c) { + hashvaluetype answer(0); + for (uint k = 0; k < c.size(); ++k) { + hashvaluetype x(1); + for (uint j = 0; j < c.size() - 1 - k; ++j) { + x = (x * B) & HASHMASK; + } + x = (x * hasher.hashvalues[c[k]]) & HASHMASK; + answer = (answer + x) & HASHMASK; } - - // this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function - template - hashvaluetype hash(container & c) { - hashvaluetype answer(0); - for(uint k = 0; k hasher; - const hashvaluetype HASHMASK; - hashvaluetype BtoN; - static const hashvaluetype B=37; - + return answer; + } + + // add inchar as an input, this is used typically only at the start + // the hash value is updated to that of a longer string (one where inchar was + // appended) + void eat(chartype inchar) { + hashvalue = (B * hashvalue + hasher.hashvalues[inchar]) & HASHMASK; + } + + // add inchar as an input and remove outchar, the hashvalue is updated + // this function can be used to update the hash value from the hash value of + // [outchar]ABC to the hash value of ABC[inchar] + void update(chartype outchar, chartype inchar) { + hashvalue = (B * hashvalue + hasher.hashvalues[inchar] - + BtoN * hasher.hashvalues[outchar]) & + HASHMASK; + } + + hashvaluetype hashvalue; + int n; + const int wordsize; + CharacterHash hasher; + const hashvaluetype HASHMASK; + hashvaluetype BtoN; + static const hashvaluetype B = 37; }; -template +template class KarpRabinHashBits { - // The key difference between KarpRabinHashBits and KarpRabinHash is that wordsize is now templated - // And the masking is only performed if nbits != the number of bits in the type + // The key difference between KarpRabinHashBits and KarpRabinHash is that + // wordsize is now templated And the masking is only performed if nbits != the + // number of bits in the type public: - // myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters - // mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers - KarpRabinHashBits(int myn): hashvalue(0), n(myn), - hasher( maskfnc(wordsize)), - HASHMASK(maskfnc(wordsize)),BtoN(1) { - for (int i=0; i < n ; ++i) { - BtoN *= B; - if(!is_full_word()) BtoN &= HASHMASK; - } + // myn is the length of the sequences, e.g., 3 means that you want to hash + // sequences of 3 characters mywordsize is the number of bits you which to + // receive as hash values, e.g., 19 means that the hash values are 19-bit + // integers + KarpRabinHashBits(int myn) + : hashvalue(0), n(myn), hasher(maskfnc(wordsize)), + HASHMASK(maskfnc(wordsize)), BtoN(1) { + for (int i = 0; i < n; ++i) { + BtoN *= B; + if (!is_full_word()) + BtoN &= HASHMASK; } - - // prepare to process a new string, you will need to call "eat" again - void reset() { - hashvalue = 0; - } - static constexpr bool is_full_word() { - return wordsize == (CHAR_BIT * sizeof(hashvaluetype)); - } - template - void mask_value(T &val) const { + } + + // prepare to process a new string, you will need to call "eat" again + void reset() { hashvalue = 0; } + static constexpr bool is_full_word() { + return wordsize == (CHAR_BIT * sizeof(hashvaluetype)); + } + template void mask_value(T &val) const { #if __cplusplus >= 201703L #define CONSTIF if constexpr #else #define CONSTIF if #endif - CONSTIF(!is_full_word()) val &= HASHMASK; + CONSTIF(!is_full_word()) val &= HASHMASK; #undef CONSTIF + } + + // this is a convenience function, use eat,update and .hashvalue to use as a + // rolling hash function + template hashvaluetype hash(container &c) const { + hashvaluetype answer(0); + for (uint k = 0; k < c.size(); ++k) { + hashvaluetype x(1); + for (uint j = 0; j < c.size() - 1 - k; ++j) { + x = (x * B); + mask_value(x); + } + x = (x * hasher.hashvalues[c[k]]); + mask_value(x); + answer = (answer + x); + mask_value(answer); } - - // this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function - template - hashvaluetype hash(container & c) const { - hashvaluetype answer(0); - for(uint k = 0; k(s));} - hashvaluetype hash(const char *s) const { - hashvaluetype answer(0); - uint csz = std::strlen(s); - for(uint k = 0; k(s)); + } + hashvaluetype hash(const char *s) const { + hashvaluetype answer(0); + uint csz = std::strlen(s); + for (uint k = 0; k < csz; ++k) { + hashvaluetype x(1); + for (uint j = 0; j < csz - 1 - k; ++j) { + x = (x * B); + mask_value(x); + } + x = (x * hasher.hashvalues[s[k]]); + mask_value(x); + answer = (answer + x); + mask_value(answer); } - - - hashvaluetype hashvalue; - int n; - CharacterHash hasher; - const hashvaluetype HASHMASK; - hashvaluetype BtoN; - static constexpr hashvaluetype B=37; + return answer; + } + + // add inchar as an input, this is used typically only at the start + // the hash value is updated to that of a longer string (one where inchar was + // appended) + void eat(chartype inchar) { + hashvalue = (B * hashvalue + hasher.hashvalues[inchar]); + mask_value(hashvalue); + } + + // add inchar as an input and remove outchar, the hashvalue is updated + // this function can be used to update the hash value from the hash value of + // [outchar]ABC to the hash value of ABC[inchar] + void update(chartype outchar, chartype inchar) { + hashvalue = (B * hashvalue + hasher.hashvalues[inchar] - + BtoN * hasher.hashvalues[outchar]); + mask_value(hashvalue); + } + + hashvaluetype hashvalue; + int n; + CharacterHash hasher; + const hashvaluetype HASHMASK; + hashvaluetype BtoN; + static constexpr hashvaluetype B = 37; }; - #endif diff --git a/include/threewisehash.h b/include/threewisehash.h index 88ecaea..3de25ee 100644 --- a/include/threewisehash.h +++ b/include/threewisehash.h @@ -1,95 +1,96 @@ #ifndef THREEWISEHASH #define THREEWISEHASH +#include "characterhash.h" #include #include -#include "characterhash.h" using namespace std; - /** -* Each instance is a rolling hash function meant to hash streams of characters. -* Each new instance of this class comes with new random keys. -* -* Recommended usage to get L-bit hash values over n-grams: -* ThreeWiseHash<> hf(n,L ); -* for(uint32 k = 0; k + * Each instance is a rolling hash function meant to hash streams of characters. + * Each new instance of this class comes with new random keys. + * + * Recommended usage to get L-bit hash values over n-grams: + * ThreeWiseHash<> hf(n,L ); + * for(uint32 k = 0; k class ThreeWiseHash { public: - - // myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters - // mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers - ThreeWiseHash(int myn, int mywordsize=19) : n(myn), wordsize(mywordsize), - hashers(),hasher(0) { - if(static_cast(wordsize) > 8*sizeof(hashvaluetype)) { - cerr<<"Can't create "< ch(maskfnc(wordsize)); - hashers.push_back(ch); - } + // myn is the length of the sequences, e.g., 3 means that you want to hash + // sequences of 3 characters mywordsize is the number of bits you which to + // receive as hash values, e.g., 19 means that the hash values are 19-bit + // integers + ThreeWiseHash(int myn, int mywordsize = 19) + : n(myn), wordsize(mywordsize), hashers(), hasher(0) { + if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { + cerr << "Can't create " << wordsize << "-bit hash values" << endl; + throw "abord"; } - - // add inchar as an input, this is used typically only at the start - // the hash value is updated to that of a longer string (one where inchar was appended) - void eat(chartype inchar) { - ngram.push_back(inchar); - __updateHashValue(); + for (int i = 0; i < n; ++i) { + CharacterHash ch( + maskfnc(wordsize)); + hashers.push_back(ch); } + } - // add inchar as an input and remove outchar, the hashvalue is updated - // this function can be used to update the hash value from the hash value of [outchar]ABC to the hash value of ABC[inchar] - void update(chartype outchar, chartype inchar) { - ngram.push_back(inchar); - ngram.pop_front(); - __updateHashValue(); - } + // add inchar as an input, this is used typically only at the start + // the hash value is updated to that of a longer string (one where inchar was + // appended) + void eat(chartype inchar) { + ngram.push_back(inchar); + __updateHashValue(); + } - // prepare to process a new string, you will need to call "eat" again - void reset() { - hashvalue = 0; - ngram.clear(); - } + // add inchar as an input and remove outchar, the hashvalue is updated + // this function can be used to update the hash value from the hash value of + // [outchar]ABC to the hash value of ABC[inchar] + void update(chartype, chartype inchar) { + ngram.push_back(inchar); + ngram.pop_front(); + __updateHashValue(); + } - void __updateHashValue() { - hashvalue = 0; - for(size_t k = 0; k - hashvaluetype hash(container & c) { - hashvaluetype answer(0); - for(size_t k = 0; k ngram; - vector > hashers; - CharacterHash hasher;//placeholder + // this is a convenience function, use eat,update and .hashvalue to use as a + // rolling hash function + template hashvaluetype hash(container &c) { + hashvaluetype answer(0); + for (size_t k = 0; k < c.size(); ++k) { + answer ^= hashers[k].hashvalues[c[k]]; + } + return answer; + } + hashvaluetype hashvalue; + int n; + const int wordsize; + deque ngram; + vector> hashers; + CharacterHash hasher; // placeholder }; - #endif diff --git a/include/ztimer.h b/include/ztimer.h index 12242fe..4614e91 100644 --- a/include/ztimer.h +++ b/include/ztimer.h @@ -4,73 +4,70 @@ #include #include #ifdef _WIN32 -/*Porting gettimeofday to Windows, +/*Porting gettimeofday to Windows, source: https://www.codefull.net/2015/12/systime-h-replacement-for-windows/ TODO: Consider simply using std::chrono for timing operations */ #include #define __need_clock_t -#include #include +#include typedef long long suseconds_t; /* Structure describing CPU time used by a process and its children. */ -struct tms -{ - clock_t tms_utime; /* User CPU time. */ - clock_t tms_stime; /* System CPU time. */ +struct tms { + clock_t tms_utime; /* User CPU time. */ + clock_t tms_stime; /* System CPU time. */ - clock_t tms_cutime; /* User CPU time of dead children. */ - clock_t tms_cstime; /* System CPU time of dead children. */ + clock_t tms_cutime; /* User CPU time of dead children. */ + clock_t tms_cstime; /* System CPU time of dead children. */ }; /* Store the CPU time used by this process and all its dead children (and their dead children) in BUFFER. Return the elapsed real time, or (clock_t) -1 for errors. All times are in CLK_TCKths of a second. */ -clock_t times(struct tms* __buffer) { +clock_t times(struct tms *__buffer) { - __buffer->tms_utime = clock(); - __buffer->tms_stime = 0; - __buffer->tms_cstime = 0; - __buffer->tms_cutime = 0; - return __buffer->tms_utime; + __buffer->tms_utime = clock(); + __buffer->tms_stime = 0; + __buffer->tms_cstime = 0; + __buffer->tms_cutime = 0; + return __buffer->tms_utime; } -int gettimeofday(struct timeval* t, void* timezone) -{ - struct _timeb timebuffer; - _ftime(&timebuffer); - t->tv_sec = timebuffer.time; - t->tv_usec = 1000 * timebuffer.millitm; - return 0; +int gettimeofday(struct timeval *t, void *timezone) { + struct _timeb timebuffer; + _ftime(&timebuffer); + t->tv_sec = timebuffer.time; + t->tv_usec = 1000 * timebuffer.millitm; + return 0; } #else #include #endif -class ZTimer -{ +class ZTimer { public: - struct timeval t1, t2; + struct timeval t1, t2; + public: - ZTimer() { - gettimeofday(&t1,0); - t2 = t1; - } - void reset() { - gettimeofday(&t1,0); - t2 = t1; - } - int elapsed() { - return ((t2.tv_sec - t1.tv_sec) * 1000) + ((t2.tv_usec - t1. - tv_usec) / 1000); - } - int split() { - gettimeofday(&t2,0); - return elapsed(); - } + ZTimer() { + gettimeofday(&t1, 0); + t2 = t1; + } + void reset() { + gettimeofday(&t1, 0); + t2 = t1; + } + int elapsed() { + return ((t2.tv_sec - t1.tv_sec) * 1000) + + ((t2.tv_usec - t1.tv_usec) / 1000); + } + int split() { + gettimeofday(&t2, 0); + return elapsed(); + } }; #endif - diff --git a/tests/unit.cpp b/tests/unit.cpp index 67835fe..7b3a6fe 100644 --- a/tests/unit.cpp +++ b/tests/unit.cpp @@ -1,200 +1,198 @@ -#include #include +#include #include "cyclichash.h" -#include "rabinkarphash.h" #include "generalhash.h" - +#include "rabinkarphash.h" #include "threewisehash.h" using namespace std; -template -bool testExtendAndPrepend(uint L = 19) { - const uint n(4);//n-grams - hashfunction hf(n, L); - string input = "XABCDY"; - string base(input.begin() + 1, input.end() - 1); - assert(base.size() == n); - string extend(input.begin() + 1, input.end()); - string prepend(input.begin(), input.end() - 1); - - for (string::const_iterator j = base.begin(); j != base.end(); ++j) - { - hf.eat(*j); - } - if(hf.hashvalue != hf.hash(base)) { - std::cout <<"bug!"<< std::endl; - std::cout << base << " " << hf.hash(base) << std::endl; - return false; - } - if(hf.hash_prepend(input[0]) != hf.hash(prepend)) { - std::cout <<"bug!"<< std::endl; - std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " << hf.hash(prepend) << std::endl; - return false; - } - if(hf.hash_extend(input.back()) != hf.hash(extend)) { - std::cout <<"bug!"<< std::endl; - std::cout << extend << " " << hf.hash_extend(input.back()) << " " << hf.hash(extend) << std::endl; - return false; - } - - assert(hf.hashvalue == hf.hash(base)); - assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); - assert(hf.hash_extend(input.back()) == hf.hash(extend)); - - return true; - +template bool testExtendAndPrepend(uint L = 19) { + const uint n(4); // n-grams + hashfunction hf(n, L); + string input = "XABCDY"; + string base(input.begin() + 1, input.end() - 1); + assert(base.size() == n); + string extend(input.begin() + 1, input.end()); + string prepend(input.begin(), input.end() - 1); + + for (string::const_iterator j = base.begin(); j != base.end(); ++j) { + hf.eat(*j); + } + if (hf.hashvalue != hf.hash(base)) { + std::cout << "bug!" << std::endl; + std::cout << base << " " << hf.hash(base) << std::endl; + return false; + } + if (hf.hash_prepend(input[0]) != hf.hash(prepend)) { + std::cout << "bug!" << std::endl; + std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " + << hf.hash(prepend) << std::endl; + return false; + } + if (hf.hash_extend(input.back()) != hf.hash(extend)) { + std::cout << "bug!" << std::endl; + std::cout << extend << " " << hf.hash_extend(input.back()) << " " + << hf.hash(extend) << std::endl; + return false; + } + + assert(hf.hashvalue == hf.hash(base)); + assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); + assert(hf.hash_extend(input.back()) == hf.hash(extend)); + + return true; } -template -bool isItAFunction(uint L = 7) { - mersenneRNG generator(5); - const uint n(3);//n-grams - hashfunction hf(n,L ); - deque s; - for(uint32 k = 0; k(generator()+65); - s.push_back(c); - hf.eat(c); - } - for(uint32 k = 0; k<100000; ++k) { - unsigned char out = s.front(); - s.pop_front(); - char c (generator()+65); - - s.push_back(c); - hf.update(out,c); - if(hf.hash(s) != hf.hashvalue) { - for(deque::iterator ii=s.begin(); ii!=s.end(); ++ii) - cout<<*ii<<" "<(*ii)<"< bool isItAFunction(uint L = 7) { + mersenneRNG generator(5); + const uint n(3); // n-grams + hashfunction hf(n, L); + deque s; + for (uint32 k = 0; k < n; ++k) { + unsigned char c = static_cast(generator() + 65); + s.push_back(c); + hf.eat(c); + } + for (uint32 k = 0; k < 100000; ++k) { + unsigned char out = s.front(); + s.pop_front(); + char c(generator() + 65); + + s.push_back(c); + hf.update(out, c); + if (hf.hash(s) != hf.hashvalue) { + for (deque::iterator ii = s.begin(); ii != s.end(); ++ii) + cout << *ii << " " << static_cast(*ii) << endl; + cerr << "bug" << endl; + cerr << s[0] << s[1] << s[2] << " was hashed to " << hf.hashvalue + << " when true hash value is " << hf.hash(s) << endl; + for (uint j = 0; j < n; ++j) + cerr << s[j] << "->" << hf.hasher.hashvalues[s[j]] << endl; + return false; + } + } + return true; } - -template -bool doesReverseUpdateWorks(uint L = 7) { - mersenneRNG generator(5); - const uint n(3);//n-grams - hashfunction hf(n,L ); - deque s; - for(uint32 k = 0; k(generator()+65); - s.push_back(c); - hf.eat(c); - } - for(uint32 k = 0; k<100000; ++k) { - unsigned char out = s.front(); - s.pop_front(); - char c (generator()+65); - s.push_back(c); - hf.update(out,c); - hf.reverse_update(out,c); - hf.update(out,c); - if(hf.hash(s) != hf.hashvalue) { - return false; - } - } - return true; +template bool doesReverseUpdateWorks(uint L = 7) { + mersenneRNG generator(5); + const uint n(3); // n-grams + hashfunction hf(n, L); + deque s; + for (uint32 k = 0; k < n; ++k) { + unsigned char c = static_cast(generator() + 65); + s.push_back(c); + hf.eat(c); + } + for (uint32 k = 0; k < 100000; ++k) { + unsigned char out = s.front(); + s.pop_front(); + char c(generator() + 65); + s.push_back(c); + hf.update(out, c); + hf.reverse_update(out, c); + hf.update(out, c); + if (hf.hash(s) != hf.hashvalue) { + return false; + } + } + return true; } - - -template -bool isItRandom(uint L = 19) { - cout<<"checking that it is randomized "< data(n); - for(int k = 0; k < n; ++k ) { - data[k] = static_cast(k); - } - hashfunction base(n,L ); - uint64 x = base.hash(data); - for(int k = 0; k < 100; ++k ) { - hashfunction hf(n,L); - uint64 y = hf.hash(data); - if(y != x) { - cout<<"It is randomized! "< bool isItRandom(uint L = 19) { + cout << "checking that it is randomized " << endl; + int n = 5; + vector data(n); + for (int k = 0; k < n; ++k) { + data[k] = static_cast(k); + } + hashfunction base(n, L); + uint64 x = base.hash(data); + for (int k = 0; k < 100; ++k) { + hashfunction hf(n, L); + uint64 y = hf.hash(data); + if (y != x) { + cout << "It is randomized! " << endl; + return true; + } + cout << "collision " << y << endl; + } + cout << "Not randomized! " << endl; + return false; // we conclude that it always hashes to the same value (this is + // bad) } - bool test() { - bool ok(true); - cout<<"Karp-Rabin"< >(); - } - ok&=isItRandom >(); - for(uint L = 1; L<=64; ++L) { - if(!ok) return false; - ok&=isItAFunction >(); - } - ok&=isItRandom >(); - if(!ok) return false; - cout<<"cyclic"< >(L); - ok&=isItAFunction >(L); - ok&=doesReverseUpdateWorks >(L); - } - for(uint L = 2; L<=64; ++L) { - if(!ok) return false; - ok&=testExtendAndPrepend >(L); - ok&=isItAFunction >(L); - } - ok&=isItRandom >(); - ok&=isItRandom >(); - - cout<<"three-wise"< >(L); - } - ok&=isItRandom >(); - for(uint L = 1; L<=64; ++L) { - ok&=isItAFunction >(L); - } - ok&=isItRandom >(); - - cout<<"general"< >(9); - if(!ok) return false; - ok&=isItRandom >(); - if(!ok) return false; - ok&=isItAFunction >(19); - cout<<"general"< >(9); - if(!ok) return false; - ok&=isItRandom >(); - if(!ok) return false; - ok&=isItAFunction >(19); - return ok; + bool ok(true); + cout << "Karp-Rabin" << endl; + for (uint L = 1; L <= 32; ++L) { + if (!ok) + return false; + ok &= isItAFunction>(); + } + ok &= isItRandom>(); + for (uint L = 1; L <= 64; ++L) { + if (!ok) + return false; + ok &= isItAFunction>(); + } + ok &= isItRandom>(); + if (!ok) + return false; + cout << "cyclic" << endl; + for (uint L = 2; L <= 32; ++L) { + if (!ok) + return false; + ok &= testExtendAndPrepend>(L); + ok &= isItAFunction>(L); + ok &= doesReverseUpdateWorks>(L); + } + for (uint L = 2; L <= 64; ++L) { + if (!ok) + return false; + ok &= testExtendAndPrepend>(L); + ok &= isItAFunction>(L); + } + ok &= isItRandom>(); + ok &= isItRandom>(); + + cout << "three-wise" << endl; + for (uint L = 1; L <= 32; ++L) { + ok &= isItAFunction>(L); + } + ok &= isItRandom>(); + for (uint L = 1; L <= 64; ++L) { + ok &= isItAFunction>(L); + } + ok &= isItRandom>(); + + cout << "general" << endl; + ok &= isItAFunction>(9); + if (!ok) + return false; + ok &= isItRandom>(); + if (!ok) + return false; + ok &= isItAFunction>(19); + cout << "general" << endl; + ok &= isItAFunction>(9); + if (!ok) + return false; + ok &= isItRandom>(); + if (!ok) + return false; + ok &= isItAFunction>(19); + return ok; } - int main() { - bool ok(test()); - if(ok) - cout<<"your code is ok!"<