I need a hash algorithm that produces a 64-bit hash code (long) with fewer collisions than String.GetHashCode() and that is fast (no expensive calls to cryptographic functions). Here's an implementation of FNV which still shows 3% of collisions after testing 2 million random strings. I need this number to be way lower.
void Main()
{
const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#@$%^&*()_+}{\":?><,./;'[]0123456789\\";
const int n = 2000000;
var random = new Random();
var hashes = new HashSet<long>();
int collisions = 0;
for(int i = 0; i < n; i++)
{
var len = random.Next(chars.Length);
var str = new char[len];
for (int j = 0; j < len; j++)
{
str[j] = chars[random.Next(chars.Length)];
}
var s = new String(str);
if(!hashes.Add(Get64BitHash( s ))) collisions++;
}
Console.WriteLine("Collision Percentage after " + n + " random strings: " + ((double)collisions * 100 / n));
}
public long Get64BitHash(string str)
{
unchecked
{
byte[] data = new byte[str.Length * sizeof(char)];
System.Buffer.BlockCopy(str.ToCharArray(), 0, data, 0, data.Length);
const ulong p = 1099511628211UL;
var hash = 14695981039346656037UL;
foreach(var d in data)
{
hash ^= d;
hash *= p;
}
return (long) hash;
}
}
SAMPLE OUTPUT OF ABOVE CODE:
Collision Percentage after 2000000 random strings: 3.01485 %
3% is the same collision percentage as just calling String.GetHashCode(). I need something way better.
PS: There's a chance I am doing something terribly long.
EDIT:
Solved. Get64BitHash method above is correct. The problem was that my strings weren't random. After making sure strings are unique (see revised code below), I get zero collisions on almost 50 million unique strings, versus ~1% collisions using String.GetHashCode().
void Main()
{
const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#@$%^&*()_+}{\":?><,./;'[]0123456789\\";
const int n = 200000000;
var random = new Random();
var hashes = new HashSet<long>();
var strings = new HashSet<string>();
int collisions = 0;
while(strings.Count < n)
{
var len = random.Next(chars.Length);
var str = new char[len];
for (int j = 0; j < len; j++)
{
str[j] = chars[random.Next(chars.Length)];
}
var s = new String(str);
if(!strings.Add(s)) continue;
if(!hashes.Add(s.GetHashCode())) collisions++;
}
Console.WriteLine("Collision Percentage after " + n + " random strings: " + ((double)collisions * 100 / strings.Count));
}