为了寻找词典的快速复合键我遇到了异常,我无法理解也无法证明。
在有限的测试中
Dictionary<KeyValuePair<UInt32, UInt32>, string>
比
慢得多(200:1)Dictionary<KeyValuePair<UInt16, UInt16>, string>
测试从0到1000的两个循环 填充然后包含密钥
Poplulate ContainsKey
UInt32 92085 86578
UInt16 2201 431
问题在于
new KeyValuePair<UInt32, UInt32>(i, j).GetHashCode();
产生许多重复。
在循环i和j 1024中,仅创建1024个唯一散列值。
基于来自CasperOne的雪崩评论尝试了i * 31和j * 97(两个素数),这导致105280在1024X1024上独一无二。仍然有很多重复。 CasperOne我知道这与随机不一样。但随机输入并不是我的工作。 GetHashCode()应该随机化输出。
为什么重复次数很多?
上的相同循环
new KeyValuePair<UInt16, UInt16>(i, j).GetHashCode();
产生1024 X 1024个唯一哈希码(完美)。
Int32有同样的问题。
这些重复哈希值kill
Dictionary<KeyValuePair<UInt32, UInt32>, string>
与Int16相比,元组也会产生很多重复,它在Int32中不会降级。
生成原始KVP和原始KPV.GetHashCode的时间类似。
与HashSet相同的异常。
Dictionary<KeyValuePair<UInt32, UInt32>, string> dKVPu32 = new Dictionary<KeyValuePair<UInt32, UInt32>, string>();
Dictionary<KeyValuePair<UInt16, UInt16>, string> dKVPu16 = new Dictionary<KeyValuePair<UInt16, UInt16>, string>();
KeyValuePair<UInt32, UInt32> kvpUint32;
KeyValuePair<UInt16, UInt16> kvpUint16;
int range = 1000;
Int32 hashCode;
HashSet<Int32> kvpUint32Hash = new HashSet<Int32>();
HashSet<Int32> kvpUint16Hash = new HashSet<Int32>();
Stopwatch sw = new Stopwatch();
sw.Start();
for (UInt32 i = 0; i < range; i++)
{
for (UInt32 j = 0; j < range; j++)
{
kvpUint32 = new KeyValuePair<UInt32, UInt32>(i, j);
}
}
Console.WriteLine("UInt32 raw " + sw.ElapsedMilliseconds.ToString());
// 7
sw.Restart();
for (UInt16 i = 0; i < range; i++)
{
for (UInt16 j = 0; j < range; j++)
{
kvpUint16 = new KeyValuePair<UInt16, UInt16>(i, j);
}
}
Console.WriteLine("UInt16 raw " + sw.ElapsedMilliseconds.ToString());
// 6
sw.Restart();
for (UInt32 i = 0; i < range; i++)
{
for (UInt32 j = 0; j < range; j++)
{
hashCode = new KeyValuePair<UInt32, UInt32>(i, j).GetHashCode();
kvpUint32Hash.Add(hashCode);
}
}
Console.WriteLine("UInt32 GetHashCode " + sw.ElapsedMilliseconds.ToString() + " unique count " + kvpUint32Hash.Count.ToString());
// 285 1024
sw.Restart();
for (UInt16 i = 0; i < range; i++)
{
for (UInt16 j = 0; j < range; j++)
{
hashCode = new KeyValuePair<UInt16, UInt16>(i, j).GetHashCode();
kvpUint16Hash.Add(hashCode);
}
}
Console.WriteLine("UInt16 GetHashCode " + sw.ElapsedMilliseconds.ToString() + " unique count " + kvpUint16Hash.Count.ToString());
// 398 1000000
sw.Restart();
Console.ReadLine();
for (UInt32 i = 0; i < range; i++)
{
for (UInt32 j = 0; j < range; j++)
{
dKVPu32.Add(new KeyValuePair<UInt32, UInt32>(i, j), String.Format("{0} {1}", i.ToString(), j.ToString()));
}
}
Console.WriteLine("hsKVPu32 pop " + sw.ElapsedMilliseconds.ToString());
// 92085
sw.Restart();
for (UInt32 i = 0; i < range; i++)
{
for (UInt32 j = 0; j < range; j++)
{
if (!dKVPu32.ContainsKey(new KeyValuePair<UInt32, UInt32>(i, j))) Debug.WriteLine("Opps"); ;
}
}
Console.WriteLine("hsKVPu32 find " + sw.ElapsedMilliseconds.ToString());
// 86578
dKVPu32.Clear();
dKVPu32 = null;
GC.Collect();
sw.Restart();
for (UInt16 i = 0; i < range; i++)
{
for (UInt16 j = 0; j < range; j++)
{
dKVPu16.Add(new KeyValuePair<UInt16, UInt16>(i, j), String.Format("{0} {1}", i.ToString(), j.ToString()));
}
}
Console.WriteLine("hsKVPu16 pop " + sw.ElapsedMilliseconds.ToString());
// 2201
sw.Restart();
for (UInt16 i = 0; i < range; i++)
{
for (UInt16 j = 0; j < range; j++)
{
if (!dKVPu16.ContainsKey(new KeyValuePair<UInt16, UInt16>(i, j))) Debug.WriteLine("Opps"); ;
}
}
sw.Stop();
Console.WriteLine("hsKVPu16 find " + sw.ElapsedMilliseconds.ToString());
// 431
P.S。最快的是打包.E.G。 ((UInt32)int1&lt;&lt; 16)| INT2;
第一个UInt32列的哈希值等于接下来两个的KVP哈希值。
2281371105 8 992
2281371104 8 993
2281371107 8 994
2281371145 0 0
2281371147 0 2
2281371149 0 4
2281371151 0 6
2281371137 0 8
2281371144 0 1
2281371146 0 3
2281371148 0 5
2281371150 0 7
2281371136 0 9
2281371144 1 0
2281371145 1 1
2281371146 1 2
2281371147 1 3
2281371148 1 4
2281371149 1 5
2281371150 1 6
2281371151 1 7
2281371136 1 8
2281371137 1 9
2281371147 2 0
2281371146 2 1
2281371144 2 3
2281371151 2 4
2281371150 2 5
2281371149 2 6
2281371148 2 7
2281371139 2 8
我发现的唯一模式是总和或差异或KVP匹配 但找不到何时总和以及何时减去的模式 这是一个糟糕的哈希,所以知道它是什么没什么价值。
答案 0 :(得分:8)
由于GetHashCode
返回Int32
,因此每对Int16
s(或UInt16
s)都可以轻松返回唯一值。使用一对Int32
s,您需要以某种方式组合这些值以与您的设计兼容。
KeyValuePair
不会覆盖GetHashCode()
,因此您只是使用ValueType.GetHashCode()
的默认实现,其文档说明如下:
(来自:http://msdn.microsoft.com/en-us/library/system.valuetype.gethashcode.aspx)
如果调用派生类型的GetHashCode方法,则返回值不太可能 适合用作哈希表中的键。另外,如果值为一个或多个 这些字段的更改,返回值可能不适合用作a中的键 哈希表。在任何一种情况下,请考虑编写自己的GetHashCode实现 更接近地表示类型的哈希码概念的方法。
由于KeyValuePair
未覆盖GetHashCode()
,我认为它不打算用作Dictionary
密钥。
此外,根据this question和this C# code,ValueType.GetHashCode()
的默认实现只是选择第一个非静态字段,并返回其GetHashCode()
方法的结果。这解释了KeyValuePair<UInt32, UInt32>
的大量重复项,但它没有解释KeyValuePair<UInt16, UInt16>
缺少重复项。
我的猜测是,对于KeyValuePair<UInt32, UInt32>
,GetHashCode()
只返回第一个值的GetHashCode()
,而KeyValuePair<UInt16, UInt16>
,GetHashCode()
正在组合值导致每对值都有唯一的哈希值,因为这样做是可能的,也是直接的。
答案 1 :(得分:7)
首先,我们可以省去这方面的时间方面 - 我觉得这对于哈希冲突真的只是 ,因为很明显这会破坏性能。
所以,问题是为什么KeyValuePair<uint, uint>
的哈希冲突多于KeyValuePair<ushort, ushort>
。为了帮助我们了解更多信息,我写了以下简短程序:
using System;
using System.Collections.Generic;
class Program
{
const int Sample1 = 100;
const int Sample2 = 213;
public static void Main()
{
Display<uint, ushort>();
Display<ushort, ushort>();
Display<uint, uint>();
Display<ushort, uint>();
}
static void Display<TKey, TValue>()
{
TKey key1 = (TKey) Convert.ChangeType(Sample1, typeof(TKey));
TValue value1 = (TValue) Convert.ChangeType(Sample1, typeof(TValue));
TKey key2 = (TKey) Convert.ChangeType(Sample2, typeof(TKey));
TValue value2 = (TValue) Convert.ChangeType(Sample2, typeof(TValue));
Console.WriteLine("Testing {0}, {1}", typeof(TKey).Name, typeof(TValue).Name);
Console.WriteLine(new KeyValuePair<TKey, TValue>(key1, value1).GetHashCode());
Console.WriteLine(new KeyValuePair<TKey, TValue>(key1, value2).GetHashCode());
Console.WriteLine(new KeyValuePair<TKey, TValue>(key2, value1).GetHashCode());
Console.WriteLine(new KeyValuePair<TKey, TValue>(key2, value2).GetHashCode());
Console.WriteLine();
}
}
我机器上的输出是:
Testing UInt32, UInt16
-1888265981
-1888265981
-1888265806
-1888265806
Testing UInt16, UInt16
-466800447
-459525951
-466800528
-459526032
Testing UInt32, UInt32
958334947
958334802
958334802
958334947
Testing UInt16, UInt32
-1913331935
-1913331935
-1913331935
-1913331935
显然,您可以尝试更改样本值以查看碰撞的位置。
KeyValuePair<ushort, uint>
的结果特别令人担忧,KeyValuePair<ushort, ushort>
的结果令人惊讶地好。
事实上,KeyValuePair<ushort, uint>
不仅仅是坏事 - 据我所知,它是荒谬坏 - 我还没有找到任何值运行64位CLR时,没有相同的哈希码-1913331935。运行32位CLR我得到一个不同的哈希码,但所有值的哈希码仍然相同。
似乎在.NET 4.5(我正在运行的)中,GetHashCode
的默认实现不只是采用结构的第一个实例字段,如前所述。我怀疑至少对于某些类型,它只使用盒装值中的标题之外的前4个字节的内存(并且这里将有每次调用的装箱),并且最终有时只是第一个字段(如果该字段是uint
),有时是多个字段(例如ushort, ushort
,其中两个字段都适合“内部”4个字节)和有时根本不是字段(ushort, uint
)。
(实际上,这并不能解释为什么你在uint, uint
案例中得到1024个不同的哈希码而不是1000个。我仍然不确定。)
最终,使用不会覆盖GetHashCode
作为字典键的值类型似乎只是一个坏主意,除非您已经过测试以确保它适合您的特定要求。对IMO来说,有太多的黑魔法对它充满信心。
答案 2 :(得分:1)
正如其他回答者提到的,KeyValuePair
不会覆盖GetHashCode
,而GetHashCode
的结构isn't the best的默认实现。您可以为此使用双元素元组,例如
var dict = new Dictionary<Tuple<uint, uint>, string>();
dict.Add(Tuple.Create(1u, 2u),"xxx"); // Tuples override GetHashCode
但是请注意,这将为额外的Tuple堆分配增加额外的开销。 (但它部分弥补了,因为当你在一个没有覆盖它的结构上调用GetHashCode
时,你隐式地将它包装好了)
答案 3 :(得分:0)
如果您希望将很多自己的东西放入使用结构(如字典)中,那么底部规则总是覆盖GetHashCode。您可以使用此扩展来查看字典的填充情况。它将报告空插槽,重复键等。即将把它放在sourceforge上,但在这里;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
// This unit is Freeware. It was developed by Jerremy Koot & Ivo Tops. July 2011
//
// Version By Changes
// ======= ===== ==============================================================
// v1.02 Ivo Removed not-working Hashtable support and simplified code
// v1.01 Ivo Lowered memory usage
// v1.00 I&J First Version
namespace FastLibrary
{
/// <summary>
/// Static Extension Methods for Dictionary, ConcurrentDictionary and HashSet
/// </summary>
public static class ExtHashContainers
{
/// <summary>
/// Checks a dictionary for performance statistics
/// </summary>
public static string Statistics<TKey, TValue>(this Dictionary<TKey, TValue> source)
{
return ExamineData(source.Keys, source);
}
/// <summary>
/// Checks a concurrent dictionary for performance statistics
/// </summary>
public static string Statistics<TKey, TValue>(this ConcurrentDictionary<TKey, TValue> source)
{
return ExamineData(source.Keys, source);
}
/// <summary>
/// Checks a HashSet for performance statistics
/// </summary>
public static string Statistics<TKey>(this HashSet<TKey> source)
{
return ExamineData(source, source);
}
private static string ExamineData<TKey>(ICollection<TKey> source, Object hashContainer)
{
if (!source.Any()) return "No Data found.";
// Find Buckets
var b = GetBuckets(hashContainer);
if (b < 0) return ("Unable to get Buckets Field for HashContainer");
// Create our counting temp dictionaries
var d = new int[b];
var h = new Dictionary<int, int>(source.Count);
// Find Hash Collisions and Bucket Stats
foreach (var k in source)
{
var hash = k.GetHashCode() & 0x7FFFFFFF; // Hashes are stripped of sign bit in HashContainers
int bucket = hash%b; // .NET Hashers do not use negative hashes, and use % voor bucket selection
// Bucket Stats
d[bucket]++;
// Hashing Stats
int c;
if (h.TryGetValue(hash, out c)) h.Remove(hash);
else c = 0;
c++;
h.Add(hash, c);
}
// Do some math
var maxInBucket = d.Max(q => q);
var maxSameHash = h.Values.Max(q => q);
var emptyBuckets = d.Count(q => q == 0);
var emptyStr = b == 0 ? "0" : ((float) (emptyBuckets)/b*100).ToString("0.0");
var worstHash = (from i in h where i.Value == maxSameHash select i.Key).FirstOrDefault();
// Report our findings
var r = Environment.NewLine + hashContainer.GetType().Name + " has " + b + " buckets with " + source.Count +
" items. " +
Environment.NewLine + "The Largest bucket contains " + maxInBucket + " items. " +
Environment.NewLine + "It has " + (emptyBuckets) +
" empty buckets (" + emptyStr + "%)" + Environment.NewLine + "Each non-empty bucket has on average " +
((source.Count/(float) (b - emptyBuckets))).ToString("0.0") + " items." + "The " + source.Count +
" items share " + h.Count +
" unique hashes. ";
if (maxSameHash > 1)
r += Environment.NewLine + "The largest collision has " + maxSameHash +
" items sharing the same hash, which == " + worstHash;
return r;
}
private static Int32 GetBuckets(object dictionary)
{
var type = dictionary.GetType();
while (type != null && !type.IsGenericType) type = type.BaseType;
if (type == null) return -1;
string field = null;
if (type.GetGenericTypeDefinition() == typeof (Dictionary<,>)) field = "buckets";
if (type.GetGenericTypeDefinition() == typeof (ConcurrentDictionary<,>)) field = "m_buckets";
if (type.GetGenericTypeDefinition() == typeof (HashSet<>)) field = "m_buckets";
if (field == null) return -1;
var bucketsField = type.GetField(field, BindingFlags.NonPublic | BindingFlags.Instance);
if (bucketsField == null) return -1;
var buckets = bucketsField.GetValue(dictionary);
if (buckets == null) return -1;
var length = buckets.GetType().GetProperty("Length");
return (int) length.GetGetMethod().Invoke(buckets, null);
}
}
}