On string interning and alternatives
When in doubt, cheat! :-)
public class CachingEqualityComparer<T> : IEqualityComparer<T> where T : class
{
public T X { get; private set; }
public T Y { get; private set; }
public IEqualityComparer<T> DefaultComparer = EqualityComparer<T>.Default;
public bool Equals(T x, T y)
{
bool result = DefaultComparer.Equals(x, y);
if (result)
{
X = x;
Y = y;
}
return result;
}
public int GetHashCode(T obj)
{
return DefaultComparer.GetHashCode(obj);
}
public T Other(T one)
{
if (object.ReferenceEquals(one, X))
{
return Y;
}
if (object.ReferenceEquals(one, Y))
{
return X;
}
throw new ArgumentException("one");
}
public void Reset()
{
X = default(T);
Y = default(T);
}
}
Example of use:
var comparer = new CachingEqualityComparer<string>();
var hs = new HashSet<string>(comparer);
string str = "Hello";
string st1 = str.Substring(2);
hs.Add(st1);
string st2 = str.Substring(2);
// st1 and st2 are distinct strings!
if (object.ReferenceEquals(st1, st2))
{
throw new Exception();
}
comparer.Reset();
if (hs.Contains(st2))
{
string cached = comparer.Other(st2);
Console.WriteLine("Found!");
// cached is st1
if (!object.ReferenceEquals(cached, st1))
{
throw new Exception();
}
}
I've created an equality comparer that "caches" the last Equal
terms it analyzed :-)
Everything could then be encapsulated in a subclass of HashSet<T>
/// <summary>
/// An HashSet<T;gt; that, thorough a clever use of an internal
/// comparer, can have a AddOrGet and a TryGet
/// </summary>
/// <typeparam name="T"></typeparam>
public class HashSetEx<T> : HashSet<T> where T : class
{
public HashSetEx()
: base(new CachingEqualityComparer<T>())
{
}
public HashSetEx(IEqualityComparer<T> comparer)
: base(new CachingEqualityComparer<T>(comparer))
{
}
public T AddOrGet(T item)
{
if (!Add(item))
{
var comparer = (CachingEqualityComparer<T>)Comparer;
item = comparer.Other(item);
}
return item;
}
public bool TryGet(T item, out T item2)
{
if (Contains(item))
{
var comparer = (CachingEqualityComparer<T>)Comparer;
item2 = comparer.Other(item);
return true;
}
item2 = default(T);
return false;
}
private class CachingEqualityComparer<T> : IEqualityComparer<T> where T : class
{
public WeakReference X { get; private set; }
public WeakReference Y { get; private set; }
private readonly IEqualityComparer<T> Comparer;
public CachingEqualityComparer()
{
Comparer = EqualityComparer<T>.Default;
}
public CachingEqualityComparer(IEqualityComparer<T> comparer)
{
Comparer = comparer;
}
public bool Equals(T x, T y)
{
bool result = Comparer.Equals(x, y);
if (result)
{
X = new WeakReference(x);
Y = new WeakReference(y);
}
return result;
}
public int GetHashCode(T obj)
{
return Comparer.GetHashCode(obj);
}
public T Other(T one)
{
if (object.ReferenceEquals(one, null))
{
return null;
}
object x = X.Target;
object y = Y.Target;
if (x != null && y != null)
{
if (object.ReferenceEquals(one, x))
{
return (T)y;
}
else if (object.ReferenceEquals(one, y))
{
return (T)x;
}
}
return one;
}
}
}
Note the use of WeakReference
so that there aren't useless references to objects that could prevent garbage collection.
Example of use:
var hs = new HashSetEx<string>();
string str = "Hello";
string st1 = str.Substring(2);
hs.Add(st1);
string st2 = str.Substring(2);
// st1 and st2 are distinct strings!
if (object.ReferenceEquals(st1, st2))
{
throw new Exception();
}
string stFinal = hs.AddOrGet(st2);
if (!object.ReferenceEquals(stFinal, st1))
{
throw new Exception();
}
string stFinal2;
bool result = hs.TryGet(st1, out stFinal2);
if (!object.ReferenceEquals(stFinal2, st1))
{
throw new Exception();
}
if (!result)
{
throw new Exception();
}
I've had exactly this requirement and indeed asked on SO, but with nothing like the detail of your question, no useful responses. One option that is built in is a (System.Xml).NameTable, which is basically a string atomization object, which is what you are looking for, we had (we've actually move to Intern because we do keep these strings for App-life).
if (name == null) return null;
if (name == "") return string.Empty;
lock (m_nameTable)
{
return m_nameTable.Add(name);
}
on a private NameTable
http://referencesource.microsoft.com/#System.Xml/System/Xml/NameTable.cs,c71b9d3a7bc2d2af shows its implemented as a Simple hashtable, ie only storing one reference per string.
Downside? is its completely string specific. If you do cross-test for memory / speed I'd be interested to see the results. We were already using System.Xml heavily, might of course not seem so natural if you where not.