How to GroupBy objects by numeric values with tolerance factor?
It seemed to me that if you have a large data set you'll want to avoid the straightforward solution of sorting the values and then collecting them as you iterate through the sorted list, since sorting a large collection can be expensive. The most efficient solution I could think of which doesn't do any explicit sorting was to build a tree where each node contains the items where the key falls within a "contiguous" range (where all the keys are within tolerance
of each other) - the range for each node expands every time an item is added which falls outside the range by less than tolerance
. I implemented a solution - which turned out to be more complicated and interesting than I expected - and based on my rough benchmarking it looks like doing it this way takes about half as much time as the straightforward solution.
Here's my implementation as an extension method (so you can chain it, although like the normal Group
method it'll iterate the source
completely as soon as the result IEnumerable
is iterated).
public static IEnumerable<IGrouping<double, TValue>> GroupWithTolerance<TValue>(
this IEnumerable<TValue> source,
double tolerance,
Func<TValue, double> keySelector)
{
if(source == null)
throw new ArgumentNullException("source");
return GroupWithToleranceHelper<TValue>.Group(source, tolerance, keySelector);
}
private static class GroupWithToleranceHelper<TValue>
{
public static IEnumerable<IGrouping<double, TValue>> Group(
IEnumerable<TValue> source,
double tolerance,
Func<TValue, double> keySelector)
{
Node root = null, current = null;
foreach (var item in source)
{
var key = keySelector(item);
if(root == null) root = new Node(key);
current = root;
while(true){
if(key < current.Min - tolerance) { current = (current.Left ?? (current.Left = new Node(key))); }
else if(key > current.Max + tolerance) {current = (current.Right ?? (current.Right = new Node(key)));}
else
{
current.Values.Add(item);
if(current.Max < key){
current.Max = key;
current.Redistribute(tolerance);
}
if(current.Min > key) {
current.Min = key;
current.Redistribute(tolerance);
}
break;
}
}
}
foreach (var entry in InOrder(root))
{
yield return entry;
}
}
private static IEnumerable<IGrouping<double, TValue>> InOrder(Node node)
{
if(node.Left != null)
foreach (var element in InOrder(node.Left))
yield return element;
yield return node;
if(node.Right != null)
foreach (var element in InOrder(node.Right))
yield return element;
}
private class Node : IGrouping<double, TValue>
{
public double Min;
public double Max;
public readonly List<TValue> Values = new List<TValue>();
public Node Left;
public Node Right;
public Node(double key) {
Min = key;
Max = key;
}
public double Key { get { return Min; } }
IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); }
public IEnumerator<TValue> GetEnumerator() { return Values.GetEnumerator(); }
public IEnumerable<TValue> GetLeftValues(){
return Left == null ? Values : Values.Concat(Left.GetLeftValues());
}
public IEnumerable<TValue> GetRightValues(){
return Right == null ? Values : Values.Concat(Right.GetRightValues());
}
public void Redistribute(double tolerance)
{
if(this.Left != null) {
this.Left.Redistribute(tolerance);
if(this.Left.Max + tolerance > this.Min){
this.Values.AddRange(this.Left.GetRightValues());
this.Min = this.Left.Min;
this.Left = this.Left.Left;
}
}
if(this.Right != null) {
this.Right.Redistribute(tolerance);
if(this.Right.Min - tolerance < this.Max){
this.Values.AddRange(this.Right.GetLeftValues());
this.Max = this.Right.Max;
this.Right = this.Right.Right;
}
}
}
}
}
You can switch double
to another type if you need to (I so wish C# had a numeric
generic constraint).
The most straight-forward approach is to design your own IEqualityComparer<double>
.
public class ToleranceEqualityComparer : IEqualityComparer<double>
{
public double Tolerance { get; set; } = 0.02;
public bool Equals(double x, double y)
{
return x - Tolerance <= y && x + Tolerance > y;
}
//This is to force the use of Equals methods.
public int GetHashCode(double obj) => 1;
}
Which you should use like so
var dataByPrice = data.GroupBy(d => d.Price, new ToleranceEqualityComparer());