Calculating average and percentiles from a histogram map?
Give a histogram (Frequency List) like the following
Value | Frequency
------+----------
1 | 5
2 | 3
3 | 1
4 | 7
5 | 2
..
Where each Value
has occurred Frequency
times in your data-set.
public static double getMean (ConcurrentHashMap<Long,Long> histogram)
{
double mean = 0;
double a = 0;
double b = 0;
TreeSet<Long> values = histogram.keySet();
for (Long value : values)
{
// a = a + (value x frequency)
a = a + (value * histogram.get(value));
// b = b + frequency
b = b + histogram.get(value);
}
// mean = SUM(value x frequency) / SUM(frequency)
mean = (a / b);
return mean;
}
The mean is straightforward to implement. Median is the 50th percentile, so you just need a single percentile method that works, and create a utility method for the median. There are several variations of Percentile calculation, but this one should generate the same results as the Microsoft Excel PERCENTILE.INC function.
import java.util.Map;
import java.util.SortedSet;
import java.util.concurrent.ConcurrentSkipListSet;
public class HistogramStatistics
{
public static Double average(final Map<Long, Long> histogram)
{
return HistogramStatistics.mean(histogram);
}
public static Double mean(final Map<Long, Long> histogram)
{
double sum = 0L;
for (Long value : histogram.keySet())
{
sum += (value * histogram.get(value));
}
return sum / (double) HistogramStatistics.count(histogram);
}
public static Double median(final Map<Long, Long> histogram)
{
return HistogramStatistics.percentile(histogram, 0.50d);
}
public static Double percentile(final Map<Long, Long> histogram, final double percent)
{
if ((percent < 0d) || (percent > 1d))
{
throw new IllegalArgumentException("Percentile must be between 0.00 and 1.00.");
}
if ((histogram == null) || histogram.isEmpty())
{
return null;
}
double n = (percent * (HistogramStatistics.count(histogram).doubleValue() - 1d)) + 1d;
double d = n - Math.floor(n);
SortedSet<Long> bins = new ConcurrentSkipListSet<Long>(histogram.keySet());
long observationsBelowBinInclusive = 0L;
Long lowBin = bins.first();
Double valuePercentile = null;
for (Long highBin : bins)
{
observationsBelowBinInclusive += histogram.get(highBin);
if (n <= observationsBelowBinInclusive)
{
if ((d == 0f) || (histogram.get(highBin) > 1L))
{
lowBin = highBin;
}
valuePercentile = lowBin.doubleValue() + ((highBin - lowBin) * d);
break;
}
lowBin = highBin;
}
return valuePercentile;
}
public static Long count(final Map<Long, Long> histogram)
{
long observations = 0L;
for (Long value : histogram.keySet())
{
observations += histogram.get(value);
}
return observations;
}
}