Convert List of Numbers to String Ranges
One approach could be "eating" piece by piece the input sequence and store the partial range results untill you've got them all:
def formatter(start, end, step):
return '{}-{}:{}'.format(start, end, step)
# return '{}-{}:{}'.format(start, end + step, step)
def helper(lst):
if len(lst) == 1:
return str(lst[0]), []
if len(lst) == 2:
return ','.join(map(str,lst)), []
step = lst[1] - lst[0]
for i,x,y in zip(itertools.count(1), lst[1:], lst[2:]):
if y-x != step:
if i > 1:
return formatter(lst[0], lst[i], step), lst[i+1:]
else:
return str(lst[0]), lst[1:]
return formatter(lst[0], lst[-1], step), []
def re_range(lst):
result = []
while lst:
partial,lst = helper(lst)
result.append(partial)
return ','.join(result)
I test it with a bunch of unit tests and it passed them all, it can handle negative numbers too, but they'll look kind of ugly (it's really anybody's fault).
Example:
>>> re_range([1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28])
'1,4-6:1,10,15-18:1,22,25-28:1'
>>> re_range([1, 3, 5, 7, 8, 9, 10, 11, 13, 15, 17])
'1-7:2,8-11:1,13-17:2'
Note: I wrote the code for Python 3.
Performance
I didn't put any performance effort in the solution above. In particular, every time a list get re-builded with slicing, it might take some time if the input list has a particular shape. So, the first simple improvement would be using itertools.islice()
where possible.
Anyway here's another implementation of the same algorithm, that scan through the input list with a scan
index instead of slicing:
def re_range(lst):
n = len(lst)
result = []
scan = 0
while n - scan > 2:
step = lst[scan + 1] - lst[scan]
if lst[scan + 2] - lst[scan + 1] != step:
result.append(str(lst[scan]))
scan += 1
continue
for j in range(scan+2, n-1):
if lst[j+1] - lst[j] != step:
result.append(formatter(lst[scan], lst[j], step))
scan = j+1
break
else:
result.append(formatter(lst[scan], lst[-1], step))
return ','.join(result)
if n - scan == 1:
result.append(str(lst[scan]))
elif n - scan == 2:
result.append(','.join(map(str, lst[scan:])))
return ','.join(result)
I stopped working on it once it got ~65% faster than the previous top solution, it seemed enough :)
Anyway I'd say that there might still be room for improvement (expecially in the middle for-loop).
This is most likely what you are looking for.
Edit: I see you already found the post. My apologies.
To help with the second part, I've tinkered a bit myself. This is what I came up with:
from numpy import diff
data = [ 1, 3, 5, 7, 8, 9, 10, 11, 13, 15, 17 ]
onediff, twodiff = diff(data), diff(diff(data))
increments, breakingindices = [], []
for i in range(len(twodiff)):
if twodiff[i] != 0:
breakingindices.append(i+2) # Correct index because of the two diffs
increments.append(onediff[i]) # Record the increment for this section
# Increments and breakingindices should be the same size
str_list = []
start = data[0]
for i in range(len(breakingindices)):
str_list.append("%d-%d:%d" % (start, data[breakingindices[i]-1], increments[i]))
start = data[breakingindices[i]]
str_list.append("%d-%d:%d" % (start, data[len(data)-1], onediff[len(onediff)-1]))
print str_list
For the given input list, this gives: ['1-7:2', '8-11:1', '13-17:2']
. The code could do with a bit of cleanup, but this sorts with your problem assuming the grouping can be done sequentially.
{caution: for [1,2,3,5,6,7] this gives ['1-3:1', '5-5:2', '6-7:1'] instead of ['1-3:1', '5-7:1']}
This is a comparison of the 3 methods. Change the amount of data and the density via the values below...no matter what values I use, the first solution seems to be the quickest for me. For very large sets of data, the third solution becomes very slow.
EDITED
Edited to include comments below and add in a new solution. The last solution seems to be the quickest now.
import numpy as np
import itertools
import random
import timeit
# --- My Solution --------------------------------------------------------------
def list_to_ranges1(data):
data = sorted(data)
diff_data = np.diff(data)
ranges = []
i = 0
skip_next = False
for k, iterable in itertools.groupby(diff_data, None):
rng = list(iterable)
step = rng[0]
if skip_next:
skip_next = False
rng.pop()
if len(rng) == 0:
continue
elif len(rng) == 1:
ranges.append('%d' % data[i])
elif step == 1:
ranges.append('%d-%d' % (data[i], data[i+len(rng)]+step))
i += 1
skip_next = True
else:
ranges.append('%d-%d:%d' % (data[i], data[i+len(rng)]+step, step))
i += 1
skip_next = True
i += len(rng)
if len(rng) == 0 or len(rng) == 1:
ranges.append('%d' % data[i])
return ','.join(ranges)
# --- Kaidence Solution --------------------------------------------------------
# With a minor edit for use in range function
def list_to_ranges2(data):
onediff = np.diff(data)
twodiff = np.diff(onediff)
increments, breakingindices = [], []
for i in range(len(twodiff)):
if twodiff[i] != 0:
breakingindices.append(i+2) # Correct index because of the two diffs
increments.append(onediff[i]) # Record the increment for this section
# Increments and breakingindices should be the same size
str_list = []
start = data[0]
for i in range(len(breakingindices)):
str_list.append("%d-%d:%d" % (start,
data[breakingindices[i]-1] + increments[i],
increments[i]))
start = data[breakingindices[i]]
str_list.append("%d-%d:%d" % (start,
data[len(data)-1] + onediff[len(onediff)-1],
onediff[len(onediff)-1]))
return ','.join(str_list)
# --- Rik Poggi Solution -------------------------------------------------------
# With a minor edit for use in range function
def helper(lst):
if len(lst) == 1:
return str(lst[0]), []
if len(lst) == 2:
return ','.join(map(str,lst)), []
step = lst[1] - lst[0]
#for i,x,y in itertools.izip(itertools.count(1), lst[1:], lst[2:]):
for i,x,y in itertools.izip(itertools.count(1),
itertools.islice(lst, 1, None, 1),
itertools.islice(lst, 2, None, 1)):
if y-x != step:
if i > 1:
return '{}-{}:{}'.format(lst[0], lst[i]+step, step), lst[i+1:]
else:
return str(lst[0]), lst[1:]
return '{}-{}:{}'.format(lst[0], lst[-1]+step, step), []
def list_to_ranges3(lst):
result = []
while lst:
partial,lst = helper(lst)
result.append(partial)
return ','.join(result)
# --- Rik Poggi Solution 2 -----------------------------------------------------
def formatter(start, end, step):
#return '{}-{}:{}'.format(start, end, step)
return '{}-{}:{}'.format(start, end + step, step)
def list_to_ranges4(lst):
n = len(lst)
result = []
scan = 0
while n - scan > 2:
step = lst[scan + 1] - lst[scan]
if lst[scan + 2] - lst[scan + 1] != step:
result.append(str(lst[scan]))
scan += 1
continue
for j in xrange(scan+2, n-1):
if lst[j+1] - lst[j] != step:
result.append(formatter(lst[scan], lst[j], step))
scan = j+1
break
else:
result.append(formatter(lst[scan], lst[-1], step))
return ','.join(result)
if n - scan == 1:
result.append(str(lst[scan]))
elif n - scan == 2:
result.append(','.join(itertools.imap(str, lst[scan:])))
return ','.join(result)
# --- Test Function ------------------------------------------------------------
def test_data(data, f_to_test):
data_str = f_to_test(data)
_list = []
for r in data_str.replace('-',':').split(','):
r = [int(a) for a in r.split(':')]
if len(r) == 1:
_list.extend(r)
elif len(r) == 2:
_list.extend(range(r[0], r[1]))
else:
_list.extend(range(r[0], r[1], r[2]))
return _list
# --- Timing Tests -------------------------------------------------------------
# Generate some sample data...
data_list = []
for i in range(5):
# Note: using the "4000" and "5000" values below, the relative density of
# the data can be changed. This has a huge effect on the results
# (particularly on the results for list_to_ranges3 which uses recursion).
data_list.append(sorted(list(set([random.randint(1,4000) for a in \
range(random.randint(5,5000))]))))
testfuncs = list_to_ranges1, list_to_ranges2, list_to_ranges3, list_to_ranges4
for f in testfuncs:
print '\n', f.__name__
for i, data in enumerate(data_list):
t = timeit.Timer('f(data)', 'from __main__ import data, f')
#print f(data)
print i, data==test_data(data, f), round(t.timeit(200), 3)