Combine consecutive date ranges
An alternative solution that uses window functions rather than recursive CTEs
SELECT
employmentid,
MIN(startdate) as startdate,
NULLIF(MAX(COALESCE(enddate,'9999-01-01')), '9999-01-01') as enddate
FROM (
SELECT
employmentid,
startdate,
enddate,
DATEADD(
DAY,
-COALESCE(
SUM(DATEDIFF(DAY, startdate, enddate)+1) OVER (PARTITION BY employmentid ORDER BY startdate ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING),
0
),
startdate
) as grp
FROM @t
) withGroup
GROUP BY employmentid, grp
ORDER BY employmentid, startdate
This works by calculating a grp
value that will be the same for all consecutive rows. This is achieved by:
- Determine totals days the span occupies (+1 as the dates are inclusive)
SELECT *, DATEDIFF(DAY, startdate, enddate)+1 as daysSpanned FROM @t
- Cumulative sum the days spanned for each employment, ordered by startdate. This gives us the total days spanned by all the previous employment spans
- We coalesce with 0 to ensure we dont have NULLs in our cumulative sum of days spanned
- We do not include current row in our cumulative sum, this is because we will use the value against
startdate
rather thanenddate
(we cant use it againstenddate
because of the NULLs)
SELECT *, COALESCE(
SUM(daysSpanned) OVER (
PARTITION BY employmentid
ORDER BY startdate
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
)
,0
) as cumulativeDaysSpanned
FROM (
SELECT *, DATEDIFF(DAY, startdate, enddate)+1 as daysSpanned FROM @t
) inner1
- Subtract the cumulative days from the
startdate
to get ourgrp
. This is the crux of the solution.- If the start date increases at the same rate as the days spanned then the days are consecutive, and subtracting the two will give us the same value.
- If the startdate increases faster than the days spanned then there is a gap and we will get a new
grp
value greater than the previous one. - Although
grp
is a date, the date itself is meaningless we are using just as a grouping value
SELECT *, DATEADD(DAY, -cumulativeDaysSpanned, startdate) as grp
FROM (
SELECT *, COALESCE(
SUM(daysSpanned) OVER (
PARTITION BY employmentid
ORDER BY startdate
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
)
,0
) as cumulativeDaysSpanned
FROM (
SELECT *, DATEDIFF(DAY, startdate, enddate)+1 as daysSpanned FROM @t
) inner1
) inner2
With the results
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| employmentid | startdate | enddate | daysSpanned | cumulativeDaysSpanned | grp |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 5 | 2007-12-03 00:00:00.000 | 2011-08-26 00:00:00.000 | 1363 | 0 | 2007-12-03 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 5 | 2013-05-02 00:00:00.000 | NULL | NULL | 1363 | 2009-08-08 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 30 | 2006-10-02 00:00:00.000 | 2011-01-16 00:00:00.000 | 1568 | 0 | 2006-10-02 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 30 | 2011-01-17 00:00:00.000 | 2012-08-12 00:00:00.000 | 574 | 1568 | 2006-10-02 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 30 | 2012-08-13 00:00:00.000 | NULL | NULL | 2142 | 2006-10-02 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
| 66 | 2007-09-24 00:00:00.000 | NULL | NULL | 0 | 2007-09-24 00:00:00.000 |
+--------------+-------------------------+-------------------------+-------------+-----------------------+-------------------------+
- Finally we can
GROUP BY grp
to get the get rid of the consecutive days.- Use
MIN
andMAX
to get the newstartdate
andendate
- To handle the NULL
enddate
we give them a large value to get picked up byMAX
then convert them back toNULL
again
- Use
SELECT
employmentid,
MIN(startdate) as startdate,
NULLIF(MAX(COALESCE(enddate,'9999-01-01')), '9999-01-01') as enddate
FROM (
SELECT *, DATEADD(DAY, -cumulativeDaysSpanned, startdate) as grp
FROM (
SELECT *, COALESCE(
SUM(daysSpanned) OVER (
PARTITION BY employmentid
ORDER BY startdate
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
)
,0
) as cumulativeDaysSpanned
FROM (
SELECT *, DATEDIFF(DAY, startdate, enddate)+1 as daysSpanned FROM @t
) inner1
) inner2
) inner3
GROUP BY employmentid, grp
ORDER BY employmentid, startdate
To get the desired result
+--------------+-------------------------+-------------------------+
| employmentid | startdate | enddate |
+--------------+-------------------------+-------------------------+
| 5 | 2007-12-03 00:00:00.000 | 2011-08-26 00:00:00.000 |
+--------------+-------------------------+-------------------------+
| 5 | 2013-05-02 00:00:00.000 | NULL |
+--------------+-------------------------+-------------------------+
| 30 | 2006-10-02 00:00:00.000 | NULL |
+--------------+-------------------------+-------------------------+
| 66 | 2007-09-24 00:00:00.000 | NULL |
+--------------+-------------------------+-------------------------+
- We can combine the inner queries to get the query at the start of this answer. Which is shorter, but less explainable
Limitations of all this required that
- there are no overlaps of startdate and enddate for an employment. This could produce collisions in our
grp
. - startdate is not NULL. However this could be overcome by replacing NULL start dates with small date values
- Future developers can decipher the window black magic you performed
The strange bit you see with my use of the date '31211231' is just a very large date to handle your "no-end-date" scenario. I have assumed you won't really have many date ranges per employee, so I've used a simple Recursive Common Table Expression to combine the ranges.
To make it run faster, the starting anchor query keeps only those dates that will not link up to a prior range (per employee). The rest is just tree-walking the date ranges and growing the range. The final GROUP BY keeps only the largest date range built up per starting ANCHOR (employmentid, startdate) combination.
SQL Fiddle
MS SQL Server 2008 Schema Setup:
create table Tbl (
employmentid int,
startdate datetime,
enddate datetime);
insert Tbl values
(5, '2007-12-03', '2011-08-26'),
(5, '2013-05-02', null),
(30, '2006-10-02', '2011-01-16'),
(30, '2011-01-17', '2012-08-12'),
(30, '2012-08-13', null),
(66, '2007-09-24', null);
/*
-- expected outcome
EmploymentId StartDate EndDate
5 2007-12-03 2011-08-26
5 2013-05-02 NULL
30 2006-10-02 NULL
66 2007-09-24 NULL
*/
Query 1:
;with cte as (
select a.employmentid, a.startdate, a.enddate
from Tbl a
left join Tbl b on a.employmentid=b.employmentid and a.startdate-1=b.enddate
where b.employmentid is null
union all
select a.employmentid, a.startdate, b.enddate
from cte a
join Tbl b on a.employmentid=b.employmentid and b.startdate-1=a.enddate
)
select employmentid,
startdate,
nullif(max(isnull(enddate,'32121231')),'32121231') enddate
from cte
group by employmentid, startdate
order by employmentid
Results:
| EMPLOYMENTID | STARTDATE | ENDDATE |
-----------------------------------------------------------------------------------
| 5 | December, 03 2007 00:00:00+0000 | August, 26 2011 00:00:00+0000 |
| 5 | May, 02 2013 00:00:00+0000 | (null) |
| 30 | October, 02 2006 00:00:00+0000 | (null) |
| 66 | September, 24 2007 00:00:00+0000 | (null) |
SET NOCOUNT ON
DECLARE @T TABLE(ID INT,FromDate DATETIME, ToDate DATETIME)
INSERT INTO @T(ID,FromDate,ToDate)
SELECT 1,'20090801','20090803' UNION ALL
SELECT 2,'20090802','20090809' UNION ALL
SELECT 3,'20090805','20090806' UNION ALL
SELECT 4,'20090812','20090813' UNION ALL
SELECT 5,'20090811','20090812' UNION ALL
SELECT 6,'20090802','20090802'
SELECT ROW_NUMBER() OVER(ORDER BY s1.FromDate) AS ID,
s1.FromDate,
MIN(t1.ToDate) AS ToDate
FROM @T s1
INNER JOIN @T t1 ON s1.FromDate <= t1.ToDate
AND NOT EXISTS(SELECT * FROM @T t2
WHERE t1.ToDate >= t2.FromDate
AND t1.ToDate < t2.ToDate)
WHERE NOT EXISTS(SELECT * FROM @T s2
WHERE s1.FromDate > s2.FromDate
AND s1.FromDate <= s2.ToDate)
GROUP BY s1.FromDate
ORDER BY s1.FromDate