Populating missing data based on previous month-end values
One approach to this problem is to do the following:
- Emulate
LEAD
on SQL Server 2008. You can useAPPLY
or a suquery for this. - For rows without a next row, add one month to their account date.
- Join to a dimension table that contains month end dates. This eliminates all rows that don't span at least a month and adds rows to fill in the gaps as necessary.
I modified your test data a little bit to make the results deterministic. Also added an index:
create table #histories
(
username varchar(10),
account varchar(10),
assigned date
);
insert into #histories
values
('PHIL','ACCOUNT1','2017-01-04'),
('PETER','ACCOUNT1','2017-01-15'),
('DAVE','ACCOUNT1','2017-03-04'),
('ANDY','ACCOUNT1','2017-05-06'),
('DAVE','ACCOUNT1','2017-05-07'),
('FRED','ACCOUNT1','2017-05-08'),
('JAMES','ACCOUNT1','2017-08-05'),
('DAVE','ACCOUNT2','2017-01-02'),
('PHIL','ACCOUNT2','2017-01-18'),
('JOSH','ACCOUNT2','2017-04-08'), -- changed this date to have deterministic results
('JAMES','ACCOUNT2','2017-04-09'),
('DAVE','ACCOUNT2','2017-05-06'),
('PHIL','ACCOUNT2','2017-05-07') ;
-- make life easy
create index gotta_go_fast ON #histories (account, assigned);
Here's the laziest date dimension table of all time:
create table #date_dim_months_only (
month_date date,
primary key (month_date)
);
-- put 2500 month ends into table
INSERT INTO #date_dim_months_only WITH (TABLOCK)
SELECT DATEADD(DAY, -1, DATEADD(MONTH, ROW_NUMBER() OVER (ORDER BY (SELECT NULL)), '20000101'))
FROM master..spt_values;
For step 1, there are plenty of ways to emulate LEAD
. Here's one method:
SELECT
h1.username
, h1.account
, h1.assigned
, next_date.assigned
FROM #histories h1
OUTER APPLY (
SELECT TOP 1 h2.assigned
FROM #histories h2
WHERE h1.account = h2.account
AND h1.assigned < h2.assigned
ORDER BY h2.assigned ASC
) next_date;
For step 2, we need to change the NULL values to something else. You want to include the final month for each account, so adding one month to the starting date suffices:
ISNULL(next_date.assigned, DATEADD(MONTH, 1, h1.assigned))
For step 3, we can join to the date dimension table. The column from the dimension table is exactly the column you need for the result set:
INNER JOIN #date_dim_months_only dd ON
dd.month_date >= h1.assigned AND
dd.month_date < ISNULL(next_date.assigned, DATEADD(MONTH, 1, h1.assigned))
I didn't like the query that I got when I put it all together. There can be issues with join order when combining OUTER APPLY
and INNER JOIN
. To get the join order I wanted I rewrote it with a subquery:
SELECT
hist.username
, hist.account
, dd.month_date
FROM
(
SELECT
h1.username
, h1.account
, h1.assigned
, ISNULL(
(SELECT TOP 1 h2.assigned
FROM #histories h2
WHERE h1.account = h2.account
AND h1.assigned < h2.assigned
ORDER BY h2.assigned ASC
)
, DATEADD(MONTH, 1, h1.assigned)
) next_assigned
FROM #histories h1
) hist
INNER JOIN #date_dim_months_only dd ON
dd.month_date >= hist.assigned AND
dd.month_date < hist.next_assigned;
I don't know how much data you have so it might not matter for you. But the plan looks how I want it to:
The results match yours:
╔══════════╦══════════╦════════════╗
║ username ║ account ║ month_date ║
╠══════════╬══════════╬════════════╣
║ PETER ║ ACCOUNT1 ║ 2017-01-31 ║
║ PETER ║ ACCOUNT1 ║ 2017-02-28 ║
║ DAVE ║ ACCOUNT1 ║ 2017-03-31 ║
║ DAVE ║ ACCOUNT1 ║ 2017-04-30 ║
║ FRED ║ ACCOUNT1 ║ 2017-05-31 ║
║ FRED ║ ACCOUNT1 ║ 2017-06-30 ║
║ FRED ║ ACCOUNT1 ║ 2017-07-31 ║
║ JAMES ║ ACCOUNT1 ║ 2017-08-31 ║
║ PHIL ║ ACCOUNT2 ║ 2017-01-31 ║
║ PHIL ║ ACCOUNT2 ║ 2017-02-28 ║
║ PHIL ║ ACCOUNT2 ║ 2017-03-31 ║
║ JAMES ║ ACCOUNT2 ║ 2017-04-30 ║
║ PHIL ║ ACCOUNT2 ║ 2017-05-31 ║
╚══════════╩══════════╩════════════╝
Here I don't use calendar table but a natural numbers table nums.dbo.nums (I hope you've got it too, if not, it can be easily generated)
I have the answer slightly different from yours ('JOSH' <-> 'JAMES') because your data contains these 2 rows:
('JOSH','ACCOUNT2','2017-04-09'),
('JAMES','ACCOUNT2','2017-04-09'),
with the same account and assigned date and you did not precise which one should be taken is this situation.
declare @eom table(account varchar(10), dt date);
with acc_mm AS
(
select account, min(assigned) as min_dt, max(assigned) as max_dt
from #histories
group by account
),
acc_mm1 AS
(
select account,
dateadd(month, datediff(month, '19991231', min_dt), '19991231') as start_dt,
dateadd(month, datediff(month, '19991231', max_dt), '19991231') as end_dt
from acc_mm
)
insert into @eom (account, dt)
select account, dateadd(month, n - 1, start_dt)
from acc_mm1
join nums.dbo.nums
on n - 1 <= datediff(month, start_dt, end_dt);
select eom.dt, eom.account, a.username
from @eom eom
cross apply(select top 1 *
from #histories h
where eom.account = h.account
and h.assigned <= eom.dt
order by h.assigned desc) a
order by eom.account, eom.dt;
Triangle JOIN for the win!
SELECT account,EndOfMonth,username
FROM (
SELECT Ends.*, h.*
,ROW_NUMBER() OVER (PARTITION BY h.account,Ends.EndOfMonth ORDER BY h.assigned DESC) AS RowNumber
FROM (
SELECT [Year],[Month],MAX(DATE) AS EndOfMonth
FROM #dim
GROUP BY [Year],[Month]
) Ends
CROSS JOIN (
SELECT account, MAX(assigned) AS MaxAssigned
FROM #histories
GROUP BY account
) ac
JOIN #histories h ON h.account = ac.account
AND Year(h.assigned) = ends.[Year]
AND Month(h.assigned) <= ends.[Month] --triangle join for the win!
AND EndOfMonth < DATEADD(month, 1, Maxassigned)
) Results
WHERE RowNumber = 1
ORDER BY account,EndOfMonth;
Results are:
account EndOfMonth username
ACCOUNT1 2017-01-31 PETER
ACCOUNT1 2017-02-28 PETER
ACCOUNT1 2017-03-31 DAVE
ACCOUNT1 2017-04-30 DAVE
ACCOUNT1 2017-05-31 FRED
ACCOUNT1 2017-06-30 FRED
ACCOUNT1 2017-07-31 FRED
ACCOUNT1 2017-08-31 JAMES
ACCOUNT2 2017-01-31 PHIL
ACCOUNT2 2017-02-28 PHIL
ACCOUNT2 2017-03-31 PHIL
ACCOUNT2 2017-04-30 JAMES
ACCOUNT2 2017-05-31 PHIL
Interactive Execution Plan here.
I/O and TIME stats (truncated all the zero-values after logical reads):
(13 row(s) affected)
Table 'Worktable'. Scan count 3, logical reads 35.
Table 'Workfile'. Scan count 0, logical reads 0.
Table '#dim'. Scan count 1, logical reads 4.
Table '#histories'. Scan count 1, logical reads 1.
SQL Server Execution Times:
CPU time = 0 ms, elapsed time = 3 ms.
Query to create required 'temp tables and test the T-SQL statement I'm suggesting:
IF OBJECT_ID('tempdb..#histories') IS NOT NULL
DROP TABLE #histories
CREATE TABLE #histories (
username VARCHAR(10)
,account VARCHAR(10)
,assigned DATE
);
INSERT INTO #histories
VALUES
('PHIL','ACCOUNT1','2017-01-04'),
('PETER','ACCOUNT1','2017-01-15'),
('DAVE','ACCOUNT1','2017-03-04'),
('ANDY','ACCOUNT1','2017-05-06'),
('DAVE','ACCOUNT1','2017-05-07'),
('FRED','ACCOUNT1','2017-05-08'),
('JAMES','ACCOUNT1','2017-08-05'),
('DAVE','ACCOUNT2','2017-01-02'),
('PHIL','ACCOUNT2','2017-01-18'),
('JOSH','ACCOUNT2','2017-04-08'),
('JAMES','ACCOUNT2','2017-04-09'),
('DAVE','ACCOUNT2','2017-05-06'),
('PHIL','ACCOUNT2','2017-05-07');
DECLARE @StartDate DATE = '20170101'
,@NumberOfYears INT = 2;
-- prevent set or regional settings from interfering with
-- interpretation of dates / literals
SET DATEFIRST 7;
SET DATEFORMAT mdy;
SET LANGUAGE US_ENGLISH;
DECLARE @CutoffDate DATE = DATEADD(YEAR, @NumberOfYears, @StartDate);
-- this is just a holding table for intermediate calculations:
IF OBJECT_ID('tempdb..#dim') IS NOT NULL
DROP TABLE #dim
CREATE TABLE #dim (
[date] DATE PRIMARY KEY
,[day] AS DATEPART(DAY, [date])
,[month] AS DATEPART(MONTH, [date])
,FirstOfMonth AS CONVERT(DATE, DATEADD(MONTH, DATEDIFF(MONTH, 0, [date]), 0))
,[MonthName] AS DATENAME(MONTH, [date])
,[week] AS DATEPART(WEEK, [date])
,[ISOweek] AS DATEPART(ISO_WEEK, [date])
,[DayOfWeek] AS DATEPART(WEEKDAY, [date])
,[quarter] AS DATEPART(QUARTER, [date])
,[year] AS DATEPART(YEAR, [date])
,FirstOfYear AS CONVERT(DATE, DATEADD(YEAR, DATEDIFF(YEAR, 0, [date]), 0))
,Style112 AS CONVERT(CHAR(8), [date], 112)
,Style101 AS CONVERT(CHAR(10), [date], 101)
);
-- use the catalog views to generate as many rows as we need
INSERT #dim ([date])
SELECT d
FROM (
SELECT d = DATEADD(DAY, rn - 1, @StartDate)
FROM (
SELECT TOP (DATEDIFF(DAY, @StartDate, @CutoffDate)) rn = ROW_NUMBER() OVER (
ORDER BY s1.[object_id]
)
FROM sys.all_objects AS s1
CROSS JOIN sys.all_objects AS s2
-- on my system this would support > 5 million days
ORDER BY s1.[object_id]
) AS x
) AS y;
/* The actual SELECT statement to get the results we want! */
SET STATISTICS IO, TIME ON;
SELECT account,EndOfMonth,username
FROM (
SELECT Ends.*, h.*
,ROW_NUMBER() OVER (PARTITION BY h.account,Ends.EndOfMonth ORDER BY h.assigned DESC) AS RowNumber
FROM (
SELECT [Year],[Month],MAX(DATE) AS EndOfMonth
FROM #dim
GROUP BY [Year],[Month]
) Ends
CROSS JOIN (
SELECT account, MAX(assigned) AS MaxAssigned
FROM #histories
GROUP BY account
) ac
JOIN #histories h ON h.account = ac.account
AND Year(h.assigned) = ends.[Year]
AND Month(h.assigned) <= ends.[Month] --triangle join for the win!
AND EndOfMonth < DATEADD(month, 1, Maxassigned)
) Results
WHERE RowNumber = 1
ORDER BY account,EndOfMonth;
SET STATISTICS IO, TIME OFF;
--IF OBJECT_ID('tempdb..#histories') IS NOT NULL DROP TABLE #histories
--IF OBJECT_ID('tempdb..#dim') IS NOT NULL DROP TABLE #dim