Find parent rows that have identical sets of child rows
This is a generalization of the relational division problem. No idea how efficient this will be:
; WITH cte AS
( SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID <> r2.RecipeID
WHERE NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
AND NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
AND ri1.IngredientID = ri2.IngredientID
AND ri1.Quantity = ri2.Quantity
AND ri1.UOM = ri2.UOM
)
)
)
SELECT c1.*
FROM cte AS c1
JOIN cte AS c2
ON c1.RecipeID_1 = c2.RecipeID_2
AND c1.RecipeID_2 = c2.RecipeID_1
AND c1.RecipeID_1 < c1.RecipeID_2;
Another (similar) approach:
SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
)
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
) ;
And another, different one:
; WITH cte AS
( SELECT RecipeID_1 = r.RecipeID, RecipeID_2 = ri.RecipeID,
ri.IngredientID, ri.Quantity, ri.UOM
FROM Recipes AS r
CROSS JOIN RecipeIngredients AS ri
)
, cte2 AS
( SELECT RecipeID_1, RecipeID_2,
IngredientID, Quantity, UOM
FROM cte
EXCEPT
SELECT RecipeID_2, RecipeID_1,
IngredientID, Quantity, UOM
FROM cte
)
SELECT RecipeID_1 = r1.RecipeID, RecipeID_2 = r2.RecipeID
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
EXCEPT
SELECT RecipeID_1, RecipeID_2
FROM cte2
EXCEPT
SELECT RecipeID_2, RecipeID_1
FROM cte2 ;
Tested at SQL-Fiddle
Using the CHECKSUM()
and CHECKSUM_AGG()
functions, test at SQL-Fiddle-2:
(ignore this as it may give false positives)
ALTER TABLE RecipeIngredients
ADD ck AS CHECKSUM( IngredientID, Quantity, UOM )
PERSISTED ;
CREATE INDEX ckecksum_IX
ON RecipeIngredients
( RecipeID, ck ) ;
; WITH cte AS
( SELECT RecipeID,
cka = CHECKSUM_AGG(ck)
FROM RecipeIngredients AS ri
GROUP BY RecipeID
)
SELECT RecipeID_1 = c1.RecipeID, RecipeID_2 = c2.RecipeID
FROM cte AS c1
JOIN cte AS c2
ON c1.cka = c2.cka
AND c1.RecipeID < c2.RecipeID ;
For the following assumed schema and example data
CREATE TABLE dbo.RecipeIngredients
(
RecipeId INT NOT NULL ,
IngredientID INT NOT NULL ,
Quantity INT NOT NULL ,
UOM INT NOT NULL ,
CONSTRAINT RecipeIngredients_PK
PRIMARY KEY ( RecipeId, IngredientID ) WITH (IGNORE_DUP_KEY = ON)
) ;
INSERT INTO dbo.RecipeIngredients
SELECT TOP (210000) ABS(CRYPT_GEN_RANDOM(8)/50000),
ABS(CRYPT_GEN_RANDOM(8) % 100),
ABS(CRYPT_GEN_RANDOM(8) % 10),
ABS(CRYPT_GEN_RANDOM(8) % 5)
FROM master..spt_values v1,
master..spt_values v2
SELECT DISTINCT RecipeId, 'X' AS Name
INTO Recipes
FROM dbo.RecipeIngredients
This populated 205,009 ingredient rows and 42,613 recipes. This will be slightly different each time due to the random element.
It assumes relatively few dupes (output after an example run was 217 duplicate recipe groups with two or three recipes per group). The most pathological case based on the figures in the OP would be 48,000 exact duplicates.
A script to set that up is
DROP TABLE dbo.RecipeIngredients,Recipes
GO
CREATE TABLE Recipes(
RecipeId INT IDENTITY,
Name VARCHAR(1))
INSERT INTO Recipes
SELECT TOP 48000 'X'
FROM master..spt_values v1,
master..spt_values v2
CREATE TABLE dbo.RecipeIngredients
(
RecipeId INT NOT NULL ,
IngredientID INT NOT NULL ,
Quantity INT NOT NULL ,
UOM INT NOT NULL ,
CONSTRAINT RecipeIngredients_PK
PRIMARY KEY ( RecipeId, IngredientID )) ;
INSERT INTO dbo.RecipeIngredients
SELECT RecipeId,IngredientID,Quantity,UOM
FROM Recipes
CROSS JOIN (SELECT 1,1,1 UNION ALL SELECT 2,2,2 UNION ALL SELECT 3,3,3 UNION ALL SELECT 4,4,4) I(IngredientID,Quantity,UOM)
The following completed in less than a second on my machine for both cases.
CREATE TABLE #Concat
(
RecipeId INT,
concatenated VARCHAR(8000),
PRIMARY KEY (concatenated, RecipeId)
)
INSERT INTO #Concat
SELECT R.RecipeId,
ISNULL(concatenated, '')
FROM Recipes R
CROSS APPLY (SELECT CAST(IngredientID AS VARCHAR(10)) + ',' + CAST(Quantity AS VARCHAR(10)) + ',' + CAST(UOM AS VARCHAR(10)) + ','
FROM dbo.RecipeIngredients RI
WHERE R.RecipeId = RecipeId
ORDER BY IngredientID
FOR XML PATH('')) X (concatenated);
WITH C1
AS (SELECT DISTINCT concatenated
FROM #Concat)
SELECT STUFF(Recipes, 1, 1, '')
FROM C1
CROSS APPLY (SELECT ',' + CAST(RecipeId AS VARCHAR(10))
FROM #Concat C2
WHERE C1.concatenated = C2.concatenated
ORDER BY RecipeId
FOR XML PATH('')) R(Recipes)
WHERE Recipes LIKE '%,%,%'
DROP TABLE #Concat
One caveat
I assumed that the length of the concatenated string will not exceed 896 bytes. If it does this will raise an error at run time rather than silently failing. You will need to remove the primary key (and implicitly created index) from the #temp
table. The maximum length of the concatenated string in my test setup was 125 characters.
If the concatenated string is too long to index then performance of the final XML PATH
query consolidating the identical recipes could well be poor. Installing and using a custom CLR string aggregation would be one solution as that could do the concatenation with one pass of the data rather than a non indexed self join.
SELECT YourClrAggregate(RecipeId)
FROM #Concat
GROUP BY concatenated
I also tried
WITH Agg
AS (SELECT RecipeId,
MAX(IngredientID) AS MaxIngredientID,
MIN(IngredientID) AS MinIngredientID,
SUM(IngredientID) AS SumIngredientID,
COUNT(IngredientID) AS CountIngredientID,
CHECKSUM_AGG(IngredientID) AS ChkIngredientID,
MAX(Quantity) AS MaxQuantity,
MIN(Quantity) AS MinQuantity,
SUM(Quantity) AS SumQuantity,
COUNT(Quantity) AS CountQuantity,
CHECKSUM_AGG(Quantity) AS ChkQuantity,
MAX(UOM) AS MaxUOM,
MIN(UOM) AS MinUOM,
SUM(UOM) AS SumUOM,
COUNT(UOM) AS CountUOM,
CHECKSUM_AGG(UOM) AS ChkUOM
FROM dbo.RecipeIngredients
GROUP BY RecipeId)
SELECT A1.RecipeId AS RecipeId1,
A2.RecipeId AS RecipeId2
FROM Agg A1
JOIN Agg A2
ON A1.MaxIngredientID = A2.MaxIngredientID
AND A1.MinIngredientID = A2.MinIngredientID
AND A1.SumIngredientID = A2.SumIngredientID
AND A1.CountIngredientID = A2.CountIngredientID
AND A1.ChkIngredientID = A2.ChkIngredientID
AND A1.MaxQuantity = A2.MaxQuantity
AND A1.MinQuantity = A2.MinQuantity
AND A1.SumQuantity = A2.SumQuantity
AND A1.CountQuantity = A2.CountQuantity
AND A1.ChkQuantity = A2.ChkQuantity
AND A1.MaxUOM = A2.MaxUOM
AND A1.MinUOM = A2.MinUOM
AND A1.SumUOM = A2.SumUOM
AND A1.CountUOM = A2.CountUOM
AND A1.ChkUOM = A2.ChkUOM
AND A1.RecipeId <> A2.RecipeId
WHERE NOT EXISTS (SELECT *
FROM (SELECT *
FROM RecipeIngredients
WHERE RecipeId = A1.RecipeId) R1
FULL OUTER JOIN (SELECT *
FROM RecipeIngredients
WHERE RecipeId = A2.RecipeId) R2
ON R1.IngredientID = R2.IngredientID
AND R1.Quantity = R2.Quantity
AND R1.UOM = R2.UOM
WHERE R1.RecipeId IS NULL
OR R2.RecipeId IS NULL)
This works acceptably when there are relatively few duplicates (less than a second for the first example data) but performs badly in the pathological case as the initial aggregation returns exactly the same results for every RecipeID
and so doesn't manage to cut down the number of comparisons at all.