How to find all connected subgraphs of an undirected graph
Here is a variant that doesn't use cursor, but uses a single recursive query.
Essentially, it treats the data as edges in a graph and traverses recursively all edges of the graph, stopping when the loop is detected. Then it puts all found loops in groups and gives each group a number.
See the detailed explanations of how it works below. I recommend you to run the query CTE-by-CTE and examine each intermediate result to understand what it does.
Sample 1
DECLARE @T TABLE (ID int, Ident1 char(1), Ident2 char(1));
INSERT INTO @T (ID, Ident1, Ident2) VALUES
(1, 'a', 'a'),
(2, 'b', 'b'),
(3, 'c', 'a'),
(4, 'c', 'b'),
(5, 'c', 'c');
Sample 2
I added one more row with z
value to have multiple rows with unpaired values.
DECLARE @T TABLE (ID int, Ident1 char(1), Ident2 char(1));
INSERT INTO @T (ID, Ident1, Ident2) VALUES
(1, 'a', 'a'),
(1, 'a', 'c'),
(2, 'b', 'f'),
(3, 'a', 'g'),
(4, 'c', 'h'),
(5, 'b', 'j'),
(6, 'd', 'f'),
(7, 'e', 'k'),
(8, 'i', NULL),
(88, 'z', 'z'),
(9, 'l', 'h');
Sample 3
DECLARE @T TABLE (ID int, Ident1 char(1), Ident2 char(1));
INSERT INTO @T (ID, Ident1, Ident2) VALUES
(1, 'a', 'f'),
(2, 'a', 'g'),
(3, 'a', NULL),
(4, 'b', 'c'),
(5, 'b', 'a'),
(6, 'b', 'h'),
(7, 'b', 'j'),
(8, 'b', NULL),
(9, 'b', NULL),
(10, 'b', 'g'),
(11, 'c', 'k'),
(12, 'c', 'b'),
(13, 'd', 'l'),
(14, 'd', 'f'),
(15, 'd', 'g'),
(16, 'd', 'm'),
(17, 'd', 'a'),
(18, 'd', NULL),
(19, 'd', 'a'),
(20, 'e', 'c'),
(21, 'e', 'b'),
(22, 'e', NULL);
Query
WITH
CTE_Idents
AS
(
SELECT Ident1 AS Ident
FROM @T
UNION
SELECT Ident2 AS Ident
FROM @T
)
,CTE_Pairs
AS
(
SELECT Ident1, Ident2
FROM @T
WHERE Ident1 <> Ident2
UNION
SELECT Ident2 AS Ident1, Ident1 AS Ident2
FROM @T
WHERE Ident1 <> Ident2
)
,CTE_Recursive
AS
(
SELECT
CAST(CTE_Idents.Ident AS varchar(8000)) AS AnchorIdent
, Ident1
, Ident2
, CAST(',' + Ident1 + ',' + Ident2 + ',' AS varchar(8000)) AS IdentPath
, 1 AS Lvl
FROM
CTE_Pairs
INNER JOIN CTE_Idents ON CTE_Idents.Ident = CTE_Pairs.Ident1
UNION ALL
SELECT
CTE_Recursive.AnchorIdent
, CTE_Pairs.Ident1
, CTE_Pairs.Ident2
, CAST(CTE_Recursive.IdentPath + CTE_Pairs.Ident2 + ',' AS varchar(8000)) AS IdentPath
, CTE_Recursive.Lvl + 1 AS Lvl
FROM
CTE_Pairs
INNER JOIN CTE_Recursive ON CTE_Recursive.Ident2 = CTE_Pairs.Ident1
WHERE
CTE_Recursive.IdentPath NOT LIKE CAST('%,' + CTE_Pairs.Ident2 + ',%' AS varchar(8000))
)
,CTE_RecursionResult
AS
(
SELECT AnchorIdent, Ident1, Ident2
FROM CTE_Recursive
)
,CTE_CleanResult
AS
(
SELECT AnchorIdent, Ident1 AS Ident
FROM CTE_RecursionResult
UNION
SELECT AnchorIdent, Ident2 AS Ident
FROM CTE_RecursionResult
)
SELECT
CTE_Idents.Ident
,CASE WHEN CA_Data.XML_Value IS NULL
THEN CTE_Idents.Ident ELSE CA_Data.XML_Value END AS GroupMembers
,DENSE_RANK() OVER(ORDER BY
CASE WHEN CA_Data.XML_Value IS NULL
THEN CTE_Idents.Ident ELSE CA_Data.XML_Value END
) AS GroupID
FROM
CTE_Idents
CROSS APPLY
(
SELECT CTE_CleanResult.Ident+','
FROM CTE_CleanResult
WHERE CTE_CleanResult.AnchorIdent = CTE_Idents.Ident
ORDER BY CTE_CleanResult.Ident FOR XML PATH(''), TYPE
) AS CA_XML(XML_Value)
CROSS APPLY
(
SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
) AS CA_Data(XML_Value)
WHERE
CTE_Idents.Ident IS NOT NULL
ORDER BY Ident;
Result 1
+-------+--------------+---------+
| Ident | GroupMembers | GroupID |
+-------+--------------+---------+
| a | a,b,c, | 1 |
| b | a,b,c, | 1 |
| c | a,b,c, | 1 |
+-------+--------------+---------+
Result 2
+-------+--------------+---------+
| Ident | GroupMembers | GroupID |
+-------+--------------+---------+
| a | a,c,g,h,l, | 1 |
| b | b,d,f,j, | 2 |
| c | a,c,g,h,l, | 1 |
| d | b,d,f,j, | 2 |
| e | e,k, | 3 |
| f | b,d,f,j, | 2 |
| g | a,c,g,h,l, | 1 |
| h | a,c,g,h,l, | 1 |
| i | i | 4 |
| j | b,d,f,j, | 2 |
| k | e,k, | 3 |
| l | a,c,g,h,l, | 1 |
| z | z | 5 |
+-------+--------------+---------+
Result 3
+-------+--------------------------+---------+
| Ident | GroupMembers | GroupID |
+-------+--------------------------+---------+
| a | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| b | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| c | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| d | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| e | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| f | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| g | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| h | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| j | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| k | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| l | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
| m | a,b,c,d,e,f,g,h,j,k,l,m, | 1 |
+-------+--------------------------+---------+
How it works
I'll use the second set of sample data for this explanation.
CTE_Idents
CTE_Idents
gives the list of all Identifiers that appear in both Ident1
and Ident2
columns.
Since they can appear in any order we UNION
both columns together. UNION
also removes any duplicates.
+-------+
| Ident |
+-------+
| NULL |
| a |
| b |
| c |
| d |
| e |
| f |
| g |
| h |
| i |
| j |
| k |
| l |
| z |
+-------+
CTE_Pairs
CTE_Pairs
gives the list of all edges of the graph in both directions. Again, UNION
is used to remove any duplicates.
+--------+--------+
| Ident1 | Ident2 |
+--------+--------+
| a | c |
| a | g |
| b | f |
| b | j |
| c | a |
| c | h |
| d | f |
| e | k |
| f | b |
| f | d |
| g | a |
| h | c |
| h | l |
| j | b |
| k | e |
| l | h |
+--------+--------+
CTE_Recursive
CTE_Recursive
is the main part of the query that recursively traverses the graph starting from each unique Identifier.
These starting rows are produced by the first part of UNION ALL
.
The second part of UNION ALL
recursively joins to itself linking Ident2
to Ident1
.
Since we pre-made CTE_Pairs
with all edges written in both directions, we can always link only Ident2
to Ident1
and we'll get all paths in the graph.
At the same time the query builds IdentPath
- a string of comma-delimited Identifiers that have been traversed so far.
It is used in the WHERE
filter:
CTE_Recursive.IdentPath NOT LIKE CAST('%,' + CTE_Pairs.Ident2 + ',%' AS varchar(8000))
As soon as we come across the Identifier that had been included in the Path before, the recursion stops as the list of connected nodes is exhausted.
AnchorIdent
is the starting Identifier for the recursion, it will be used later to group results.
Lvl
is not really used, I included it for better understanding of what is going on.
+-------------+--------+--------+-------------+-----+
| AnchorIdent | Ident1 | Ident2 | IdentPath | Lvl |
+-------------+--------+--------+-------------+-----+
| a | a | c | ,a,c, | 1 |
| a | a | g | ,a,g, | 1 |
| b | b | f | ,b,f, | 1 |
| b | b | j | ,b,j, | 1 |
| c | c | a | ,c,a, | 1 |
| c | c | h | ,c,h, | 1 |
| d | d | f | ,d,f, | 1 |
| e | e | k | ,e,k, | 1 |
| f | f | b | ,f,b, | 1 |
| f | f | d | ,f,d, | 1 |
| g | g | a | ,g,a, | 1 |
| h | h | c | ,h,c, | 1 |
| h | h | l | ,h,l, | 1 |
| j | j | b | ,j,b, | 1 |
| k | k | e | ,k,e, | 1 |
| l | l | h | ,l,h, | 1 |
| l | h | c | ,l,h,c, | 2 |
| l | c | a | ,l,h,c,a, | 3 |
| l | a | g | ,l,h,c,a,g, | 4 |
| j | b | f | ,j,b,f, | 2 |
| j | f | d | ,j,b,f,d, | 3 |
| h | c | a | ,h,c,a, | 2 |
| h | a | g | ,h,c,a,g, | 3 |
| g | a | c | ,g,a,c, | 2 |
| g | c | h | ,g,a,c,h, | 3 |
| g | h | l | ,g,a,c,h,l, | 4 |
| f | b | j | ,f,b,j, | 2 |
| d | f | b | ,d,f,b, | 2 |
| d | b | j | ,d,f,b,j, | 3 |
| c | h | l | ,c,h,l, | 2 |
| c | a | g | ,c,a,g, | 2 |
| b | f | d | ,b,f,d, | 2 |
| a | c | h | ,a,c,h, | 2 |
| a | h | l | ,a,c,h,l, | 3 |
+-------------+--------+--------+-------------+-----+
CTE_CleanResult
CTE_CleanResult
leaves only relevant parts from CTE_Recursive
and again merges both Ident1
and Ident2
using UNION
.
+-------------+-------+
| AnchorIdent | Ident |
+-------------+-------+
| a | a |
| a | c |
| a | g |
| a | h |
| a | l |
| b | b |
| b | d |
| b | f |
| b | j |
| c | a |
| c | c |
| c | g |
| c | h |
| c | l |
| d | b |
| d | d |
| d | f |
| d | j |
| e | e |
| e | k |
| f | b |
| f | d |
| f | f |
| f | j |
| g | a |
| g | c |
| g | g |
| g | h |
| g | l |
| h | a |
| h | c |
| h | g |
| h | h |
| h | l |
| j | b |
| j | d |
| j | f |
| j | j |
| k | e |
| k | k |
| l | a |
| l | c |
| l | g |
| l | h |
| l | l |
+-------------+-------+
Final SELECT
Now we need to build a string of comma-separated Ident
values for each AnchorIdent
.
CROSS APPLY
with FOR XML
does it.
DENSE_RANK()
calculates the GroupID
numbers for each AnchorIdent
.
My suggestion is to use stored procedure with cursor. It is easy to implement and relatively fast. Only two steps:
- First, create sp_GetIdentByGroup
- Second, INSERT related identifiers in a temporary table #PairIds and call sp [dbo].[sp_GetIdentByGroup] and you will get Identifiers with the same GroupID as related items.
Query:
CREATE TABLE #PairIds
(
Ident1 VARCHAR(10),
Ident2 VARCHAR(10)
)
INSERT INTO #PairIds
VALUES ('a', 'c'),
('b', 'f'),
('a', 'g'),
('c', 'h'),
('b', 'j'),
('d', 'f'),
('e', 'k'),
('l', 'h')
exec [dbo].[sp_GetIdentByGroup]
Result:
Ident | GroupID
---------------------------------------------------
a | 1 |
b | 2 |
c | 1 |
d | 2 |
e | 3 |
f | 2 |
g | 1 |
h | 1 |
j | 2 |
k | 3 |
l | 1 |
Code for creating stored procedure:
CREATE PROCEDURE [dbo].[sp_GetIdentByGroup]
AS
BEGIN
DECLARE @message VARCHAR(70);
DECLARE @IdentInput1 varchar(20)
DECLARE @IdentInput2 varchar(20)
DECLARE @Counter INT
DECLARE @Group1 INT
DECLARE @Group2 INT
DECLARE @Ident varchar(20)
DECLARE @IdentCheck1 varchar(20)
DECLARE @IdentCheck2 varchar(20)
SET @Counter = 1
DECLARE @IdentByGroupCursor TABLE (
Ident varchar(20) UNIQUE CLUSTERED,
GroupID INT
);
-- Use a cursor to select your data, which enables SQL Server to extract
-- the data from your local table to the variables.
declare ins_cursor cursor for
select Ident1, Ident2 from #PairIds
open ins_cursor
fetch next from ins_cursor into @IdentInput1, @IdentInput2 -- At this point, the data from the first row
-- is in your local variables.
-- Move through the table with the @@FETCH_STATUS=0
WHILE @@FETCH_STATUS=0
BEGIN
SET @Group1 = null
SET @Group2 = null
SELECT TOP 1 @Group1 = GroupID, @IdentCheck1 = Ident
FROM @IdentByGroupCursor
WHERE Ident in (@IdentInput1)
SELECT TOP 1 @Group2 = GroupID, @IdentCheck2 = Ident
FROM @IdentByGroupCursor
WHERE Ident in (@IdentInput2)
IF (@Group1 IS NOT NULL AND @Group2 IS NOT NULL)
BEGIN
IF @Group1 > @Group2
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group2
WHERE
GroupID = @Group1
END
IF @Group2 > @Group1
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group1
WHERE
GroupID = @Group2
END
END
ELSE IF @Group1 IS NOT NULL
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group1
WHERE
Ident IN (@IdentInput1)
END
ELSE IF @Group2 IS NOT NULL
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group2
WHERE
Ident IN (@IdentInput2)
END
IF (@Group1 IS NOT NULL AND @Group2 IS NOT NULL)
BEGIN
IF @Group1 > @Group2
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group2
WHERE
GroupID = @Group1
END
IF @Group2 > @Group1
BEGIN
UPDATE @IdentByGroupCursor
SET GroupID = @Group1
WHERE
GroupID = @Group2
END
END
IF @Group1 IS NULL
BEGIN
INSERT INTO @IdentByGroupCursor (Ident, GroupID)
VALUES (@IdentInput1, ISNULL(@Group2, @Counter))
END
IF @Group2 IS NULL
BEGIN
INSERT INTO @IdentByGroupCursor (Ident, GroupID)
VALUES (@IdentInput2, ISNULL(@Group1, @COunter))
END
IF (@Group1 IS NULL OR @Group2 IS NULL)
BEGIN
SET @COunter = @COunter +1
END
-- Once the execution has taken place, you fetch the next row of data from your local table.
fetch next from ins_cursor into @IdentInput1, @IdentInput2
End
-- When all the rows have inserted you must close and deallocate the cursor.
-- Failure to do this will not let you re-use the cursor.
close ins_cursor
deallocate ins_cursor
SELECT Ident ,DENSE_RANK() OVER( ORDER BY GroupID ASC) AS GroupID
FROM @IdentByGroupCursor
ORDER BY Ident
END
GO
Sp_GetIdentByGroup has an index for speed and with the use of a cursor, it prepares desired result set. The stored procedure expects #PairIds table to exist.
More information on SQL How to group identifiers that are related with each other in specific groups.
This script produces the outputs for test sets 1, 2 and 3 as required. Notes on the algorithm as comments in the script.
Be aware:
- This algorithm destroys the input set. In the script the input set is
#tree
. So using this script requires inserting the source data into#tree
- This algorithm does not work for
NULL
values for nodes. ReplaceNULL
values withCHAR(0)
when inserting into#tree
usingISNULL(source_col,CHAR(0))
to circumvent this shortcoming. When selecting from the final result, replaceCHAR(0)
withNULL
usingNULLIF(node,CHAR(0))
.
Note that the answer using recursive CTEs is more elegant in that it is a single SQL statement, but for large input sets using recursive CTEs may give abysmal execution time (see this comment on that answer). The solution as described below, while more convoluted, should run much faster for large input sets.
SET NOCOUNT ON;
CREATE TABLE #tree(node_l CHAR(1),node_r CHAR(1));
CREATE NONCLUSTERED INDEX NIX_tree_node_l ON #tree(node_l)INCLUDE(node_r); -- covering indices to speed up lookup
CREATE NONCLUSTERED INDEX NIX_tree_node_r ON #tree(node_r)INCLUDE(node_l);
INSERT INTO #tree(node_l,node_r) VALUES
('a','c'),('b','f'),('a','g'),('c','h'),('b','j'),('d','f'),('e','k'),('i','i'),('l','h'); -- test set 1
--('a','f'),('a','g'),(CHAR(0),'a'),('b','c'),('b','a'),('b','h'),('b','j'),('b',CHAR(0)),('b',CHAR(0)),('b','g'),('c','k'),('c','b'),('d','l'),('d','f'),('d','g'),('d','m'),('d','a'),('d',CHAR(0)),('d','a'),('e','c'),('e','b'),('e',CHAR(0)); -- test set 2
--('a','a'),('b','b'),('c','a'),('c','b'),('c','c'); -- test set 3
CREATE TABLE #sets(node CHAR(1) PRIMARY KEY,group_id INT); -- nodes with group id assigned
CREATE TABLE #visitor_queue(node CHAR(1)); -- contains nodes to visit
CREATE TABLE #visited_nodes(node CHAR(1) PRIMARY KEY CLUSTERED WITH(IGNORE_DUP_KEY=ON)); -- nodes visited for nodes on the queue; ignore duplicate nodes when inserted
CREATE TABLE #visitor_ctx(node_l CHAR(1),node_r CHAR(1)); -- context table, contains deleted nodes as they are visited from #tree
DECLARE @last_created_group_id INT=0;
-- Notes:
-- 1. This algorithm is destructive in its input set, ie #tree will be empty at the end of this procedure
-- 2. This algorithm does not accept NULL values. Populate #tree with CHAR(0) for NULL values (using ISNULL(source_col,CHAR(0)), or COALESCE(source_col,CHAR(0)))
-- 3. When selecting from #sets, to regain the original NULL values use NULLIF(node,CHAR(0))
WHILE EXISTS(SELECT*FROM #tree)
BEGIN
TRUNCATE TABLE #visited_nodes;
TRUNCATE TABLE #visitor_ctx;
-- push first nodes onto the queue (via #visitor_ctx -> #visitor_queue)
DELETE TOP (1) t
OUTPUT deleted.node_l,deleted.node_r INTO #visitor_ctx(node_l,node_r)
FROM #tree AS t;
INSERT INTO #visitor_queue(node) SELECT node_l FROM #visitor_ctx UNION SELECT node_r FROM #visitor_ctx; -- UNION to filter when node_l equals node_r
INSERT INTO #visited_nodes(node) SELECT node FROM #visitor_queue; -- keep track of nodes visited
-- work down the queue by visiting linked nodes in #tree; nodes are deleted as they are visited
WHILE EXISTS(SELECT*FROM #visitor_queue)
BEGIN
TRUNCATE TABLE #visitor_ctx;
-- pop_front for node on the stack (via #visitor_ctx -> @node)
DELETE TOP (1) s
OUTPUT deleted.node INTO #visitor_ctx(node_l)
FROM #visitor_queue AS s;
DECLARE @node CHAR(1)=(SELECT node_l FROM #visitor_ctx);
TRUNCATE TABLE #visitor_ctx;
-- visit nodes in #tree where node_l or node_r equal target @node;
-- delete visited nodes from #tree, output to #visitor_ctx
DELETE t
OUTPUT deleted.node_l,deleted.node_r INTO #visitor_ctx(node_l,node_r)
FROM #tree AS t
WHERE t.node_l=@node OR t.node_r=@node;
-- insert visited nodes in the queue that haven't been visited before
INSERT INTO #visitor_queue(node)
(SELECT node_l FROM #visitor_ctx UNION SELECT node_r FROM #visitor_ctx) EXCEPT (SELECT node FROM #visited_nodes);
-- keep track of visited nodes (duplicates are ignored by the IGNORE_DUP_KEY option for the PK)
INSERT INTO #visited_nodes(node)
SELECT node_l FROM #visitor_ctx UNION SELECT node_r FROM #visitor_ctx;
END
SET @last_created_group_id+=1; -- create new group id
-- insert group into #sets
INSERT INTO #sets(group_id,node)
SELECT group_id=@last_created_group_id,node
FROM #visited_nodes;
END
SELECT node=NULLIF(node,CHAR(0)),group_id FROM #sets ORDER BY node; -- nodes with their assigned group id
SELECT g.group_id,m.members -- groups with their members
FROM
(SELECT DISTINCT group_id FROM #sets) AS g
CROSS APPLY (
SELECT members=STUFF((
SELECT ','+ISNULL(CAST(NULLIF(si.node,CHAR(0)) AS VARCHAR(4)),'NULL')
FROM #sets AS si
WHERE si.group_id=g.group_id
FOR XML PATH('')
),1,1,'')
) AS m
ORDER BY g.group_id;
DROP TABLE #visitor_queue;
DROP TABLE #visited_nodes;
DROP TABLE #visitor_ctx;
DROP TABLE #sets;
DROP TABLE #tree;
Output for set 1:
+------+----------+
| node | group_id |
+------+----------+
| a | 1 |
| b | 2 |
| c | 1 |
| d | 2 |
| e | 4 |
| f | 2 |
| g | 1 |
| h | 1 |
| i | 3 |
| j | 2 |
| k | 4 |
| l | 1 |
+------+----------+
Output for set 2:
+------+----------+
| node | group_id |
+------+----------+
| NULL | 1 |
| a | 1 |
| b | 1 |
| c | 1 |
| d | 1 |
| e | 1 |
| f | 1 |
| g | 1 |
| h | 1 |
| j | 1 |
| k | 1 |
| l | 1 |
| m | 1 |
+------+----------+
Output for set 3:
+------+----------+
| node | group_id |
+------+----------+
| a | 1 |
| b | 1 |
| c | 1 |
+------+----------+