Grouping connected linestrings in PostGIS?
So, by example. Here's a simple table with two connected groups of edges:
drop table lines;
create table lines ( id integer primary key, geom geometry(linestring) );
insert into lines (id, geom) values ( 1, 'LINESTRING(0 0, 0 1)');
insert into lines (id, geom) values ( 2, 'LINESTRING(0 1, 1 1)');
insert into lines (id, geom) values ( 3, 'LINESTRING(1 1, 1 2)');
insert into lines (id, geom) values ( 4, 'LINESTRING(1 2, 2 2)');
insert into lines (id, geom) values ( 11, 'LINESTRING(10 10, 10 11)');
insert into lines (id, geom) values ( 12, 'LINESTRING(10 11, 11 11)');
insert into lines (id, geom) values ( 13, 'LINESTRING(11 11, 11 12)');
insert into lines (id, geom) values ( 14, 'LINESTRING(11 12, 12 12)');
create index lines_gix on lines using gist(geom);
Now, here's a recursive function that, given the id of an edge, accumulates all the edges that touch:
CREATE OR REPLACE FUNCTION find_connected(integer) returns integer[] AS
$$
WITH RECURSIVE lines_r AS (
SELECT ARRAY[id] AS idlist, geom, id
FROM lines
WHERE id = $1
UNION ALL
SELECT array_append(lines_r.idlist, lines.id) AS idlist,
lines.geom AS geom,
lines.id AS id
FROM lines, lines_r
WHERE ST_Touches(lines.geom, lines_r.geom)
AND NOT lines_r.idlist @> ARRAY[lines.id]
)
SELECT
array_agg(id) AS idlist
FROM lines_r
$$
LANGUAGE 'sql';
That just leaves us needing to find, after each group is accumulated, the id of an edge that is not already part of a group. Which, tragically, requires a second recursive query.
WITH RECURSIVE groups_r AS (
(SELECT find_connected(id) AS idlist,
find_connected(id) AS grouplist,
id FROM lines WHERE id = 1)
UNION ALL
(SELECT array_cat(groups_r.idlist,find_connected(lines.id)) AS idlist,
find_connected(lines.id) AS grouplist,
lines.id
FROM lines, groups_r
WHERE NOT idlist @> ARRAY[lines.id]
LIMIT 1)
)
SELECT id, grouplist
FROM groups_r;
Which taken together return a nice set with the seed id and each group it accumulated. I leave it as an exercise to the reader to turn the arrays of id back into a query to create geometry for mapping.
id | grouplist
----+---------------
1 | {1,2,3,4}
11 | {11,12,13,14}
(2 rows)
Here's an approach that uses a temporary table to incrementally aggregate clusters together. I don't really care for the temporary table approach, but this seems to perform quite well as the number of lines increases (I have 1.2 M lines in my input).
DO
$$
DECLARE
this_id bigint;
this_geom geometry;
cluster_id_match integer;
id_a bigint;
id_b bigint;
BEGIN
DROP TABLE IF EXISTS clusters;
CREATE TABLE clusters (cluster_id serial, ids bigint[], geom geometry);
CREATE INDEX ON clusters USING GIST(geom);
-- Iterate through linestrings, assigning each to a cluster (if there is an intersection)
-- or creating a new cluster (if there is not)
FOR this_id, this_geom IN SELECT id, geom FROM lines LOOP
-- Look for an intersecting cluster. (There may be more than one.)
SELECT cluster_id FROM clusters WHERE ST_Intersects(this_geom, clusters.geom)
LIMIT 1 INTO cluster_id_match;
IF cluster_id_match IS NULL THEN
-- Create a new cluster
INSERT INTO clusters (ids, geom) VALUES (ARRAY[this_id], this_geom);
ELSE
-- Append line to existing cluster
UPDATE clusters SET geom = ST_Union(this_geom, geom),
ids = array_prepend(this_id, ids)
WHERE clusters.cluster_id = cluster_id_match;
END IF;
END LOOP;
-- Iterate through the clusters, combining clusters that intersect each other
LOOP
SELECT a.cluster_id, b.cluster_id FROM clusters a, clusters b
WHERE ST_Intersects(a.geom, b.geom)
AND a.cluster_id < b.cluster_id
INTO id_a, id_b;
EXIT WHEN id_a IS NULL;
-- Merge cluster A into cluster B
UPDATE clusters a SET geom = ST_Union(a.geom, b.geom), ids = array_cat(a.ids, b.ids)
FROM clusters b
WHERE a.cluster_id = id_a AND b.cluster_id = id_b;
-- Remove cluster B
DELETE FROM clusters WHERE cluster_id = id_b;
END LOOP;
END;
$$ language plpgsql;