GeoPandas: Find nearest point in other dataframe
If you have large dataframes, I've found that scipy
's cKDTree spatial index .query
method returns very fast results for nearest neighbor searches. As it uses a spatial index it's orders of magnitude faster than looping though the dataframe and then finding the minimum of all distances. It is also faster than using shapely's nearest_points
with RTree (the spatial index method available via geopandas) because cKDTree allows you to vectorize your search whereas the other method does not.
Here is a helper function that will return the distance and 'Name' of the nearest neighbor in gpd2
from each point in gpd1
. It assumes both gdfs have a geometry
column (of points).
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
['Soap', 1, Point(0, 2)]],
columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
['Home', Point(1, 1.1)]],
columns=['Place', 'geometry'])
def ckdnearest(gdA, gdB):
nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
btree = cKDTree(nB)
dist, idx = btree.query(nA, k=1)
gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
gdf = pd.concat(
[
gdA.reset_index(drop=True),
gdB_nearest,
pd.Series(dist, name='dist')
],
axis=1)
return gdf
ckdnearest(gpd1, gpd2)
And if you want to find the closest point to a LineString, here is a full working example:
import itertools
from operator import itemgetter
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString
gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
['Smith', 1, Point(2, 2)],
['Soap', 1, Point(0, 2)]],
columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
['Home', LineString([Point(101, 0), Point(102, 1)])]],
columns=['Place', 'geometry'])
def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
A = np.concatenate(
[np.array(geom.coords) for geom in gdfA.geometry.to_list()])
B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
B_ix = tuple(itertools.chain.from_iterable(
[itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
B = np.concatenate(B)
ckd_tree = cKDTree(B)
dist, idx = ckd_tree.query(A, k=1)
idx = itemgetter(*idx)(B_ix)
gdf = pd.concat(
[gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
pd.Series(dist, name='dist')], axis=1)
return gdf
c = ckdnearest(gpd1, gpd2)
You can directly use the Shapely function Nearest points (the geometries of the GeoSeries are Shapely geometries):
from shapely.ops import nearest_points
# unary union of the gpd2 geomtries
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
# find the nearest point and return the corresponding Place value
nearest = gpd2.geometry == nearest_points(point, pts)[1]
return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
Name ID geometry Nearest
0 John 1 POINT (1 1) Home
1 Smith 1 POINT (2 2) Shops
2 Soap 1 POINT (0 2) Work
Explication
for i, row in gpd1.iterrows():
print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
POINT (1 1) POINT (1 1.1)
POINT (2 2) POINT (2.5 2)
POINT (0 2) POINT (0 1.1)
Figured it out:
def min_dist(point, gpd2):
gpd2['Dist'] = gpd2.apply(lambda row: point.distance(row.geometry),axis=1)
geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
return geoseries
Of course some criticism is welcome. I'm not a fan of recalculating gpd2['Dist'] for every row of gpd1...