R - Finding closest neighboring point and number of neighbors within a given radius, coordinates lat-long

Best option is to use libraries sp and rgeos, which enable you to construct spatial classes and perform geoprocessing.

library(sp)
library(rgeos)

Read the data and transform them to spatial objects:

mydata <- read.delim('d:/temp/testfile.txt', header=T)

sp.mydata <- mydata
coordinates(sp.mydata) <- ~long+lat

class(sp.mydata)
[1] "SpatialPointsDataFrame"
attr(,"package")
[1] "sp"

Now calculate pairwise distances between points

d <- gDistance(sp.mydata, byid=T)

Find second shortest distance (closest distance is of point to itself, therefore use second shortest)

min.d <- apply(d, 1, function(x) order(x, decreasing=F)[2])

Construct new data frame with desired variables

newdata <- cbind(mydata, mydata[min.d,], apply(d, 1, function(x) sort(x, decreasing=F)[2]))

colnames(newdata) <- c(colnames(mydata), 'neighbor', 'n.lat', 'n.long', 'n.area', 'n.canopy', 'n.avg.depth', 'distance')

newdata
            pond      lat      long area canopy avg.depth     neighbor    n.lat    n.long n.area n.canopy n.avg.depth
6            A10 41.95928 -72.14605 1500     66  60.61538 Borrow.Pit.3 41.95546 -72.15375      0        0    29.22222
3          AA006 41.96431 -72.12100  250      0  57.77778   Blacksmith 41.95508 -72.12380    361       77    71.31250
2     Blacksmith 41.95508 -72.12380  361     77  71.31250        AA006 41.96431 -72.12100    250        0    57.77778
5   Borrow.Pit.1 41.95601 -72.15419    0      0  41.44444 Borrow.Pit.2 41.95571 -72.15413      0        0    37.70000
4   Borrow.Pit.2 41.95571 -72.15413    0      0  37.70000 Borrow.Pit.1 41.95601 -72.15419      0        0    41.44444
5.1 Borrow.Pit.3 41.95546 -72.15375    0      0  29.22222 Borrow.Pit.2 41.95571 -72.15413      0        0    37.70000
6.1      Boulder 41.91822 -72.14978 1392     98  43.53333 Borrow.Pit.3 41.95546 -72.15375      0        0    29.22222
        distance
6   0.0085954872
3   0.0096462277
2   0.0096462277
5   0.0003059412
4   0.0003059412
5.1 0.0004548626
6.1 0.0374480316

Edit: if coordinates are in degrees and you would like to calculate distance in kilometers, use package geosphere

library(geosphere)

d <- distm(sp.mydata)

# rest is the same

This should provide better results, if the points are scattered across the globe and coordinates are in degrees


I add below an alternative solution using the newer sf package, for those interested and coming to this page now (as I did).

First, load the data and create the sf object.

# Using sf
mydata <- structure(
  list(pond = c("A10", "AA006", "Blacksmith", "Borrow.Pit.1", 
                "Borrow.Pit.2", "Borrow.Pit.3", "Boulder"), 
       lat = c(41.95928, 41.96431, 41.95508, 41.95601, 41.95571, 41.95546, 
               41.918223), 
       long = c(-72.14605, -72.121, -72.123803, -72.15419, -72.15413, 
                -72.15375, -72.14978), 
       area = c(1500L, 250L, 361L, 0L, 0L, 0L, 1392L), 
       canopy = c(66L, 0L, 77L, 0L, 0L, 0L, 98L), 
       avg.depth = c(60.61538462, 57.77777778, 71.3125, 41.44444444, 
                     37.7, 29.22222222, 43.53333333)), 
  class = "data.frame", row.names = c(NA, -7L))


library(sf)
data_sf <- st_as_sf(mydata, coords = c("long", "lat"),
                    # Change to your CRS
                    crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
st_is_longlat(data_sf)

sf::st_distance calculates the distance matrix in meters using Great Circle distance when using lat/lon data.

dist.mat <- st_distance(data_sf) # Great Circle distance since in lat/lon
# Number within 1.5km: Subtract 1 to exclude the point itself
num.1500 <- apply(dist.mat, 1, function(x) {
  sum(x < 1500) - 1
})

# Calculate nearest distance
nn.dist <- apply(dist.mat, 1, function(x) {
  return(sort(x, partial = 2)[2])
})
# Get index for nearest distance
nn.index <- apply(dist.mat, 1, function(x) { order(x, decreasing=F)[2] })

n.data <- mydata
colnames(n.data)[1] <- "neighbor"
colnames(n.data)[2:ncol(n.data)] <- 
  paste0("n.", colnames(n.data)[2:ncol(n.data)])
mydata2 <- data.frame(mydata,
                      n.data[nn.index, ],
                      n.distance = nn.dist,
                      radius1500 = num.1500)
rownames(mydata2) <- seq(nrow(mydata2))
mydata2
          pond      lat      long area canopy avg.depth     neighbor    n.lat    n.long n.area n.canopy
1          A10 41.95928 -72.14605 1500     66  60.61538 Borrow.Pit.1 41.95601 -72.15419      0        0
2        AA006 41.96431 -72.12100  250      0  57.77778   Blacksmith 41.95508 -72.12380    361       77
3   Blacksmith 41.95508 -72.12380  361     77  71.31250        AA006 41.96431 -72.12100    250        0
4 Borrow.Pit.1 41.95601 -72.15419    0      0  41.44444 Borrow.Pit.2 41.95571 -72.15413      0        0
5 Borrow.Pit.2 41.95571 -72.15413    0      0  37.70000 Borrow.Pit.1 41.95601 -72.15419      0        0
6 Borrow.Pit.3 41.95546 -72.15375    0      0  29.22222 Borrow.Pit.2 41.95571 -72.15413      0        0
7      Boulder 41.91822 -72.14978 1392     98  43.53333 Borrow.Pit.3 41.95546 -72.15375      0        0
  n.avg.depth n.distance radius1500
1    41.44444  766.38426          3
2    71.31250 1051.20527          1
3    57.77778 1051.20527          1
4    37.70000   33.69099          3
5    41.44444   33.69099          3
6    37.70000   41.99576          3
7    29.22222 4149.07406          0

For obtaining the closest neighbor after calculating distance, you can use sort() with the partial = 2 argument. Depending on the amount of data, this could be much faster than using order as in the previous solution. The package Rfast is likely even faster but I avoid including additional packages here. See this related post for a discussion and benchmarking of various solutions: https://stackoverflow.com/a/53144760/12265198


In Rfast, there is a function called "dista" and computes the Euclidean or the Manhattan distances only (at the moment). It gives the option to compute the k-smallest distances. Alternatively it can return the indexes of the observations with the smallest distances. The cosinus distance is basically almost the same as the Eucledean distance (exluding a constant, 2 I think).