How to join a table to a shapefile with non-matching IDs and names (similar strings)?
I would go for stringdist
package which has implemented many algorithms to calculate the partial similarity (distance) of strings including Jaro-winkler
.
Here is a fast solution for you:
#df to be joined
id <- c(100:111)
name <- c("Aragatsotn", "Ararat", "Armavir", "Gaghark'unik'", "Kotayk", "Lorri",
"Shirak", "Syunik'", "Tavush", "Vayots' Dzor", "Yerevan City","Aragatsotn")
value <- runif(12, 0.0, 1.0)
df <- data.frame(id, name, value)
#create shape data df
shpNames <- c("Aragatsotn",
"Ararat",
"Armavir",
"Erevan",
"Gegharkunik",
"Kotayk",
"Lori",
"Shirak",
"Syunik",
"Tavush",
"VayotsDzor")
arm.data <- data.frame(ID_1=1:11,NAME_1=shpNames)
#simple match (only testing)
match(df$name,arm.data$NAME_1)
#simple merge (testing)
merge(arm.data,df,by.x="NAME_1",by.y="name",all.x=TRUE)
#partial match using stringdist package
library("stringdist")
am<-amatch(arm.data$NAME_1,df$name,maxDist = 3)
b<-data.frame()
for (i in 1:dim(arm.data)[1]) {
b<-rbind(b,data.frame(arm.data[i,],df[am[i],]))
}
b
it outputs:
ID_1 NAME_1 id name value
1 1 Aragatsotn 100 Aragatsotn 0.8510984
2 2 Ararat 101 Ararat 0.3004329
3 3 Armavir 102 Armavir 0.9258740
4 4 Erevan NA <NA> NA
5 5 Gegharkunik 103 Gaghark'unik' 0.9935353
6 6 Kotayk 104 Kotayk 0.6025050
7 7 Lori 105 Lorri 0.9577662
8 8 Shirak 106 Shirak 0.6346550
9 9 Syunik 107 Syunik' 0.6531175
10 10 Tavush 108 Tavush 0.9726032
11 11 VayotsDzor 109 Vayots' Dzor 0.3457315
You can play with maxDist parameter of amatch method. Although 3 works best with your sample data!
I want to add some details to Farid Cher's answer as this is a very common problem. Using amatch
can do wonders, but with these Spatial
objects you should not use base::merge
and not access the @data
slot. That would inevitably leads to a terrible mess (base::merge
changes the order of records, and they would no longer match geometries).
Instead, use the sp::merge
method, by using the SpatialPolygonsDataFrame
as first argument in merge
. Also note the potential problem of having duplicated records. And I added data so that the example is self-contained and reproducible.
library(raster)
#example data.frame
name <- c("Aragatsotn", "Ararat", "Armavir", "Gaghark'unik'", "Kotayk", "Lorri", "Shirak", "Syunik'", "Tavush", "Vayots' Dzor", "Yerevan City","Aragatsotn")
value <- runif(12, 0.0, 1.0)
df <- data.frame(name, value)
# example SpatialPolygonsDataFrame
arm <- getData('GADM', country='ARM', level=1)[, c('NAME_1')]
This
merge(arm, df, by.x='NAME_1', by.y='name')
fails with message
#Error in .local(x, y, ...) : non-unique matches detected
Because there are two records for "Aragatsotn" in df
. You could do
merge(arm, df, by.x='NAME_1', by.y='name', duplicateGeoms=TRUE)
But normally the sane approach is to use someting like
df <- aggregate(df[, 'value', drop=FALSE], df[, 'name', drop=FALSE], mean)
m <- merge(arm, df, by.x='NAME_1', by.y='name')
data.frame(m)
data.frame(m)
# NAME_1 value
#1 Aragatsotn 0.421576186
#2 Ararat 0.003138734
#3 Armavir 0.703402672
#4 Erevan NA
#5 Gegharkunik NA
#6 Kotayk 0.926883799
#7 Lori NA
#8 Shirak 0.430585540
#9 Syunik NA
#10 Tavush 0.121784395
#11 Vayots Dzor NA
Now, merge does not work well in this case because the names do not match. So you can use
i <- amatch(df$name, arm$NAME_1, maxDist = 3)
df$match[!is.na(i)] <- arm$NAME_1[i[!is.na(i)]]
df
# name value match
#1 Aragatsotn 0.421576186 Aragatsotn
#2 Ararat 0.003138734 Ararat
#3 Armavir 0.703402672 Armavir
#4 Gaghark'unik' 0.682169824 Gegharkunik
#5 Kotayk 0.926883799 Kotayk
#6 Lorri 0.128894086 Lori
#7 Shirak 0.430585540 Shirak
#8 Syunik' 0.163562936 Syunik
#9 Tavush 0.121784395 Tavush
#10 Vayots' Dzor 0.383439033 Vayots Dzor
#11 Yerevan City 0.168033419 <NA>
Almost there, but "Yerevan City" did not match with "Erevan". In this case you can increase maxDist
i <- amatch(df$name, arm$NAME_1, maxDist = 10)
df$match[!is.na(i)] <- arm$NAME_1[i[!is.na(i)]]
But increasing maxDist
will not always work or give the wrong matches becuase variant names can be very distinct. So in many cases you will end up doing some manual replacements like:
df[df$name=="Yerevan City", 'match'] <- "Erevan"
In both cases followed by
m <- merge(arm, df, by.x='NAME_1', by.y='match')
In any case you will want to check if sum(table(i) > 1) == 0
; although merge
should fail anyway if there are duplicate matches.