general lag in time series panel data
I stumbled over a similar problem and wrote a function.
#df needs to be a structured balanced paneldata set sorted by id and date
#OBS the function deletes the row where the NA value would have been.
df <- data.frame(id = c(1, 1, 1, 1, 1, 2, 2,2,2,2),
date = c(1992, 1993, 1991, 1990, 1994, 1992, 1991
,1994,1990,1993),
value = c(4.1, 4.5, 3.3, 5.3, 3.0, 3.2, 5.2,5.3,3.4,5.6))
# sort paneldata set
library(dplyr)
df<-arrange(df,id,date)
#Function
# a=df
# b=colname of variable/variables that you want to lag
# q=number of lag years
# t=colname of date/time column
retraso<-function(a,b,q,t){
sto<-max(as.numeric(unique(a[[t]])))
sta<-min(as.numeric(unique(a[[t]])))
yo<-a[which(a[[t]]>=(sta+q)),]
la<-function(a,d,t,sto,sta){
ja<-data.frame(a[[d]],a[[t]])
colnames(ja)<-c(d,t)
ja<-ja[which(ja[[t]]<=(sto-q)),1]
return(ja)
}
for (i in 1:length(b)){
yo[[b[i]]] <-la(a,b[i],t,sto,sta)
}
return(yo)
}
#lag df 1 year
df<-retraso(df,"value",1,"date")
I think the easiest way, especially considering doing further analysis, is to convert your data frame to pdata.frame
class from plm
package.
After the conversion from diff()
and lag()
operators can be used to create panel differences and lags.
df<-pdata.frame(df,index=c("id","date"))
df<-transform(df, l_value=lag(value,1))
For a panel without missing obs this is an intuitive solution:
df <- data.frame(id = c(1, 1, 1, 1, 1, 2, 2),
date = c(1992, 1993, 1991, 1990, 1994, 1992, 1991),
value = c(4.1, 4.5, 3.3, 5.3, 3.0, 3.2, 5.2))
df<-df[with(df, order(id,date)), ] # sort by id and then by date
df$l_value=c(NA,df$value[-length(df$value)]) # create a new var with data displaced by 1 unit
df$l_value[df$id != c(NA, df$id[-length(df$id)])] =NA # NA data with different current and lagged id.
df
id date value l_value
4 1 1990 5.3 NA
3 1 1991 3.3 5.3
1 1 1992 4.1 3.3
2 1 1993 4.5 4.1
5 1 1994 3.0 4.5
7 2 1991 5.2 NA
6 2 1992 3.2 5.2
You can use ddply
: it cuts a data.frame into pieces and transforms each piece.
d <- data.frame(
User = rep( LETTERS[1:3], each=10 ),
Date = seq.Date( Sys.Date(), length=30, by="day" ),
Value = rep(1:10, 3)
)
library(plyr)
d <- ddply(
d, .(User), transform,
# This assumes that the data is sorted
Value = c( NA, Value[-length(Value)] )
)