Function that will extract hour values from one table and populate "buckets" of one hour increments in another table

This involves bit of fiddling with the dates and time but it seems to work using dcast.

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in1 <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out1 <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Fiddling with dates and time
# This will be used to duplicate data frame x times for dcast below
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out1, dat$time_in1, units = "hours")))

dat$time_in_min <- format(dat$time_in1, format = "%M")
dat$time_out_min <- format(dat$time_out1, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$time_in2 <- strptime(dat$time_in1 + hours(1), format = "%Y-%m-%d %H")
dat$time_out2 <- strptime(dat$time_out1, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$time_in2, dat$time_in1, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out1, dat$time_out2, units = "hours"))

# For the 24 hour bucket for each person
dat$start_hr <- format(dat$time_in1, format = "%H")

# Append Data multiple times based on number of hours in between Out and In
dt <- dat[rep(seq_len(nrow(dat)), dat$diff_time), c("Date", "Person", "Time_In", "Time_Out", "start_hr", "diff_in", "diff_out", "diff_time")]
dt <- data.table(dt)

# For the 24 hour bucket for each person
dt[, rank := 1:.N, by = c("Person", "Date", "Time_In", "Time_Out")]
dt[, start_hr2 := as.numeric(start_hr) + rank]

# Combine with Time in and Out to allow fraction of hour start and end
dt[, rank2 := 1]
dt[rank == 1, rank2:= diff_in]
dt[diff_time == rank & diff_out > 0, rank2 := diff_out]

# 24 hours in a day
dt[start_hr2 > 24, start_hr2 := start_hr2 - 24]

# For the data provided it works without this line because Alexander worked 24 hours
# Need this line to include all 24 hour bucket
dt$start_hr2 <- factor(dt$start_hr2, levels = 1:24)

dt_dcast <- dcast(dt, Person + Date + Time_In + Time_Out ~ start_hr2, value.var = "rank2", fill = 0, drop = c(TRUE, FALSE))
setnames(dt_dcast, names(dt_dcast), c("Person", "Date", "Time In", "Time Out", paste0(1:24 - 1, "-", 1:24)))
dt_dcast
            Person       Date Time In Time Out 0-1  1-2 2-3 3-4 4-5 5-6 6-7 7-8 8-9 9-10 10-11 11-12 12-13 13-14 14-15 15-16 16-17 17-18 18-19 19-20 20-21 21-22 22-23 23-24
1: Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1
2:     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3:        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4:        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0

Using for loop:

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Create 'hour' gap
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out, dat$time_in, units = "hours")))
dat$time_in_min <- format(dat$time_in, format = "%M")
dat$time_out_min <- format(dat$time_out, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For the 24 hour bucket for each person
dat$start_hr <- as.numeric(format(dat$time_in + hours(1), format = "%H"))
dat$start_hr <- ifelse(dat$start_hr == 0, 24, dat$start_hr)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$hour_in <- strptime(dat$time_in + hours(1), format = "%Y-%m-%d %H")
dat$hour_out <- strptime(dat$time_out, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$hour_in, dat$time_in, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out, dat$hour_out, units = "hours"))

hr_bucket <- data.frame(matrix(0, ncol = 24, nrow = nrow(dat)))
names(hr_bucket) <- paste0(1:24-1, "_", 1:24)

stg_data <- dat[, c("start_hr", "diff_time", "diff_in", "diff_out")]
stg_calc <- cbind(stg_data, hr_bucket)

col_index <- ncol(stg_data)
for (i in 1:nrow(stg_calc)) {

  ref_start_hr <- stg_calc[i ,]$start_hr 
  ref_diff_time <- stg_calc[i ,]$diff_time

  ref_diff_in <- stg_calc[i ,]$diff_in
  ref_diff_out <- stg_calc[i ,]$diff_out

  # if a person works till the next morning
  if ((ref_start_hr + ref_diff_time) > 24) {

    offset_col_used <- 24 - ref_start_hr + 1
    offset_col_rem <- ref_diff_time - offset_col_used

    stg_calc[i, (col_index + ref_start_hr):(col_index + 24)] <- 1
    stg_calc[i, (col_index + 1):(col_index + offset_col_rem)] <- 1

  } else {

    stg_calc[i, (col_index + ref_start_hr):(col_index + ref_start_hr + ref_diff_time - 1)] <- 1

  } 

  # To adjust for fraction of hour worked at start and end
  if (stg_calc[i, ]$diff_in %% 1 > 0) stg_calc[i, col_index + ref_start_hr] <- ref_diff_in
  if (stg_calc[i, ]$diff_out %% 1 > 0) stg_calc[i, col_index + ref_start_hr + ref_diff_time - 1] <- ref_diff_out

}

dat2 <- cbind(dat[, c("Person", "Date", "Time_In", "Time_Out")], stg_calc[, names(hr_bucket)])
dat2
           Person       Date Time_In Time_Out 0_1  1_2 2_3 3_4 4_5 5_6 6_7 7_8 8_9 9_10 10_11 11_12 12_13 13_14 14_15 15_16 16_17 17_18 18_19 19_20 20_21 21_22 22_23 23_24
1        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0
2     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4 Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1

This seems to work, though it has a few kludgy steps.

library(tidyverse)
library(lubridate)

(dat<-tibble(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
             Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
             Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
             Time_Out = c("2:30pm","3:33pm","3:00am","1:00am")))

dat2 <- dat %>%
  mutate(Time_In2 = mdy_hm(paste(Date, Time_In)),
         Time_Out2 = mdy_hm(paste(Date, Time_Out)),
         Time_Out2 = Time_Out2 + if_else(Time_Out2 <= Time_In2, ddays(1), 0)) %>%
  select(Person, Time_In2, Time_Out2) %>%
  gather(type, time, -Person) %>%

  # Kludge #1: gather seems to have converted POSIXct into numeric, switch back
  mutate(time = as.POSIXct(time, origin="1970-01-01", tz = "UTC")) %>%

  # Kludge #2: add rows for all minutes of day for each person.
  #   Clearly not most efficient method! This might be slowish if you have
  #   many thousands of Person values.
  group_by(Person) %>%
  padr::pad(interval = "min") %>%
  mutate(hour = hour(time)) %>%
  # Exclude ending minute to avoid double-counting
  filter(type != "Time_Out2" | is.na(type)) %>%
  ungroup() %>%

  count(Person, hour) %>%
  mutate(n = n/60) %>%
  spread(hour, n, fill = 0)

> dat2
# A tibble: 4 x 25
  Person            `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`  `12`  `13`  `14`  `15`  `16`  `17`  `18`  `19`  `20`  `21`  `22`  `23`
  <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alexandra Wakes     1  1        1     1     1     1     1     1     1     1     1     1     1  1      1    1        1     1     1     1     1     1     1     1
2 Brian Smith         0  0.25     1     1     1     1     1     1     1     1     1     1     1  1      1    0.55     0     0     0     0     0     0     0     0
3 Jane Doe            1  1        1     0     0     0     0     0     0     0     0     0     0  0      0    0        0     0     0     0     0     0     0     1
4 John Doe            0  0        0     0     0     0     0     0     0     0     0     0     0  0.75   0.5  0        0     0     0     0     0     0     0     0

I start with @Jon Spring's data frame where the strings are transformed into datetimes. I complete the data frame with all combinations of hours, but you likely could skip that with enough people.

library(tidyverse)
library(lubridate)

dat<-tibble(ID = 1:4,
            Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
             Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
             Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
             Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Stolen from Jon Spring
# https://stackoverflow.com/a/55698472/11355066
dat2<- dat%>%
  mutate(Shift_Start = mdy_hm(paste(Date, Time_In)),
         Shift_End = mdy_hm(paste(Date, Time_Out)),
         Shift_End = Shift_End + if_else(Shift_End <= Shift_Start, ddays(1), 0)) 

# Different solution
dat2%>%
  group_by(ID, Person, Shift_Start, Shift_End)%>%
  do(
    tibble(hours_worked = seq.POSIXt(from = floor_date(.$Shift_Start, 'hour'), to = ceiling_date(.$Shift_End - dhours(), 'hour'), by = 'hour')
           ,hours_values = na.omit(
             c(if_else(minute(.$Shift_Start) == 0, NA_integer_, as.integer(60 - minute(.$Shift_Start)))
               ,rep(60L, length(seq.POSIXt(from = ceiling_date(.$Shift_Start, 'hour'), to = floor_date(.$Shift_End, 'hour'), by = 'hour'))-1)
               ,if_else(minute(.$Shift_End) == 0, NA_integer_, as.integer(minute(.$Shift_End))))
             )/60
           )
    )%>%
  complete(hours_worked = seq.POSIXt(from = min(floor_date(Shift_Start, 'day')), to = max(ceiling_date(Shift_End, 'day')), by = 'hour'))%>%
  mutate(hour_minutes = format(hours_worked, "%H:%M"))%>%
  select(-hours_worked)%>%
  na.omit()%>%
  # ID only grouped to match order of the poster
  # group_by(ID, Person, hour_minutes)%>%summarize(hours_values = sum(hours_values))%>%
  spread(hour_minutes, hours_values, fill = 0)

Function that will extract hour values from one table and populate "buckets" of one hour increments in another table

Tags:

Python

Pandas

R

Dataframe

Related

Recent Posts