Question

我有一个包含两个变量的数据集：个人的日期和服务年限（仅用于制作一个小的可复制示例）。我需要获得此人开始工作的月份（例如此示例是在1989-06年），考虑到如果该解决方案适用于许多人，则开始工作的月份可能因个人而异。像这样：

library(data.table)
dt <- structure(list(DATE = c("2009-01", "2009-02", "2009-03", "2009-04", 
                          "2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10", 
                          "2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04", 
                          "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10", 
                          "2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04", 
                          "2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10", 
                          "2011-11", "2011-12"), Years_service = c(19, 19, 19, 19, 19, 
                                                                   20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 
                                                                   21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22), 
                 INITIAL_MONTH = c("1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06")), .Names = c("DATE", "Years_service", 
                                                                      "INITIAL_MONTH"), class = c("data.table", "data.frame"), row.names = c(NA,-36L))

head(dt)
      DATE Years_service INITIAL_MONTH
1: 2009-01            19       1989-06
2: 2009-02            19       1989-06
3: 2009-03            19       1989-06
4: 2009-04            19       1989-06
5: 2009-05            19       1989-06
6: 2009-06            20       1989-06

如何在R中获得它？

Answer 1

我们可以在Years_service列中找到第一个变化，并用该索引处对应的DATE值减去它。

library(dplyr)
library(lubridate)

dt %>%
  mutate(inds = which.max(diff(Years_service) != 0) + 1, 
        init_month = format(as.Date(paste0(DATE[inds], "-01")) - 
                      years(Years_service[inds]), "%Y-%m")) %>%
  select(-inds)

#      DATE Years_service INITIAL_MONTH init_month
#1  2009-01            19       1989-06    1989-06
#2  2009-02            19       1989-06    1989-06
#3  2009-03            19       1989-06    1989-06
#4  2009-04            19       1989-06    1989-06
#....

您可能想对多个人执行此操作，可以在其中添加group_by子句

dt %>%
  group_by(person) %>%
  mutate(inds = which.max(diff(Years_service) != 0) + 1, 
         init_month = format(as.Date(paste0(DATE[inds], "-01")) - 
                       years(Years_service[inds]), "%Y-%m")) %>%
  select(-inds)

编辑

对于最新情况，我们可能需要先arrange dates

dt1 <- dt[order(-DATE)]

dt1 %>%
  mutate(dates = as.Date(paste0(DATE, "-01"))) %>%
  arrange(dates) %>%
  mutate(inds = which.max(diff(Years_service) != 0) + 1, 
     init_month = format(dates[inds] - years(Years_service[inds]), "%Y-%m")) %>%
  select(-inds)

Answer 2

Base R解决方案

使用seq来反算月数

使用Date用天（%d）创建一个新的as.Date向量（请使用sprintf函数）

dt$Date <- sprintf("%s-01",dt$DATE)

创建-X months格式的字符串向量，以在seq中向后计数

dt$Back_step <- sprintf("-%s months",dt$Years_service)

使用for循环遍历打印X个月前的日期的行

for(i in 1:nrow(dt)){
  dt$INITIAL_MONTH[i] <- as.character(seq(as.Date(dt$Date[i],format="%Y-%m-%d"), 
                                                  length = 2, by = dt$Back_step[i])[2])
}

[2]表明我们正在获取序列中的第二个值

Answer 3

还添加一个data.table解决方案。

# Find the initial month
dt1 <- dt[order(DATE)]
dt1[, diff:=Years_service - shift(Years_service)]
dt2 <- dt1[diff==1, head(.SD, 1)]
# calculate the year
dt2[, init_month:=paste0(as.numeric(substr(DATE, 1, 4))-Years_service, '-', substr(DATE, 6, 7))]
# write back to the original data.table
init_mon <- dt2$init_month[1]
dt <- dt[, init_month:=init_mon]

如果数据中有多个人：

library(data.table)
dt <- structure(list(PERSON = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
                     DATE = c("2009-01", "2009-02", "2009-03", "2009-04", 
                          "2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10", 
                          "2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04", 
                          "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10", 
                          "2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04", 
                          "2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10", 
                          "2011-11", "2011-12", "2009-01", "2009-02", "2009-03", "2009-04", 
                          "2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10", 
                          "2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04", 
                          "2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10", 
                          "2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04", 
                          "2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10", 
                          "2011-11", "2011-12"), Years_service = c(19, 19, 19, 19, 19, 
                                                                   20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 
                                                                   21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 
                                                                   20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 
                                                                   21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22), 
                 INITIAL_MONTH = c("1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06", 
                                   "1989-06", "1989-06")), .Names = c("PERSON", "DATE", "Years_service", 
                                                                      "INITIAL_MONTH"), class = c("data.table", "data.frame"), row.names = c(NA,-36L))


head(dt)

# PERSON    DATE    Years_service   INITIAL_MONTH
# 1         2009-01 19              1989-06
# 1         2009-02 19              1989-06
# 1         2009-03 19              1989-06
# 1         2009-04 19              1989-06
# 1         2009-05 19              1989-06
# 1         2009-06 20              1989-06

在计算中添加分组依据

dt1 <- dt[order(PERSON, DATE)]
dt1[, diff:=Years_service - shift(Years_service), by="PERSON"]
dt2 <- dt1[diff==1, head(.SD, 1), by="PERSON"]
dt2[, init_month:=paste0(as.numeric(substr(DATE, 1, 4))-Years_service, '-', substr(DATE, 6, 7))]
dt <- merge(dt, dt2[, list(PERSON, init_month)], on=c("PERSON"), all.x=TRUE)

从日期列表中获取初始月份

3 个答案: