Adding a vector as new columns in existing data frame

时间:2018-03-25 19:04:44

标签: r dataframe dplyr data.table tidyverse

Sample data

df <- data.frame(location = rep(1:1000, each = 36), 
                 year = rep(1980:2015,times = 1000),
                 mu = runif(min =  36.5, max = 43.2, 1000*36),
                 lambda = runif(min =  4.5, max = 4.8, 1000*36))

This data consits of 1000 locations and 36 years with two variables mu and lambda

For each location X year combination, I have a function which takes a value of lambda and mu and generates a vector of size 12. An example:

library(grofit)    
dat <- df[df$location == 1 & df$year == 1980,]  
y <- round(gompertz(1:12,100,dat$mu,dat$lambda), digits = 2)
y

  [1]  0.00  0.00  0.00  0.72 18.60 56.37 82.26 93.56 97.76 99.23
  [11] 99.74 99.91

If I want to add y as columns to dat

  new.col <- 5:16 
  dat[new.col] <- y
  dat

  location year       mu   lambda V5 V6 V7   V8   V9   V10   V11
     1     1980 39.60263 4.554095  0  0  0 0.72 18.6 56.37 82.26
  V12   V13   V14   V15   V16
  1 93.56 97.76 99.23 99.74 99.91

As you see, I have attached y as columns V5 till V16 in the dat. I want to repeat this for all location and year combination in df. I hope this is clear.

df %>% group_by(location year) %>% mutate(?? how to I add new columns for y??)

3 个答案:

答案 0 :(得分:4)

You could use lapply():

library(grofit)    
df2 <- do.call(rbind, lapply(1:nrow(df), 
                             function(x) round(gompertz(1:12, 100, df[x, 3], df[x, 4]), 
                                               digits = 2)))
df3 <- cbind(df, df2)

result:

> head(df3)
  location year       mu   lambda 1 2 3    4     5     6     7     8     9    10    11    12
1        1 1980 43.04565 4.536717 0 0 0 0.61 20.58 61.23 85.88 95.39 98.54 99.55 99.86 99.96
2        1 1981 39.00524 4.505235 0 0 0 0.96 20.02 57.28 82.45 93.53 97.71 99.20 99.72 99.90
3        1 1982 41.60206 4.619627 0 0 0 0.42 17.07 56.52 83.18 94.23 98.10 99.38 99.80 99.94
4        1 1983 42.01069 4.689058 0 0 0 0.26 14.87 54.43 82.35 93.99 98.04 99.37 99.80 99.94
5        1 1984 40.34275 4.692595 0 0 0 0.30 14.36 52.30 80.54 93.03 97.61 99.20 99.73 99.91
6        1 1985 41.13246 4.641404 0 0 0 0.38 16.20 55.15 82.32 93.84 97.94 99.32 99.78 99.93

data:

set.seed(47)  # for sake of reproducibility
df <- data.frame(location = rep(1:1000, each = 36), 
                 year = rep(1980:2015, times = 1000),
                 mu = runif(min =  36.5, max = 43.2, 1000 * 36),
                 lambda = runif(min =  4.5, max = 4.8, 1000 * 36))

答案 1 :(得分:3)

Here is a tidyverse solution :

df <- head(df) # we'll work on a sample

library(tidyverse)
df %>%
  mutate(y = map2(mu,lambda,gompertz,time= 1:12,A = 100),
         y = map(y,. %>% round(2) %>% t %>% as_tibble)) %>% # we reformat the vectors as one line tibbles for smooth unnesting
  unnest %>%
  rename_at(5:16,~paste0("y",1:12))
#   location year       mu   lambda y1 y2 y3   y4    y5    y6    y7    y8    y9   y10   y11   y12
# 1        1 1980 38.52133 4.793232  0  0  0 0.20 11.20 46.38 76.37 90.97 96.73 98.84 99.59 99.86
# 2        1 1981 41.05032 4.668713  0  0  0 0.32 15.29 54.04 81.74 93.61 97.86 99.29 99.77 99.92
# 3        1 1982 36.76366 4.687794  0  0  0 0.45 13.67 48.07 76.37 90.55 96.41 98.66 99.51 99.82
# 4        1 1983 42.47994 4.766380  0  0  0 0.14 12.55 51.99 81.37 93.71 97.97 99.36 99.80 99.94
# 5        1 1984 36.58161 4.510503  0  0  0 1.09 18.81 53.90 79.56 91.89 96.92 98.85 99.57 99.84
# 6        1 1985 41.77695 4.705588  0  0  0 0.23 14.29 53.52 81.81 93.75 97.95 99.34 99.79 99.93

and a base version that should run faster:

new_df <- cbind(df,round(t(mapply(gompertz,  df$mu, df$lambda,MoreArgs = list(time= 1:12, A = 100))),2))
names(new_df)[5:16] <- paste0("y",1:12)
#   location year       mu   lambda y1 y2 y3   y4    y5    y6    y7    y8    y9   y10   y11   y12
# 1        1 1980 38.52133 4.793232  0  0  0 0.20 11.20 46.38 76.37 90.97 96.73 98.84 99.59 99.86
# 2        1 1981 41.05032 4.668713  0  0  0 0.32 15.29 54.04 81.74 93.61 97.86 99.29 99.77 99.92
# 3        1 1982 36.76366 4.687794  0  0  0 0.45 13.67 48.07 76.37 90.55 96.41 98.66 99.51 99.82
# 4        1 1983 42.47994 4.766380  0  0  0 0.14 12.55 51.99 81.37 93.71 97.97 99.36 99.80 99.94
# 5        1 1984 36.58161 4.510503  0  0  0 1.09 18.81 53.90 79.56 91.89 96.92 98.85 99.57 99.84
# 6        1 1985 41.77695 4.705588  0  0  0 0.23 14.29 53.52 81.81 93.75 97.95 99.34 99.79 99.93

An alternative to mapply that is not used so often is Vectorize, in this case I think its use is justified as this function seems like it really should be vectorized in the first place.

gompertz2 <- Vectorize(gompertz,c("mu","lambda"))
new_df <- cbind(df,round(t(gompertz2(1:12, 100, df$mu,df$lambda)),2))
names(new_df)[5:16] <- paste0("y",1:12)

# same output

答案 2 :(得分:1)

With the data that you generated, there is no need to summarise() with dplyr. Each record is unique. So this seems more like a place to use apply().

There are ways to loop through this; I just created twelve statements. We are passing the mu,lamda columns of df to the apply function and then using your function across each 36000 rows to grab the 12 pieces of that vector into 12 new variables y1:y12.

df$y1 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[1])
df$y2 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[2])
df$y3 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[3])
df$y4 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[4])
df$y5 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[5])
df$y6 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[6])
df$y7 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[7])
df$y8 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[8])
df$y9 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[9])
df$y10 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[10])
df$y11 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[11])
df$y12 <- apply(df[,3:4], 1, function(x) round(gompertz(1:12,100,x[1],x[2]), digits = 2)[12])
head(df)

  location year       mu   lambda y1 y2 y3   y4    y5    y6    y7    y8    y9   y10   y11   y12
1        1 1980 38.70790 4.531560  0  0  0 0.86 19.00 56.00 81.67 93.18 97.56 99.14 99.70 99.89
2        1 1981 42.64717 4.765444  0  0  0 0.14 12.60 52.22 81.56 93.81 98.01 99.37 99.80 99.94
3        1 1982 39.19041 4.527792  0  0  0 0.85 19.33 56.75 82.27 93.49 97.71 99.20 99.73 99.91
4        1 1983 37.50859 4.565435  0  0  0 0.79 17.46 53.28 79.68 92.13 97.09 98.94 99.62 99.86
5        1 1984 36.71666 4.779357  0  0  0 0.27 11.29 44.76 74.36 89.65 96.05 98.53 99.45 99.80
6        1 1985 42.11325 4.783322  0  0  0 0.13 11.99 50.91 80.66 93.39 97.85 99.31 99.78 99.93

NOTE: within dplyr you could also do something like:

df <- df %>% rowwise() %>% mutate(y1 = round(gompertz(1:12,100,mu,lambda), digits = 2)[1],
                                  y2 = round(gompertz(1:12,100,mu,lambda), digits = 2)[2],
                                  y3 = round(gompertz(1:12,100,mu,lambda), digits = 2)[3],
                                  y4 = round(gompertz(1:12,100,mu,lambda), digits = 2)[4],
                                  y5 = round(gompertz(1:12,100,mu,lambda), digits = 2)[5])

and repeat with 6-12, and achieve same result.