将嵌套的参差不齐的列分成固定长度的块

时间:2018-06-30 06:29:38

标签: r dplyr data-manipulation tidyr purrr

我有一个数据集,其中一列是由1个数字组成的字符串,表示星期几,然后跟任意数量的10位数字块:

# A tibble: 7 x 3
  respid            record_type record_data                                                  
  <chr>             <chr>       <chr>                                                        
1 20163911123050111 6           1000456561200035759120000989800                              
2 20163911123050111 6           2000405161200031719120000999900                              
3 20163911123050111 6           30004071212000320212200032832220003545620                    
4 20163911123050111 6           40004051612000326272200033032220003545620                    
5 20163911123050111 6           5036803031200040404120004051812000434361200045556120003575910
6 20163911123050111 6           6000411161200031720120003283121000344462100035759120004707410
7 20163911123050111 6           70004111312000314261200043334120004535610 

我想通过以下方式将其转换为长格式: 1.将第3列分割为长度为1的固定块,然后将一系列长度为10的字符
2.从宽到长

例如,上面的第一行将变为3行:

  respid            record_type dayofweek  chunk                                               
  <chr>             <chr>       <chr>       <chr>                                                  
1 20163911123050111 6           1         0004565612  
2 20163911123050111 6           1         0003575912  
3 20163911123050111 6           1         0000989800  

到目前为止,我已在第一部分中使用此代码,但这是一个循环...:

my_list<-list()
for(i in 1:nrow(mydf)){
  temp_list<-list()
  temp_list
  temp_list$respid <- mydf[i,1]
  temp_list$record_type <- mydf[i,2]
  temp_list$dayofweek <- stringi::stri_sub(t6[i,3],1,1)
  temp_list$chunk <- stringi::stri_sub(mydf[i,3], 
                                          seq(2, stringi::stri_length(mydf[i,3]), by = 10), 
                                          length = 10)    

  my_list[[i]] <- temp_list
}

有没有办法像purrr :: map和tidyr :: unnest这样的东西?

2 个答案:

答案 0 :(得分:2)

方法是首先从1st中将record_data字符提取为dayofweek。现在,可以替换第10个字符,并添加一个分隔符(例如,),以准备record_data来应用tidyr::separate_rows

library(tidyverse)

df %>% 
  # 1st character as dayofweek 
  mutate(dayofweek = substring(record_data, 1,1)) %>%
  # Every 10th character appended with ,
  mutate(record_data = gsub("(\\d{10})","\\1,",substring(record_data,2))) %>%
  # Remove last ,
  mutate(record_data = gsub(",$","",record_data)) %>%
  # Expand rows
  separate_rows(record_data)

#               respid record_type dayofweek record_data
# 1  20163911123050112           6         1  0004565612
# 2  20163911123050112           6         1  0003575912
# 3  20163911123050112           6         1  0000989800
# 4  20163911123050112           6         2  0004051612
# 5  20163911123050112           6         2  0003171912
# 6  20163911123050112           6         2  0000999900
# 7  20163911123050112           6         3  0004071212
# 8  20163911123050112           6         3  0003202122
# 9  20163911123050112           6         3  0003283222
# 10 20163911123050112           6         3  0003545620
# 11 20163911123050112           6         4  0004051612
# 12 20163911123050112           6         4  0003262722
# 13 20163911123050112           6         4  0003303222
# 14 20163911123050112           6         4  0003545620
# 15 20163911123050112           6         5  0368030312
# 16 20163911123050112           6         5  0004040412
# 17 20163911123050112           6         5  0004051812
# 18 20163911123050112           6         5  0004343612
# 19 20163911123050112           6         5  0004555612
# 20 20163911123050112           6         5  0003575910
# 21 20163911123050112           6         6  0004111612
# 22 20163911123050112           6         6  0003172012
# 23 20163911123050112           6         6  0003283121
# 24 20163911123050112           6         6  0003444621
# 25 20163911123050112           6         6  0003575912
# 26 20163911123050112           6         6  0004707410
# 27 20163911123050112           6         7  0004111312
# 28 20163911123050112           6         7  0003142612
# 29 20163911123050112           6         7  0004333412
# 30 20163911123050112           6         7  0004535610

数据:

df <- read.table(text ="
respid            record_type record_data
20163911123050111 6           1000456561200035759120000989800
20163911123050111 6           2000405161200031719120000999900                              
20163911123050111 6           30004071212000320212200032832220003545620
20163911123050111 6           40004051612000326272200033032220003545620                    
20163911123050111 6           5036803031200040404120004051812000434361200045556120003575910
20163911123050111 6           6000411161200031720120003283121000344462100035759120004707410
20163911123050111 6           70004111312000314261200043334120004535610",
header = TRUE, colClasses = c("numeric", "integer", "character"))

答案 1 :(得分:2)

我们可以定义一个函数,该函数可以将字符串每10位拆分一次并返回一个列表。然后,我们可以使用separate函数拆分星期几和其余部分。我们终于可以应用定义的功能,并unnest数据框。

# Define a function to split the string in every 10 digits
string_split <- function(string, width = 10){
  lst <- list()
  i <- 1
  while (nchar(string) > 0){
    lst[[i]] <- substring(string, 1, width)
    string <- substring(string, width + 1)
    i <- i + 1
  }
  return(lst)
}


library(tidyverse)

dat2 <- dat %>%
  # Split dayofweek and chunk
  separate(record_data, into = c("dayofweek", "chunk"), sep = 1) %>%
  # Apply the string_split function
  mutate(chunk = map(chunk, string_split)) %>%
  unnest()

head(dat2)
#              respid record_type dayofweek      chunk
# 1 20163911123050111           6         1 0004565612
# 2 20163911123050111           6         1 0003575912
# 3 20163911123050111           6         1 0000989800
# 4 20163911123050111           6         2 0004051612
# 5 20163911123050111           6         2 0003171912
# 6 20163911123050111           6         2 0000999900

数据

dat <- read.table(text = "respid            record_type record_data                                                  
1 20163911123050111 6           1000456561200035759120000989800                              
2 20163911123050111 6           2000405161200031719120000999900                              
3 20163911123050111 6           30004071212000320212200032832220003545620                    
4 20163911123050111 6           40004051612000326272200033032220003545620                    
5 20163911123050111 6           5036803031200040404120004051812000434361200045556120003575910
6 20163911123050111 6           6000411161200031720120003283121000344462100035759120004707410
7 20163911123050111 6           70004111312000314261200043334120004535610",
                  header = TRUE, colClasses = "character")