解析每一行中的字符串,并将结果扩展为整洁的数据框

时间:2018-08-22 21:50:34

标签: r regex dplyr tidyverse tidyr

我有一个数据框,如:

# A tibble: 2 x 3
     id text_1                                              text_2                                                          
  <int> <chr>                                               <chr>                                                           
1     1 "{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=… "{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ddd\", :…
2     2 "{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=… "{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ccc\", :…

可重现:

structure(list(id = 1:2, text_1 = c("{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=>{:name=>\"bbb\", :priority=>0, :count=>2}}", 
"{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=>{:name=>\"ccc\", :priority=>0, :count=>3}}"
), text_2 = c("{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ddd\", :priority=>0, :count=>2}", 
"{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ccc\", :priority=>0, :count=>2}, {:name=>\"ddd\", :priority=>0, :count=>9}"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
), spec = structure(list(cols = list(id = structure(list(), class = c("collector_integer", 
"collector")), text_1 = structure(list(), class = c("collector_character", 
"collector")), text_2 = structure(list(), class = c("collector_character", 
"collector"))), default = structure(list(), class = c("collector_guess", 
"collector"))), class = "col_spec"))

其中每行包含要分析的字符串。

第一列包含带有标识符的字段,第二列包含在一组括号之间。

我希望通过将id级别不相同的乘积设置为零的方式来实现此结果。

产品数量有限,因此需要使用所有组合进行填充。

# A tibble: 14 x 5
      id product priority count level 
   <int> <chr>      <int> <int> <chr> 
 1     1 aaa            0     4 text_1
 2     1 bbb            0     4 text_1
 3     1 ccc            0     0 text_1
 4     2 aaa            0     5 text_1
 5     2 bbb            0     0 text_1
 6     2 ccc            0     3 text_1
 7     1 aaa            0     0 text_2
 8     1 bbb            0     4 text_2
 9     1 ccc            0     0 text_2
10     1 ddd            0     2 text_2
11     2 aaa            0     0 text_2
12     2 bbb            0     4 text_2
13     2 ccc            0     2 text_2
14     2 ddd            0     9 text_2

我认为我必须使用extract的某种组合,但我迷路了。

2 个答案:

答案 0 :(得分:3)

如果它最初是JSON(如@neilfws建议),则使用jsonlite包之类的内容进行解析会更容易。如果没有,请尝试以下操作:

library(tidyr)
library(dplyr)
library(stringr)

df %>%
  gather(level, 'val', text_1, text_2) %>%
  separate(val, into = paste0('val', 1:(max(str_count(.$val,"\\},"))+1)), "\\},") %>%
  gather(val, ugly_text, starts_with('val')) %>%
  select(-val) %>%
  filter(!is.na(ugly_text)) %>%
  mutate(product = str_match(ugly_text, ':name=>\\"(.*?)\\"')[, 2],
         priority = str_match(ugly_text , ':priority=>([0-9])')[, 2],
         count = str_match(ugly_text , ':count=>([0-9])')[, 2]) %>%
  select(id, product, priority, count, level) %>%
  full_join(distinct(expand.grid(id = .$id, product = .$product, level = .$level)),
            by = c('id', 'product', 'level')) %>%
  mutate_at(vars(priority, count), ~if_else(is.na(.x), 0, as.numeric(.x))) %>%
  arrange(level, id, product)

# A tibble: 16 x 5
      id product priority count level 
   <int> <chr>      <dbl> <dbl> <chr> 
 1     1 aaa            0     4 text_1
 2     1 bbb            0     2 text_1
 3     1 ccc            0     0 text_1
 4     1 ddd            0     0 text_1
 5     2 aaa            0     5 text_1
 6     2 bbb            0     0 text_1
 7     2 ccc            0     3 text_1
 8     2 ddd            0     0 text_1
 9     1 aaa            0     0 text_2
10     1 bbb            0     4 text_2
11     1 ccc            0     0 text_2
12     1 ddd            0     2 text_2
13     2 aaa            0     0 text_2
14     2 bbb            0     4 text_2
15     2 ccc            0     2 text_2
16     2 ddd            0     9 text_2

我不确定您写“产品数量封闭,因此需要将所有组合填充”时的意思-但这至少应该是一个开始。

答案 1 :(得分:1)

这是我尝试使用tidyverse包清洁数据集的尝试。它与所需的输出不同。这是因为您的输出在某些组中包含诸如cccddd之类的组合。但是,我无法弄清楚规则,例如为什么text_2aaa,但是text_1没有ddd。因此,除非有新信息,否则我决定保持原样。

library(tidyverse)

dat2 <- dat %>%
    gather(level, text, starts_with("text_")) %>%
    separate_rows(text, sep = ", ") %>%
    mutate(text = map(text, ~str_split(.x, pattern = "=>"))) %>%
    mutate(text = map(text, function(x){
       len <- length(x[[1]])
       y <- x[[1]][c(len - 1, len)]
       return(y)
    })) %>%
    mutate(text = map(text, ~str_replace_all(.x, "[:punct:]", ""))) %>%
    mutate(Column = map_chr(text, 1), Value = map_chr(text, 2)) %>%
    mutate_at(vars(Column, Value), funs(str_trim(.))) %>%
    select(-text) %>%
    mutate(Group = cumsum(Column %in% "name")) %>% 
    spread(Column, Value) %>%
    select(id, product = name, priority, count, level) %>%
    arrange(level, id, product) 
dat2
# # A tibble: 9 x 5
#      id product priority count level 
#   <int> <chr>   <chr>    <chr> <chr> 
# 1     1 aaa     0        4     text_1
# 2     1 bbb     0        2     text_1
# 3     2 aaa     0        5     text_1
# 4     2 ccc     0        3     text_1
# 5     1 bbb     0        4     text_2
# 6     1 ddd     0        2     text_2
# 7     2 bbb     0        4     text_2
# 8     2 ccc     0        2     text_2
# 9     2 ddd     0        9     text_2

我尝试添加一个complete调用以扩展所有组合的数据框。但是,现在输出的行比所需的输出多,因为ddd现在与text_1一起使用。同样,不清楚创建所需输出的规则是什么。

library(tidyverse)

dat2 <- dat %>%
    gather(level, text, starts_with("text_")) %>%
    separate_rows(text, sep = ", ") %>%
    mutate(text = map(text, ~str_split(.x, pattern = "=>"))) %>%
    mutate(text = map(text, function(x){
       len <- length(x[[1]])
       y <- x[[1]][c(len - 1, len)]
       return(y)
    })) %>%
    mutate(text = map(text, ~str_replace_all(.x, "[:punct:]", ""))) %>%
    mutate(Column = map_chr(text, 1), Value = map_chr(text, 2)) %>%
    mutate_at(vars(Column, Value), funs(str_trim(.))) %>%
    select(-text) %>%
    mutate(Group = cumsum(Column %in% "name")) %>% 
    spread(Column, Value) %>%
    complete(name, id, level, fill = list(priority = 0, count = 0)) %>%
    select(id, product = name, priority, count, level) %>%
    arrange(level, id, product) 
dat2
# # A tibble: 16 x 5
#      id product priority count level 
#   <int> <chr>   <chr>    <chr> <chr> 
# 1     1 aaa     0        4     text_1
# 2     1 bbb     0        2     text_1
# 3     1 ccc     0        0     text_1
# 4     1 ddd     0        0     text_1
# 5     2 aaa     0        5     text_1
# 6     2 bbb     0        0     text_1
# 7     2 ccc     0        3     text_1
# 8     2 ddd     0        0     text_1
# 9     1 aaa     0        0     text_2
#10     1 bbb     0        4     text_2
#11     1 ccc     0        0     text_2
#12     1 ddd     0        2     text_2
#13     2 aaa     0        0     text_2
#14     2 bbb     0        4     text_2
#15     2 ccc     0        2     text_2
#16     2 ddd     0        9     text_2