Question

我有几个字符串，下面是一些例子。

rfoutputtablep7q10000t20000c100
rfoutputtablep7q1000t20000c100
svmLinear2outputtablep7q20000t20000c100
svmLinear2outputtablep7q5000t20000c100

我想创建一个包含以下列的数据框：algorithm，p，q，t和c并从这些字符串中提取值。因此"outputtable"之前的内容是algorithm，"p"之后的数字是p的值，"q"之后的数字是q的值等等。

如何创建此数据框？

Answer 1

仅使用基数R.

res <- do.call(rbind, strsplit(y, 'outputtable|p|q|t|c'))
res <- as.data.frame(res[, -2])
res[-1] <- lapply(res[-1], function(x) as.numeric(as.character(x)))
names(res) <- c("algorithm", "p", "q", "t", "c")
res
#   algorithm p     q     t   c
#1         rf 7 10000 20000 100
#2         rf 7  1000 20000 100
#3 svmLinear2 7 20000 20000 100
#4 svmLinear2 7  5000 20000 100

DATA。

y <- scan(text = '"rfoutputtablep7q10000t20000c100"
"rfoutputtablep7q1000t20000c100"
"svmLinear2outputtablep7q20000t20000c100"
"svmLinear2outputtablep7q5000t20000c100"',
what = character())

Answer 2

library(stringr)
myd = c("p", "q", "t", "c")
data.frame(sapply(myd, function(a) str_extract(str_extract(x, paste0(a, "\\d+")), "\\d+")))
#  p     q     t   c
#1 7 10000 20000 100
#2 7  1000 20000 100
#3 7 20000 20000 100
#4 7  5000 20000 100

#For first column
substr(x, 1, unlist(gregexpr("outputtable", x)) - 1)
#[1] "rf"         "rf"         "svmLinear2" "svmLinear2"

数据

x = c("rfoutputtablep7q10000t20000c100", "rfoutputtablep7q1000t20000c100", "svmLinear2outputtablep7q20000t20000c100", "svmLinear2outputtablep7q5000t20000c100")

Answer 3

使用正向前瞻来获得算法：

gsub("^(\\w+)(?=outputtable).*", "\\1", string, perl=TRUE)

实例：https://regex101.com/r/7vDK1x/2

p，q，t和c的正面后视（用(?<=p)中的其他字母替换p。

gsub(".*?(?<=q)(\\d+).*", "\\1", a, perl=TRUE)

Answer 4

这是使用stringi包的另一种解决方案。检查比较目前为止提出的所有解决方案的基准。 stringi比基本R略快，但如果你寻求一个简单的解决方案，当然会有点复杂。因此，根据您对速度或简单性的偏好，要么是好的。但是，stringi为更复杂的案例提供了更大的灵活性。（注意，基准测试不完全可比，因为我们都使用了不同的方法来设置data.frame和转换类型。）

更新：针对Rui Barradas的评论，我已将代码更新为我的答案。（i）我提出了一个使用stringi方法的函数，包括将列转换为数字，因此，对于完成任务，我会这样做。（ii）此外，我增加了基准，以便包括迄今为止提出的所有方法（也在评论中）。为了实现中途公平比较，我修改了所提出的方法，以便输出相同。我已经跳过了将列转换为数字以进行比较，特别是通过避免临时分配等使命令同样简洁。

似乎stringi仍然是最快的。

请纠正我，如果我已经监督了任何有关公平比较的事情（特别是stringr解决方案可能会在代码方面得到改进，我猜，但我对包不太熟悉，因此，我保留了建议的解决方案）。

library(stringi)
library(stringr)
library(microbenchmark)

strings <- c("rfoutputtablep7q10000t20000c100",
              "rfoutputtablep7q1000t20000c100",
             "svmLinear2outputtablep7q20000t20000c100",
             "svmLinear2outputtablep7q5000t20000c100")


split_to_df <- function(string, splititems, colidschar, firstcolname, replsplit_tonames) {

   data <- as.data.frame(do.call(rbind
                                ,stri_split_regex(strings, paste(splititems, collapse = "|")))
                        ,stringsAsFactors = FALSE)
   names(data) <- c(firstcolname, stri_replace_all_regex(splititems, replsplit_tonames, ""))
   numericcols <- setdiff(1:ncol(data), colidschar)
   data[,numericcols] <- lapply(data[,numericcols], as.numeric)
   return(data)

}

stringi_approach_complete <- function() {

  df <- split_to_df(string = strings
                    ,splititems = c("outputtablep(?=\\d)", "q(?=\\d)", "t(?=\\d)", "c(?=\\d)")
                    ,colidschar = 1
                    ,firstcolname = "A"
                    ,replsplit_tonames = "\\(.*\\)|outputtable")
  # class(df$p)
  # [1] "numeric"
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}


stringi_approach_compare <- function() {

  data <- as.data.frame(do.call(rbind, stri_split_regex(strings, c("outputtable|p(?=\\d)|q(?=\\d)|t(?=\\d)|c(?=\\d)"))))
  names(data) <- c("A", "p", "q", "t", "c")
  #class(data$p)
  #[1] "factor"
  #data
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}


stringr_approach <- function() {

  res <- data.frame(p = str_extract(str_extract(strings, "p\\d+"), "\\d+"),
                    q = str_extract(str_extract(strings, "q\\d+"), "\\d+"),
                    t = str_extract(str_extract(strings, "t\\d+"), "\\d+"),
                    c = str_extract(str_extract(strings, "c\\d+"), "\\d+"))
  #class(res$p)
  #[1] "factor"
  #res
  # p     q     t   c
  # 1 7 10000 20000 100
  # 2 7  1000 20000 100
  # 3 7 20000 20000 100
  # 4 7  5000 20000 100

}

base_approach1 <- function() {

  res <- do.call(rbind, strsplit(strings, 'outputtable|p|q|t|c'))
  res <- as.data.frame(res[, -2])
  names(res) <- c("A", "p", "q", "t", "c")
  #class(res$p)
  #[1] "factor"
  #res[-1] <- lapply(res[-1], function(x) as.numeric(as.character(x)))
  #res
  #           A p     q     t   c
  #1         rf 7 10000 20000 100
  #2         rf 7  1000 20000 100
  #3 svmLinear2 7 20000 20000 100
  #4 svmLinear2 7  5000 20000 100


}

base_approach2 <- function() {

  df <- setNames(data.frame(do.call(rbind, strsplit(strings, 'outputtable\\D|p|q|t|c'))), c("A", "p", "q", "t", "c"))
  #class(df$p)
  #[1] "factor"
  #df
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}



microbenchmark(
  base_approach1(),
  base_approach2(),
  stringi_approach_compare(),
  stringr_approach(),
  stringi_approach_complete()

)

# Unit: microseconds
#         expr                 min       lq     mean   median       uq       max neval
# base_approach1()            260.139 273.3635 337.1985 285.6005 298.2330  5280.152   100
# base_approach2()            352.906 362.1820 461.8205 374.8140 391.9850  4645.791   100
# stringi_approach_compare()  280.667 297.8380 312.8426 307.3125 319.1545   654.098   100
# stringr_approach()          849.499 867.6570 956.7596 886.2100 923.7115  5651.609   100
# stringi_approach_complete() 319.747 333.9580 461.5521 346.7870 369.0900 10985.052   100

从R中的字符串中提取子字符串和数字

4 个答案: