基于R中的交替字符拆分字符串

时间:2015-04-01 05:14:13

标签: r

我正试图找出一种有效的方法来分割像

这样的字符串
"111110000011110000111000"

进入矢量

[1] "11111" "00000" "1111" "0000" "111" "000"

其中“0”和“1”可以是任何交替的字符。

9 个答案:

答案 0 :(得分:88)

尝试

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  

更新

使用stri_extract_all_regex

修改@ rawr的解决方案
library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  


stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"  
#[10] "000"  

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"  
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
#[15] "D"       "aa"      "BB"     

基准

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
            perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3), 
                rle(strsplit(x3, "")[[1]])$lengths, 
                colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
   mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
   res_vector=rep(NA_character_,nchar(x3))
  res_vector[1]=substr(x3,1,1)
  counter=1
  old_tmp=''

   for (i in 2:nchar(x3)) {
    tmp=substr(x3,i,i)
    if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
    } else {
    res_vector[counter+1]=tmp
    counter=counter+1
    }
  old_tmp=tmp
   }

 res_vector[!is.na(res_vector)]
  }


 richard <- function(){
     cs <- cumsum(
     rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
   )
   stri_sub(x3, c(1, head(cs + 1, -1)), cs)
  }

 nicola<-function(x) {
   indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
   substring(x,indices[-length(indices)]+1,indices[-1])
 }

 richard2 <- function() {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
 }

system.time(akrun())
# user  system elapsed 
# 0.003   0.000   0.003 

system.time(thelate())
#   user  system elapsed 
#  0.272   0.001   0.274 

system.time(rawr())
# user  system elapsed 
#  0.397   0.001   0.398 

system.time(ananda())
#  user  system elapsed 
# 3.744   0.204   3.949 

system.time(Colonel())
#   user  system elapsed 
#  0.154   0.001   0.154 

system.time(Cryo())
#  user  system elapsed 
# 0.220   0.005   0.226 

system.time(richard())
#  user  system elapsed 
# 0.007   0.000   0.006 

system.time(nicola(x3))
# user  system elapsed 
# 0.190   0.001   0.191 

稍微大一点的字符串,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed 
#0.166   0.000   0.155 
system.time(richard())
#  user  system elapsed 
# 0.606   0.000   0.569 
system.time(richard2())
#  user  system elapsed 
# 0.518   0.000   0.487 

system.time(Colonel())
#  user  system elapsed 
# 9.631   0.000   9.358 


library(microbenchmark)
 microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
 #Unit: relative
 #     expr      min       lq     mean   median       uq      max neval cld
 # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
 #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
 # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 

注意:尝试运行其他方法,但需要很长时间

数据

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

答案 1 :(得分:25)

主题的变化:

x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

答案 2 :(得分:21)

您可以使用substrread.fwf以及rle(尽管它不太可能像任何基于正则表达式的解决方案一样高效):

x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#      V1      V2      V3      V4      V5      V6 
# "11111" "00000"  "1111"  "0000"   "111"   "000"

这种方法的一个优点是,它可以工作,比如说:

x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
             rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"

unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#        V1        V2        V3        V4        V5        V6 
#   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d" 

答案 3 :(得分:20)

另一种方法是在交替数字之间添加空格。这适用于任何两个,而不仅仅是1和0。然后在空白处使用strsplit

x <- "111110000011110000111000"

(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "


strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

或者更为简洁,正如@akrun所指出的那样:

strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

同时将\\d更改为\\w也可以

x  <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"      

x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000" 

您还可以使用\K(而不是明确使用捕获组,\\1\\2),我不会看到它们使用过多,也不知道如何使用解释一下:}

AFAIK \\K重置报告的匹配的起点,并且不再包含任何以前消耗的字符,基本上丢弃了与该点匹配的所有内容。

x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "

答案 4 :(得分:14)

原创方法:这是一种包含rle() stringi 方法。

x <- "111110000011110000111000"
library(stringi)

cs <- cumsum(
    rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

或者,您可以使用length

中的stri_sub()参数
rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
    stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"  

针对效率进行了更新:在意识到base::strsplit()stringi::stri_split_boundaries()更快之后,这是我之前使用基本功能的答案的更高效版本。

set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)

system.time({
    cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
    substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
#   user  system elapsed 
#  0.686   0.012   0.697 

答案 5 :(得分:11)

另一种方法,使用mapply

x="111110000011110000111000"

with(rle(strsplit(x,'')[[1]]), 
     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  

答案 6 :(得分:8)

它并不是OP所寻求的(简洁的R代码),但我想在Rcpp中尝试一下,结果相对简单,大约比其快5倍最快的基于R的答案。

library(Rcpp)

cppFunction(
  'std::vector<std::string> split_str_cpp(std::string x) {

  std::vector<std::string> parts;

  int start = 0;

  for(int i = 1; i <= x.length(); i++) {
      if(x[i] != x[i-1]) {
        parts.push_back(x.substr(start, i-start));
        start = i;
      } 
  }

  return parts;

  }')

并测试这些

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

提供以下输出

> split_str_cpp(str1)
[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
> split_str_cpp(x1)
 [1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"  
> split_str_cpp(x2)
 [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
[15] "D"       "aa"      "BB"   

基准测试显示它比R解决方案快5-10倍。

akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]

richard1 <- function(x3){
  cs <- cumsum(
    rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
  )
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

richard2 <- function(x3) {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

library(microbenchmark)
library(stringi)

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)

比较:

Unit: relative
              expr      min       lq     mean   median       uq      max neval
 split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
         akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
      richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
      richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20

答案 7 :(得分:2)

简单for循环解决方案

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==substr(x,i-1,i-1)) {
    res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
  } else {
    res_vector[length(res_vector)+1]=tmp
  }
}

res_vector

#[1] "aaaaa"  "bb"  "ccccccc"  "bbb"  "a"  "d"  "11111"  "00000"  "222"  "aaa"  "bb"  "cc"  "d"  "11"  "D"  "aa"  "BB"

或者使用预先分配的结果向量可能稍快一点

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
  } else {
    res_vector[counter+1]=tmp
    counter=counter+1
  }
  old_tmp=tmp
}

res_vector[!is.na(res_vector)]

答案 8 :(得分:1)

这个怎么样:

s <- "111110000011110000111000"

spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))

# [1] "11111" "00000" "1111"  "0000"  "111"   "000"