我有一个数据框:
df <- data.frame(sentences = c("An apple hangs on an apple tree",
"Bananas are yellow and tasty",
" Bananas and apples",
"The apple is tasty","Apples are healthy. Apples are juicy.",
"Bananarama are an English female pop music vocal duo"
),
stringsAsFactors = FALSE)
list_of_patterns <- tolower(c("Apple", "Banana")) # IMPORTANT : This list will contains thousand of patterns with real data
我的结果:
sentences
An apple hangs on an apple tree
Bananas are yellow and tasty
Bananas and apples
The apple is tasty
Apples are healthy. Apples are juicy.
Bananarama are an English female pop music vocal duo
我的期望:我想计算模式列表中包含每个单词的行数。并非所有模式。 (输出DF)。
Apple : 4 # number of rows containing this pattern
Banana : 3 # number of rows containing this pattern
答案 0 :(得分:3)
这是基本的R解决方案。
sapply(list_of_patterns, function(x) length(grep(x, df$sentences, ignore.case = TRUE)))
# apple banana
# 4 3
对于大型数据集,我的解决方案似乎是迄今为止发布的解决方案(1,2)中最快的。在comment中,用户Daniel O在sapply/length
上进行了进一步的改进。特别是对于小型数据集,列表成员的lengths
更快。
library(microbenchmark)
Rui <- function(df){
sapply(list_of_patterns, function(x) length(grep(x, df$sentences, ignore.case = TRUE)))
}
Rui2 <- function(df){
lengths(sapply(list_of_patterns, grep, df$sentences, ignore.case = TRUE))
}
Ronak <- function(df){
colSums(sapply(list_of_patterns, stringr::str_detect, string = tolower(df$sentences)) > 0)
}
GKi <- function(df){
sapply(list_of_patterns, function(x) sum(grepl(x, tolower(df$sentences))))
}
GKi2 <- function(df){
y <- tolower(df$sentences)
sapply(list_of_patterns, function(x) sum(grepl(x, y)))
}
test <- function(df, n = 1){
for(i in seq.int(n)) df <- rbind(df, df)
mb <- microbenchmark(
Rui = Rui(df),
Rui2 = Rui2(df),
Ronak = Ronak(df),
GKi = GKi(df),
GKi2 = GKi2(df)
)
print(mb, unit = 'relative', order = 'median')
}
test(df, 1)
#Unit: relative
# expr min lq mean median uq max neval cld
# Rui2 1.000000 1.000000 1.0000000 1.000000 1.000000 1.00000000 100 a
# Rui 1.104683 1.095170 0.7443870 1.078442 1.065026 0.03807400 100 a
# GKi2 1.073707 1.109255 0.7726559 1.134426 1.097193 0.05178062 100 a
# GKi 1.396863 1.324172 0.9401923 1.334219 1.326366 0.05203571 100 a
# Ronak 2.182422 2.160915 1.7950065 2.175393 2.962058 0.14150318 100 b
test(df, 10)
#Unit: relative
# expr min lq mean median uq max neval cld
# Rui2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
# Rui 1.067239 1.012265 1.017496 1.002532 1.001989 1.367107 100 a
# GKi2 1.622306 1.616520 1.673410 1.625944 1.627558 1.942846 100 b
# Ronak 1.852324 1.756859 1.768850 1.718348 1.756507 1.637960 100 c
# GKi 2.519431 2.315098 2.325065 2.303203 2.347620 1.991377 100 d
答案 1 :(得分:2)
我们可以使用str_detect
中的stringr
:
colSums(sapply(list_of_patterns, stringr::str_detect,
string = tolower(df$sentences)))
# apple banana
# 4 3
答案 2 :(得分:1)
您可以将grepl
中的sum
与sapply
一起使用。
sapply(list_of_patterns, function(x) sum(grepl(x, tolower(df$sentences))))
# apple banana
# 4 3
或存储tolower
y <- tolower(df$sentences)
sapply(list_of_patterns, function(x) sum(grepl(x, y)))
或者像已经@ Ronak-Shah一样使用colSums
,但在 base 中:
colSums(sapply(list_of_patterns, grepl, tolower(df$sentences)))
答案 3 :(得分:0)
其他选项
df <- df %>% mutate(sentences = tolower(sentences))
list_of_patterns <- tolower(c("Apple", "Banana")) %>% purrr::set_names()
map_dbl(list_of_patterns, ~ sum(str_detect(df$sentences, .x)))