使用由另一个向量定义的自定义间隔对向量中的Bin值进行

时间:2019-06-13 17:46:24

标签: r

我将一些值分为以下几个间隔:

> set.seed(22)
> a <- rnorm(50)
> b <- as.data.frame(table(Hmisc::cut2(a, m = 10)))
> b
             Var1 Freq
1 [-1.616,-0.793)   10
2 [-0.793,-0.200)   10
3 [-0.200, 0.301)   10
4 [ 0.301, 0.937)   10
5 [ 0.937, 3.253]   10

我想在另一个向量c中获取值,并将其分配给b $ Var1中的间隔所定义的bin。

> c <- runif(50, -1, 3)
> c
 [1] -0.36167553 -0.42019310  1.80365545  1.45542530 -0.72798537  0.32368285  1.68209984 -0.07971160  2.69304696 -0.84131974  0.89430681
[12] -0.38260232  2.78302235  2.91256761 -0.20692439  2.21367929  2.40534034  0.26349751  0.51897997  0.10485985 -0.14338538  1.65355414
[23]  2.68974930 -0.38767144  0.75481723  2.98473148  0.79046750  2.26079307 -0.24748383 -0.18502040  2.82674089  2.97552886  1.25323374
[34]  2.11271998  2.92941982 -0.62746180  0.53751411  1.34383497  0.02002254  2.04000343  0.23576506  1.67230419  0.68045395 -0.32637800
[45]  0.33067028 -0.58080654  0.38844488 -0.34026266  1.54217623  2.51062797

当我使用findInterval尝试此操作时,c中的值将分配给以下bin:

> interval_c <- findInterval(c, b$Var1)
> interval_c
 [1] 0 0 1 1 0 0 1 0 2 0 0 0 2 2 0 2 2 0 0 0 0 1 2 0 0 2 0 2 0 0 2 2 1 2 2 0 0 1 0 2 0 1 0 0 0 0 0 0 1 2

我曾期望看到以下内容:

> interval_c
 [1] 2 2 5 5 2 4 5 3 5 1 4

...等

有没有一种方法可以将c中的值分配到b中正确的bin中?谢谢!

2 个答案:

答案 0 :(得分:2)

将参数onlycuts = TRUE添加到Hmisc::cut2以仅返回切割(请参见?cut2),并将findInterval应用于获得的切割矢量:

set.seed(22)
(a <- rnorm(50))
#>  [1] -0.512139088  2.485183678  1.007826150  0.292814572 -0.208959361
#>  [6]  1.858092390 -0.066026405 -0.162764952 -0.199860680  0.300561734
#> [11] -0.763907283  0.081961904  0.743028275 -0.084022194 -0.792894517
#> [16] -0.922153631  0.861562379  2.002942188  0.936551013 -1.615734872
#> [21] -0.575056589 -0.003973089 -0.676112603 -1.049628275 -0.543280568
#> [26]  0.556144530  0.252837717 -0.901814675  0.824391356 -1.560279752
#> [31]  0.537994003 -1.268353887  0.640519828 -0.535761818 -1.019642817
#> [36] -0.807881506  0.056825225  0.950211404 -1.126763499 -0.201168295
#> [41] -0.228495853  0.558716260  0.748745433  1.918204369  1.007207812
#> [46]  3.253349400 -0.161748014  0.333755546 -1.178672976  1.077604331
(c <- runif(50, -1, 3))
#>  [1] -0.36167553 -0.42019310  1.80365545  1.45542530 -0.72798537
#>  [6]  0.32368285  1.68209984 -0.07971160  2.69304696 -0.84131974
#> [11]  0.89430681 -0.38260232  2.78302235  2.91256761 -0.20692439
#> [16]  2.21367929  2.40534034  0.26349751  0.51897997  0.10485985
#> [21] -0.14338538  1.65355414  2.68974930 -0.38767144  0.75481723
#> [26]  2.98473148  0.79046750  2.26079307 -0.24748383 -0.18502040
#> [31]  2.82674089  2.97552886  1.25323374  2.11271998  2.92941982
#> [36] -0.62746180  0.53751411  1.34383497  0.02002254  2.04000343
#> [41]  0.23576506  1.67230419  0.68045395 -0.32637800  0.33067028
#> [46] -0.58080654  0.38844488 -0.34026266  1.54217623  2.51062797

(cuts <- Hmisc::cut2(a, m = 10, onlycuts = TRUE))
#> [1] -1.6157349 -0.7928945 -0.1998607  0.3005617  0.9365510  3.2533494
findInterval(c, cuts)
#>  [1] 2 2 5 5 2 4 5 3 5 1 4 2 5 5 2 5 5 3 4 3 3 5 5 2 4 5 4 5 2 3 5 5 5 5 5
#> [36] 2 4 5 3 5 3 5 4 2 4 2 4 2 5 5

reprex package(v0.3.0)

创建于2019-06-13

答案 1 :(得分:0)

del需要一个向量来对值进行分箱。这是将findInterval转换为向量的一种方法-

b$Var1

数据-

library(tidyverse)

vec <- b %>% 
  mutate(b_tmp = str_sub(Var1, 2, -2)) %>% 
  separate(b_tmp, c("minI", "maxI"), sep = ",") %>% 
  mutate_at(c("minI", "maxI"), as.numeric) %>%
  {sort(c(pull(., minI), pull(., maxI)))} %>%
  unique()

vec
[1] -1.616 -0.793 -0.200  0.301  0.937  3.253

# c is a function in R so not a good idea to use it as object; using vec_c instead
vec_c <- c(-0.36167553, -0.42019310, 1.80365545, 1.45542530, -0.72798537, 0.32368285)

interval_c <- findInterval(vec_c, vec) 
[1] 2 2 5 5 2 4