Question

我有一个数字向量，我想从中选择相距2个单位的对。因此，如果我将矢量p定义如下：

p<-c(2,3,5,7,11,13,17,19,23,29,31,37,41,43,47)

我想选择以下对：

3,5; 5,7; 11,13; 17,19; 29,31; 41,43

我尝试在矢量

中至少选择这些数字是不成功的

j<-NULL
for(i in seq(p)) if (p[i+1]-p[i]==2) j<-c(j,i,i+1)

但它没有提供所需的输出。谢谢你的帮助。

Answer 1

以下是使用基本R函数的解决方案：

dif=which(abs(diff(p))==2)
sapply(dif, function(x) c(p[x],p[x+1]))

   # [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    3    5   11   17   29   41
# [2,]    5    7   13   19   31   43

通过将2更改为任何其他值，您可以设法获取矢量元素分开的任何所需单位的结果。

abs用于处理未对vector元素进行排序的情况。

基准（小规模）

p<-c(2,3,5,7,11,13,17,19,23,29,31,37,41,43,47) # length(p)=15
library(dplyr)
library(data.table) 
library(microbenchmark)
func_Sotos <- function(p){df <- expand.grid(p, p);df[df[,1]-df[,2] == 2,];}
func_m0h3n <- function(p){dif=which(abs(diff(p))==2);sapply(dif, function(x) c(p[x],p[x+1]));}
func_David_B <- function(p){data.frame(p) %>% mutate(lagp = lag(p))  %>% filter(p - lagp == 2)}
func_akrun1 <- function(p){setDT(list(p=p))[, p1 := shift(p)][p-p1 ==2];}
func_akrun2 <- function(p){unique(CJ(p=p, p1=p)[abs(p-p1)==2][.(p=pmin(p,p1), p1=pmax(p, p1))]);}
func_RHertel1 <- function(p){d2_mat <- which(as.matrix(dist(p))==2, arr.ind=TRUE);unique(t(apply(cbind(p[d2_mat[,1]],p[d2_mat[,2]]),1,sort)));}
func_RHertel2 <- function(p){m2 <- t(combn(sort(p),2));m2[abs(m2[,1] - m2[,2]) == 2,];}
func_RHertel3 <- function(p){d2 <- as.matrix(dist(p));d2[lower.tri(d2)] <- 0;idx <- which(d2 == 2, arr.ind=TRUE);cbind(p[idx[,1]], p[idx[,2]]);}
func_Tomas.H <- function(p) {a<-which(p-lag(p)==2);b<-a-1;df<-data.frame(pair1=p[b],pair2=p[a]);df;}
func_Arun.kumar.mahesh <- function(p) {
j<-c()
    for(i in 1:length(p)){
      if(sum(p[i]-p[i+1],na.rm=T)==-2){
        j[i] <- paste(p[i],p[i+1],sep=",")
      }
    }
    j <- j[!is.na(j)]
}
microbenchmark(func_Sotos(p), func_m0h3n(p), func_David_B(p), func_akrun1(p), func_akrun2(p), func_RHertel1(p), func_RHertel2(p), func_RHertel3(p), func_Tomas.H(p), func_Arun.kumar.mahesh(p))

Unit: microseconds
                      expr      min        lq      mean    median        uq      max neval
             func_Sotos(p)  403.770  455.9520  470.6952  469.6390  485.4640  594.961   100
             func_m0h3n(p)   72.713   92.8155  125.7504   98.8040  104.7920 2622.790   100
           func_David_B(p) 1986.340 2148.2335 2260.4203 2207.0450 2292.1615 5547.553   100
            func_akrun1(p) 1321.233 1404.2110 1472.6807 1464.3060 1504.7255 1872.566   100
            func_akrun2(p) 2524.414 2623.2185 2777.9167 2700.2080 2816.5485 5595.885   100
          func_RHertel1(p) 1160.838 1230.5560 1349.9502 1267.7680 1328.7185 4133.504   100
          func_RHertel2(p)  249.362  281.2270  298.3233  296.1975  308.3880  562.027   100
          func_RHertel3(p)  159.968  182.8515  204.4812  203.1675  223.6985  304.538   100
           func_Tomas.H(p)  275.453  316.0865  337.7593  334.6925  350.7320  646.716   100
 func_Arun.kumar.mahesh(p)  123.184  142.2175  174.5620  148.4200  158.0435 2579.163   100

BENCHMARK（中等规模）

set.seed(100)
p=sample(1000) # length(p)=1000

microbenchmark(func_Sotos(p), func_m0h3n(p), func_David_B(p), func_akrun1(p), func_akrun2(p), func_RHertel1(p), func_RHertel2(p), func_RHertel3(p), func_Tomas.H(p), func_Arun.kumar.mahesh(p))
Unit: microseconds
                      expr        min          lq         mean      median          uq        max neval
             func_Sotos(p)  30711.250  35060.8410  53640.60456  64290.0265  69224.6310  98474.248   100
             func_m0h3n(p)     41.465     68.9580     88.75608     83.5305    102.1600    196.808   100
           func_David_B(p)    854.835   1067.1160   1220.68932   1150.1960   1261.5205   3934.944   100
            func_akrun1(p)    524.319    748.9200    830.18763    811.5670    896.2995   1549.519   100
            func_akrun2(p)  12986.877  17372.2235  34010.07038  21836.1435  52173.1590  58796.699   100
          func_RHertel1(p)  76813.429 107942.6315 112380.30785 115049.1765 119579.6505 163399.316   100
          func_RHertel2(p) 280275.495 297188.4505 307531.70976 304330.0005 314177.5760 360689.445   100
          func_RHertel3(p)  45957.354  85348.1045 103999.44879 113351.6765 118847.8575 170738.875   100
           func_Tomas.H(p)    154.742    212.4325    263.66812    260.8075    295.0610    536.037   100
 func_Arun.kumar.mahesh(p)    972.619   1072.5250   1192.35206   1152.4500   1238.9850   2483.979   100

Answer 2

嗨，如果需要的结果是数据框，那么试试这个

p<-c(2,3,5,7,11,13,17,19,23,29,31,37,41,43,47)

a<-which(p-lag(p)==2)
b<-a-1

df<-data.frame(pair1=p[b],
               pair2=p[a])

如果你想要一个向量，那么这应该可以工作

res<-NULL
for (i in a){

    res<-c(res,p[i-1],p[i])
}

Answer 3

有一种比这更好的方法，但这是expand.grid，

的想法

df <- expand.grid(p, p)
unname(apply(df[df[,1]-df[,2] == -2,], 1,paste, collapse = ','))
#[1] "3,5"   "5,7"   "11,13" "17,19" "29,31" "41,43"

如果你想要一个数据框，那么，

df[df[,1]-df[,2] == 2,]
#    Var1 Var2
#18     5    3
#34     7    5
#66    13   11
#98    19   17
#146   31   29
#194   43   41

Answer 4

您可以使用dplyr执行此操作，这将返回数据框中的对：

> library(dplyr)
> data.frame(p) %>% mutate(lagp = lag(p))  %>% filter(p - lagp == 2)
   p lagp
1  5    3
2  7    5
3 13   11
4 19   17
5 31   29
6 43   41

Answer 5

这是另一个使用data.table

library(data.table) 
setDT(list(p=p))[, p1 := shift(p)][p-p1 ==2]
#    p p1
#1:  5  3
#2:  7  5
#3: 13 11
#4: 19 17
#5: 31 29
#6: 43 41

如果在执行操作之前没有订购vector p，请order。

setDT(list(p=p))[order(p)][, p1 := shift(p)][p-p1==2]

更新

使用@RHertel提供的新载体

p <- c(2, 3, 4, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47)
unique(CJ(p=p, p1=p)[abs(p-p1)==2][.(p=pmin(p,p1), p1=pmax(p, p1))])
#    p p1
#1:  2  4
#2:  3  5
#3:  5  7
#4: 11 13
#5: 17 19
#6: 29 31
#7: 41 43

Answer 6

有点hacky，但这是另一种方式。

d2_mat <- which(as.matrix(dist(p))==2, arr.ind=TRUE)
unique(t(apply(cbind(p[d2_mat[,1]],p[d2_mat[,2]]),1,sort)))
#     [,1] [,2]
#[1,]    3    5
#[2,]    5    7
#[3,]   11   13
#[4,]   17   19
#[5,]   29   31
#[6,]   41   43

与其他一些答案相反，这不需要矢量p中数字的任何特定顺序。

同样的矢量化版本可能是：

d2 <- as.matrix(dist(p))
d2[lower.tri(d2)] <- 0
idx <- which(d2 == 2, arr.ind=TRUE)
cbind(p[idx[,1]], p[idx[,2]])

在最后一行，而不是cbind()，也可以使用paste()，具体取决于所需的输出：

paste(p[idx[,1]], p[idx[,2]], sep=",")
#[1] "3,5"   "5,7"   "11,13" "17,19" "29,31" "41,43"

以下变体比我以前的建议更简单，可能（更快）。

m2 <- t(combn(sort(p),2))
m2[abs(m2[,1] - m2[,2]) == 2,]

此版本也会查找在任何整数向量内相隔2个单位的所有值对。

以下是一个例子：

p <- c(13, 19, 43, 29, 47, 17, 7, 37, 2, 41, 3, 4, 31, 11, 5, 23)
#     [,1] [,2]
#[1,]    2    4
#[2,]    3    5
#[3,]    5    7
#[4,]   11   13
#[5,]   17   19
#[6,]   29   31
#[7,]   41   43

如果需要，可以使用以下方法修改输出：

m2 <- t(combn(sort(p), 2))
m2 <- m2[abs(m2[,1] - m2[,2]) == 2,]
paste(m2[,1], m2[,2], sep=",")
#[1] "2,4"   "3,5"   "5,7"   "11,13" "17,19" "29,31" "41,43"

Answer 7

   Use length function instead of seq to get desired output

    j<-c()
    for(i in 1:length(p)){
      if(sum(p[i]-p[i+1],na.rm=T)==-2){
        j[i] <- paste(p[i],p[i+1],sep=",")
      }
    }
    j <- j[!is.na(j)]
    print(j)
    [1] "3,5"   "5,7"   "11,13" "17,19" "29,31" "41,43"

选择矢量中的数字对，其差值等于预定值

7 个答案:

更新