一种计算行之间差异的有效方法

时间:2019-01-31 22:01:00

标签: r dplyr apply

考虑到这是我下面的数据集

 Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
          5.7         2.5          5.0         2.0  virginica
          7.7         3.0          6.1         2.3  virginica
          6.7         3.3          5.7         2.1  virginica
          4.8         3.0          1.4         0.1     setosa
          5.5         4.2          1.4         0.2     setosa
          4.9         3.6          1.4         0.1     setosa
          6.3         3.3          4.7         1.6 versicolor
          5.6         2.9          3.6         1.3 versicolor
          5.9         3.0          4.2         1.5 versicolor


df <- structure(list(Sepal.Length = c(5.7, 7.7, 6.7, 4.8, 5.5, 4.9, 
    6.3, 5.6, 5.9), Sepal.Width = c(2.5, 3, 3.3, 3, 4.2, 3.6, 3.3, 
    2.9, 3), Petal.Length = c(5, 6.1, 5.7, 1.4, 1.4, 1.4, 4.7, 3.6, 
    4.2), Petal.Width = c(2, 2.3, 2.1, 0.1, 0.2, 0.1, 1.6, 1.3, 1.5
    ), Species = structure(c(3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("setosa", 
    "versicolor", "virginica"), class = "factor")), row.names = c(NA, 
    -9L), class = "data.frame")

我的目标是

  1. 从物种的第一行==“ virginica”中减去Sepal.Length Sepal.Width Petal.Length Petal.Width的值,并将每一行“ Setosa”相减,

    我在下面做什么?

    Virginia1_vs_Setosa1a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][1,]
    Virginia1_vs_Setosa1a 
        0.9        -0.5          3.6         1.9
    Virginia1_vs_Setosa2a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][2,]
    Virginia1_vs_Setosa2a
        0.2        -1.7          3.6         1.8
    Virginia1_vs_Setosa3a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][3,]
    Virginia1_vs_Setosa3a
        0.8        -1.1          3.6         1.9
    
  2. 取每个元素的乘积

      Virginia1_vs_Setosa1 <-  as.numeric(
                         Virginia1_vs_Setosa1a[1]*Virginia1_vs_Setosa1a[2]*
                          Virginia1_vs_Setosa1a[3]*Virginia1_vs_Setosa1a[4])
                          0.9*-0.5*3.6*1.9 = -3.078
      Virginia1_vs_Setosa2  <- as.numeric(
                         Virginia1_vs_Setosa2a[1]*Virginia1_vs_Setosa2a[2]*
                          Virginia1_vs_Setosa2a[3]*Virginia1_vs_Setosa2a[4])
                          0.2*-1.7*3.6*1.8 = -2.2032
      Virginia1_vs_Setosa3  <- as.numeric(
                         Virginia1_vs_Setosa3a[1]*Virginia1_vs_Setosa3a[2]*
                          Virginia1_vs_Setosa3a[3]*Virginia1_vs_Setosa3a[4])
                          0.8*-1.1*3.6*1.9 = -6.0192
    

类似于弗吉尼亚州的第二行,setosa中的每一行。

      Virginia2_vs_Setosa1a <- df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][1,]

      Virginia2_vs_Setosa2a <- df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][2,]

      Virginia2_vs_Setosa3a <-  df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][3,]

      Virginia2_vs_Setosa1 <-  as.numeric(
              Virginia2_vs_Setosa1a[1]*Virginia2_vs_Setosa1a[2]*
              Virginia2_vs_Setosa1a[3]*Virginia2_vs_Setosa1a[4])

      Virginia2_vs_Setosa2  <- as.numeric(
              Virginia2_vs_Setosa2a[1]*Virginia2_vs_Setosa2a[2]*
              Virginia2_vs_Setosa2a[3]*Virginia2_vs_Setosa2a[4])

      Virginia2_vs_Setosa3  <- as.numeric(
              Virginia2_vs_Setosa3a[1]*Virginia2_vs_Setosa3a[2]*
              Virginia2_vs_Setosa3a[3]*Virginia2_vs_Setosa3a[4])

              rm(Virginia2_vs_Setosa1a, Virginia2_vs_Setosa2a, 
              Virginia2_vs_Setosa3a)

与弗吉尼亚州的第三行相似,setosa的每一行

       Virginia3_vs_Setosa1a <- df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][1,]

       Virginia3_vs_Setosa2a <- df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][2,]

       Virginia3_vs_Setosa3a <-  df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][3,]

       Virginia3_vs_Setosa1 <-  as.numeric(
                 Virginia3_vs_Setosa1a[1]*Virginia3_vs_Setosa1a[2]*
                 Virginia3_vs_Setosa1a[3]*Virginia3_vs_Setosa1a[4])

       Virginia3_vs_Setosa2  <- as.numeric(
                 Virginia3_vs_Setosa2a[1]*Virginia3_vs_Setosa2a[2]*
                 Virginia3_vs_Setosa2a[3]*Virginia3_vs_Setosa2a[4])


       Virginia3_vs_Setosa3  <- as.numeric(
                 Virginia3_vs_Setosa3a[1]*Virginia3_vs_Setosa3a[2]*
                 Virginia3_vs_Setosa3a[3]*Virginia3_vs_Setosa3a[4])

         rm(Virginia3_vs_Setosa1a, Virginia3_vs_Setosa2a, 
            Virginia3_vs_Setosa3a)

最后在下面创建一个3 * 3的矩阵

matrix(c(Virginia1_vs_Setosa1, Virginia1_vs_Setosa2, Virginia1_vs_Setosa3, Virginia2_vs_Setosa1, Virginia2_vs_Setosa2, Virginia2_vs_Setosa3,
  Virginia3_vs_Setosa1, Virginia3_vs_Setosa2, Virginia3_vs_Setosa3), nrow=3, ncol=3)


       [,1]     [,2]    [,3]
[1,] -3.0780   0.0000  4.9020
[2,] -2.2032 -26.0568 -8.8236
[3,] -6.0192 -17.3712 -4.6440

如您所见,我的解决方案非常笨拙且效率低下。如果有人可以向我展示一种实现相同结果的有效方法,我将非常感激。

2 个答案:

答案 0 :(得分:1)

您可以使用双for循环来完成此操作。 *apply函数家族可能有解决方案,但是这一功能可行。

f <- droplevels(df$Species[df$Species != "versicolor"])
sp <- split(df[df$Species != "versicolor", ], f)

res <- matrix(0, 3, 3)
for(i in 1:nrow(sp[[1]])){
  for(j in 1:nrow(sp[[2]])){
    res[i, j] <- prod(sp[[2]][j, -5] - sp[[1]][i, -5])
  }
}

res
#        [,1]     [,2]    [,3]
#[1,] -3.0780   0.0000  4.9020
#[2,] -2.2032 -26.0568 -8.8236
#[3,] -6.0192 -17.3712 -4.6440

答案 1 :(得分:0)

对于这种特殊情况,您可以从outer借用一些想法

X <- lapply(split(df[df$Species=="virginica", 1:4], 1:3), unlist)
Y <- lapply(split(df[df$Species=="setosa", 1:4], 1:3), unlist)

FUN <- function(l1, l2) mapply(function(v,w) prod(v-w), l1, l2)
Y <- rep(Y, rep.int(length(X), length(Y)))
if (length(X)) 
    X <- rep(X, times = ceiling(length(Y)/length(X)))
matrix(FUN(X, Y), ncol=3L, byrow=TRUE)

在大多数情况下,您将需要生成每对可能的不同行对,然后根据您的公式进行计算。使用data.table,将类似于:

library(data.table)
setDT(df)
setorder(df, Species)[, numid := rowid(Species)]

parts <- split(df, by=c("Species", "numid"))
combis <- CJ(parts, parts, sorted=FALSE)
combis[, .(
        Species1=V1[[1]][,Species], 
        numid1=V1[[1]][,numid],
        Species2=V2[[1]][,Species], 
        numid2=V2[[1]][,numid],
        differ=prod(V1[[1]][, 1:4] - V2[[1]][, 1:4])), 
    by=seq_len(combis[,.N])][
        Species1!=Species2, -1L]