Question

我一直在检查，但我没有找到答案，让我了解如何做到这一点....提前感谢您的帮助。

我需要根据ID对矩阵中的值进行平均...例如（但我有4000乘4000 ......）：

        [,1]     [,2]     [,3]     [,4]      [,5]
[1,]    NA         A         B        C       B
[2,]     A       11.0     10.0      8.0      4.0
[3,]     B        6.0     10.0     15.0      5.0
[4,]     C       12.0     11.0     10.0      4.0
[5,]     B       12.0     11.0     10.0      5.0

我希望结果如下：

        [,1]    [,2]      [,3]    [,4]
[1,]     NA       A        B        C  
[2,]     A       11.0     7.0      8.0      
[3,]     B        9.0     7.8     12.5         
[4,]     C       12.0     7.5     10.0

非常感谢。

Answer 1

正如@akrun指出的那样，用矩阵中的实际行和列标记行和列并不是一个好主意。原因是您无法在矩阵中混合数据类型（有关详细信息，请参阅this）。相反，您可以使用rownames/colnames标记矩阵。下面是一个非常简单的基础R解决方案。我确信使用data.table或dplyr会有更快的方法，但这样做会有。请注意，如果唯一ID的数量大于1000左右，这将非常慢。

AverageMatVals <- function(mat) {  ## This way is very natural but highly inefficient
    uniRow <- unique(rownames(mat))
    uniCol <- unique(colnames(mat))
    newmat <- matrix(numeric(0), nrow=length(uniRow), ncol=length(uniCol))
    rownames(newmat) <- uniRow
    colnames(newmat) <- uniCol

    for (i in 1:nrow(newmat)) {
        rowMatch <- which(rownames(mat)==uniRow[i])
        for (j in 1:ncol(newmat)) {
            colMatch <- which(colnames(mat)==uniCol[j])
            newmat[i,j] <- round(mean(mat[rowMatch,colMatch]), 1)
        }
    }
    newmat
}    

mat <- matrix(c(11,6,12,12,10,10,11,11,8,15,10,10,4,5,4,5), nrow=4)
rownames(mat) <- c("A","B","C","B")
colnames(mat) <- c("A","B","C","B")

AverageMatVals(mat)
   A    B    C
A 11  7.0  8.0
B  9  7.8 12.5
C 12  7.5 10.0

下面是一个更快的方法，应该在超过5,000行/列左右的矩阵上表现良好。

AverageMatValsFast <- function(mat) {
    uniRow <- unique(rownames(mat))
    uniCol <- unique(colnames(mat))
    lenRow <- length(uniRow)
    v1 <- rep(1, ncol(mat))
    v2 <- rep(1, lenRow)

    tempMat <- t(vapply(1:lenRow, function(x) {
            rowMatch <- which(rownames(mat)==uniRow[x])
            if (length(rowMatch)>1) {
                colMeans(mat[rowMatch,])
            } else {
                mat[rowMatch,]
            }}, v1))

    meanMat <- vapply(1:length(uniCol), function(x) {
                colMatch <- which(colnames(mat)==uniCol[x])
                if (length(colMatch)>1) {
                    round(rowMeans(tempMat[,colMatch]), 1)
                } else {
                    round(tempMat[,colMatch], 1)
                }}, v2)

    remove(tempMat)    ## This could be a very large 
    gc()               ## object thus we need to clean it up
    rownames(meanMat) <- uniRow
    colnames(meanMat) <- uniCol

    meanMat
}

以下是一些时间安排：

set.seed(13379)
matTest1 <- matrix(sample(10^6, 4000^2, replace = TRUE), nrow = 4000, ncol = 4000)
myLetters <- expand.grid(LETTERS, LETTERS, stringsAsFactors = FALSE)
myLetters <- sapply(1:nrow(myLetters), function(x) paste(myLetters[x, ],collapse=""))
rownames(matTest1) <- sample(myLetters, 4000, replace = TRUE)
colnames(matTest1) <- sample(myLetters, 4000, replace = TRUE)

system.time(a <- AverageMatValFast(matTest1))
 user  system elapsed 
 0.77    0.00    0.77

system.time(b <- AverageMatVal(matTest1))
  user  system elapsed 
 59.50    0.02   59.56

all(sapply(1:nrow(a), function(x) all(abs(a[x,]-b[x,])<0.2)))   ## can't test equality b/c of rounding
[1]  TRUE

这是一个非常大的例子：

set.seed(11)
matTest2 <- matrix(sample(10^6, 6000^2, replace = TRUE), nrow = 6000, ncol = 6000)
myLetters <- expand.grid(LETTERS, LETTERS, LETTERS[sample(26,5)], stringsAsFactors = FALSE)
myLetters <- sapply(1:nrow(myLetters), function(x) paste(myLetters[x, ],collapse=""))
rownames(matTest2) <- sample(myLetters, 6000, replace = TRUE)
colnames(matTest2) <- sample(myLetters, 6000, replace = TRUE)

system.time(t1 <- AverageMatValFast(matTest2))
 user  system elapsed 
 3.54    0.04    3.58 

dim(t1)
[1]  2836  2831

的更新

以下是评论中OP建议的示例。这些名字是自由获得的here。

set.seed(333) myNames <- read.csv("http://www.quietaffiliate.com/Files/CSV_Database_of_First_Names.csv", stringsAsFactors = FALSE) myNames <- tolower(myNames$firstname) length(myNames) [1] 5494 head(myNames) [1] "aaron" "aaron" "abbey" "abbie" "abby" "abdul" sampNames1 <- sample(myNames, 4000, replace = TRUE) sampNames2 <- sample(myNames, 4000, replace = TRUE) mat1 <- matrix(sample(10^6, 4000^2, replace = TRUE), nrow = 4000, ncol = 4000) rownames(mat1) <- sampNames1 colnames(mat1) <- sampNames2 system.time(t2 <- AverageMatValsFast(mat1)) user system elapsed 2.32 0.19 2.51 t2[1:10, 1:5] wen cristen sherell sona denna jovan 624688.0 141679.5 551442.5 568128.8 405943.2 benjamin 662494.2 658096.5 435062.5 521144.0 424704.8 wendolyn 869093.5 856608.0 446543.5 715201.0 234873.5 liane 495856.0 615054.0 456647.5 304897.0 509781.5 alexia 430558.0 369075.0 724121.0 617018.0 404110.5 nobuko 302176.5 249807.0 664577.0 458983.5 416712.5 lynsey 583306.0 247513.7 466308.2 384851.2 569038.0 eunice 503505.3 410133.0 304032.3 354720.7 415618.0 arnita 667288.5 388770.0 661687.0 368347.0 495238.5 eugenia 572900.2 568346.5 613246.2 525411.1 482589.8

将矩阵中的值平均为R中的新矩阵

1 个答案: