修改

Question

这是我的树。第一列是分支的标识符，其中0是主干，L是左边的第一个分支，R是右边的第一个分支。 LL是第二次分叉后最左边的分支等。变量length包含每个分支的长度。

> tree
  branch length
1      0     20
2      L     12
3     LL     19
4      R     19
5     RL     12
6    RLL     10
7    RLR     12
8     RR     17

这是这棵树的图纸

enter image description here

这树上有两个位置

pos1 = tree[3,]; pos1$length = 12
pos2 = tree[6,]; pos2$length = 3

我构建了这个算法来计算树上任意两点之间分支的最短距离。

distance = function(tree, pos1, pos2){
    if (identical(pos1$branch, pos2$branch)){Dist=pos1$length-pos2$length;return(Dist)}
    pos1path = strsplit(pos1$branch, "")[[1]]
    if (pos1path[1]!="0") {pos1path = c("0", pos1path)}
    pos2path = strsplit(pos2$branch, "")[[1]]
    if (pos2path[1]!="0") {pos2path = c("0", pos1path)}
    CommonTrace="included"; for (i in 1:min(length(pos1path), length(pos2path))) {if (pos1path[i] != pos2path[i]) {CommonTrace = i-1; break}}

    if(CommonTrace=="included"){
        CommonTrace = min(length(pos1path), length(pos2path))
        if (length(pos1path) > length(pos2path)) {longerpos = pos1; shorterpos = pos2; longerpospath = pos1path} else {longerpos = pos2; shorterpos = pos1; longerpospath = pos2path}
        distToNode = 0
        if ((CommonTrace+1) != length(longerpospath)){
            for (i in (CommonTrace+1):(length(longerpospath)-1)){ distToNode = distToNode + tree$length[tree$branch == paste(longerpospath[2:i], collapse='')]} 
        }
        Dist = distToNode + longerpos$length + (tree[tree$branch == shorterpos$branch,]$length-shorterpos$length)
        if (identical(shorterpos, pos1)){Dist=-Dist}
        return(Dist)
    } else { # if they are sisterbranch
        Dist=0 
        if((CommonTrace+1) != length(pos1path)){
            for (i in (CommonTrace+1):(length(pos1path)-1)){ Dist = Dist + tree$length[tree$branch == paste(pos1path[2:i], collapse='')]}   
        }
        if((CommonTrace+1) != length(pos2path)){
            for (i in (CommonTrace+1):(length(pos2path)-1)){ Dist = Dist + tree$length[tree$branch == paste(pos2path[2:i], collapse='')]}
        }
        Dist = Dist + pos1$length + pos2$length # signdistance does not apply!
        return(Dist)
    }
}

我认为算法运行正常。然后我循环浏览所有感兴趣的位置。

for (i in allpositions){
   for (j in allpositions){
      mat[i,j] = distance(tree, i, j)
   }
}

问题是我有大约50000个位置的非常大的树，我想计算任意两个位置之间的距离，即我有几倍50000 ^ 2的距离来计算。它需要永远！你能帮我改进我的代码吗？

Answer 1

这是临时答案旨在帮助OP识别其算法中的问题。

我在每次循环后添加cats;运行代码并查看新创建的tree_cat.txt文件，它将为您提供有关问题所在位置的提示。 m矩阵中的各个单元（例如m[1, 1]）被写入并写入多次。所以要用指数检查一些东西。

好消息是在矩阵单元中有121 * 121 = 14641次写入操作。所以问题实际上是在分配新的矩阵值时使用的索引。

tree <- read.table(text="branch length
1      0     20
2      L     12
3     LL     19
4      R     19
5     RL     12
6    RLL     10
7    RLR     12
8     RR     17", header=TRUE)

m = matrix(0, ncol=sum(tree$length), nrow=sum(tree$length))
catn <- function(...) cat(..., "\n")
capture.output(
for (originbranch in 1:nrow(tree)) {
    catn("originbranch = ", originbranch)
    for (originpatch in 1:tree$length[originbranch]) {
        catn("  originpatch = ", originpatch)
        for (destinationbranch in 1:nrow(tree)) {
            catn("    destinationbranch = ", destinationbranch)
            for (destinationpatch in 1:tree$length[destinationbranch]){
                catn("      destinationpatch = ", destinationpatch)
                split_dest = unlist(strsplit(tree$branch[destinationbranch], ""))
                split_orig =  unlist(strsplit(tree$branch[originbranch], ""))
                depth = 0
                for (i in 1:min(c(length(split_orig), length(split_dest)))) {
                    catn("        i = ", i)
                    if (split_dest[i] == split_orig[i]){
                        depth = depth + 1
                    } else {
                        break
                    }
                }
                distdest = 0
                distorig = 0
                for (upperbranch in depth:length(split_orig)){
                    catn("        upperbranch_orig = ", upperbranch)
                    distorig = distorig + tree$length[tree$branch == paste(split_orig[1:upperbranch], collapse="")]
                }
                for (upperbranch in depth:length(split_dest)){
                    catn("        upperbranch_dest = ", upperbranch)
                    distdest = distdest + tree$length[tree$branch == paste(split_dest[1:upperbranch], collapse="")]
                }
                distorig = distorig + destinationpatch - tree$length[originbranch]
                distdest = distdest + destinationpatch - tree$length[destinationbranch]
                dist = distorig + distdest
                m[originpatch, destinationpatch] = dist ## PROBLEMATIC INDEXING!!
                catn(sprintf("        ----->   Matrix element written: m[%d, %d] = %d", originpatch, destinationpatch, dist))
            }
        }
    }
}, file = "tree_cat.txt")

Answer 2

我对你的像素距离概念并不完全清楚，但根据我的理解，下面的代码提供了一个函数pixel_dist，它计算沿树枝指定的两个像素点之间的像素距离。

我使用igraph将树映射到图形，其中分支是图形边缘，图形顶点是分支交叉点，并使用图形函数进行基本顶点距离计算。

library(igraph)
#  Assign vertex name to tree branch intersections
temp <- gsub("R","1", gsub("L","0",tree$branch))
temp <- strsplit(temp,split=character(0))
tree$upper_vert <- sapply(temp, function(x) {n <- length(x);  2^n + 2^((n-1):0)%*%as.numeric(x) }  )
tree$lower_vert <- as.integer(tree$upper_vert/2)
tree$branch[tree$branch=="0"] <- "trunk"
tree[tree$branch=="trunk",c("lower_vert","upper_vert")] <- c(0,1)

#  Create graph of tree
tree_graph <- graph.data.frame(tree[,c("lower_vert","upper_vert")], directed=TRUE)    # CORRECTED
E(tree_graph)$label <- paste(tree$branch, tree$length,sep="-")
E(tree_graph)$branch <- tree$branch
E(tree_graph)$length <- tree$length
E(tree_graph)$weight <- tree$length
#
#  assign x & y positions for plotting
#
V(tree_graph)$y <- as.integer(as.numeric(V(tree_graph)$name)^.5) + 1
V(tree_graph)["0"]$y <- 0
V(tree_graph)["1"]$y <- 1
V(tree_graph)$x <- as.numeric(V(tree_graph)$name) - 3*(2^(V(tree_graph)$y-2)) + .5
V(tree_graph)["0"]$x <- 0
V(tree_graph)["1"]$x <- 0
plot(tree_graph)
#
#  calculate distances between vertices
#
vert_dist <- shortest.paths(tree_graph, weights=V(tree_graph)$length, mode="all")  # distances between vertices
vert_dist_dir <- shortest.paths(tree_graph, weights=V(tree_graph)$length, mode="in")  # distances between vertices along directed edges ADDED
#
# Calculate distances from end vertex of each edge (branch)
#
edge_node <- get.edges(tree_graph, E(tree_graph))    #  list of vertices for each edge
brnch_dist <- sapply(edge_node[,2], function(x) vert_dist[x, edge_node[,2]])  # distance between end vertex of each edge
colnames(brnch_dist) <- E(tree_graph)$branch   
rownames(brnch_dist) <- E(tree_graph)$branch

brnch_dist_dir <- sapply(edge_node[,2], function(x) vert_dist_dir[x, edge_node[,2]])  # directed distance between end vertex of each edge - ADDED
colnames(brnch_dist_dir) <- E(tree_graph)$branch   
rownames(brnch_dist_dir) <- E(tree_graph)$branch
#
# calcuates total pixel distance given branches and pixel distances along branch  # CORRECTED
#
pixel_dist <- function(b1, pix1, b2, pix2, brnch_dist, brnch_dist_dir) { 
    if(!is.infinite(brnch_dist_dir[b1,b2]) )     #  directed edges same from b1 to b2
      pixel_dist <- brnch_dist[b1,b2] - E(tree_graph)[branch== b2]$length + E(tree_graph)[branch== b1]$length + pix2 - pix1 
    else {
      if(!is.infinite(brnch_dist_dir[b2,b1]) )   # directed edges same from b2 to b1
         pixel_dist <- brnch_dist[b1,b2] + E(tree_graph)[branch== b2]$length - E(tree_graph)[branch== b1]$length + pix2 - pix1 
    else                                         # opposing directed edges
        pixel_dist <- brnch_dist[b1,b2] - E(tree_graph)[branch== b2]$length - E(tree_graph)[branch== b1]$length + pix2 + pix1
    }
    return(pixel_dist)
}

pixel_dist(b1="L",pix1=3, b2="R", pix2=5, brnch_dist=brnch_dist, brnch_dist_dir=brnch_dist_dir)

具有分支名称，长度和方向的树图的图

enter image description here

我还不清楚你打算如何在像素矩阵中放置像素距离，但你可以使用pixel_dist函数或类似的函数来计算矩阵值。

修改

上面的代码已被修改，以便在计算像素距离时正确考虑边缘方向。

树上的像素距离

2 个答案:

修改