使用椭球/ Vincenty查找多个数据集之间的最近点

时间:2019-04-18 22:55:32

标签: r dataframe geospatial

注意:此问题是上一个问题:r - Finding closest coordinates between two large data sets 的后续操作。

我旨在基于两个数据集中的坐标来确定数据集中2中与数据集中1中每个条目最近的条目。数据集1包含180,000行(仅1,800个唯一坐标),数据集2包含4,500行(完整的4,500个唯一坐标)。

先前引用的帖子中包含一个解决问题的方法,但是它使用RANN::nn2并使用欧几里得距离,而不是使用椭球/ Vincenty。

当前代码:

df1[ , c(4,5)] <- as.data.frame(RANN::nn2(df2[,c(2,3)],df1[,c(2,3)],k=1))
df1[,4] <- df2[df1[, 4], 1]

    #    id HIGH_PRCN_LAT HIGH_PRCN_LON SRC_ID   distance
    # 1   1      52.88144     -2.873778     44  0.7990743
    # 2   2      57.80945     -2.234544   5688  2.1676868
    # 3   4      34.02335     -3.098445  61114  1.4758202
    # 4   5      63.80879     -2.439163     23  4.2415854
    # 5   6      53.68881     -7.396112     54  3.6445416
    # 6   7      63.44628     -5.162345     23  2.3577811
    # 7   8      21.60755     -8.633113    440  8.2123762
    # 8   9      78.32444      3.813290     76 11.4936496
    # 9  10      66.85533     -3.994326     55  1.9296370
    # 10  3      51.62354     -8.906553     54  3.2180026

我怀疑该解决方案将涉及geosphere::distVincentyEllipsoid,但不确定如何将其集成到现有代码中。

数据:

r详细信息

platform        x86_64-w64-mingw32
version.string  R version 3.5.3 (2019-03-11)

数据集1输入(不缩小为唯一坐标)

df1 <- structure(list(id = c(1L, 2L, 4L, 5L, 
6L, 7L, 8L, 9, 10L, 3L), 
    HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529, 
    63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207, 
    78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822, 
    -2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454, 
    -5.162345043546359, -8.63311254098095, 3.813289888829932, 
    -3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")

数据集2输入

df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L, 
5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432, 
42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062, 
-6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA, 
10L), class = "data.frame")

1 个答案:

答案 0 :(得分:1)

使用 distVincentyEllipsoid 功能:

library(geosphere)

t(
  apply(
        apply(df1[,c(3,2)], 1, function(mrow){distVincentyEllipsoid(mrow, df2[,c(3,2)])}), 
         2, function(x){ c(SRC_ID=df2[which.min(x),1],distance=min(x))}
       )
  )
       SRC_ID   distance
    1      44   74680.48
    2    5688  238553.51
    3   61114  137385.18
    4      23  340642.70
    5      44  308458.73
    6      23  256176.88
    7     440  908292.28
    8      76 1064419.47
    9      55  185119.29
    10     54  251580.45

只需使用df1[,c(4,5)] <- t(apply(...即可将值分配给df1的列



使用rgeos::gDistance。这是笛卡尔距离,但是从下面的解决方案开始,我设法在上面发布了更新的答案;

library(sp);library(rgeos)

#convert to spatial datasets    
df1rgsp <- SpatialPointsDataFrame(df1[,c(3,2)], df1[,-c(3,2)])
df2rgsp <- SpatialPointsDataFrame(df2[,c(3,2)], data.frame(SRC_ID=df2[,1]))

#apply it on each rows
#find the minimum value and the corresponding row number
#transform it to become to columns and assign it to the columns of `df1`
df1[,c(4,5)] <- t( apply(gDistance(df1rgsp, df2rgsp, byid=TRUE), 1, function(x){
                            c(SRC_ID=which.min(x),distance=min(x))}))

#replace row numbers with `SRC_ID
df1[,4] <- df2[as.integer(df1[, 4]), 1] #same as what you have in the Q

    #    id HIGH_PRCN_LAT HIGH_PRCN_LON SRC_ID   distance
    # 1   1      52.88144     -2.873778    440  1.9296370
    # 2   2      57.80945     -2.234544  61114  3.2180026
    # 3   4      34.02335     -3.098445     21  2.3577811
    # 4   5      63.80879     -2.439163     23  8.8794997
    # 5   6      53.68881     -7.396112     55  0.7990743
    # 6   7      63.44628     -5.162345    440  3.4316239
    # 7   8      21.60755     -8.633113   5688 11.4936496
    # 8   9      78.32444      3.813290     54  2.1676868
    # 9  10      66.85533     -3.994326     23  6.1545391
    # 10  3      51.62354     -8.906553     23  1.4758202

相关问题