Question

我试图比较两组地址（一个小的：pams：~30,000行）和另一个大的（nppes：~4.5M行）。我正在寻找与名字和姓氏的紧密匹配，然后是地址上的后续最佳匹配（我已经使用了邮政编码的子集）。下面是我到目前为止编写的代码。

问题是： - 它很慢（下面的代码比我上次运行时有点优化但是它花了几个小时而没有完成）。 - 限制匹配姓氏的第一个字母。我加了尝试让代码运行的限制，但我宁愿没有附加限制。

我非常感谢有关如何改善这一点的任何建议。

npi_1 <- data.frame(npi = nppes$npi, first_name = tolower(nppes$first_name), last_name = tolower(nppes$last_name), 
                          zip = nppes$b_post_code)
pams_1 <- data.frame(pams_id = pams$ID, npi = pams$NPI, first_name = tolower(pams$First.Name),
                           last_name = tolower(pams$Last.Name), zip = gsub("-", "", pams$Zip))

result <- data.frame(pams_id = "", npi = "", match = "", stringsAsFactors = F)

for (i in 1:27809)
{
  pams_2 <- pams_1[i,]
  npi_2 <- npi_1 %>% filter(substr(last_name, 1, 1) == substr(pams_2$last_name, 1, 1))
  npi_2 <- npi_2 %>% mutate(match_last = stringdist(last_name, pams_2$last_name, method = "jw")) %>%
    filter(match_last <= 0.1000000)
  npi_2 <- npi_2 %>% mutate(match_first = stringdist(first_name, pams_2$first_name, method = "jw")) %>%
    filter(match_first <= 0.1000000)
  npi_2 <- npi_2 %>% mutate(match_zip3 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 3), method = "jw"))
  npi_2 <- npi_2 %>% mutate(match_zip5 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 5), method = "jw"))
  npi_2 <- npi_2 %>% mutate(match_zip = stringdist(zip, pams_2$zip, method = "jw"))
  npi_2 <- npi_2 %>% mutate(match_score = ((match_last + match_first) * 1.4) + (match_zip3 * 1.2) + match_zip5 + (match_zip * 0.8))
  npi_2 <- npi_2 %>% arrange(match_score)
  pair_1 <- c(pams_2$pams_id[1], npi_2$npi[1], npi_2$match_score[1])
  result[i, ] <- pair_1
}

大数据集的R字符串距离

0 个答案: