在R中联接(大型,复杂)表时,我通常会很努力地检查此操作的结果。 参见此处的最小可重现示例:
library(data.table)
table1 <- data.table(id=c("A", "B", "C"), price=c(12,11,10))
table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"))
table_join <- merge(table1,table2,
by="id",
all.x=T,
all.y=T)
预期输出不是火箭科学:
structure(list(id = c("A", "B", "C", "C", "D"), price = c(12,
11, 10, 10, NA), wharehouse = c("Colorado", NA, "Texas", "New York",
"Oregon"), join = c("INNER JOIN", "LEFT JOIN", "INNER JOIN. MULTIPLE RIGHT JOIN",
"INNER JOIN. MULTIPLE RIGHT JOIN", "RIGHT JOIN")), row.names = c(NA,
-5L), class = c("data.table", "data.frame"))
但是我想知道有多少行与另一张表不匹配,有一个匹配,多个匹配...
我想要一些信息(也许是新行)来描述合并的结果。查看可能的示例:
table_join[1, join:="INNER JOIN"]
table_join[2, join:="LEFT JOIN"]
table_join[3, join:="INNER JOIN. MULTIPLE RIGHT JOIN"]
table_join[4, join:="INNER JOIN. MULTIPLE RIGHT JOIN"]
table_join[5, join:="RIGHT JOIN"]
这里是“预期的输出”
structure(list(id = c("A", "B", "C", "C", "D"), price = c(12,
11, 10, 10, NA), wharehouse = c("Colorado", NA, "Texas", "New York",
"Oregon"), join = c("INNER JOIN", "LEFT JOIN", "INNER JOIN. MULTIPLE RIGHT JOIN",
"INNER JOIN. MULTIPLE RIGHT JOIN", "RIGHT JOIN")), row.names = c(NA,
-5L), class = c("data.table", "data.frame"))
可以肯定的是,在大表中可能会出现新情况(完全笛卡尔联接),匹配项是另一张表上存在id
但与NAs
匹配(在我的示例中,我们假设ID为{{ 1}},但价格为D
)。
此外,这将有助于我在合并多个表时跟踪复杂的情况
执行这种操作的R中是否存在合并包装器?我记得a还是一个年轻,梦想中的研究助理时,Stata能够做一些相关的事情,但是我不知道如何在R中自动完成它。
答案 0 :(得分:1)
问题本身很容易,可以通过跟踪每个表的ID频率来解决。下面是我的解决方案,但可能需要对大型表进行一些优化。
EDIT1:
错误修复:类别已被覆盖; Sencha ExtJS 6.0.2 Classic
代替eval(track.col)
此外,现在可以为eval(parse(text = track.col)))
列分配自定义名称。
join
EDIT2
同一功能的可读性强的版本
library(data.table)
track.merge <- function(x, y, on, track.col){
x[, N := .N, by = on][]
y[, N := .N, by = on][]
table_join <- merge(x, y, by=on, all.x=T, all.y=T)
x[, N := NULL, by = on][]
y[, N := NULL, by = on][]
table_join[N.x > 1 & N.y > 1,
eval(track.col) := "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"][]
table_join[N.x > 1 & is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN. MULTIPLE LEFT JOIN"][]
table_join[N.y > 1 & is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN. MULTIPLE RIGHT JOIN"][]
table_join[is.na(N.x),
eval(track.col) := "RIGHT JOIN"][]
table_join[is.na(N.y),
eval(track.col) := "LEFT JOIN"][]
table_join[is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN"][]
table_join[, ':=' (N.x = NULL, N.y = NULL)][]
}
测试:
track.merge2 <- function(x, y, on, track.col){
x[, N := .N, by = on][]
y[, N := .N, by = on][]
table_join <- merge(x, y, by=on, all.x=T, all.y=T)
track_ids <- character(NROW(table_join))
x[, N := NULL, by = on][]
y[, N := NULL, by = on][]
track_ids[table_join$N.x > 1 & table_join$N.y > 1] <- "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"
track_ids[table_join$N.x > 1 & track_ids == ""] <- "INNER JOIN. MULTIPLE LEFT JOIN"
track_ids[table_join$N.y > 1 & track_ids == ""] <- "INNER JOIN. MULTIPLE RIGHT JOIN"
track_ids[is.na(table_join$N.x)] <- "RIGHT JOIN"
track_ids[is.na(table_join$N.y)] <- "LEFT JOIN"
track_ids[track_ids == ""] <- "INNER JOIN"
table_join[[track.col]] <- track_ids
table_join[, ':=' (N.x = NULL, N.y = NULL)][]
}
考虑具有1000k行和两列合并的表,速度要慢约2.5倍:
table1 <- data.table(id=c("A", "C", "C", "B", "F", "H", "H"), price=c(12,11,10,13,10,15,3))
table2 <- data.table(id=c("A", "C", "C", "F", "F", "H", "L"), wharehouse=c("Colorado","Texas","New York", "Washington", "Illinois", "Florida", "Kansas"))
> table1
id price
1: A 12
2: C 11
3: C 10
4: B 13
5: F 10
6: H 15
7: H 3
> table2
id wharehouse
1: A Colorado
2: C Texas
3: C New York
4: F Washington
5: F Illinois
6: H Florida
7: L Kansas
> track.merge(table1, table2, "id", "join")
id price wharehouse join
1: A 12 Colorado INNER JOIN
2: B 13 <NA> LEFT JOIN
3: C 11 Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
4: C 11 New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
5: C 10 Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
6: C 10 New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
7: F 10 Washington INNER JOIN. MULTIPLE RIGHT JOIN
8: F 10 Illinois INNER JOIN. MULTIPLE RIGHT JOIN
9: H 15 Florida INNER JOIN. MULTIPLE LEFT JOIN
10: H 3 Florida INNER JOIN. MULTIPLE LEFT JOIN
11: L NA Kansas RIGHT JOIN
> all.equal(track.merge2(x, y, on = "id", "join"), track.merge(x, y, on = "id", "join"))
[1] TRUE
有关丢失ID的评论
从数据库角度来看,使用NA作为id并没有多大意义。 Ids是将表关联到关系数据库的关键。如果存在缺少ID的记录,将它们与其他表相关联是没有意义的,所以我要么将它们过滤掉,要么尝试在合并表之前对其进行修复。
答案 1 :(得分:1)
这是我使用dplyr
的解决方案。就像@Gerald T所说的那样,您可以通过查看合并表来获取所有信息。
您可以使用此代码获得频率表。
library(tidyverse)
table1 %>% left_join(table2) %>%
group_by(id) %>%
summarise(num_wharehouse = sum(!is.na(wharehouse)))
Joining, by = "id" # A tibble: 3 x 2 id num_wharehouse <chr> <int> 1 A 1 2 B 0 3 C 2
然后,您可以获得所需的统计信息。
table1 %>% left_join(table2) %>%
group_by(id) %>%
summarise(num_wharehouse = sum(!is.na(wharehouse))) %>%
summarise(merged = sum(num_wharehouse > 0),
not_merged = sum(num_wharehouse == 0),
single_match = sum(num_wharehouse == 1),
multi_match = sum(num_wharehouse > 1))
Joining, by = "id" # A tibble: 1 x 4 merged not_merged single_match multi_match <int> <int> <int> <int> 1 2 1 1 1
答案 2 :(得分:1)
我的软件包safejoin旨在解决更广泛的联接检查问题。它并不能完全满足您的要求,但希望能够将其关闭,可能会更好,因为它可以进行下一步的检查。
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
safe_full_join(table1, table2, check="uvmn")
id price wharehouse
1 A 12 Colorado
2 B 11 <NA>
3 C 10 Texas
4 C 10 New York
5 D NA Oregon
Warning messages:
1: x has unmatched sets of joining values:
id
1: B
2: y has unmatched sets of joining values:
id
1: D
3: y is not unique on id
检查由单个字符串参数(即一个字符序列)处理,其中大写字母触发失败,小写字母触发警告,并以~
前缀的字母触发消息,此处使用的代码({{3 }})如下:
"u"
就像unique一样,检查联接列是否在x上形成唯一键"v"
检查连接列是否在y上形成唯一键"m"
喜欢匹配,以检查x的所有行是否都匹配"n"
检查y的所有行是否匹配答案 3 :(得分:0)
可以在联接之后应用包装函数,以如下分析其路径。 OP中的场景和评论:
#Scenario 1
table1 <- data.table(id=c("A", "B", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"));
#Scenario 2
table1 <- data.table(id=c("C", "C", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"));
#Scenario 3
table1 <- data.table(id=c(NA, "C", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", NA), wharehouse=c("Colorado","Texas","New York", "Oregon"))
#Scenario 4
table1 <- data.table(id=c("A", "A", "C"), price=c(12,11,10)); table2 <- data.table(id=c("B", "C", "C","D"), wharehouse=c("Colorado","Texas","New York", "Oregon"))
setkeyv(table1,"id")
setkeyv(table2,"id")
table_join <- merge(table1,table2,by="id",all.x=T,all.y=T)
write_description <- function(p,w,n) {
inners <- (!is.na(p) & !is.na(w))
lefts <- (!is.na(p) & is.na(w))
rights <- ((is.na(p) & !is.na(w))) | (n > 1 & !is.na(w))
multis <- n > 1
paste0(ifelse(inners,"INNER JOIN ",""),
ifelse(multis,"MULTIPLE ",""),
ifelse(lefts,"LEFT JOIN ",""),
ifelse(rights,"RIGHT JOIN ",""))
}
table_join[,description:=write_description(price,wharehouse,.N),by="id"]
结果场景1:
> table_join
id price wharehouse description
1: A 12 Colorado INNER JOIN
2: B 11 NA LEFT JOIN
3: C 10 Texas INNER JOIN MULTIPLE RIGHT JOIN
4: C 10 New York INNER JOIN MULTIPLE RIGHT JOIN
5: D NA Oregon RIGHT JOIN
结果:场景2
> table_join
id price wharehouse description
1: A NA Colorado RIGHT JOIN
2: C 12 Texas INNER JOIN MULTIPLE RIGHT JOIN
3: C 12 New York INNER JOIN MULTIPLE RIGHT JOIN
4: C 11 Texas INNER JOIN MULTIPLE RIGHT JOIN
5: C 11 New York INNER JOIN MULTIPLE RIGHT JOIN
6: C 10 Texas INNER JOIN MULTIPLE RIGHT JOIN
7: C 10 New York INNER JOIN MULTIPLE RIGHT JOIN
8: D NA Oregon RIGHT JOIN
结果:场景3
> table_join
id price wharehouse description
1: NA 12 Oregon INNER JOIN
2: A NA Colorado RIGHT JOIN
3: C 11 Texas INNER JOIN MULTIPLE RIGHT JOIN
4: C 11 New York INNER JOIN MULTIPLE RIGHT JOIN
5: C 10 Texas INNER JOIN MULTIPLE RIGHT JOIN
6: C 10 New York INNER JOIN MULTIPLE RIGHT JOIN
结果:场景4
> table_join
id price wharehouse description
1: A 12 NA MULTIPLE LEFT JOIN
2: A 11 NA MULTIPLE LEFT JOIN
3: B NA Colorado RIGHT JOIN
4: C 10 Texas INNER JOIN MULTIPLE RIGHT JOIN
5: C 10 New York INNER JOIN MULTIPLE RIGHT JOIN
6: D NA Oregon RIGHT JOIN