在多个表中查找重复项

时间:2012-08-19 21:37:13

标签: mysql sql

我有一个表(T1)和一个带属性的表(T2)。我希望找到与提供id的记录具有相同属性的记录。

这是一个例子。给定1我想找到2(确保属性也匹配)。

T1
ID | A | B
----------
1  | k | l
2  | k | l


T2
IDFK | C | D
-------------    
1    | w | x
1    | y | z
2    | w | x
2    | y | z

这是我到目前为止的SQL:

SELECT * FROM T1 
JOIN T1 AS T1COPY ON T1.A = T1COPY.A, T1.B = T1COPY.B 
JOIN T2 ON T1.ID = T2.IDFK 
JOIN T2 AS T2COPY ON T1COPY.ID = T2COPY.IDFK 
   AND T2.C = T2COPY.C 
   AND T2.D = T2COPY.D
WHERE T1.ID = 1

但即使属性不同,它也无法正常工作,因为它匹配2。

7 个答案:

答案 0 :(得分:1)

以下是MySQL的答案:http://www.sqlfiddle.com/#!2/ec4fa/2

select h.* 
from 
(
    select x.*
    from t join t x using(a,b)
    where t.id = 1 and x.id <> 1  
) h
join 
(

    select coalesce(x.cpIdFk, x.uIdFk) as idFk  
    from
    (
      select cp.idFk as cpIdFk, u.idFk as uIdFk
      from 
      (
        select t.id as idFk, x.*
        from t cross join (select c, d from u where idFk = 1) as x
        where t.id <> 1      
      ) cp
      left join (select * from u where idFk <> 1) u using(idfk,c,d)

      union

      select cp.idFk,u.idFk
      from 
      (
        select t.id as idFk, x.*
        from t cross join (select c, d from u where idFk = 1) as x
        where t.id <> 1      
      ) cp
      right join (select * from u where idFk <> 1) u using(idfk,c,d)

    ) as x

    group by idFk
    having bit_and(cpidFk is not null and uIdFk is not null)

) d on d.idFk = h.id 
order by h.id;

过滤器ID的输出== 1:

| ID | A | B |
--------------
|  2 | k | l |
|  5 | k | l |

从这些输入中:

CREATE TABLE t
    (ID int, A varchar(1), B varchar(1));

INSERT INTO t
    (ID, A, B)
VALUES
    (1, 'k', 'l'),
    (2, 'k', 'l'),
    (3, 'k', 'l'),
    (4, 'k', 'l'),
    (5, 'k', 'l'),
    (6, 'k', 'j');


CREATE TABLE u
    (IDFK int, C varchar(1), D varchar(1));

INSERT INTO u
    (IDFK, C, D)
VALUES
    (1, 'w', 'x'),
    (1, 'y', 'z'),

    (2, 'w', 'x'),
    (2, 'y', 'z'),

    (3, 'w', 'x'),
    (3, 'y', 'z'),
    (3, 'm', 'z'),

    (4, 'w', 'x'),

    (5, 'w', 'x'),
    (5, 'y', 'z'),

    (6, 'w', 'x'),
    (6, 'y', 'z');

此处的说明:Find duplicates across multiple tables

MySQL查询看起来有点复杂,因为它不支持FULL JOIN并且它也没有CTE。我们通过合并FULL JOINLEFT JOIN

的结果来模拟RIGHT JOIN

答案 1 :(得分:0)

第二次修订答案

由于注释表明T2中可能存在重复行,因此需要更复杂的解决方案。以下是我认为可以生成正确数据的查询。

-- Query 8B
SELECT x.id
  FROM (SELECT d2.id, d2.c, d2.d
          FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS d2
          JOIN (SELECT id
                  FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x
                 GROUP BY id
                HAVING COUNT(*) = (SELECT COUNT(*)
                                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                                    GROUP BY id)
               ) AS j2
            ON j2.id = d2.id
       ) AS x
  JOIN (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS y
    ON x.c = y.c AND x.d = y.d
 GROUP BY x.id
HAVING COUNT(*) = (SELECT COUNT(*)
                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                    GROUP BY id);

我怀疑这是否是最简单的,但它是之前修订过的答案的合理延续。

运行示例

这是查询的跟踪输出,显示了开发时的步骤。 DBMS是在Mac OS X 10.7.4上运行的IBM Informix Dynamic Server 11.70.FC2,使用SQLCMD v88.00作为SQL命令解释器(不,不是Microsoft的johnny-come-lately;我二十年前写的那个)

+ BEGIN;
+ CREATE TABLE T1
(ID INTEGER NOT NULL PRIMARY KEY, a CHAR(1) NOT NULL, b CHAR(1) NOT  NULL);
+ INSERT INTO T1 VALUES(1, 'k', 'l');
+ INSERT INTO T1 VALUES(2, 'k', 'l');
+ INSERT INTO T1 VALUES(3, 'a', 'b');
+ INSERT INTO T1 VALUES(4, 'p', 'q');
+ INSERT INTO T1 VALUES(5, 't', 'v');
+ CREATE TABLE T2
(IDFK INTEGER NOT NULL REFERENCES T1, c CHAR(1) NOT NULL, d CHAR(1) NOT  NULL);
+ INSERT INTO T2 VALUES(1, 'w', 'x');
+ INSERT INTO T2 VALUES(1, 'y', 'z');
+ INSERT INTO T2 VALUES(2, 'w', 'x');
+ INSERT INTO T2 VALUES(2, 'w', 'x');
+ INSERT INTO T2 VALUES(2, 'y', 'z');
+ INSERT INTO T2 VALUES(3, 'w', 'x');
+ INSERT INTO T2 VALUES(3, 'y', 'b');
+ INSERT INTO T2 VALUES(3, 'y', 'z');
+ INSERT INTO T2 VALUES(4, 'w', 'x');
+ INSERT INTO T2 VALUES(5, 'w', 'x');
+ INSERT INTO T2 VALUES(5, 'y', 'z');
+ INSERT INTO T2 VALUES(5, 'w', 'x');
+ INSERT INTO T2 VALUES(5, 'y', 'z');
+ SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1;
2|w|x
2|y|z
3|w|x
3|y|b
3|y|z
4|w|x
5|w|x
5|y|z
+ SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1;
1|w|x
1|y|z
+ SELECT id, COUNT(*) FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x GROUP BY id;
2|2
5|2
3|3
4|1
+ SELECT id, COUNT(*) FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x GROUP BY id;
1|2
+ -- Query 5B - IDs having same count of distinct rows as ID = 1
SELECT id
  FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x
 GROUP BY id
HAVING COUNT(*) = (SELECT COUNT(*)
                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                    GROUP BY id);
2
5
+ -- Query 6B
SELECT d2.id, d2.c, d2.d
  FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS d2
  JOIN (SELECT id
          FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x
         GROUP BY id
        HAVING COUNT(*) = (SELECT COUNT(*)
                             FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                            GROUP BY id)
       ) AS j2
    ON j2.id = d2.id
 ORDER BY id;
2|w|x
2|y|z
5|w|x
5|y|z
+ -- Query 7B
SELECT x.id, y.id, x.c, y.c, x.d, y.d
  FROM (SELECT d2.id, d2.c, d2.d
          FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS d2
          JOIN (SELECT id
                  FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x
                 GROUP BY id
                HAVING COUNT(*) = (SELECT COUNT(*)
                                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                                    GROUP BY id)
               ) AS j2
            ON j2.id = d2.id
       ) AS x
  JOIN (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS y
    ON x.c = y.c AND x.d = y.d
 ORDER BY x.id, y.id, x.c, x.d;
2|1|w|w|x|x
2|1|y|y|z|z
5|1|w|w|x|x
5|1|y|y|z|z
+ -- Query 8B
SELECT x.id
  FROM (SELECT d2.id, d2.c, d2.d
          FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS d2
          JOIN (SELECT id
                  FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk != 1) AS x
                 GROUP BY id
                HAVING COUNT(*) = (SELECT COUNT(*)
                                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                                    GROUP BY id)
               ) AS j2
            ON j2.id = d2.id
       ) AS x
  JOIN (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS y
    ON x.c = y.c AND x.d = y.d
 GROUP BY x.id
HAVING COUNT(*) = (SELECT COUNT(*)
                     FROM (SELECT DISTINCT idfk AS id, c, d FROM t2 WHERE idfk  = 1) AS x
                    GROUP BY id);
2
5
+ ROLLBACK;

第一次修订答案

步骤1:具有与ID = 1

相同的行数的ID
SELECT idfk AS id -- Query 5
  FROM t2
 WHERE idfk != 1
 GROUP BY idfk
HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1);

步骤2:数据对应查询5

SELECT idfk AS id, c, d -- Query 6
  FROM t2
  JOIN (SELECT idfk AS id
          FROM t2
         WHERE idfk != 1
         GROUP BY idfk
        HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
       ) AS j2
    ON j2.id = t2.idfk
 ORDER BY id;

步骤3:使用ID = 1

的行连接查询6中的行
SELECT x.id, y.id, x.c, y.c, x.d, y.d -- Query 7
  FROM (SELECT idfk AS id, c, d
          FROM t2
          JOIN (SELECT idfk AS id
                  FROM t2
                 WHERE idfk != 1
                 GROUP BY idfk
                HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
               ) AS j2
            ON j2.id = t2.idfk
       ) AS x
  JOIN (SELECT idfk AS id, c, d
          FROM t2 WHERE idfk = 1
       ) AS y
    ON x.c = y.c AND x.d = y.d
 ORDER BY x.id, y.id, x.c, x.d;

步骤4:来自查询7的ID,其中计数与ID = 1

的计数相同
SELECT x.id
  FROM (SELECT idfk AS id, c, d
          FROM t2
          JOIN (SELECT idfk AS id
                  FROM t2
                 WHERE idfk != 1
                 GROUP BY idfk
                HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
               ) AS j2
            ON j2.id = t2.idfk
       ) AS x
  JOIN (SELECT idfk AS id, c, d
          FROM t2 WHERE idfk = 1
       ) AS y
    ON x.c = y.c AND x.d = y.d
 GROUP BY x.id
HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1);

运行示例

DBMS是在Mac OS X 10.7.4上运行的IBM Informix Dynamic Server 11.70.FC2,使用SQLCMD v88.00作为SQL命令解释器(不,不是Microsoft的johnny-come-lately;我第一次写的二十多岁)几年前)。

+ BEGIN;
+ CREATE TABLE T1
(ID INTEGER NOT NULL PRIMARY KEY, a CHAR(1) NOT NULL, b CHAR(1) NOT  NULL);
+ INSERT INTO T1 VALUES(1, 'k', 'l');
+ INSERT INTO T1 VALUES(2, 'k', 'l');
+ INSERT INTO T1 VALUES(3, 'a', 'b');
+ INSERT INTO T1 VALUES(4, 'p', 'q');
+ CREATE TABLE T2
(IDFK INTEGER NOT NULL REFERENCES T1, c CHAR(1) NOT NULL, d CHAR(1) NOT  NULL);
+ INSERT INTO T2 VALUES(1, 'w', 'x');
+ INSERT INTO T2 VALUES(1, 'y', 'z');
+ INSERT INTO T2 VALUES(2, 'w', 'x');
+ INSERT INTO T2 VALUES(2, 'y', 'z');
+ INSERT INTO T2 VALUES(3, 'w', 'x');
+ INSERT INTO T2 VALUES(3, 'y', 'b');
+ INSERT INTO T2 VALUES(3, 'y', 'z');
+ INSERT INTO T2 VALUES(4, 'w', 'x');
+ SELECT t1.id AS id, t2.c, t2.d -- Query 1
  FROM t1
  JOIN t2 ON t1.id = t2.idfk;
1|w|x
1|y|z
2|w|x
2|y|z
3|w|x
3|y|b
3|y|z
4|w|x
+ -- Query 5 - IDs having same count of rows as ID = 1

SELECT idfk AS id
  FROM t2
 WHERE idfk != 1
 GROUP BY idfk
HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1);
2
+ SELECT idfk AS id, c, d
  FROM t2
  JOIN (SELECT idfk AS id
          FROM t2
         WHERE idfk != 1
         GROUP BY idfk
        HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
       ) AS j2
    ON j2.id = t2.idfk
 ORDER BY id;
2|w|x
2|y|z
+ SELECT x.id, y.id, x.c, y.c, x.d, y.d
  FROM (SELECT idfk AS id, c, d
          FROM t2
          JOIN (SELECT idfk AS id
                  FROM t2
                 WHERE idfk != 1
                 GROUP BY idfk
                HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
               ) AS j2
            ON j2.id = t2.idfk
       ) AS x
  JOIN (SELECT idfk AS id, c, d
          FROM t2 WHERE idfk = 1
       ) AS y
    ON x.c = y.c AND x.d = y.d
 ORDER BY x.id, y.id, x.c, x.d;
2|1|w|w|x|x
2|1|y|y|z|z
+ SELECT x.id
  FROM (SELECT idfk AS id, c, d
          FROM t2
          JOIN (SELECT idfk AS id
                  FROM t2
                 WHERE idfk != 1
                 GROUP BY idfk
                HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1)
               ) AS j2
            ON j2.id = t2.idfk
       ) AS x
  JOIN (SELECT idfk AS id, c, d
          FROM t2 WHERE idfk = 1
       ) AS y
    ON x.c = y.c AND x.d = y.d
 GROUP BY x.id
HAVING COUNT(*) = (SELECT COUNT(*) FROM t2 WHERE t2.idfk = 1);
2
+ ROLLBACK;

原始答案

这至少引起了对问题的充分澄清。

据我所知,如果你有一个子查询,如:

SELECT t1.id AS id, t2.c, t2.d  -- Query 1
  FROM t1
  JOIN t2 ON t1.id = t2.idfk

然后您在结果集中查找行对,其中cd中的值相同但id值不同。因此,我们基于以下内容编写主查询:

SELECT j1.id, j2.id  -- Query 2
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d AND j1.id != j2.id

您可以通过将!=条件更改为<>来确保您不会同时获得“1,2”和“2,1”。

如果您想要与T1中的特定ID值匹配的行,则可以在WHERE子句中指定它:

SELECT j2.id  -- Query 3
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d AND j1.id != j2.id
 WHERE j1.id = 1;  -- 1 is the ID for which matches are sought

如果您愿意,可以在子查询中添加条件(尽管优秀的优化程序可能会为您做到这一点):

SELECT j2.id  -- Query 4
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk AND t1.id = 1
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk AND t1.id != 1
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d
 WHERE j1.id = 1;  -- 1 is the ID for which matches are sought

主要ON子句中的第三个条件是多余的,因为通过构造,j1子查询中的ID值都是1,j2子查询中的ID值都是'不是'。


我在SQL中修复了t2.id vs t2.idfk的问题,我运行了上面的4个查询。每个都产生我期望的答案。结果集中有两行,例如,查询4,因为T1中有两对行,因此行{1, a b }和{ 2, a b }存在于T2中。如果您只希望2两个出现一次,尽管有许多匹配的行,那么您需要将DISTINCT应用于SELECT。

在评论中,您说:

  

不幸的是,即使其中一个属性不匹配,它仍会返回结果。如何匹配T2中的每个属性?

这需要扩展数据集来演示。当我添加:

INSERT INTO T1 VALUES(3, 'a', 'b');
INSERT INTO T2 VALUES(3, 'a', 'z');
INSERT INTO T2 VALUES(3, 'y', 'b');

值3仅出现在查询1的结果中,这是它应该出现的唯一位置。

请说明您所看到的错误行为,并显示示例数据。我使用以下SQL和交错查询结果测试了上面的查询。 DBMS是在Mac OS X 10.7.4上运行的IBM Informix Dynamic Server 11.70.FC2,使用SQLCMD v88.00作为SQL命令解释器。

+ BEGIN;
+ CREATE TEMP TABLE T1
(ID INTEGER NOT NULL PRIMARY KEY, A CHAR(1) NOT NULL, B CHAR(1) NOT  NULL);
+ INSERT INTO T1 VALUES(1, 'k', 'l');
+ INSERT INTO T1 VALUES(2, 'k', 'l');
+ INSERT INTO T1 VALUES(3, 'a', 'b');
+ CREATE TEMP TABLE T2
(IDFK INTEGER NOT NULL, C CHAR(1) NOT NULL, D CHAR(1) NOT  NULL);
+ INSERT INTO T2 VALUES(1, 'w', 'x');
+ INSERT INTO T2 VALUES(1, 'y', 'z');
+ INSERT INTO T2 VALUES(2, 'w', 'x');
+ INSERT INTO T2 VALUES(2, 'y', 'z');
+ INSERT INTO T2 VALUES(3, 'a', 'z');
+ INSERT INTO T2 VALUES(3, 'y', 'b');
+ SELECT t1.id AS id, t2.c, t2.d -- Query 1
  FROM t1
  JOIN t2 ON t1.id = t2.idfk;
1|w|x
1|y|z
2|w|x
2|y|z
3|a|z
3|y|b
+ SELECT j1.id, j2.id -- Query 2
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d AND j1.id != j2.id;
1|2
1|2
2|1
2|1
+ SELECT j2.id -- Query 3
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d AND j1.id != j2.id
 WHERE j1.id = 1;
2
2
+ SELECT j2.id  -- Query 4
  FROM (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk AND t1.id = 1
       ) AS j1
  JOIN (SELECT t1.id AS id, t2.c, t2.d
          FROM t1
          JOIN t2 ON t1.id = t2.idfk AND t1.id != 1
       ) AS j2
    ON j1.c = j2.c AND j1.d = j2.d
 WHERE j1.id = 1;
2
2
+ ROLLBACK;

答案 2 :(得分:0)

我的方法是使用group_concat将两列属性合并为单个值。然后,我可以轻松找到具有相同属性的所有ID,并将这些属性作为属性返回。

select allts.id
from (select group_concat(c separator ';' order by c) as allcs,
             group_concat(d separator ';' order by d) as allds
      from t2
      where t2.id = 1
     ) t2_1 join
     (select t2.id, group_concat(c separator ';' order by c) as allcs,
             group_concat(d separator ';' order by d) as allds
      from t2
      group by t2.id
     ) allts
     on t2_1.allcs = allts.allcs and t2_1.allds = t2_1.allds join

此版本未考虑t1中的任何信息。你的问题只提到了t2中的属性。

答案 3 :(得分:0)

让我们首先定义代表输入ID的属性数据:

select idfk, c, d
from t2
where idfk = @ID

现在我们可以使用这些信息来选择T2中存在的潜在匹配,其中IDFK不是@ID:

select x.idfk, x.c, x.d y.idfk as id2 from (
    select idfk, c, d
    from t2
    where idfk = @ID
) x left join t2 y on x.c = y.c and x.d = y.d
where y.idfk <> @ID 
  and y.idfk is not null

如果id2的每个值的行数与第一个查询中的行数相同,则数据是第二个查询与第一个查询中的数据匹配。

因此:

select id2 from ( 
    select id2, count(*) as rowcount from (
        <second query>
    ) z
) rowsByID
where rowcount = (select count(*) from (<first query>) IDattributes)

我不确定您是否打算返回的行必须与A&amp; B也是,或者仅仅是表2中的数据,但如果我认为它们必须匹配A&amp; B,然后:

select ID from t1 
    join <third query> m on t1.id = m.id2
    join (select a, b from t1 where id = @id) prime_row on t1.a = prime_row.a and t1.b = prime_row.b

如果你不需要A&amp; B匹配,删除第二个连接。

这是怎么回事?

答案 4 :(得分:0)

在大桌子上这可能会非常慢。 (编辑:我现在知道mysql没有完整的连接;但是第一个查询对其他系统仍然有效,并且可能更容易理解。如果你不关心它,请跳到第二个。)

我使用问号作为参数标记。所有人都应该收到匹配的“重复”id的相同值。添加条件and T.id <> ?以从结果集中排除匹配的行。 (我原以为OP想要第1行和第2行。)tX代表搜索空间,所以它也可以被排除在那里并在此过程的早期消除。

select *
from T1 as T
where T.id in (
    select coalesce(attrR.idfk, tX.id)
    from
        T1 as tX
        cross join
        (select * from T2 where T2.idfk = ?) as attrL
        full outer join T2 as attrR
            on      attrR.idfk = tX.id
                and attrR.c = attrL.c
                and attrR.d = attrL.d
    group by coalesce(attrR.idfk, tX.id)
    having count(*) =
        sum(case
                when attrR.c = attrL.c and attrR.d = attrL.d
                then 1 else 0
            end
        )
);

这避免了full outer join的缺失。

select *
from T1 as T
where T.id in (
    select attrR.idfk
    from
        T1 as tX
        cross join
        (select * from T2 where idfk = ?) as attrL
        right outer join
        T2 as attrR
            on      attrR.idfk = tX.id
                and attrR.c = attrL.c
                and attrR.d = attrL.d
        cross join
        (select count(*) as cnt from T2 where idfk = ?) as tC
    group by attrR.idfk
    having
        sum(case
                when attrR.c = attrL.c and attrR.d = attrL.d
                then 1 else 1000000
            end
        ) = min(tC.cnt)
);

此复合检查等同于sum(case...)表达式。有人可能比另一个感觉好。

    having
            count(attrL.idfk) = min(tC.cnt)
        and count(*) = min(tC.cnt)

我提供的第一个和第二个查询确实有效,但前提是每个T1在T2中至少有一个属性。这是一个通过添加虚拟属性来补偿的版本,以防止中间结果中出现空集。这是更丑陋的,所以如果没有必要,不要使用它。 (完整的连接版本需要进行类似的调整。)

select *
from T1 as T
where T.id in (
    select attrR.idfk
    from
        T1 as tX
        cross join
        (
            select c, d from T2 where idfk = ?
            union all
            select '!@#$%', '' -- add a dummy attribute
        ) as attrL
        right outer join
        (
            select idfk, c, d from T2
            union all
            select id, '!@#$%', '' from T1
        ) as attrR
            on      attrR.idfk = tX.id
                and attrR.c = attrL.c
                and attrR.d = attrL.d
        cross join
        (select count(*)+1 as cnt from T2 where idfk = ?) as tC -- note the +1
    group by attrR.idfk
    having
            count(tX.id) = min(tC.cnt)
        and count(*) = min(tC.cnt)
);

答案 5 :(得分:0)

如果你偶然将系统移植到Postgresql,你可以使用FULL JOIN:http://www.sqlfiddle.com/#!1/1f0ef/1

with headers_matches as
(
    select x.*
    from t join t x using(a,b)
    where t.id = 1 and x.id <> 1
)
,cp as
(
    select t.id as idFk, x.*
    from t cross join (select c, d from u where idFk = 1) as x
    where t.id <> 1
)
,details_matches as
(
    select coalesce(cp.idFk,u.idFk) as idFk
    from cp
    full join (select * from u where idFk <> 1) u using(idfk,c,d)
    group by idFk
    having every(cp.idFk is not null and u.idFk is not null)
)
select h.* 
from headers_matches h
join details_matches d on d.idFk = h.id 
order by h.id;

过滤器ID的输出== 1:

| ID | A | B |
--------------
|  2 | k | l |
|  5 | k | l |

从这些输入中:

CREATE TABLE t
    (ID int, A varchar(1), B varchar(1));

INSERT INTO t
    (ID, A, B)
VALUES
    (1, 'k', 'l'),
    (2, 'k', 'l'),
    (3, 'k', 'l'),
    (4, 'k', 'l'),
    (5, 'k', 'l'),
    (6, 'k', 'j');



CREATE TABLE u
    (IDFK int, C varchar(1), D varchar(1));

INSERT INTO u
    (IDFK, C, D)
VALUES
    (1, 'w', 'x'),
    (1, 'y', 'z'),

    (2, 'w', 'x'),
    (2, 'y', 'z'),

    (3, 'w', 'x'),
    (3, 'y', 'z'),
    (3, 'm', 'z'),

    (4, 'w', 'x'),

    (5, 'w', 'x'),
    (5, 'y', 'z'),

    (6, 'w', 'x'),
    (6, 'y', 'z');

如何运作

我们首先做最难的部分,这是细节。我们将在这个答案的后半部分做标题。

它是如何工作的,首先我们需要交叉填充细节,以便我们可以对细节进行适当的完全连接,这样可以在以后检测到间隙:

with cp as -- cross populate
(
    select t.id as idFk, x.*
    from t cross join (select c, d from u where idFk = 1) as x
    where t.id <> 1
)
select *
from cp;

输出:

| IDFK | C | D |
----------------
|    2 | w | x |
|    2 | y | z |
|    3 | w | x |
|    3 | y | z |
|    4 | w | x |
|    4 | y | z |
|    5 | w | x |
|    5 | y | z |
|    6 | w | x |
|    6 | y | z |

然后从这个交叉填充的细节,我们可以做正确的全加入:

with cp as 
(
    select t.id as idFk, x.*
    from t cross join (select c, d from u where idFk = 1) as x
    where t.id <> 1
)
select 
    cp.idFk as cpIdFk, cp.c as cpC, cp.d as cpD,
    u.idFk as uFk, u.c as uC, u.d as Ud
from cp
full join (select * from u where idFk <> 1) u using(idfk,c,d);

输出:

| CPIDFK |    CPC |    CPD |    UFK |     UC |     UD |
-------------------------------------------------------
|      2 |      w |      x |      2 |      w |      x |
|      2 |      y |      z |      2 |      y |      z |
| (null) | (null) | (null) |      3 |      m |      z |
|      3 |      w |      x |      3 |      w |      x |
|      3 |      y |      z |      3 |      y |      z |
|      4 |      w |      x |      4 |      w |      x |
|      4 |      y |      z | (null) | (null) | (null) |
|      5 |      w |      x |      5 |      w |      x |
|      5 |      y |      z |      5 |      y |      z |
|      6 |      w |      x |      6 |      w |      x |
|      6 |      y |      z |      6 |      y |      z |

有了这些信息,我们现在可以做正确的逻辑来检测两组之间是否存在差距,从上面的集合中,我们可以看到那些没有差距的是#2,#5和#6 。为此,我们执行此查询:

with cp as
(
    select t.id as idFk, x.*
    from t cross join (select c, d from u where idFk = 1) as x
    where t.id <> 1
)
,details_matches as
(
    select coalesce(cp.idFk,u.idFk) as idFk
    from cp
    full join (select * from u where idFk <> 1) u using(idfk,c,d)
    group by idFk
    having every(cp.idFk is not null and u.idFk is not null)
)
select * from details_matches
order by idFk;

输出:

| IDFK |
--------
|    2 |
|    5 |
|    6 |

然后现在我们进行标题比较,这更容易:

with headers_matches as
(
    select x.*
    from t join t x using(a,b)
    where t.id = 1 and x.id <> 1
)
select * from headers_matches;

那应该返回标题#2,#3,#4,#5,因为它们与#1的标题值相同:

输出:

| ID | A | B |
--------------
|  2 | k | l |
|  3 | k | l |
|  4 | k | l |
|  5 | k | l |

最后,我们将两个查询结合起来:

with headers_matches as
(
    select x.*
    from t join t x using(a,b)
    where t.id = 1 and x.id <> 1
)
,cp as
(
    select t.id as idFk, x.*
    from t cross join (select c, d from u where idFk = 1) as x
    where t.id <> 1
)
,details_matches as
(
    select coalesce(cp.idFk,u.idFk) as idFk
    from cp
    full join (select * from u where idFk <> 1) u using(idfk,c,d)
    group by idFk
    having every(cp.idFk is not null and u.idFk is not null)
)
select h.* 
from headers_matches h
join details_matches d on d.idFk = h.id 
order by h.id;

输出:

| ID | A | B |
--------------
|  2 | k | l |
|  5 | k | l |

请在此处查看查询进度:http://www.sqlfiddle.com/#!1/1f0ef/1

我稍后会将Postgresql查询转换为Mysql。

更新

这是MySQL版本:Find duplicates across multiple tables

答案 6 :(得分:0)

我已经通过你的评论考虑了我之前的回答,并提出了一种不同的方法。

select idfk, c, d from t2 where idfk = @ID 

此查询标识@ID的所有属性集。假设我们将其放入临时表中,然后对于该表中的每一行,识别T2中的所有IDFK,其中IDFK&lt;&gt; @ID,它将所有属性值与源行匹配;将所有这些行放入新表中。

我的sql会这样做:(你可能需要为mysql调整这个)

create table #attribs (row# int, c, d);
insert #attribs (row#, c, d) values (0, null, null);

insert #attribs (row#, c, d)
select (select max row# from #attribs) + 1, c, d
from T2 where idfk = @ID;

delete #attribs where row# = 0;

create table #matchedattrib (idfk int)

while (select count(*) from #attribs) > 0 begin
    select @c = c, @d = d from #attribs where row# = (select min(row#) from #attribs);
    delete #attribs where row# = (select min(row#) from #attribs);

    insert #matchedattrib (idfk)
    select idfk from T2 where idfk <> @ID and T2.c = @c and T2.d = @d;
end

完成此操作后,此新表中的任何IDFK与@ID(第一个查询)的属性集具有相同的行数,具有@ID的所有属性。

select idfk, count(*) as tot_attribs
into #counts
from #matchedattrib
group by idfk
having count(*) = (select count(*) from (select idfk from T2 where idfk = @ID) x);

但是,正如您在前面的回答中指出的那样,这些IDFK也可以有其他属性,因此对于第二个表中具有正确行数的IDFK,您需要计算T2中存在的行数是相同的数字 - 验证这些匹配的属性实际上是该IDFK的所有属性 - 意味着属性的完全匹配。

select idfk from #counts
where tot_attribs = (select count(*) from T2 where idfk = #counts.idfk)

如果你还需要在A + B上匹配,你必须自己填写!