Question

假设我正在使用文章标签来实施文章。我正在使用SQL Server 2008。

TABLE Articles
ArtID INT
...

TABLE Tags
TagID INT
TagText VARCHAR(10)

TABLE ArticleTags
ArtID INT
TagID INT

我正在尝试找出使用特定代码查询所有文章的最有效方法。这里有两个选项，我读过这两个选项都是最有效的。

方法A：

SELECT a.* FROM Articles
WHERE EXISTS (
    SELECT * FROM ArticleTags at
    INNER JOIN Tags t ON at.TagID = t.TagID
    WHERE at.ArtID = a.ID
    AND t.TagText IN ('abc', 'def')
)

方法B：

SELECT a.* FROM Articles a
INNER JOIN ArticleTags at ON a.ArtID = at.ArtID
INNER JOIN Tags t ON at.TagID = t.TagID
WHERE t.TagText IN ('abc', 'def')
GROUP BY a.ArtID

任何SQL专家都可以建议哪个更有效，为什么？或者我可能走错了路。

Answer 1

与几乎所有SQL性能问题一样，答案不是查询，答案是数据模式。您拥有哪些索引，这就是推动查询性能的因素。

通常，多对多关系需要两个互补索引，一个作为(ID1, ID2)，另一个作为(ID2, ID1)。其中一个是聚集的，哪一个并不重要。因此，我们创建一个测试数据库（100k文章，1K标签，每篇文章1-10个标签）：

:setvar dbname testdb
:setvar articles 1000000
:setvar tags 1000
:setvar articletags 10
:on error exit

set xact_abort on;
go

use master;
go

if db_id('$(dbname)') is not null
begin
    alter database [$(dbname)] set single_user with rollback immediate;
    drop database [$(dbname)];
end
go

create database [$(dbname)];
go

use [$(dbname)];
go

create TABLE Articles (
    ArtID INT not null identity(1,1),
    name varchar(100) not null, 
    filler char(500) not null default replicate('X', 500),
    constraint pk_Articles primary key clustered (ArtID));
go

create table Tags (
    TagID INT not null identity(1,1),
    TagText VARCHAR(10) not null,
    constraint pk_Tags primary key clustered (TagID),
    constraint unq_Tags_Text unique (TagText));
go

create TABLE ArticleTags (
    ArtID INT not null,
    TagID INT not null,
    constraint fk_Articles 
        foreign key (ArtID)
        references Articles (ArtID),
    constraint fk_Tags
        foreign key (TagID)
        references Tags (TagID),
    constraint pk_ArticleTags
        primary key clustered (ArtID, TagID));
go

create nonclustered index ndxArticleTags_TagID
    on ArticleTags (TagID, ArtID);
go          

-- populate articles
set nocount on;
declare @i int =0, @name varchar(100);
begin transaction
while @i < $(articles)  
begin
    set @name = 'Name ' + cast(@i as varchar(10));
    insert into Articles (name) values (@name);
    set @i += 1;
    if @i %1000 = 0
    begin
        commit;
        raiserror (N'Inserted %d articles', 0, 1, @i);
        begin transaction;
    end
end
commit
go


-- populate tags
set nocount on;
declare @i int =0, @text varchar(100);
begin transaction
while @i < $(tags)  
begin
    set @text = 'Tag ' + cast(@i as varchar(10));
    insert into Tags (TagText) values (@text);
    set @i += 1;
    if @i %1000 = 0
    begin
        commit;
        raiserror (N'Inserted %d tags', 0, 1, @i);
        begin transaction;
    end
end
commit
go

-- populate article-tags
set nocount on;
declare @i int =0, @a int = 1, @cnt int, @tag int;
set @cnt = rand() * $(articletags) + 1;
set @tag = rand() * $(tags) + 1;
begin transaction
while @a < $(articles)  
begin
    insert into ArticleTags (ArtID, TagID) values (@a, @tag);
    set @cnt -= 1;
    set @tag += rand()*10+1;
    if $(tags)<=@tag 
    begin
        set @tag = 1;
    end
    if @cnt = 0
    begin
        set @cnt = rand() * $(articletags) + 1;
        set @tag = rand() * $(tags) + 1;
        set @a += 1;
    end
    set @i += 1;
    if @i %1000 = 0
    begin
        commit;
        raiserror (N'Inserted %d article-tags', 0, 1, @i);
        begin transaction;
    end
end
commit
raiserror (N'Final: %d article-tags', 0, 1, @i);
go

现在让我们比较两个查询：

set statistics io on;
set statistics time on;

select a.ArtID
from Articles a
where exists (
    select * 
    from ArticleTags at
    join Tags t on at.TagID = t.TagID
    where at.ArtID = a.ArtID
    and t.TagText in ('Tag 10', 'Tag 12'));

SELECT a.ArtID FROM Articles a
INNER JOIN ArticleTags at ON a.ArtID = at.ArtID
INNER JOIN Tags t ON at.TagID = t.TagID
WHERE t.TagText IN ('Tag 10', 'Tag 12')
GROUP BY a.ArtID

结果：

Table 'Articles'. Scan count 0, logical reads 3561, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'ArticleTags'. Scan count 2, logical reads 13, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Tags'. Scan count 2, logical reads 4, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

Table 'Articles'. Scan count 0, logical reads 3561, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'ArticleTags'. Scan count 2, logical reads 13, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Tags'. Scan count 2, logical reads 4, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

惊喜！（好吧，不是真的）。他们是 IDENTICAL 。事实上，他们有完全相同的执行计划。

Answer 2

您的方法B有一个GROUP BY子句，但您从文章返回所有列，甚至可能是不可聚合的列。这会引发错误。 GROUP BY可能是不必要的。

如果没有GROUP BY，查询的执行计划大致相同。但是，方法B是更标准的SQL查询语句。

编辑：在这种情况下，DISTINCT通常优于GROUP BY，并具有相同的功能

SELECT DISTINCT 
    a.* 
FROM 
    Articles a 
INNER JOIN 
    ArticleTags at 
ON 
    a.ArtID = at.ArtID 
INNER JOIN 
    Tags t 
ON 
    at.TagID = t.TagID 
WHERE 
    t.TagText IN ('abc', 'def')

Answer 3

我会根据artID和TagText列上的3个表创建一个索引视图。这样你就可以使用：

SELECT * 
FROM Articles 
WHERE artID IN 
(SELECT artID 
FROM ArticleTagTextView 
WHERE TagText IN ('abc', 'def'))

Answer 4

很快：没有区别。两者都将被翻译成相同的执行计划。

编辑：没有注意到GROUP BY。这种方式查询很可能不会编译。删除GROUP BY子句或列出表的所有字段，如GROUP BY Id，Name，...

最有效的多对多查询

4 个答案: