麻烦使用ROW_NUMBER()OVER(PARTITION BY ...)

时间:2013-11-12 05:10:30

标签: sql sql-server sql-server-2008 row-number gaps-and-islands

我正在使用SQL Server 2008 R2。我有一个名为EmployeeHistory的表,其中包含以下结构和示例数据:

EmployeeID Date      DepartmentID SupervisorID
10001      20130101  001          10009
10001      20130909  001          10019
10001      20131201  002          10018
10001      20140501  002          10017
10001      20141001  001          10015
10001      20141201  001          10014

请注意,随着时间的推移,员工10001已经更换了2个部门和几个主管。我想要做的是列出按日期字段排序的每个部门中该员工的工作的开始和结束日期。因此,输出将如下所示:

EmployeeID DateStart DateEnd  DepartmentID 
10001      20130101  20131201 001
10001      20131201  20141001 002
10001      20141001  NULL     001

我打算使用以下查询对数据进行分区,但失败了。部门从001变为002然后又变回001.显然我不能通过DepartmentID进行分区......我确信我忽略了显而易见的事情。有帮助吗?提前谢谢你。

SELECT * ,ROW_NUMBER() OVER (PARTITION BY EmployeeID, DepartmentID
ORDER BY [Date]) RN FROM EmployeeHistory

3 个答案:

答案 0 :(得分:9)

我会做这样的事情:

;WITH x 
 AS (SELECT *, 
            Row_number() 
              OVER( 
                partition BY employeeid 
                ORDER BY datestart) rn 
     FROM   employeehistory) 
SELECT * 
FROM   x x1 
   LEFT OUTER JOIN x x2 
                ON x1.rn = x2.rn + 1 

或许它可能是x2.rn - 1.你必须看到。无论如何,你明白了。一旦你自己加入了表格,就可以过滤,分组,排序等,以获得你需要的东西。

答案 1 :(得分:5)

有点牵扯。最简单的是引用我为您创建的this SQL Fiddle产生确切结果。有一些方法可以改善它的性能或其他考虑因素,但这有望至少比某些替代方案更清晰。

要点是,首先获得数据的规范排名,然后使用它将数据分组,然后找到每个组的结束日期,然后消除任何中间行。 ROW_NUMBER()和CROSS APPLY在可读性方面做了很多帮助。


编辑2019:

事实上,由于某种原因,SQL Fiddle似乎确实被破坏了,但它似乎是SQL Fiddle网站上的一个问题。这是一个完整的版本,刚刚在SQL Server 2016上进行了测试:

CREATE TABLE Source
(
  EmployeeID int,
  DateStarted date,
  DepartmentID int
)

INSERT INTO Source
VALUES
(10001,'2013-01-01',001),
(10001,'2013-09-09',001),
(10001,'2013-12-01',002),
(10001,'2014-05-01',002),
(10001,'2014-10-01',001),
(10001,'2014-12-01',001)


SELECT *, 
  ROW_NUMBER() OVER (PARTITION BY EmployeeID ORDER BY DateStarted) AS EntryRank,
  newid() as GroupKey,
  CAST(NULL AS date) AS EndDate
INTO #RankedData
FROM Source
;

UPDATE #RankedData
SET GroupKey = beginDate.GroupKey
FROM #RankedData sup
  CROSS APPLY 
  (
    SELECT TOP 1 GroupKey
    FROM #RankedData sub 
    WHERE sub.EmployeeID = sup.EmployeeID AND
      sub.DepartmentID = sup.DepartmentID AND
      NOT EXISTS 
        (
          SELECT * 
          FROM #RankedData bot 
          WHERE bot.EmployeeID = sup.EmployeeID AND
            bot.EntryRank BETWEEN sub.EntryRank AND sup.EntryRank AND
            bot.DepartmentID <> sup.DepartmentID
        )
      ORDER BY DateStarted ASC
    ) beginDate (GroupKey);

UPDATE #RankedData
SET EndDate = nextGroup.DateStarted
FROM #RankedData sup
  CROSS APPLY 
  (
    SELECT TOP 1 DateStarted
    FROM #RankedData sub
    WHERE sub.EmployeeID = sup.EmployeeID AND
      sub.DepartmentID <> sup.DepartmentID AND
      sub.EntryRank > sup.EntryRank
    ORDER BY EntryRank ASC
  ) nextGroup (DateStarted);

SELECT * FROM 
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY GroupKey ORDER BY EntryRank ASC) AS GroupRank FROM #RankedData
) FinalRanking
WHERE GroupRank = 1
ORDER BY EntryRank;

DROP TABLE #RankedData
DROP TABLE Source

答案 2 :(得分:0)

这似乎是一个常见的“隔岛”问题。行号rn1rn2的两个序列之间的差异给出了“组”号。

逐个CTE运行此查询,并检查中间结果以了解其工作原理。

样本数据

我从问题中扩展了样本数据。

DECLARE @Source TABLE
(
    EmployeeID int,
    DateStarted date,
    DepartmentID int
)

INSERT INTO @Source
VALUES
(10001,'2013-01-01',001),
(10001,'2013-09-09',001),
(10001,'2013-12-01',002),
(10001,'2014-05-01',002),
(10001,'2014-10-01',001),
(10001,'2014-12-01',001),

(10005,'2013-05-01',001),
(10005,'2013-11-09',001),
(10005,'2013-12-01',002),
(10005,'2014-10-01',001),
(10005,'2016-12-01',001);

查询SQL Server 2008

SQL Server 2008中没有LEAD函数,因此我不得不通过OUTER APPLY使用自联接来获取DateEnd的“ next”行的值。 / p>

WITH
CTE
AS
(
    SELECT
        EmployeeID
        ,DateStarted
        ,DepartmentID
        ,ROW_NUMBER() OVER (PARTITION BY EmployeeID ORDER BY DateStarted) AS rn1
        ,ROW_NUMBER() OVER (PARTITION BY EmployeeID, DepartmentID ORDER BY DateStarted) AS rn2
    FROM @Source
)
,CTE_Groups
AS
(
    SELECT
        EmployeeID
        ,MIN(DateStarted) AS DateStart
        ,DepartmentID
    FROM CTE
    GROUP BY
        EmployeeID
        ,DepartmentID
        ,rn1 - rn2
)
SELECT
    CTE_Groups.EmployeeID
    ,CTE_Groups.DepartmentID
    ,CTE_Groups.DateStart
    ,A.DateEnd
FROM
    CTE_Groups
    OUTER APPLY
    (
        SELECT TOP(1) G2.DateStart AS DateEnd
        FROM CTE_Groups AS G2
        WHERE
            G2.EmployeeID = CTE_Groups.EmployeeID
            AND G2.DateStart > CTE_Groups.DateStart
        ORDER BY G2.DateStart
    ) AS A
ORDER BY
    EmployeeID
    ,DateStart
;

查询SQL Server 2012 +

从SQL Server 2012开始,有一个LEAD函数使此任务更加高效。

WITH
CTE
AS
(
    SELECT
        EmployeeID
        ,DateStarted
        ,DepartmentID
        ,ROW_NUMBER() OVER (PARTITION BY EmployeeID ORDER BY DateStarted) AS rn1
        ,ROW_NUMBER() OVER (PARTITION BY EmployeeID, DepartmentID ORDER BY DateStarted) AS rn2
    FROM @Source
)
,CTE_Groups
AS
(
    SELECT
        EmployeeID
        ,MIN(DateStarted) AS DateStart
        ,DepartmentID
    FROM CTE
    GROUP BY
        EmployeeID
        ,DepartmentID
        ,rn1 - rn2
)
SELECT
    CTE_Groups.EmployeeID
    ,CTE_Groups.DepartmentID
    ,CTE_Groups.DateStart
    ,LEAD(CTE_Groups.DateStart) OVER (PARTITION BY CTE_Groups.EmployeeID ORDER BY CTE_Groups.DateStart) AS DateEnd
FROM
    CTE_Groups
ORDER BY
    EmployeeID
    ,DateStart
;

结果

+------------+--------------+------------+------------+
| EmployeeID | DepartmentID | DateStart  |  DateEnd   |
+------------+--------------+------------+------------+
|      10001 |            1 | 2013-01-01 | 2013-12-01 |
|      10001 |            2 | 2013-12-01 | 2014-10-01 |
|      10001 |            1 | 2014-10-01 | NULL       |
|      10005 |            1 | 2013-05-01 | 2013-12-01 |
|      10005 |            2 | 2013-12-01 | 2014-10-01 |
|      10005 |            1 | 2014-10-01 | NULL       |
+------------+--------------+------------+------------+