2010-10-05 64 views
3

我正面臨一個概念性問題,我很難克服。我希望這些人能夠幫助我以正確的方向輕輕克服它。需要使用大型數據集的小型子集的方法

我正在做一些ETL工作,源數據非常相似和非常大。我將它加載到一個用於複製的表中,我只想要這個目標表中最基本的信息。

我的源表看起來是這樣的:

alt text

我需要我的目標表,以反映它是這樣:

alt text

正如你可以看到我不重複InTransit狀態,它在源表中被複制。我試圖找出如何實現的步驟是:

  1. 獲取自上次運行查詢以來輸入的任何新的不同行。 (Easy)
  2. 對於每個TrackingId,我需要檢查每個新狀態是否已經是目標中的最新狀態,如果不是這樣,否則請繼續並插入它。這意味着我必須在最早的狀態下開始並從那裏開始。 (我沒有*(#!在線索我怎麼會做這個)
  3. 做到這一點,每15分鐘,這樣的狀態保持非常近所以第2步必須是高性能的。

我的源表可以很容易由100k +行組成,但需要每隔15分鐘運行一次,這要求我確保這是非常高效的,因此我真的試圖避免使用遊標。

現在我唯一能看到的方法是使用CLR sproc,但我認爲可能有更好的方法,因此我希望你們可以推動我朝着正確的方向發展。

I我確定我可能會遺漏一些你可能需要的東西,請讓我知道你可能需要什麼信息,我會很樂意提供。

預先感謝您!

編輯: 好的我沒有明確的在我的問題。我的源表將包含多個跟蹤ID。它可能高達100k +行,包含多個TrackingId和每個trackingId的多個狀態。我必須針對每個單獨的跟蹤ID更新目標表,但我的來源將是trackingId的汞合金。

回答

1

你在這裏。我會讓你清理它並進行優化。其中一個子查詢可以進入視圖並且可以清除混亂的日期比較。如果您使用的是SQL 2008 R2,那麼請使用CAST作爲DATE。

declare @tbl1 table(
id int, Trackingid int, Status varchar(50), StatusDate datetime 
) 

declare @tbl2 table(
id int, Trackingid int, Status varchar(50), StatusDate datetime 
) 

----Source data 
insert into @tbl1 (id, trackingid, status, statusdate) values(1,1,'PickedUp','10/01/10 1:00') -- 
insert into @tbl1 (id, trackingid, status, statusdate) values(2,1,'InTransit','10/02/10 1:00') -- 
insert into @tbl1 (id, trackingid, status, statusdate) values(8,1,'InTransit','10/02/10 3:00') 
insert into @tbl1 (id, trackingid, status, statusdate) values(4,1,'Delayed','10/03/10 1:00') 
insert into @tbl1 (id, trackingid, status, statusdate) values(5,1,'InTransit','10/03/10 1:01') 
insert into @tbl1 (id, trackingid, status, statusdate) values(6,1,'AtDest','10/03/10 2:00') 
insert into @tbl1 (id, trackingid, status, statusdate) values(7,1,'Deliv','10/03/10 3:00') -- 
insert into @tbl1 (id, trackingid, status, statusdate) values(3,2,'InTransit','10/03/10 1:00') 
insert into @tbl1 (id, trackingid, status, statusdate) values(9,2,'AtDest','10/04/10 1:00') 
insert into @tbl1 (id, trackingid, status, statusdate) values(10,2,'Deliv','10/04/10 1:05') 
insert into @tbl1 (id, trackingid, status, statusdate) values(11,1,'Delayed','10/02/10 2:05') 

----Target data 
insert into @tbl2 (id, trackingid, status, statusdate) values(1,1,'PickedUp','10/01/10 1:00') 
insert into @tbl2 (id, trackingid, status, statusdate) values(2,1,'InTransit','10/02/10 1:00') 
insert into @tbl2 (id, trackingid, status, statusdate) values(3,1,'Deliv','10/03/10 3:00') 


select d.* from 
(
    select 
    * , 
    ROW_NUMBER() OVER(PARTITION BY trackingid, CAST((STR(YEAR(statusdate)) + '/' +STR(MONTH(statusdate)) + '/' +STR(DAY(statusdate))) AS DATETIME) ORDER BY statusdate) AS 'RN' 
    from @tbl1 
) d 

where 
not exists 
(
    select RN from 
    (
     select 
     * , 
     ROW_NUMBER() OVER(PARTITION BY trackingid, CAST((STR(YEAR(statusdate)) + '/' +STR(MONTH(statusdate)) + '/' +STR(DAY(statusdate))) AS DATETIME) ORDER BY statusdate) AS 'RN' 
     from @tbl1 
    )f where f.RN = d.RN + 1 and d.status = f.status and f.trackingid = d.trackingid and 
    CAST((STR(YEAR(f.statusdate)) + '/' +STR(MONTH(f.statusdate)) + '/' +STR(DAY(f.statusdate))) AS DATETIME) = 
      CAST((STR(YEAR(d.statusdate)) + '/' +STR(MONTH(d.statusdate)) + '/' +STR(DAY(d.statusdate))) AS DATETIME) 
) 

and 
not exists 
(
    select 1 from @tbl2 t2 
    where (t2.trackingid = d.trackingid 
    and t2.statusdate = d.statusdate 
    and t2.status = d.status) 
) 
and (
    not exists 
    (
     select 1 from 
     (
      select top 1 * from @tbl2 t2 
      where t2.trackingid = d.trackingid 
      order by t2.statusdate desc 
     ) g 
     where g.status = d.status 
    ) 
    or not exists 
    (
     select 1 from 
     (
      select top 1 * from @tbl2 t2 
      where t2.trackingid = d.trackingid 
      and t2.statusdate <= d.statusdate 
      order by t2.statusdate desc 
     ) g 
     where g.status = d.status 
    ) 
) 
order by trackingid,statusdate 
+0

這實現了我所需要的一切,非常感謝! – joshlrogers 2010-10-05 17:44:50

1

如何好這個執行將取決於指標,特別是如果你的目標在一次一個TrackingID,但是這是使用CTE和單向自聯接,以獲得所需的結果:

CREATE TABLE #foo 
(
    TrackingID INT, 
    [Status] VARCHAR(32), 
    StatusDate SMALLDATETIME 
); 

INSERT #foo SELECT 1, 'PickedUp', '2010-10-01 08:15'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 03:07'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 10:28'; 
INSERT #foo SELECT 1, 'Delayed', '2010-10-03 09:52'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-03 20:09'; 
INSERT #foo SELECT 1, 'AtDest', '2010-10-04 13:42'; 
INSERT #foo SELECT 1, 'Deliv',  '2010-10-04 17:05'; 

WITH src AS 
(
    SELECT 
     TrackingID, 
     [Status], 
     StatusDate, 
     ab = ROW_NUMBER() OVER (ORDER BY [StatusDate]) 
    FROM #foo 
    WHERE TrackingID = 1 
), 
realsrc AS 
(
    SELECT 
     a.TrackingID, 
     leftrow   = a.ab, 
     rightrow  = b.ab, 
     leftstatus  = a.[Status], 
     leftstatusdate = a.StatusDate, 
     rightstatus  = b.[Status], 
     rightstatusdate = b.StatusDate 
    FROM src AS a 
    LEFT OUTER JOIN src AS b 
    ON a.ab = b.ab - 1 
) 
SELECT 
    Id = ROW_NUMBER() OVER (ORDER BY [leftstatusdate]), 
    TrackingID, 
    [Status] = leftstatus, 
    [StatusDate] = leftstatusdate 
FROM 
    realsrc 
WHERE 
    rightrow IS NULL 
    OR (leftrow = rightrow - 1 AND leftstatus <> rightstatus) 
ORDER BY 
    [StatusDate]; 
GO 
DROP TABLE #foo; 

如果您需要支持在同一查詢多個TrackingIDs:

CREATE TABLE #foo 
(
    TrackingID INT, 
    [Status] VARCHAR(32), 
    StatusDate SMALLDATETIME 
); 

INSERT #foo SELECT 1, 'PickedUp', '2010-10-01 08:15'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 03:07'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 10:28'; 
INSERT #foo SELECT 1, 'Delayed', '2010-10-03 09:52'; 
INSERT #foo SELECT 1, 'InTransit', '2010-10-03 20:09'; 
INSERT #foo SELECT 1, 'AtDest', '2010-10-04 13:42'; 
INSERT #foo SELECT 1, 'Deliv',  '2010-10-04 17:05'; 
INSERT #foo SELECT 2, 'InTransit', '2010-10-02 10:28'; 
INSERT #foo SELECT 2, 'Delayed', '2010-10-03 09:52'; 
INSERT #foo SELECT 2, 'InTransit', '2010-10-03 20:09'; 
INSERT #foo SELECT 2, 'AtDest', '2010-10-04 13:42'; 

WITH src AS 
(
    SELECT 
     TrackingID, 
     [Status], 
     StatusDate, 
     ab = ROW_NUMBER() OVER (ORDER BY [StatusDate]) 
    FROM #foo 
), 
realsrc AS 
(
    SELECT 
     a.TrackingID, 
     leftrow   = a.ab, 
     rightrow  = b.ab, 
     leftstatus  = a.[Status], 
     leftstatusdate = a.StatusDate, 
     rightstatus  = b.[Status], 
     rightstatusdate = b.StatusDate 
    FROM src AS a 
    LEFT OUTER JOIN src AS b 
    ON a.ab = b.ab - 1 
    AND a.TrackingID = b.TrackingID 
) 
SELECT 
    Id = ROW_NUMBER() OVER (ORDER BY TrackingID, [leftstatusdate]), 
    TrackingID, 
    [Status] = leftstatus, 
    [StatusDate] = leftstatusdate 
FROM 
    realsrc 
WHERE 
    rightrow IS NULL 
    OR (leftrow = rightrow - 1 AND leftstatus <> rightstatus) 
ORDER BY 
    TrackingID, 
    [StatusDate]; 
GO 
DROP TABLE #foo; 
+0

這適用於我的一半問題。您可以從源頭中篩選出順序狀態,但最終無法解決任何與目標相關的問題。正如我在步驟2中的原始問題中所說的,我需要擔心在我的目標中插入重複的順序狀態。這並不能保證這不會發生。我爲您提供+1的優雅解決方案來檢索源數據。 – joshlrogers 2010-10-05 20:08:40

0

如果這是SQL 2005,那麼你可以使用ROW_NUMBER與子查詢或CTE: 如果數據集確實是巨大的,雖然和性能是那麼問題之一就是上面那個當我試圖讓代碼塊工作時,粘貼會更有效率。

/** 
* This is just to create a sample table to use in the test query 
**/ 

DECLARE @test TABLE(ID INT, TrackingID INT, Status VARCHAR(20), StatusDate DATETIME) 
INSERT @test 
SELECT 1,1,'PickedUp', '01 jan 2010 08:00' UNION 
SELECT 2,1,'InTransit', '01 jan 2010 08:01' UNION 
SELECT 3,1,'InTransit', '01 jan 2010 08:02' UNION 
SELECT 4,1,'Delayed', '01 jan 2010 08:03' UNION 
SELECT 5,1,'InTransit', '01 jan 2010 08:04' UNION 
SELECT 6,1,'AtDest', '01 jan 2010 08:05' UNION 
SELECT 7,1,'Deliv', '01 jan 2010 08:06' 


/** 
* This would be the select code to exclude the duplicate entries. 
* Sorting desc in row_number would get latest instead of first 
**/ 
;WITH n AS 
(
    SELECT ID, 
      TrackingID, 
      Status, 
      StatusDate, 
      --For each Status for a tracking ID number by ID (could use date but 2 may be the same) 
      ROW_NUMBER() OVER(PARTITION BY TrackingID, Status ORDER BY ID) AS [StatusNumber] 
    FROM @test 
) 
SELECT ID, 
     TrackingID, 
     Status, 
     StatusDate 
FROM n 
WHERE StatusNumber = 1 
ORDER BY ID 
+0

您查詢不會從原始表中返回條目'5'。 – Quassnoi 2010-10-05 15:06:04

2

這裏有沒有一個解決方案自聯接:

WITH q AS 
     (
     SELECT *, 
       ROW_NUMBER() OVER (ORDER BY statusDate) AS rn, 
       ROW_NUMBER() OVER (PARTITION BY status ORDER BY statusDate) AS rns 
     FROM tracking 
     WHERE tackingId = @id 
     ), 
     qs AS 
     (
     SELECT *, 
       ROW_NUMBER() OVER (PARTITION BY rn - rns ORDER BY statusDate) AS rnn 
     FROM q 
     ) 
SELECT * 
FROM qs 
WHERE rnn = 1 
ORDER BY 
     statusDate 

這裏有一個腳本來檢查:

DECLARE @tracking TABLE 
     (
     id INT NOT NULL PRIMARY KEY, 
     trackingId INT NOT NULL, 
     status INT, 
     statusDate DATETIME 
     ) 

INSERT 
INTO @tracking 
SELECT 1, 1, 1, DATEADD(d, 1, '2010-01-01') 
UNION ALL 
SELECT 2, 1, 2, DATEADD(d, 2, '2010-01-01') 
UNION ALL 
SELECT 3, 1, 2, DATEADD(d, 3, '2010-01-01') 
UNION ALL 
SELECT 4, 1, 2, DATEADD(d, 4, '2010-01-01') 
UNION ALL 
SELECT 5, 1, 3, DATEADD(d, 5, '2010-01-01') 
UNION ALL 
SELECT 6, 1, 3, DATEADD(d, 6, '2010-01-01') 
UNION ALL 
SELECT 7, 1, 4, DATEADD(d, 7, '2010-01-01') 
UNION ALL 
SELECT 8, 1, 2, DATEADD(d, 8, '2010-01-01') 
UNION ALL 
SELECT 9, 1, 2, DATEADD(d, 9, '2010-01-01') 
UNION ALL 
SELECT 10, 1, 1, DATEADD(d, 10, '2010-01-01') 
; 
WITH q AS 
     (
     SELECT *, 
       ROW_NUMBER() OVER (ORDER BY statusDate) AS rn, 
       ROW_NUMBER() OVER (PARTITION BY status ORDER BY statusDate) AS rns 
     FROM @tracking 
     ), 
     qs AS 
     (
     SELECT *, 
       ROW_NUMBER() OVER (PARTITION BY rn - rns ORDER BY statusDate) AS rnn 
     FROM q 
     ) 
SELECT * 
FROM qs 
WHERE rnn = 1 
ORDER BY 
     statusDate 
+0

您是否認爲這在性能上與自連接相當,還是您希望它運行得更快? – JNK 2010-10-05 15:08:27

+0

+1 - 我不能讓我的工作! – JNK 2010-10-05 16:12:25

0

我覺得這個例子會做你要找的內容:

CREATE TABLE dbo.srcStatus (
Id INT IDENTITY(1,1), 
TrackingId INT NOT NULL, 
[Status] VARCHAR(10) NOT NULL, 
StatusDate DATETIME NOT NULL 
); 

CREATE TABLE dbo.tgtStatus (
Id INT IDENTITY(1,1), 
TrackingId INT NOT NULL, 
[Status] VARCHAR(10) NOT NULL, 
StatusDate DATETIME NOT NULL 
); 

INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'PickedUp','10/1/2010 8:15 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'InTransit','10/2/2010 3:07 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'InTransit','10/2/2010 10:28 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'PickedUp','10/1/2010 8:15 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'InTransit','10/2/2010 3:07 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'Delayed','10/2/2010 10:28 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'Delayed','10/3/2010 9:52 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'InTransit','10/3/2010 8:09 PM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'AtDest','10/4/2010 1:42 PM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (1,'Deliv','10/4/2010 5:05 PM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'InTransit','10/3/2010 9:52 AM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'InTransit','10/3/2010 8:09 PM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'AtDest','10/4/2010 1:42 PM'); 
INSERT INTO dbo.srcStatus (TrackingId, [Status], StatusDate) VALUES (2,'Deliv','10/4/2010 5:05 PM'); 

WITH cteSrcTrackingIds 
      AS (SELECT DISTINCT 
         TrackingId 
       FROM  dbo.srcStatus 
      ), 
     cteAllTrackingIds 
      AS (SELECT TrackingId , 
         [Status] , 
         StatusDate 
       FROM  dbo.srcStatus 
       UNION 
       SELECT tgtStatus.TrackingId , 
         tgtStatuS.[Status] , 
         tgtStatus.StatusDate 
       FROM  cteSrcTrackingIds 
         INNER JOIN dbo.tgtStatus ON cteSrcTrackingIds.TrackingId = tgtStatus.TrackingId 
      ), 
     cteAllTrackingIdsWithRownums 
      AS (SELECT TrackingId , 
         [Status] , 
         StatusDate , 
         ROW_NUMBER() OVER (PARTITION BY TrackingId ORDER BY StatusDate) AS rownum 
       FROM  cteAllTrackingIds 
      ), 
     cteTrackingIdsWorkingSet 
      AS (SELECT src.rownum AS [id] , 
         src2.rownum AS [id2] , 
         src.TrackingId , 
         src.[Status] , 
         src.StatusDate , 
         ROW_NUMBER() OVER (PARTITION BY src.TrackingId, 
              src.rownum ORDER BY src.StatusDate) AS rownum 
       FROM  cteAllTrackingIdsWithRownums AS [src] 
         LEFT OUTER JOIN cteAllTrackingIdsWithRownums AS [src2] ON src.TrackingId = src2.TrackingId 
                   AND src.rownum < src2.rownum 
                   AND src.[Status] != src2.[Status] 
      ), 
     cteTrackingIdsSubset 
      AS (SELECT id , 
         TrackingId , 
         [Status] , 
         StatusDate , 
         ROW_NUMBER() OVER (PARTITION BY TrackingId, id2 ORDER BY id) AS rownum 
       FROM  cteTrackingIdsWorkingSet 
       WHERE rownum = 1 
      ) 
    INSERT INTO dbo.tgtStatus 
      (TrackingId , 
       [status] , 
       StatusDate 
      ) 
      SELECT cteTrackingIdsSubset.TrackingId , 
        cteTrackingIdsSubset.[status] , 
        cteTrackingIdsSubset.StatusDate 
      FROM cteTrackingIdsSubset 
        LEFT OUTER JOIN dbo.tgtStatus ON cteTrackingIdsSubset.TrackingId = tgtStatus.TrackingId 
                AND cteTrackingIdsSubset.[status] = tgtStatus.[status] 
                AND cteTrackingIdsSubset.StatusDate = tgtStatus.StatusDate 
      WHERE cteTrackingIdsSubset.rownum = 1 
        AND tgtStatus.id IS NULL 
      ORDER BY cteTrackingIdsSubset.TrackingId , 
        cteTrackingIdsSubset.StatusDate;