2013-05-22 47 views
1

填充間隙我有一個數據表,如下SQL Server 2008帶有尺寸

#data 
--------------- 
Account AccountType 
--------------- 
1  2 
2  0 
3  5 
4  2 
5  1 
6  5 

ACCOUNTTYPE 2是報頭和5是總計。類型2的含義賬戶必須照看下一個1或0以確定其Dim值是1還是0.類型5的彙總必須查找最接近的1或0來確定其Dim值。類型1或0的帳戶的類型爲Dim。 2型

賬戶顯示爲島嶼所以它不夠的,只是檢查ROWNUMBER + 1和同樣爲5型

我一直在使用CTE的下表在到達的accounsts。但無法找到一個快速的方式,從這裏到我的賬戶,ACCOUNTTYPE的最終結果,昏暗的所有帳戶的

T3 
------------------- 
StartRow EndRow AccountType Dim 
------------------- 
1   1   2  0 
2   2   0  0 
3   3   5  0 
4   4   2  1 
5   5   0  1 
6   6   5  1 

下面的代碼是MS TSQL複製粘貼這一切,並看到它運行。在CTE select語句中的最後一次連接對於需要30秒的500行甚至是非常緩慢的。我有100.000行我需要處理。我完成了一個基於遊標的解決方案,它在10-20秒內完成了這一工作,並提供了一個快速遞歸CTE解決方案,可在5秒內完成100.000行,但它依賴於#data表的碎片。我應該補充說這個簡化的實際問題有很多需要考慮的維度。但對於這個簡單的問題,它將起到同樣的作用。

無論如何,有沒有一種快速的方法來做到這一點使用連接或其他基於集的解決方案。

SET NOCOUNT ON 

IF OBJECT_ID('tempdb..#data') IS NOT NULL 
    DROP TABLE #data 

CREATE TABLE #data 
(
Account INTEGER IDENTITY(1,1), 
AccountType INTEGER, 
) 

BEGIN -- TEST DATA 
DECLARE @Counter INTEGER = 0 
DECLARE @MaxDataRows INTEGER = 50 -- Change here to check performance 
DECLARE @Type INTEGER 
    WHILE(@Counter < @MaxDataRows) 
    BEGIN 
    SET @Type = CASE 
     WHEN @Counter % 10 < 3 THEN 2 
     WHEN @Counter % 10 >= 8 THEN 5 
     WHEN @Counter % 10 >= 3 THEN (CASE WHEN @Counter < @MaxDataRows/2.0 THEN 0 ELSE 1 END) 
     ELSE 0 
     END 
    INSERT INTO #data VALUES(@Type) 
    SET @Counter = @Counter + 1 
    END 
END -- TEST DATA END 



;WITH groupIds_cte AS 
(
    SELECT *, 
    ROW_NUMBER() OVER (PARTITION BY AccountType ORDER BY Account) - Account AS GroupId 
    FROM #data 
), 

islandRanges_cte AS 
(
SELECT 
    MIN(Account) AS StartRow, 
    MAX(Account) AS EndRow, 
    AccountType 
FROM groupIds_cte 
GROUP BY GroupId,AccountType 
), 

T3 AS 
(
SELECT I.*, J.AccountType AS Dim 
FROM islandRanges_cte I 
INNER JOIN islandRanges_cte J 
ON (I.EndRow + 1 = J.StartRow AND I.AccountType = 2) 
UNION ALL 
SELECT I.*, J.AccountType AS Dim 
FROM islandRanges_cte I 
INNER JOIN islandRanges_cte J 
ON (I.StartRow - 1 = J.EndRow AND I.AccountType = 5) 
UNION ALL 
SELECT *, AccountType AS Dim 
FROM islandRanges_cte 
WHERE AccountType = 0 OR AccountType = 1 
), 

T4 AS 
(
SELECT Account, Dim 
    FROM (
    SELECT FlattenRow AS Account, StartRow, EndRow, Dim 
    FROM T3 I 
    CROSS APPLY (VALUES(StartRow),(EndRow)) newValues (FlattenRow) 
    ) T 
) 

--SELECT * FROM T3 ORDER BY StartRow 
--SELECT * FROM T4 ORDER BY Account 

-- Final correct result but very very slow 
SELECT D.Account, D.AccountType, I.Dim FROM T3 I 
INNER JOIN #data D 
ON D.Account BETWEEN I.StartRow AND I.EndRow 
ORDER BY Account 

編輯與一段時間的測試

SET NOCOUNT ON 

IF OBJECT_ID('tempdb..#data') IS NULL 
CREATE TABLE #times 
(
RecId INTEGER IDENTITY(1,1), 
Batch INTEGER, 
Method NVARCHAR(255), 
MethodDescription NVARCHAR(255), 
RunTime INTEGER 
) 

IF OBJECT_ID('tempdb..#batch') IS NULL 
CREATE TABLE #batch 
(
Batch INTEGER IDENTITY(1,1), 
Bit BIT 
) 

INSERT INTO #batch VALUES(0) 

IF OBJECT_ID('tempdb..#data') IS NOT NULL 
    DROP TABLE #data 

CREATE TABLE #data 
(
Account INTEGER 
) 

CREATE NONCLUSTERED INDEX data_account_index ON #data (Account) 

IF OBJECT_ID('tempdb..#islands') IS NOT NULL 
    DROP TABLE #islands 

CREATE TABLE #islands 
(
AccountFrom INTEGER , 
AccountTo INTEGER, 
Dim INTEGER, 
) 

CREATE NONCLUSTERED INDEX islands_from_index ON #islands (AccountFrom, AccountTo, Dim) 

BEGIN -- TEST DATA 
    INSERT INTO #data 
    SELECT TOP 100000 ROW_NUMBER() OVER(ORDER BY t1.number) AS N 
    FROM master..spt_values t1 
    CROSS JOIN master..spt_values t2 

    INSERT INTO #islands 
    SELECT MIN(Account) AS Start, MAX(Account), Grp 
    FROM (SELECT *, NTILE(10) OVER (ORDER BY Account) AS Grp FROM #data) T 
    GROUP BY Grp ORDER BY Start 
END -- TEST DATA END 

--SELECT * FROM #data 
--SELECT * FROM #islands 

--PRINT CONVERT(varchar(20),DATEDIFF(MS,@RunDate,GETDATE()))+' ms Sub Query' 
DECLARE @RunDate datetime 
SET @RunDate=GETDATE() 

SELECT Account, (SELECT Dim From #islands WHERE Account BETWEEN AccountFrom AND AccountTo) AS Dim 
FROM #data 

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'subquery','',DATEDIFF(MS,@RunDate,GETDATE())) 
SET @RunDate=GETDATE() 

SELECT D.Account, V.Dim 
FROM #data D 
CROSS APPLY 
(
SELECT Dim From #islands V 
WHERE D.Account BETWEEN V.AccountFrom AND V.AccountTo 
) V 

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'crossapply','',DATEDIFF(MS,@RunDate,GETDATE())) 
SET @RunDate=GETDATE() 

SELECT D.Account, I.Dim 
FROM #data D 
JOIN #islands I 
ON D.Account BETWEEN I.AccountFrom AND I.AccountTo 

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'join','',DATEDIFF(MS,@RunDate,GETDATE())) 
SET @RunDate=GETDATE() 

;WITH cte AS 
(
SELECT Account, AccountFrom, AccountTo, Dim, 1 AS Counting 
FROM #islands 
CROSS APPLY (VALUES(AccountFrom),(AccountTo)) V (Account) 
UNION ALL 
SELECT Account + 1 ,AccountFrom, AccountTo, Dim, Counting + 1 
FROM cte 
WHERE (Account + 1) > AccountFrom AND (Account + 1) < AccountTo 
) 
SELECT Account, Dim, Counting FROM cte OPTION(MAXRECURSION 32767) 

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'recursivecte','',DATEDIFF(MS,@RunDate,GETDATE())) 

可以從#times表中選擇,看的運行時間:)

+0

。 。如果您有工作代碼,請將其發佈在您的問題中。 –

+0

我試着發佈最低工作示例!謝謝 – CodeMonkey

+0

我編輯了這個問題,以更好地反映我在找什麼,幷包含示例代碼。謝謝! – CodeMonkey

回答

0

我想你想加入,但使用不平等而不是平等:

select tt.id, tt.dim1, it.dim2 
from TallyTable tt join 
    IslandsTable it 
    on tt.id between it."from" and it."to" 

This works fo r您在問題中提供的數據。

這是另一個可能工作的想法。以下是查詢:

select d.*, 
     (select top 1 AccountType from #data d2 where d2.Account > d.Account and d2.AccountType not in (2, 5) 
     ) nextAccountType 
from #data d 
order by d.account; 

我剛剛在50,000行上運行這個,這個版本花了17秒在我的系統上。將表格更改爲:

CREATE TABLE #data (
    Account INTEGER IDENTITY(1,1) primary key, 
    AccountType INTEGER, 
); 

實際上它已經放慢到大約1:33了 - 這讓我感到意外。也許其中一個會幫助你。

+0

給出正確的結果,但它很慢 – CodeMonkey

+0

找出來了,由於我的ORDER BY子句太​​慢了,不知道爲什麼會這麼慢。 – CodeMonkey