這是行不通的嗎?
DECLARE @transactions TABLE (
ColA INT
, ColB INT
, ColC INT
, ColD INT
, ColE INT
, ColF INT
)
DECLARE @Counter1 INT = 0
WHILE @Counter1 < 10000
BEGIN
SET @Counter1 += 1
INSERT INTO @transactions
SELECT ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
END
;WITH Dupe
AS (
SELECT *, ROW_NUMBER() OVER
(PARTITION BY ColA, ColB, ColC, ColD, ColE, ColF
ORDER BY ColA, ColB, ColC, ColD, ColE, ColF) AS rn
FROM @transactions
)
SELECT * FROM Dupe WHERE rn > 1
對於任何需要比較可能爲空值的值的情況,您都可以使用ISNULL。請注意,我編寫的大部分內容僅用於生成有用的數據集。在6列和10,000行的情況下,我在不到一秒的時間內就獲得了42行相同的行。沒有三倍。碰到了100,000行,我得到了3,489個重複的行,包括一些三元組。花了3秒鐘。
下面是使用文本的示例。整個事件花了25秒鐘記錄在100,000條記錄上,儘管我的計時器顯示只有不到4條是找到重複記錄,剩下的就是表格總數。
DECLARE @transactions2 TABLE (
ColA NVARCHAR(30)
, ColB NVARCHAR(30)
, ColC NVARCHAR(30)
, ColD NVARCHAR(30)
, ColE NVARCHAR(30)
, ColF NVARCHAR(30)
)
DECLARE @names TABLE (
ID INT IDENTITY
, Name NVARCHAR(30)
)
DECLARE @Counter2 INT = 0
, @ColA NVARCHAR(30)
, @ColB NVARCHAR(30)
, @ColC NVARCHAR(30)
, @ColD NVARCHAR(30)
, @ColE NVARCHAR(30)
, @ColF NVARCHAR(30)
INSERT INTO @names VALUES
('Anderson, Arthur')
, ('Broberg, Bruce')
, ('Chan, Charles')
, ('Davidson, Darwin')
, ('Eggert, Emily')
, ('Fox, Francesca')
, ('Garbo, Greta')
, ('Hollande, Hortense')
, ('Iguadolla, Ignacio')
, ('Jackson, Jurimbo')
, ('Katana, Ken')
, ('Lawrence, Larry')
, ('McDonald, Michael')
, ('Nyugen, Nathan')
, ('O''Dell, Oliver')
, ('Peterson, Phillip')
, ('Quigley, Quentin')
, ('Ramallah, Rodolfo')
, ('Smith, Samuel')
, ('Turner, Theodore')
, ('Uno, Umberto')
, ('Victor, Victoria')
, ('Wallace, William')
, ('Xing, Xiopan')
, ('Young, Yvette')
, ('Zapata, Zorro')
, (NULL)
WHILE @Counter2 < 100000
BEGIN
SET @Counter2 += 1
SET @ColA = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
SET @ColB = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
SET @ColC = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
SET @ColD = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
SET @ColE = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
SET @ColF = (SELECT Name FROM @names WHERE ID = ROUND(RAND()*27 +.5,0))
INSERT INTO @transactions2
SELECT @ColA, @ColB, @ColC, @ColD, @ColE, @ColD
END
PRINT CAST(GETDATE() AS DateTime2 (3))
;WITH Dupe
AS (
SELECT *, ROW_NUMBER() OVER
(PARTITION BY ISNULL(ColA,''), ISNULL(ColB,''), ISNULL(ColC,''), ISNULL(ColD,''), ISNULL(ColE,''), ISNULL(ColF,'')
ORDER BY ISNULL(ColA,''), ISNULL(ColB,''), ISNULL(ColC,''), ISNULL(ColD,''), ISNULL(ColE,''), ISNULL(ColF,'')) AS rn
FROM @transactions2
)
SELECT * FROM Dupe WHERE rn > 1 ORDER BY rn
PRINT CAST(GETDATE() AS DateTime2 (3))
刪除'Coalesce'從WHERE子句他們是不必要的 –
你的桌子有獨特的列嗎?如果是這樣,我會放在存在的分組中,並添加x.uniqueid <> y.uniqueid。此外,對於存在的地方,您實際上不會返回值,因此我只需執行Select 1或其他常量值。我也會小心你的coalesce匹配,它現在編寫的方式會認爲''和NULL是匹配的。 –