2013-05-07 79 views
1

我有兩個數據表中包含數據類似如下: -TSQL鏈接數據分成匹配和不匹配的表

| id | name | dob |   | name | dob | 
|-------|------|----------|   |------|----------| 
| 12345 | ABC | 20010301 |   | ABC | 20010301 | - matching record 
| 45678 | DEF | 20010425 |   | XYZ | 20010301 | - unmatched record 

是否有可能寫這兩個表進行比較的查詢,然後創建一個匹配和一個不匹配的表,只留下orignal表結構/數據?

Match Table  Unmatched Table 
| id | rank |  | id | rank | 
|-------|------|  |-------|------| 
| 12345 | 1 |  | 45678 | NULL | 

我用MERGE嘗試,但我必須插入/更新一個源表,我已經打了我的天花板TSQL的方面 - 我也將處理超過3000萬行數據集 - 任何意見/建議?
sql(字段不匹配,但原則是存在的)我到目前爲止如下所示?

Create TABLE #Cohort ([ID] varchar(4),[match rank] int) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('aaaa',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('bbbb',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('cccc',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('dddd',NULL) 

Create TABLE #link ([ID] varchar(4),[match rank] int) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES('aaaa',NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 

Create TABLE #Matches ([ID] varchar(4),[match rank] int) 
Create TABLE #Unmatched ([ID] varchar(4),[match rank] int) 

MERGE #Cohort tg 
USING (SELECT distinct c.[ID], 1 as [match rank] 
     from #Cohort c 
     INNER JOIN #link as h on c.[ID]=h.[ID]) sc 
ON (tg.[ID] = sc.[ID]) 
WHEN NOT MATCHED BY TARGET 
    THEN INSERT([ID],[match rank]) VALUES(sc.[ID],sc.[match rank]) 
WHEN NOT MATCHED BY SOURCE 
    THEN DELETE 
OUTPUT Deleted.* INTO #Unmatched; 

回答

0

使用CTE,最後你將在#Matched匹配的行和#Unmatched不匹配的行。就目前而言,您的MERGE語句將刪除#cohort表中的行,只剩下aaaa值。

CREATE TABLE #Cohort ([ID] VARCHAR(4),[MATCH RANK] INT) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('aaaa',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('bbbb',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('cccc',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('dddd',NULL) 

CREATE TABLE #link ([ID] VARCHAR(4),[MATCH RANK] INT) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES('aaaa',NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 

CREATE TABLE #Matches ([ID] VARCHAR(4),[MATCH RANK] INT) 
CREATE TABLE #Unmatched ([ID] VARCHAR(4),[MATCH RANK] INT) 

;WITH MatchedTbl AS 
(
    SELECT DISTINCT c.[ID], c.[MATCH RANK] 
    FROM #Cohort c 
    INNER JOIN #link h ON c.[ID] = h.[ID] 
) 
INSERT INTO #Matches 
SELECT c.[ID], c.[MATCH RANK] 
    FROM MatchedTbl c 

;WITH NonMatchedTbl AS 
(
    SELECT DISTINCT l.[ID], l.[MATCH RANK] 
     FROM #link l 
    WHERE l.ID NOT IN (SELECT DISTINCT ID FROM #cohort) 
) 
INSERT INTO #Unmatched 
SELECT [ID], [MATCH RANK] 
    FROM NonMatchedTbl 

SELECT * FROM #Cohort 
SELECT * FROM #Link 
SELECT * FROM #Matches 
SELECT * FROM #Unmatched 

DROP TABLE #Cohort 
DROP TABLE #link 
DROP TABLE #Matches 
DROP TABLE #Unmatched 
0

查找匹配/不匹配記錄的標準方法是執行左連接並在左連接表中查找空值。

SELECT t1.id, COUNT(t2.name) AS rank 
INTO #MatchedTable 
FROM Table1 t1 
LEFT JOIN Table2 t2 ON t2.name = t1.name 
WHERE t2.name IS NOT NULL 
GROUP BY t1.id 
ORDER BY t1.id 

和:

SELECT t1.id, NULL AS rank 
INTO #UnmatchedTable 
FROM Table1 t1 
LEFT JOIN Table2 t2 ON t2.name = t1.name 
WHERE t2.name IS NULL 
GROUP BY t1.id 
ORDER BY t1.id 

我希望這有助於。

0

如果您處理MASSIVE數據,您可以嘗試兩件事情。如果您仍想使用合併語句,則可以嘗試在BATCHES中執行此操作,而不是一次執行所有操作。或者你可以分配批次並直接插入。無論哪種方式,我會建議一個可能的暫存區域,創建一個索引,然後插入。分配具有ntile功能的批次。低於該自解壓例子在SQL Server 2008或更高版本上運行:

declare @Person Table (personID int identity, person varchar(8)); 

insert into @Person values ('Brett'),('Sean'),('Chad'),('Michael'),('Ray'),('Erik'),('Quyen'),('John'),('Tim'); 

declare @Orders table (OrderID int identity, PersonID int, Desciption varchar(32), Amount int); 

insert into @Orders values (1, 'Shirt', 20),(1, 'Shoes', 50),(2, 'Shirt', 22),(2, 'Shoes', 52),(3, 'Shirt', 20),(3, 'Shoes', 50),(3, 'Hat', 20),(4, 'Shirt', 20),(5, 'Shirt', 20),(5, 'Pants', 30), 
(6, 'Shirt', 20),(6, 'RunningShoes', 70),(7, 'Shirt', 22),(7, 'Shoes', 40),(7, 'Coat', 80) 

declare @Storage table (batch int, personid int, person varchar(8), orderid int, Desciption varchar(32), amount int); 

insert into @Storage 

Select 
    ntile(5) over(order by p.PersonID) 
-- ntile does the number n inside across entire dataset so if I had 500 items 100 would each be different batch 
, p.personID 
, p.person 
, o.OrderID 
, o.Desciption 
, o.Amount 
from @Person p 
    left join @Orders o on p.personID = o.PersonID 
-- left join assures that when orders do not exist I still get the person 

declare @Cursor int = 5 
-- I can set a cursor for inserts based on batching. 

-- pretend tables for matching 
declare @Matched table (personid int, person varchar(8), orderid int, Desciption varchar(32), amount int); 
declare @UnMatched table (personid int, person varchar(8), orderid int, description varchar(32), amount int); 


insert into @Matched 
select 
    personID 
, person 
, OrderID 
, Desciption 
, Amount 
from @Storage 
where batch = @Cursor 
and orderID is not null 


insert into @UnMatched 
select 
    personID 
, person 
, OrderID 
, Desciption 
, Amount 
from @Storage 
where batch = @Cursor 
and orderID is null 

select * From @Matched 
select * From @UnMatched 

我的例子很簡單,但你可以改變「光標」變量看到,將從臨時出現不同的結果。由於批處理,我不會一次運行整個集合,而且我可以將數據存儲在存儲器中,然後編寫一個過程,根據將改變的遊標或整數執行插入操作。無論數據是否被處理,您甚至可以添加一個用於位參考的列。