找到確切的FK匹配

有一個非常大的表（超過200萬行）
SID INT，wordID的INT（PK SID，wordID的）找到確切的FK匹配

要查找的SID的具有完全相同的wordID的（沒有額外）
對於SID超過100的wordID完全匹配的機率下降，從而願意將其限制在100
（但想進入1000）

如果這是學校和SID是類和wordID的我們是學生。
然後，我想找到具有完全相同學生的課程。

SID，的wordID
1,1
1,2
1,3
2,2
2,3
3,1
3,4
5,1
5 ，2
6,2
6,3
7,1
7,2
8,1
8,1

SID 6和2具有完全相同的的wordID的
SID 5，圖7和8有完全相同的的wordID的

這是我迄今爲止
我想，以消除兩個刪除＃temp3_sID1_sID2和照顧，在插入件之上
但我會嘗試任何想法
它不喜歡你可以很容易地創建一個200個百萬行的表

測試

drop table #temp_sID_wordCount 
    drop table #temp_count_wordID_sID 
    drop table #temp3_wordID_sID_forThatCount 
    drop table #temp3_sID1_sID2 
    drop table #temp3_sID1_sID2_keep 
    create table #temp_sID_wordCount (sID int primary key, ccount int not null) 
    create table #temp_count_wordID_sID (ccount int not null, wordID int not null, sID int not null, primary key (ccount, wordID, sID)) 
    create table #temp3_wordID_sID_forThatCount (wordID int not null, sID int not null, primary key(wordID, sID)) 
    create table #temp3_sID1_sID2_keep (sID1 int not null, sID2 int not null, primary key(sID1, sID2)) 
    create table #temp3_sID1_sID2 (sID1 int not null, sID2 int not null, primary key(sID1, sID2)) 
    insert into #temp_sID_wordCount 
    select sID, count(*) as ccount 
    FROM [FTSindexWordOnce] with (nolock) 
    group by sID 
    order by sID; 
    select count(*) from #temp_sID_wordCount where ccount <= 100; -- 701,966 
    truncate table #temp_count_wordID_sID 
    insert into #temp_count_wordID_sID 
    select #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID 
    from #temp_sID_wordCount 
    join [FTSindexWordOnce] with (nolock) 
     on [FTSindexWordOnce].sID = #temp_sID_wordCount.sID 
    and ccount >= 1 and ccount <= 10 
    order by #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID; 
    select count(*) from #temp_sID_wordCount; -- 34,860,090 

    truncate table #temp3_sID1_sID2_keep 
    declare cur cursor for 
    select top 10 ccount from #temp_count_wordID_sID group by ccount order by ccount 

    open cur 
    declare @count int, @sIDcur int 
    fetch next from cur into @count 
    while (@@FETCH_STATUS = 0) 
    begin 
     --print (@count) 
     --select count(*), @count from #temp_sID_wordCount where #temp_sID_wordCount.ccount = @count 
     truncate table #temp3_wordID_sID_forThatCount 
     truncate table #temp3_sID1_sID2 

     -- wordID and sID for that unique word count 
     -- they can only be exact if they have the same word count 
     insert into #temp3_wordID_sID_forThatCount 
     select  #temp_count_wordID_sID.wordID 
       , #temp_count_wordID_sID.sID 
     from #temp_count_wordID_sID 
     where #temp_count_wordID_sID.ccount = @count 
     order by #temp_count_wordID_sID.wordID, #temp_count_wordID_sID.sID 

     -- select count(*) from #temp3_wordID_sID_forThatCount 

     -- this has some duplicates 
     -- sID1 is the group 
     insert into #temp3_sID1_sID2 
     select w1.sID, w2.sID 
     from #temp3_wordID_sID_forThatCount as w1 with (nolock) 
     join #temp3_wordID_sID_forThatCount as w2 with (nolock) 
      on w1.wordID = w2.wordID 
     and w1.sID <= w2.sID   
     group by w1.sID, w2.sID 
     having count(*) = @count 
     order by w1.sID, w2.sID 

     -- get rid of the goups of 1  
     delete #temp3_sID1_sID2 
     where sID1 in (select sID1 from #temp3_sID1_sID2 group by sID1 having count(*) = 1) 

     -- get rid of the double dips   
     delete #temp3_sID1_sID2 
     where #temp3_sID1_sID2.sID1 in 
       (select distinct s1del.sID1 -- these are the double dips 
       from #temp3_sID1_sID2 as s1base with (nolock) 
       join #temp3_sID1_sID2 as s1del with (nolock) 
        on s1del.sID1 > s1base.sID1 
       and s1Del.sID1 = s1base.sID2) 

     insert into #temp3_sID1_sID2_keep  
     select #temp3_sID1_sID2.sID1 
      , #temp3_sID1_sID2.sID2 
     from #temp3_sID1_sID2 with (nolock) 
     order by #temp3_sID1_sID2.sID1, #temp3_sID1_sID2.sID2 

    fetch next from cur into @count 
    end 
    close cur 
    deallocate cur 

select * 
FROM #temp3_sID1_sID2_keep with (nolock) 
order by 1,2

來源

2013-10-24 Paparazzi

所以，正如我所看到的，任務是找到相等的子集。

首先，我們可以找到對等的子集：

;with tmp1 as (select sID, cnt = count(wordID) from [Table] group by sID) 
select s1.sID, s2.sID 
from tmp1 s1 
    cross join tmp1 s2 
    cross apply (
     select count(1) 
     from [Table] d1 
      join [Table] d2 on d2.wordID = d1.wordID 
     where d1.sID = s1.sID and d2.sID = s2.sID 
    ) c(cnt) 
where s1.cnt = s2.cnt 
    and s1.sID > s2.sID 
    and s1.cnt = c.cnt

輸出是：

sID  sID 
----------- ----------- 
6   2 
7   5 
8   5 
8   7

然後對可以合併成組，如果必要的話：

sID   gNum 
----------- ----------- 
2   1 
6   1 
5   2 
7   2 
8   2

見下面的SqlFiddle示例中的詳細信息。

SqlFiddle Sample

另一種方法是計算哈希函數爲每個子集數據：

;with a as (
    select distinct sID from [Table] 
) 
select sID, 
    hashbytes('sha1', (
     select cast(wordID as varchar(10)) + '|' 
     from [Table] 
     where sID = a.sID 
     order by wordID 
     for xml path(''))) 
from a

然後子集可以基於哈希值進行分組。

SqlFiddle Sample

最後一個花了不到一分鐘，我的機器上的約1000萬行（20K SID值多達一千wordID的每個）的測試數據。您也可以通過排除沒有任何其他wordID計數匹配的sID來優化它。

來源

2013-10-24 22:29:24

它適用於示例數據，但在大型表格上它沒有在7小時內完成，所以我無法驗證。 – Paparazzi

@Blam，增加了其他的方法，請看到更新的答案。 –

令人敬畏的方法，但我有一個輕微的問題。我需要去最大字數600或我得到字符串或二進制數據將被截斷錯誤。如果我拿出哈希字節，我仍然得到錯誤，所以它是在XML中。但是在600的極限時間裏，它運行時間不到兩分鐘。 – Paparazzi

找到確切的FK匹配

回答

相關問題