2012-08-06 35 views
1

我使用SQL Server 2008拆分和匹配行

我的表看起來像這樣:

ID  Column 
---------------------------------- 
1  This is a Sample Text 
2  Sample Text is typed here 
3  Here the sample text is 
4  Typing a sample 

,我需要的輸出是這樣的:

ID Column      MostCommon Common1 Common2 NonCommon 
--------------------------------------------------------------------------------------- 
1 This is a Sample Text  Sample  Text  is  This a 
2 Sample Text is typed here Sample  Text  is  typed here 
3 Here the sample text is Sample  Text  is  Here the 
4 Typing a sample   sample  NULL  NULL  Typing A 

任何人都可以幫助我在SQL Server 2008中編寫sp /函數/查詢

'sample'出現在所有行中。所以我可以保留它作爲最常見的單詞,'文本'是',下一個最常見的,可以在#1,2,3行中找到。所有其他字與其他行不匹配,並將移動到非常見類別

+1

你能請** **英文說明**你想要實現什麼?只是給我們輸入和輸出,讓我們猜測**你想做什麼並不是很有用...... – 2012-08-06 06:56:52

+0

爲什麼'Text'優先於'Common'作爲'Common'? (它是基於長度?)。定義普通的東西的規則是什麼? (存在於75%的行中,除1之外)。有沒有行計數<> 4? – 2012-08-06 06:57:23

+0

查找行之間的常見模式。 – mvskumar 2012-08-06 07:04:33

回答

0

以下是您可以如何操作的方法,首先必須創建一個函數,用於分割字符串,然後計算出現次數並按照需要進行顯示。這使得它更復雜,因爲你想顯示可變數量的列:

/* 
CREATE FUNCTION dbo.SplitStrings_XML 
(
    @List  NVARCHAR(MAX), 
    @Delimiter NVARCHAR(255) 
) 
RETURNS TABLE 
WITH SCHEMABINDING 
AS 
    RETURN 
    ( 
     SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)') 
     FROM 
     ( 
     SELECT x = CONVERT(XML, '<i>' + 
        REPLACE(@List, @Delimiter, '</i><i>') + '</i>').query('.') 
    ) AS a CROSS APPLY x.nodes('i') AS y(i) 
    ); 
GO 
*/ 

CREATE TABLE #t(ID INT, Col VARCHAR(1000)) 
INSERT #t 
VALUES 
(1,  'This is a Sample Text'), 
(2,  'Sample Text is typed here'), 
(3,  'Here the sample text is'), 
(4,  'Typing a sample') 

DECLARE @MinimumNumberOfOccurances INT = 2 

SELECT a.ID, 
     a.Col, 
     b.Item 
INTO #SplitedStrings 
FROM #t a 
CROSS APPLY dbo.SplitStrings_XML(a.Col, N' ') b 


SELECT b.Item, 
     COUNT(*) cnt 
INTO #SplitedStringsGrouped 
FROM #t a 
CROSS APPLY dbo.SplitStrings_XML(a.Col, N' ') b 
GROUP BY b.Item 

SELECT  b.*, 
      a.cnt 
INTO  #ResultTable 
FROM  #SplitedStringsGrouped a 
RIGHT JOIN #SplitedStrings b ON 
      b.Item = a.Item 
      AND a.cnt > @MinimumNumberOfOccurances 
ORDER BY b.ID, a.cnt DESC, LEN(a.Item) DESC 

DECLARE @ColumnNames VARCHAR(1000) = STUFF(
(
    SELECT ',[' + Item + ']' 
    FROM #SplitedStringsGrouped 
    WHERE cnt > @MinimumNumberOfOccurances 
    FOR  XML PATH('') 
) 
, 1, 1, '') 

DECLARE @TableHeader VARCHAR(1000) = STUFF(
(
    SELECT ',MAX([' + Item + ']) AS [Common' + 
      CAST((ROW_NUMBER() OVER 
       (ORDER BY cnt DESC, LEN(Item) DESC) - 1) 
        AS VARCHAR(5)) 
      + ']' 
    FROM #SplitedStringsGrouped 
    WHERE cnt > @MinimumNumberOfOccurances 
    FOR  XML PATH('') 
) 
, 1, 1, '') 

SELECT ID, 
     Item, 
     ROW_NUMBER() OVER 
      (PARTITION BY ID ORDER BY ID) Num 
INTO #NonCommon 
FROM #ResultTable 
WHERE cnt IS NULL 

DECLARE @sql VARCHAR(1000) = 
' 
SELECT MAX(pvt.ID) ID, MAX(pvt.Col) [Column], 
     '[email protected]+', 
     RTRIM((
      SELECT a.Item + '' '' 
      FROM #NonCommon a 
      WHERE a.ID = pvt.ID 
      FOR  XML PATH('''') 
     )) NonCommon   
FROM #ResultTable a 
PIVOT (
    MAX(Item) FOR Item IN ('[email protected]+') 
) pvt 
GROUP BY pvt.ID 
' 

EXEC(@sql) 

DROP TABLE #t 
DROP TABLE #SplitedStringsGrouped 
DROP TABLE #SplitedStrings 
DROP TABLE #ResultTable 
DROP TABLE #NonCommon 
+0

謝謝伊萬。你救了我 – mvskumar 2012-08-09 05:48:44

+0

不客氣,我很高興它的作品。我希望有一個更簡單的解決方案,但這是我能想到的唯一解決方案。您可以將不同的值設置爲'@ MinimumNumberOfOccurances'來定義單詞被發現的次數,以便被認爲是通用的。 – 2012-08-09 05:54:05