2011-12-07 116 views
2

我試圖用JavaScript客戶端來實現模糊搜索,以搜索包含在SQL數據庫中的記錄的大型數據庫(大約300個項目)。我的約束是,無法對數據庫執行實時查詢 - 我必須在夜間批處理作業中生成「索引」作爲平面文件。所以,從一分貝,看起來像這樣:使用SQL查詢生成n-gram表

ID. NAME 
1.  The Rain Man 
2.  The Electric Slide 
3.  Transformers 

我需要一個單一的查詢是這樣的中創建:

Trigram ID 
the  1 
the  2 
he_  1 
he_  2 
e_r  1 
_ra  1 
rai  1 
ain  1 
in_  1 
n_m  1 
_ma  1 
man  1 
e_e  2 
_el  2 
ele  2 
lec  2 

等等等等,錯別字沒有頂住。這裏的規則是:''n'是第一列中字符串的長度,只有az和_是有效字符,任何其他字符被歸一化爲小寫字母,或映射到_,表示由n-gram條款可能適用於表格。因此,我希望獲得一張表格,以便我能夠快速查找特定的n-gram,並獲取包含該序列的所有行的列表。我不是一個足夠聰明的SQL cookie來解決這個問題。你可以嗎?

+0

什麼版本的SQL?我認爲這會更好地在SQL之外實現,但是 – Sparky

+0

我不知道哪個版本。我所知道的是,我有一個在asp應用程序中使用SQL的字段。您可能會不寒而慄 – Breton

+0

您打算在ASP中編寫BATCH流程嗎?如果不知道SQL的版本,就很難有SQL查詢生成一個平面文件... – Sparky

回答

0

你不得不重複着這樣一句話:

insert into trigram_table (Trigram, ID) 
select substr(translate(lower(Name), ' ', '_'), :X, :N), 
     ID 
    from db_table 

所有:X從1到萊恩(名稱)+ 1 - :N

您還必須擴展的翻譯功能所有其他需要轉換爲下劃線的特殊字符。現在它只是將空白轉換爲下劃線。

爲了提高性能,您可以在trigram_table上的最後一次傳遞中對Trigram列執行翻譯和更低的函數,因此您沒有爲每個函數執行這些函數:X。

3

我創建了一個非常好用的T-SQL NGrams;請注意關於如何使用示例的註釋部分

CREATE FUNCTION dbo.nGrams8K 
( 
    @string VARCHAR(8000), 
    @n TINYINT, 
    @pad BIT 
) 
/* 
Created by: Alan Burstein 
Created on: 3/10/2014 
Updated on: 5/20/2014 changed the logic to use an "inline tally table" 
      9/10/2014 Added some more code examples in the comment section 
      9/30/2014 Added more code examples 
      10/27/2014 Small bug fix regarding padding 

Use: Outputs a stream of tokens based on an input string. 
     Works just like mdq.nGrams; see http://msdn.microsoft.com/en-us/library/ff487027(v=sql.105).aspx. 

n-gram defined: 
    In the fields of computational linguistics and probability, 
    an n-gram is a contiguous sequence of n items from a given 
    sequence of text or speech. The items can be phonemes, syllables, 
    letters, words or base pairs according to the application. 

    To better understand N-Grams see: http://en.wikipedia.org/wiki/N-gram 
*/ 
RETURNS TABLE 
WITH SCHEMABINDING 
AS 

RETURN 
    WITH 
    E1(n) AS (SELECT 1 FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(n)), 
    E2(n) AS (SELECT 1 FROM E1 a CROSS JOIN E1 b), 
    iTally(n) AS 
    (
     SELECT TOP (LEN(@string)[email protected]) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) 
     FROM E2 a CROSS JOIN E2 b 
    ), 
    NewString(NewString) AS 
    ( 
     SELECT REPLICATE(CASE @pad WHEN 0 THEN '' ELSE ' ' END,@n-1)[email protected]+ 
       REPLICATE(CASE @pad WHEN 0 THEN '' ELSE ' ' END,@n-1) 
    ) 
    SELECT TOP ((@n)+LEN(@string)) 
      n AS [sequence], 
      SUBSTRING(NewString,n,@n) AS token 
    FROM iTally 
    CROSS APPLY NewString 
    WHERE n < ((@n)+LEN(@string)); 

/* 
------------------------------------------------------------ 
-- (1) Basic Use 
------------------------------------------------------------- 

;-- (A)basic "string to table": 
SELECT [sequence], token 
FROM dbo.nGrams8K('abcdefg',1,1); 

-- (b) create "bi-grams" (pad bit off) 
SELECT [sequence], token 
FROM dbo.nGrams8K('abcdefg',2,0); 

-- (c) create "tri-grams" (pad bit on) 
SELECT [sequence], token 
FROM dbo.nGrams8K('abcdefg',3,1); 

-- (d) filter for only "tri-grams" 
SELECT [sequence], token 
FROM dbo.nGrams8K('abcdefg',3,1) 
WHERE len(ltrim(token)) = 3; 

-- note the query plan for each. The power is coming from an index 
-- also note how many rows are produced: len(@string+(@n-1)) 
-- lastly, you can trim as needed when padding=1 

------------------------------------------------------------ 
-- (2) With a variable 
------------------------------------------------------------ 

-- note, in this example I am getting only the stuff that has three letters 
DECLARE @string varchar(20) = 'abcdefg', 
     @tokenLen tinyint = 3; 

SELECT [sequence], token 
FROM dbo.nGrams8K('abcdefg',3,1) 
WHERE len(ltrim(token)) = 3; 
GO 

------------------------------------------------------------ 
-- (3) An on-the-fly alphabet (this will come in handy in a moment) 
------------------------------------------------------------ 
DECLARE @alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ'; 

SELECT [sequence], token 
FROM dbo.nGrams8K(@alphabet,1,0); 
GO 

------------------------------------------------------------ 
-- (4) Character Count 
------------------------------------------------------------ 
DECLARE @string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.', 

@alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ'; 

SELECT a.token, COUNT(b.token) ttl 
FROM dbo.nGrams8K(@alphabet,1,0) a 
LEFT JOIN dbo.nGrams8K(@string,1,0) b ON a.token=b.token 
GROUP BY a.token 
ORDER BY a.token; 
GO 

------------------------------------------------------------ 
-- (5) Locate the start position of a search pattern 
------------------------------------------------------------ 
;-- (A) note these queries: 
    DECLARE @string varchar(100)='THE QUICK Green FOX JUMPED OVER THE LAZY DOGS BACK'; 
    -- (i) 
     SELECT * FROM dbo.nGrams8K(@string,1,0) a; 
    -- (ii) note this query: 
     SELECT * FROM dbo.nGrams8K(@string,1,0) a WHERE [token]=' '; 

-- (B) and now the word count (@string included for presentation) 
    SELECT @string AS string, 
      count(*)+1 AS words 
    FROM dbo.nGrams8K(@string,1,0) a 
    WHERE [token]=' ' 
    GO 

------------------------------------------------------------ 
-- (6) search for the number of occurances of a word 
------------------------------------------------------------ 
DECLARE @string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.', 
     @alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ', 
     @searchString VARCHAR(100)='The'; 

-- (5a) by location 
SELECT sequence-(LEN(@searchstring)) AS location, 
     token AS searchString 
FROM dbo.nGrams8K(@string,LEN(@searchstring+' ')+1,0) b 
WHERE [email protected]; 

-- (2b) get total 
SELECT @string AS string, 
     @searchString AS searchString, 
     COUNT(*) AS ttl 
FROM dbo.nGrams8K(@string,LEN(@searchstring+' ')+1,0) b 
WHERE [email protected]; 

------------------------------------------------------------ 
-- (7) Special SubstringBefore and SubstringAfter 
------------------------------------------------------------ 

-- (7a) SubstringBeforeSSI (note: SSI = substringIndex) 
    ALTER FUNCTION dbo.SubstringBeforeSSI 
    (
     @string varchar(1000), 
     @substring varchar(100), 
     @substring_index tinyint 
    ) 
    RETURNS TABLE 
    WITH SCHEMABINDING 
    AS 
    RETURN 
     WITH get_pos AS 
     (
      SELECT rn = row_number() over (order by sequence), substring_index = sequence 
      FROM dbo.nGrams8K(@string,len(@substring),1) 
      WHERE [email protected] 
     ) 
     SELECT newstring = substring(@string,1,substring_index-len(@substring)) 
     FROM get_pos 
     WHERE [email protected]_index; 
    GO 

    DECLARE @string varchar(1000)='10.0.1600.22', 
      @searchPattern varchar(100)='.', 
      @substring_index tinyint = 3; 

    SELECT * FROM dbo.SubstringBeforeSSI(@string,@searchPattern,@substring_index); 
    GO 

-- (7b) SubstringBeforeSSI (note: SSI = substringIndex) 
    ALTER FUNCTION dbo.SubstringAfterSSI 
    (
     @string varchar(1000), 
     @substring varchar(100), 
     @substring_index tinyint 
    ) 
    RETURNS TABLE 
    WITH SCHEMABINDING 
    AS 
    RETURN 
     WITH get_pos AS 
     (
      SELECT rn = row_number() over (order by sequence), substring_index = sequence 
      FROM dbo.nGrams8K(@string,len(@substring),1) 
      WHERE [email protected] 
     ) 
     SELECT newstring = substring(@string,substring_index+1,8000) 
     FROM get_pos 
     WHERE [email protected]_index; 
    GO 

    DECLARE @string varchar(1000)='<notes id="1">blah, blah, blah</notes><notes id="2">More Notes</notes>', 
    @searchPattern varchar(100)='</notes>', 
    @substring_index tinyint = 1; 

    SELECT @string, * 
    FROM dbo.SubstringAfterSSI(@string,@searchPattern,@substring_index); 

------------------------------------------------------------ 
-- (8) Strip non-numeric characters from a string 
------------------------------------------------------------ 

-- (8a) create the function 
ALTER FUNCTION StripNonNumeric_itvf(@OriginalText VARCHAR(8000)) 
RETURNS TABLE 
--WITH SCHEMABINDING 
AS 
return 
    WITH ngrams AS 
    (
     SELECT n = [sequence], c = token    
     FROM dbo.nGrams8K(@OriginalText,1,1) 
    ), 
    clean_txt(CleanedText) AS 
    (
     SELECT c+'' 
     FROM ngrams 
     WHERE ascii(substring(@OriginalText,n,1)) BETWEEN 48 AND 57 
     FOR XML PATH('') 
    ) 
    SELECT CleanedText 
    FROM clean_txt; 
GO 

-- (8b) use against a value or variable 
SELECT CleanedText 
FROM dbo.StripNonNumeric_itvf('value123'); 

-- (8c) use against a table 

-- test harness: 
IF OBJECT_ID('tempdb..#strings') IS NOT NULL DROP TABLE #strings; 

WITH strings AS 
(
    SELECT TOP (100000) string = newid() 
    FROM sys.all_columns a CROSS JOIN sys.all_columns b 
) 
SELECT * 
INTO #strings 
FROM strings; 
GO 

-- query (returns 100K rows every 3 seconds on my pc): 
SELECT CleanedText 
FROM #strings 
CROSS APPLY dbo.StripNonNumeric_itvf(string); 

------------------------------------------------------------ 
-- (9) A couple complex String Algorithms 
------------------------------------------------------------ 

-- (9a) hamming distance between two strings: 
DECLARE @string1 varchar(8000) = 'xxxxyyyzzz', 
     @string2 varchar(8000) = 'xxxxyyzzzz'; 


    SELECT string1 = @string1, 
      string2 = @string2, 
      hamming_distance = count(*) 
    FROM dbo.nGrams8K(@string1,1,0) s1 
    CROSS APPLY dbo.nGrams8K(@string2,1,0) s2 
    WHERE s1.sequence = s2.sequence 
    AND s1.token <> s2.token 
GO 

-- (9b) inner join between 2 strings 
    --(can be used to speed up other string metrics such as the longest common subsequence) 
DECLARE @string1 varchar(100)='xxxx123yyyy456zzzz', 
     @string2 varchar(100)='xx789yy000zz'; 

WITH 
    s1(string1) AS 
    ( 
     SELECT [token]+'' 
     FROM dbo.nGrams8K(@string1,1,0) 
     WHERE charindex([token],@string2)<>0 
     ORDER BY [sequence] 
     FOR XML PATH('') 
    ), 
    s2(string2) AS 
    ( 
     SELECT [token]+'' 
     FROM dbo.nGrams8K(@string2,1,0) 
     WHERE charindex([token],@string1)<>0 
     ORDER BY [sequence] 
     FOR XML PATH('') 
    ) 
    SELECT string1, string2 
    FROM s1 
    CROSS APPLY s2; 

------------------------------------------------------------ 
-- (10) Advanced Substring Metrics 
------------------------------------------------------------ 

-- (10a) Identify common substrings and their location 

DECLARE @string1 varchar(100) = 'xxx yyy zzz', 
     @string2 varchar(100) = 'xx yyy zz'; 

-- (i) review the two strings 
SELECT str1 = @string1, 
     str2 = @string2; 

-- (ii) the results 
WITH 
iTally AS 
(
    SELECT n 
    FROM dbo.tally t 
    WHERE n<= len(@string1) 
), 
distinct_tokens AS 
(
    SELECT ng1 = ng1.token, ng2 = ng2.token --= ltrim(ng1.token), ng2 = ltrim(ng2.token) 
    FROM itally 
    CROSS APPLY dbo.nGrams8K(@string1,n,1) ng1 
    CROSS APPLY dbo.nGrams8K(@string2,n,1) ng2 
    WHERE ng1.token=ng2.token 
) 
SELECT ss_txt = ng1, 
     ss_len = len(ng1), 
     str1_loc = charindex(ng1,@string1), 
     str2_loc = charindex(ng2,@string2) 
FROM distinct_tokens 
WHERE ng1<>'' AND charindex(ng1,@string1)+charindex(ng2,@string2)<>0 
GROUP BY ng1, ng2 
ORDER BY charindex(ng1,@string1), charindex(ng2,@string2), len(ng1); 

-- (10b) Longest common substring function 

-- (i) function 
    IF EXISTS 
    ( SELECT * FROM INFORMATION_SCHEMA.ROUTINES 
     WHERE ROUTINE_SCHEMA='dbo' AND ROUTINE_NAME = 'lcss') 
    DROP FUNCTION dbo.lcss; 
    GO 

    CREATE FUNCTION dbo.lcss(@string1 varchar(100), @string2 varchar(100)) 
    RETURNS TABLE 
    AS 
    RETURN 
     SELECT TOP (1) with ties token 
     FROM dbo.tally 
     CROSS APPLY dbo.nGrams8K(@string1,n,1) 
     WHERE n <= len(@string1) 
     AND charindex(token, @string2) > 0 
     ORDER BY len(token) DESC; 
    GO 

-- (ii) example of use 
    DECLARE @string1 varchar(100) = '000xxxyyyzzz', 
      @string2 varchar(100) = '999xxyyyzaa'; 

    SELECT string1 = @string1, 
      string2 = @string2, 
      token 
    FROM dbo.lcss(@string1, @string2); 
*/ 
GO