2013-08-20 117 views
2

我正在編譯一個啓用了utf8標誌的PCRE模式,並試圖匹配一個utf8 char*字符串,但它不匹配,並且pcre_exec返回負值。我將主題長度設置爲65到pcre_exec,這是字符串中的字符數。我相信它預計的字節數,所以我試圖將參數增加到70,但仍然得到相同的結果。我不知道還有什麼讓比賽失敗。在拍攝自己之前請幫忙。PCRE不匹配utf8字符

(如果我嘗試沒有標誌PCRE_UTF8然而,它匹配但偏置矢量[1]是30,它是字符的索引只是在我的輸入字符串中的Unicode字符前)

#include "stdafx.h" 
#include "pcre.h" 
#include <pcre.h>    /* PCRE lib  NONE */ 
#include <stdio.h>    /* I/O lib   C89 */ 
#include <stdlib.h>    /* Standard Lib C89 */ 
#include <string.h>    /* Strings   C89 */ 
#include <iostream> 

int main(int argc, char *argv[]) 
{ 
    pcre *reCompiled; 

    int pcreExecRet; 
    int subStrVec[30]; 
    const char *pcreErrorStr; 
    int pcreErrorOffset; 
    char* aStrRegex = "(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)(" 
            // params can be an empty pair of parenthesis or have parameters inside them as well. 
            "\\(\\s*(?<params>[?\\w,]+)\\s*\\)" 
            // paramList along with its parenthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters. 
            ")?"; 
    reCompiled = pcre_compile(aStrRegex, 0, &pcreErrorStr, &pcreErrorOffset, NULL); 
    if(reCompiled == NULL) { 
     printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr); 
     exit(1); 
    } 

    char* line = "?rt?=call SqlTxFunctionTesting(?înFîéld?,?outField?,?inOutField?)"; 
    pcreExecRet = pcre_exec(reCompiled, 
          NULL, 
          line, 
          65, // length of string 
          0,      // Start looking at this point 
          0,      // OPTIONS 
          subStrVec, 
          30);     // Length of subStrVec 

    printf("\nret=%d",pcreExecRet); 

    //int substrLen = pcre_get_substring(line, subStrVec, pcreExecRet, 1, &mantissa); 

} 

回答

1

1)

char * q= "î"; 
printf("%d, %s", q[0], q); 

輸出:
63?

2)您必須使用PCRE_BUILD_PCRE16(或32)和PCRE_SUPPORT_UTF重建PCRE。並使用pcre16.lib和/或pcre16.dll。那麼你可以試試這個代碼:

pcre16 *reCompiled; 
    int pcreExecRet; 
    int subStrVec[30]; 
    const char *pcreErrorStr; 
    int pcreErrorOffset; 
    wchar_t* aStrRegex = L"(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)(" 
            // params can be an empty pair of paranthesis or have parameters inside them as well. 
            L"\\(\\s*(?<params>[?,\\w\\p{L}]+)\\s*\\)" 
            // paramList along with its paranthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters. 
            L")?"; 
    reCompiled = pcre16_compile((PCRE_SPTR16)aStrRegex, PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL); 
    if(reCompiled == NULL) { 
    printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr); 
    exit(1); 
    } 

    const wchar_t* line = L"?rt?=call SqlTxFunctionTesting( ?inField?,?outField?,?inOutField?,?fd? )"; 
    const wchar_t* mantissa=new wchar_t[wcslen(line)]; 
    pcreExecRet = pcre16_exec(reCompiled, 
          NULL, 
          (PCRE_SPTR16)line, 
          wcslen(line), // length of string 
          0,      // Start looking at this point 
          0,      // OPTIONS 
          subStrVec, 
          30);     // Length of subStrVec 

printf("\nret=%d",pcreExecRet); 
for (int i=0;i<pcreExecRet;i++){ 
    int substrLen = pcre16_get_substring((PCRE_SPTR16)line, subStrVec, pcreExecRet, i, (PCRE_SPTR16 *)&mantissa); 
    wprintf(L"\nret string=%s, length=%i\n",mantissa,substrLen); 
} 

3)\ w = [0-9A-Z_a-z]。它不包含unicode符號。
4)這真的可以幫助:http://answers.oreilly.com/topic/215-how-to-use-unicode-code-points-properties-blocks-and-scripts-in-regular-expressions/
5)從PCRE 8.33源(pcre_exec.c:2251)

/* Find out if the previous and current characters are "word" characters. 
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 
be "non-word" characters. Remember the earliest consulted character for 
partial matching. */