2014-10-12 47 views
0

問題:我給定大小爲n的字符串s和我有類型
[L,R]
的許多查詢和每個查詢我必須退回s中等於s [L..R]並且在s中的索引L之前開始的子字符串的數量。計數一個子串的特殊ocuurences在字符串

約束:
Ñ< = 2 * 10^6
查詢< = 10^5

一種強力方法是使用後綴數組,其中我們遍歷LCP陣列找到答案每個查詢。

我認爲O(q log n)方法對這個問題很好。請建議任何?

預先感謝..

回答

1

我建議構建一個排序後綴數組並用二進制搜索來查詢它。另一種方法是建立查詢字符串並通過字符串進行單次傳遞以累積計數:

  • 閱讀所有查詢。查詢具有左右兩個索引以及一個計數。
  • 爲所有查詢構建一個特里結構。指向查詢的指針用作每個查詢的結束標記。
  • 遍歷一次字符串。對於每個char,遍歷trie並在具有指向查詢的指針的節點上累積計數。

這種方法似乎更快。它無法處理重複查詢。當事先知道所有查詢時,它也是有意義的。 (不幸的是,我的示例實現表明,我對對方的回答代碼中有一個嚴重的錯誤,可能是在遠程二進制搜索,它誤算一些查詢。)

編輯:我已經添加了我的例子實現用C - 不在C++中,對不起 - 在下面。

這個實現是浪費的,因爲它分配了除'\0'以外的所有可能字符的子節點,其中絕大多數將是NULL。更好的實現可以將查詢的字符映射到更緊密的字母表。

必須在讀取整個數組之後,在特殊循環中構建trie,以便重新分配不會使參考指針無效到該數組中。

#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \ 
    putc(10, stderr), 1)) 



typedef struct Query Query; 
typedef struct Trie Trie; 
typedef struct Trienode Trienode; 

struct Query { 
    char *p;    /* starting point in buffer */ 
    size_t len;    /* length of query in chars */ 
    size_t count;   /* number of occurrences */ 
}; 

struct Trie { 
    Trienode *head; 
}; 

struct Trienode { 
    Query *query;   /* query reference */ 
    Trienode *next[255]; /* child nodes */ 
}; 

/* 
*  Read whole file to buffer and return size in n. 
*/ 
char *slurp(const char *fn, size_t *n) 
{ 
    FILE *f = fopen(fn, "r"); 
    size_t size; 
    char *buf; 

    if (f == NULL) die("Couldn't open %s", fn); 

    fseek(f, 0, SEEK_END); 
    size = ftell(f); 
    fseek(f, 0, SEEK_SET); 

    buf = malloc(size + 1); 

    if (buf == NULL) die("Allocation failed"); 

    fread(buf, 1, size, f); 
    buf[size] = '\0'; 
    fclose(f); 

    if (n) *n = size; 
    return buf; 
} 

/* 
*  Insert query string and reference into trie. 
*/ 
void trie_insert(Trie *trie, Query *q) 
{ 
    char *str = q->p; 
    Trienode **n = &trie->head; 
    size_t i; 

    for (i = 0; i < q->len; i++) {  
     if (*n == NULL) { 
      *n = malloc(sizeof(**n)); 
      if (*n == NULL) die("Coudn't allocate node"); 
      memset(*n, 0, sizeof(**n)); 
     } 

     n = &(*n)->next[(unsigned char) str[i] - 1]; 
    } 

    if (*n == NULL) { 
     *n = malloc(sizeof(**n)); 
     if (*n == NULL) die("Coudn't allocate node"); 
     memset(*n, 0, sizeof(**n)); 
    } 

    (*n)->query = q; 
} 

static void trie_delete_node(Trienode *n) 
{ 
    size_t i; 

    for (i = 0; i < 255; i++) { 
     if (n->next[i]) trie_delete_node(n->next[i]); 
    } 

    free(n); 
} 

/* 
*  Destroy trie and its nodes. 
*/ 
void trie_delete(Trie *trie) 
{ 
    if (trie->head) trie_delete_node(trie->head); 
} 

/* 
*  Find occurrences of all queries. The count member of all 
*  queries must be 0 before calling this routine. 
*/ 
void search(char *buf, Trie *trie) 
{ 
    while (*buf) { 
     Trienode *n = trie->head; 
     char *p = buf; 

     while (n && *p) { 
      if (n->query) { 
       Query *q = n->query; 

       if (buf < q->p) q->count++; 
      } 
      n = n->next[(unsigned char) *p - 1]; 
      p++; 
     } 

     buf++; 
    } 
} 

/* 
*  Example client code 
*/ 
int main(int argc, char **argv) 
{ 
    Query *query = NULL; 
    size_t nquery = 0; 
    size_t squery = 0; 

    char *buf; 
    size_t nbuf; 

    Trie trie = {NULL}; 
    FILE *f; 
    size_t i; 

    if (argc != 3) die("Usage: %s string queries", *argv); 

    // Read string buffer from file 
    buf = slurp(argv[1], &nbuf); 

    // Read query array 
    f = fopen(argv[2], "r"); 
    if (f == NULL) die("Can't open %s.", argv[1]); 
    for (;;) { 
     char str[80]; 
     size_t p, len; 

     if (fgets(str, sizeof(str), f) == NULL) break; 
     if (sscanf(str, "%zu %zu", &p, &len) < 2) continue; 

     if (nquery >= squery) { 
      squery *= 2; 
      if (squery == 0) squery = 0x400; 
      query = realloc(query, squery * sizeof(*query)); 
      if (query == NULL) die("Reallocation failed."); 
     } 

     query[nquery].p = buf + p; 
     query[nquery].len = len; 
     query[nquery].count = 0; 
     nquery++; 
    } 
    fclose(f); 

    // Build tree from queries 
    for (i = 0; i < nquery; i++) { 
     Query *q = query + i; 
     trie_insert(&trie, q); 
    } 

    // Assign the query counts 
    search(buf, &trie); 

    // Print results 
    for (i = 0; i < nquery; i++) { 
     Query *q = query + i; 
     printf("%8zu %.*s\n", q->count, (int) q->len, q->p); 
    } 

    // Clean up  
    trie_delete(&trie); 
    free(buf); 
    free(query); 

    return 0; 
} 
+0

是的,所有的查詢事先知道 – v78 2014-10-12 15:24:47

+0

然後,這種方法應該爲你工作。我在C中添加了示例代碼(對不起,我對C++不太好)。 – 2014-10-12 17:11:47

1

一個可能的解決方案可能是類似於solution that I provided for fast string search with many queries to the same string。該解決方案在C中有一個示例實現。

  • 在您的字符串的每個字符中創建一個指針數組。把它分類。
  • 使用二分查找查找排序的auffix數組中的查詢的第一個匹配項。
  • 查找查詢的最後一次出現。您可以通過增加最後一個字符來實現,以便查找「abd」而不是「abc」來查找第一個不匹配。
  • 計算在L之前開始的兩場比賽之間的所有比賽。

該解決方案,但是,是不是O(q日誌ñ),因爲排序已經是O(ñ日誌ñ)和q查詢查找起坐O( q log n)。

我已經重新編寫了我爲您的問題鏈接的示例。它是C,而不是C++,因爲使用了malloc,所以它不會用C++編譯器編譯。用慣用的C++重寫它不應該太難。

該解決方案需要大量的後綴數組內存。它可以在一分多鐘內處理大約130,000個查詢和一個1.3兆字節的文件。

#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \ 
    putc(10, stderr), 1)) 



typedef struct Haystack Haystack;  

struct Haystack { 
    size_t size; /* Number of chars in string */ 
    char *buf;  /* Null-terminated char buffer */ 
    char **ptr;  /* Pointers into char buffer */ 
}; 

/* 
*  Count occurrence of c in zero-terminated string p. 
*/ 
size_t strcount(const char *p, int c) 
{ 
    size_t n = 0; 

    for (;;) { 
     p = strchr(p, c); 
     if (p == NULL) return n; 
     p++; 
     n++;   
    } 

    return 0; 
} 

/* 
*  String comparison via pointers to strings. 
*/ 
int pstrcmp(const void *a, const void *b) 
{ 
    const char *const *aa = a; 
    const char *const *bb = b; 

    return strcmp(*aa, *bb); 
} 

/* 
*  Create and prepare a hayst, i.e. a text file to search. 
*/ 
Haystack *hayst_new(const char *fn) 
{ 
    Haystack *hayst; 
    FILE *f = fopen(fn, "r"); 
    char *p; 
    char **pp; 

    if (f == NULL) die("Couldn't open %s", fn); 

    hayst = malloc(sizeof(*hayst)); 
    if (hayst == NULL) die("Allocation failed"); 

    fseek(f, 0, SEEK_END); 
    hayst->size = ftell(f); 
    fseek(f, 0, SEEK_SET); 

    hayst->buf = malloc(hayst->size + 1); 
    hayst->ptr = malloc(hayst->size * sizeof(*hayst->ptr)); 

    if (hayst->buf == NULL) die("Allocation failed"); 
    if (hayst->ptr == NULL) die("Allocation failed"); 

    fread(hayst->buf, 1, hayst->size, f); 
    hayst->buf[hayst->size] = '\0'; 
    fclose(f); 

    p = hayst->buf; 
    pp = hayst->ptr; 
    while (*p) *pp++ = p++; 

    qsort(hayst->ptr, hayst->size, sizeof(*hayst->ptr), pstrcmp); 

    return hayst; 
} 

/* 
*  Clean up hayst. 
*/ 
void hayst_delete(Haystack *hayst) 
{ 
    free(hayst->buf); 
    free(hayst->ptr); 
    free(hayst); 
} 

/* 
*  Binary range search for string pointers. 
*/ 
static char **pstr_bsearch(const char *key, size_t len, 
    char **arr, size_t high) 
{ 
    size_t low = 0; 

    while (low < high) { 
     size_t mid = (low + high)/2; 
     int diff = strncmp(key, arr[mid], len); 

     if (diff <= 0) high = mid; 
     else low = mid + 1; 
    } 

    return arr + low; 
} 

/* 
*  Count occurrences of the string key in the haystack. 
*/ 
size_t hayst_find(Haystack *hayst, size_t offset, size_t len) 
{ 
    char *key = hayst->buf + offset; 
    char **begin, **end; 
    size_t n = 0; 

    if (offset + len > hayst->size) return 0; 

    begin = pstr_bsearch(key, len, hayst->ptr, hayst->size); 
    if (begin == NULL) return 0; 

    key[len - 1]++; 
    end = pstr_bsearch(key, len, hayst->ptr, hayst->size); 
    key[len - 1]--; 
    if (end == NULL) return 0; 
    if (end == begin) return 0; 

    while (begin < end) { 
     if (*begin < key) n++; 
     begin++; 
    } 

    return n; 
} 

/* 
*  Example client code 
*/ 
int main(int argc, char **argv) 
{ 
    Haystack *hayst; 
    FILE *f; 

    if (argc != 3) die("Usage: %s string queries", *argv); 

    hayst = hayst_new(argv[1]); 

    f = fopen(argv[2], "r"); 
    if (f == NULL) die("Can't open %s.", argv[1]); 
    for (;;) { 
     char str[80]; 
     size_t p, len; 
     size_t n; 

     if (fgets(str, sizeof(str), f) == NULL) break; 
     if (sscanf(str, "%zu %zu", &p, &len) < 2) continue; 
     n = hayst_find(hayst, p, len); 
     printf("%8zu %.*s\n", n, (int) len, hayst->buf + p); 
    } 
    fclose(f); 

    hayst_delete(hayst); 

    return 0; 
}