2011-11-09 64 views
4

Gperf在我的環境中始終不如Judy數組,我想知道是否還有專門爲整數鍵構建的另一個完美哈希庫。我事先知道這組鍵,並且我想將其用於性能/尺寸優勢。查找已知的整數密鑰集

有〜1000個鍵,並且檢索不是按順序排列的。密鑰對都是整數。密鑰是32位,檢索值是8位。大小是最重要的因素。

如果有一種方法來調整整數鍵的Gperf,或者只是一般的另一種方法,我也都是耳朵。 :)

(旁註:...在輸入這個問題時,我意識到二進制搜索可能需要的蛋糕,我只是想到了這個問題,我仍然想聽聽你可能有的想法爲了學習,儘管!)

編輯:密鑰不均勻分佈。大多數是隨機聚集在整個可能的範圍內。

編輯2:最糟糕的二進制搜索對我來說太慢了,所以我最終玩弄了這些密鑰,直到我找到8位來使用它們來創建256個分佈良好的存儲桶。我保存了每個存儲桶的最大值(每個存儲桶入口24位),併爲密鑰對創建了一個大的結構數組。相比我測試的所有其他情況,我的測試速度更快,體積更小,所以我想我現在正在爲此付出努力。 :)

+0

你可以使用Lua作爲哈希表庫。但是對於1000個條目,您可能會更好地搜索排序的數組,如您所述。 – lhf

+0

你是否需要插入/刪除,還是靜態查找表? – wildplasser

+0

@wildplasser靜態表。 :) – user962158

回答

3

保持您的密鑰排序,並使用M樹來檢索任何密鑰。

M樹的每個節點有M個入口,而不是2個用於二進制。 這對性能有很大的幫助。 使用高速緩存行大小作爲節點大小的基礎,因此使用64字節。 您可以在此大小中存儲16個32位值。

由於您有1000個值,因此3個級別將足以檢索正確的密鑰(而不是二進制樹的10個級別)。

另一個想法是將你的密鑰散列成一個小的散列表,比如12位的一個(4K條目),並用一個簡單的鏈來解決潛在的衝突。您很可能在一次搜索中獲得大部分密鑰。

+0

接受這一點。感謝您的好解釋。我會修改這個想法。 :) – user962158

1

你試過judy arrays?可能正是你所需要的。

+0

不幸的是,朱迪陣列是我試圖用靜態桌子擊敗!我不需要插入/刪除,所以我想擺脫所有需要的開銷。 :( – user962158

2
/* 
** Proof of concept for constructing a {fixed-size,lookup-only} hashtable 
** needing only (2*N* sizeof(int)) additional storage for storing N num=value pairs. 
** The key is supposed to be an int, 
** the 'value' is a char. 
** Note: some people would like to include <stdint.h> and replace all the ints by {uint32_t,uint16_t,uint8_t}. 
** 
** (c)opyright Wildplasser, 2011-11-12 
** href = http://stackoverflow.com/questions/8059591/lookups-on-known-set-of-integer-keys 
*/ 

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <assert.h> 

struct tinyhash { 
    unsigned key; 
    unsigned short link; 
    unsigned char value; 
    unsigned is_linked :1; 
    }; 
#define LINK_DEAD ((unsigned short)-1) 

    /* A hashtable with N slots for N entries (100% full) */ 
#define THE_SIZE 1000 
struct tinyhash table[THE_SIZE] ; 

void tiny_fill(void); 
void tiny_init(void); 
int tiny_find(unsigned key); 

void tiny_print(void); 
void tiny_count(void); 

static int tiny_cmp(const void *l, const void *r); 
static unsigned short * tiny_hnd(unsigned key); 
static unsigned tiny_hash(unsigned key); 

int main (void) 
{ 

assert(sizeof table == 2 * THE_SIZE * sizeof(int)); 

fprintf (stderr, "Size=%u\n", (unsigned int) sizeof table); 

tiny_fill(); 
tiny_init(); 
tiny_print(); 
tiny_count(); 

return 0; 
} 
    /* Perform a table lookup. 
    ** Return the "value" that belongs to "key".. 
    ** If not found -1 is returned. 
    */ 
int tiny_find(unsigned key) 
{ 
unsigned short *ptr; 
ptr = tiny_hnd(key); 
return *ptr==LINK_DEAD ? -1 : table[*ptr].value ; 
} 

    /* Find the place where key is located, or 
    ** (if not found) where it should be appendend. 
    ** The returned value is a pointer to the parent's .link field. 
    */ 
static unsigned short * tiny_hnd(unsigned key) 
{ 
unsigned hash; 
unsigned short slot, *ptr; 

hash = tiny_hash(key); 
slot = hash % THE_SIZE; 
for (ptr = &table[slot].link; *ptr != LINK_DEAD; ptr = &table[*ptr].link) { 
    if (!table[*ptr].is_linked) break; 
    if (table[*ptr].key == key) break; 
    } 
return ptr; 
} 

    /* For testing: fill hashtable with garbage */ 
void tiny_fill(void) 
{ 
unsigned idx; 
for (idx=0; idx < THE_SIZE; idx++) { 
    table[idx].key = rand() + 543 * rand(); 
    table[idx].value = rand() ; 
     table[idx].link = LINK_DEAD; 
     table[idx].is_linked = 0; 
    } 
} 
    /* Build hashtable, that is: 
    ** shuffle the entries and build linked list. 
    */ 
void tiny_init(void) 
{ 
unsigned idx; 

    /* Phase 0: set all to unused. 
    ** The link field is temporaly abused to store the intended 
    ** slotnumber. 
    */ 
for (idx=0; idx < THE_SIZE; idx++) { 
     table[idx].link = tiny_hash(table[idx].key) % THE_SIZE; 
     table[idx].is_linked = 0; 
    } 

    /* Phase 0a: sort on (intended) slotnumber. */ 
qsort (table, THE_SIZE, sizeof table[0] , tiny_cmp); 

    /* Phase 1: put enties at their intended slotnumber 
    ** but only if the entry that lives there does not belong there 
    ** (is uninitialized). 
    */ 
for (idx=0; idx < THE_SIZE; idx++) { 
    unsigned slot; 
     /* [idx] has allready been placed */ 
    if (table[idx].is_linked) continue; 
    slot = table[idx].link; 
     /* [idx] belongs here. freeze it */ 
    if (slot==idx) { 
     table[idx].link = LINK_DEAD; 
     table[idx].is_linked = 1; 
     } 
     /* try to swap [idx] with the item at its intended slot */ 
    else { 
     struct tinyhash tmp; 
      /* item[slot] belongs there: keep off */ 
     if (table[slot].is_linked) continue; 
     tmp = table[idx]; 
     table[idx] = table[slot]; 
     table[slot] = tmp; 
     table[slot].is_linked = 1; 
     table[slot].link = LINK_DEAD; 
      /* Don't bump idx: [idx] and [slot] have been swapped, 
      ** we need to inspect the new item[idx] at the next cycle. 
      */ 
     idx--; /* idx will be re-incremented in the loop; */ 
     } 
    } 

    /* Phase 2: link any remaining unplaced item to its 
    ** parent in the LL-chain. 
    */ 
for (idx=0; idx < THE_SIZE; idx++) { 
    unsigned short *parent; 
    if (table[idx].is_linked) continue; 
    parent = tiny_hnd(table[idx].key); 
    if (*parent != LINK_DEAD) continue; /* duplicate */ 
    *parent = idx; 
    table[idx].is_linked = 1; 
    table[idx].link = LINK_DEAD; 
    } 
} 
    /* Compare function for qsort() 
    */ 
static int tiny_cmp(const void *vl, const void *vr) 
{ 
struct tinyhash *l = (struct tinyhash *)vl; 
struct tinyhash *r = (struct tinyhash *)vr; 

#if 0 
unsigned slot_l, slot_r; 
slot_l = tiny_hash(l->key) %THE_SIZE; 
slot_r = tiny_hash(r->key) %THE_SIZE; 
if (slot_l < slot_r) return -3; 
if (slot_l > slot_r) return 3; 
#else 
if (l->link < r->link) return -3; 
if (l->link > r->link) return 3; 
#endif 

if (l->key < r->key) return -2; 
if (l->key > r->key) return 2; 

if (l < r) return -1; 
if (l > r) return 1; 

return 0; 
} 

    /* Stupid hashfunction, to be replaced with something usefull.. 
    ** (works good for random ints) Use at your own risk. 
    */ 
static unsigned tiny_hash(unsigned key) 
{ 
return key * 98765; 
} 

void tiny_print(void) 
{ 
unsigned idx; 

for (idx=0; idx < THE_SIZE; idx++) { 
    unsigned slot; 
    int dir; 
    slot = tiny_hash(table[idx].key) % THE_SIZE; 
    dir = (slot==idx) ? '=' : (slot>idx) ? '<': '>'; 
    fprintf(stderr, "[%4x] %c %4x: %4x %c %10u->%3u\n" 
    , idx, dir, 0xffff & slot 
    , 0xffff & table[idx].link 
    , table[idx].is_linked ? '*' : '.' 
    , table[idx].key,table[idx].value 
    ); 
    } 
} 
    /* For testing: print the hashchains, construct a histogram of chainlengths, 
    ** and calculate the "total cost of retrieval". 
    */ 
void tiny_count(void) 
{ 
unsigned idx, cnt, len, tothops, slot; 
unsigned histogram[THE_SIZE]; 

memset(histogram, 0, sizeof histogram); 

cnt=tothops=0; 
for (slot =0; slot < THE_SIZE; slot++) { 
    idx = tiny_hash(table[slot].key) % THE_SIZE; 
    if (slot!=idx) continue; /* this is not the start of a chain */ 
    for (len=0 ; idx != LINK_DEAD; idx = table[idx].link) { 
     if (!table[idx].is_linked) continue; 
     if (len==0) fprintf(stderr, "[%u]:", slot); 
     fprintf(stderr, " %u", table[idx].key); 
     len++; 
     } 
    fprintf(stderr, "(=%u)\n", len); 
    cnt += len; 
    histogram[len] += 1; 
    tothops += (((len+1) *len)/2); 
    } 

fprintf(stderr, "Histogram of chainlengths:\n"); 
for (len=0; len < THE_SIZE; len++) { 
    if (!histogram[len]) continue; 
    fprintf(stderr, "[%u]: %u\n", len, histogram[len]); 
    } 
fprintf(stderr, "tothops=%u/%u (%f hops per node)\n" 
    , tothops, cnt, (double)tothops/cnt); 
} 

結果:(好:它的一些

.... 
[978]: 1794172570(=1) 
[980]: 642121828(=1) 
[983]: 2674104091(=1) 
[985]: 547527125(=1) 
[986]: 2383911594(=1) 
[988]: 4254837348(=1) 
[989]: 1860851825 1990214465 1766290905(=3) 
[990]: 3793608270 469685686(=2) 
[992]: 1189958296 872917240(=2) 
[994]: 1999781290 1501026482(=2) 
[995]: 520334159 211600319(=2) 
[997]: 177583921(=1) 
[998]: 1173163646 1013572158(=2) 
[999]: 1705614211 3460318251(=2) 
Histogram of chainlengths: 
[1]: 369 
[2]: 190 
[3]: 57 
[4]: 15 
[5]: 4 
tothops=1491/1000 (1.491000 hops per node) 

注:因爲排序的同時初始化哈希表,條目非常接近到他們預期的地方。這增加了參考的地點。