我想玩C數據結構(哈希表)。我沒有使用任何預先構建的哈希表庫(如STL),因爲我想更好地瞭解它的工作原理。 所以我在這裏創建一個哈希表,其中包含元素列表,其中每個元素包含一個鍵和一個字符串元素數據(chars數組),以及字符串元素數據的長度。有效實現Hashtable,具有緩存感知的局部性屬性(局部敏感哈希表)
我的實現能夠正常工作,但是在與我的一位同事討論之後,我被告知我的實現效率不高,特別是在上下文中:我的實現無法識別緩存,導致我的哈希表查找無效。我並不是很瞭解他的解釋。
所以我想知道,緩存感知的局部性實現實際上意味着什麼?
我怎樣才能使我的哈希表實現變得更有效率通過使用該緩存感知locality屬性時進行查找?有沒有更好的方法來構建這個結構,以及組織元素的更好方法(查找)?
這是我迄今所做的:
HASHTBL.h
struct hashelt {
struct hashelt* he_next; /* One way linked list pointer */
char* he_key; /* Pointer to element's key */
char* he_data; /* Pointer to element's data */
int he_data_len; /* Length of element's data */
};
typedef struct hashelt hashelt;
struct hashtbl {
hashelt** ht_links; /* Array of pointers to elemsnts */
int ht_prime; /* Length of hash table - it is a prime number */
};
typedef struct hashtbl hashtbl;
int prime_check(int num);
hashtbl* hashtbl_init(int tbl_size);
int hashtbl_key_convert(hashtbl* htable, char* hkey);
hashelt* hashtbl_lookup(hashtbl* htable, char* hkey, char* hdata);
int hashtbl_add(hashtbl* htable, char* hkey, char* hdata);
void hashtbl_print(hashtbl* htable);
HASHTBL.c
/*
* prime_check
* Check if num is a prime or not.
* num Number to use as an upper bound.
*
* The functions returns 0 for a prime and 1 otherwise.
*/
int prime_check(int num) {
/* Local declarations */
int i; /* Loop counter */
/* Search through the first sqrt(num) integers */
for(i = 2; i*i < num; i++)
if ((num % i) == 0)
return 1;
/* It is a prime */
return 0;
}
/*
* hashtbl_init
* Create and initialize a hash table.
* The input variable are
* tbl_size Suggested size of the table, which should be a prime or 0.
* The functions returns a pointer to a hashtbl or NULL for failure.
* The hash table is the size of the largest prime found for the suggested
* table size of the default size if 0 is given as a suggestion.
*/
hashtbl* hashtbl_init(int tbl_size) {
/* Local declarations */
hashtbl* hp; /* Hash table pointer */
int i; /* Loop counter */
/* Initial values */
hp = NULL;
/* Determine the actual table size */
if (tbl_size <= 0)
tbl_size = DEFAULT_HASHTBL_LENGTH;
else if (prime_check(tbl_size))
return NULL;
if (tbl_size <= 0)
return NULL;
/* Allocate the hash table */
if((hp = (hashtbl*)malloc(sizeof(hashtbl))) == NULL)
return hp;
/* Allocate the linked list vector */
if((hp->ht_links = (hashelt**)malloc(tbl_size * sizeof(hashelt*))) == NULL) {
free(hp);
return NULL;
}
/* Fill in the hash table with no entries */
for(i = 0 ; i < tbl_size; i++)
hp->ht_links[i] = NULL;
/* Fill in the hash table size */
hp->ht_prime = tbl_size;
/* All done */
return hp;
}
/*
* hashtbl_key_convert
* Make an index into the key chain in the hash table from a key:
* kindex = sum of the characters in hkey modulo ht_prime
* The input variable are
* htable A pointer to a hash table.
* hkey A pointer to a a key.
* The functions returns an index into the key chain.
*/
int hashtbl_key_convert(hashtbl* htable, char* hkey) {
/* Local declarations */
int i; /* Loop counter */
int klen; /* Length of the key */
int kindex; /* Key index to return */
/* Compute the key index */
klen = strlen(hkey);
for(kindex = 0, i = 0 ; i < klen; i++)
kindex += hkey[i];
kindex = kindex % htable->ht_prime;
/* All done */
return kindex;
}
/*
* hashtbl_lookup
* Lookup a (key,data) in a hash table.
* The input variable are
* htable A pointer to a hash table.
* hkey A key.
* hdata A pointer to the data.
* The functions returns 0 for found and 1 for not found.
*/
hashelt* hashtbl_lookup(hashtbl* htable, char* hkey, char* hdata) {
/* Local declarations */
hashelt* hep; /* Hash element pointer */
int i; /* Loop counter */
int key; /* Hash table key index */
/* Get key index from hkey */
key = hashtbl_key_convert(htable, hkey);
/* Search through the hash table, including collicions */
for(hep = htable->ht_links[key] ; hep != NULL ; hep = hep->he_next)
if (strcmp(hep->he_key, hkey) == 0)
return hep;
/* Not found */
return NULL;
}
/*
* hashtbl_add
* Add a (key,data) to a hash table.
* The input variable are
* htable A pointer to a hash table.
* hkey A key.
* hdata A pointer to the data.
* The functions returns 0 for success and 1-4 for failure.
*
*/
int hashtbl_add(hashtbl* htable, char* hkey, char* hdata) {
/* Local declarations */
hashelt* hep; /* Element in linked list */
hashelt* newelt; /* New element in hash table */
int i; /* Loop counter */
int dlen; /* Length of data */
int key; /* Hash table key index */
/* Get key index from hkey */
key = hashtbl_key_convert(htable, hkey);
/* Check if the (key,data) is already in the hash table */
if ((hep = hashtbl_lookup(htable, hkey, hdata)) != NULL)
if (strcmp(hep->he_data, hdata) == 0)
return 1;
/* Add the data */
dlen = strlen(hdata);
hep = htable->ht_links[key];
if ((newelt = (hashelt*)malloc(sizeof(hashelt))) == NULL) {
fprintf(stderr, "hashtbl_add: Cannot allocate new hash element\n");
return 3;
}
newelt->he_next = hep;
newelt->he_data_len = dlen;
newelt->he_key = (char*)malloc((strlen(hkey)+1) * sizeof(char));
newelt->he_data = (char*)malloc((dlen+1) * sizeof(char));
if (newelt->he_key == NULL || newelt->he_data == NULL) {
fprintf(stderr, "hashtbl_add: Cannot allocate new hash element contents\n");
return 4;
}
strcpy(newelt->he_key, hkey);
strcpy(newelt->he_data, hdata);
htable->ht_links[key] = newelt;
/* All done */
return 0;
}
/*
* hashtbl_print
* Print an entire hash table.
* The input variable are
* htable A pointer to a hash table.
* The functions returns 0 for success and 1-4 for failure.
*/
void hashtbl_print(hashtbl* htable) {
/* Local declarations */
hashelt* hep; /* Element in linked list */
int i; /* Loop counter */
int l; /* Link count */
/* Initial printing */
printf("\nHash table contents\n");
/* Print a has tbale out, one key bucket at a time */
for(i = 0 ; i < htable->ht_prime ; i++) {
printf("\nkey index %i\n", i);
hep = htable->ht_links[i];
if (hep == NULL)
printf(" No entries in the linked list\n");
else {
l = 0;
while(hep != NULL) {
printf(" Element %d\n", l);
printf(" key: %s\n", hep->he_key);
printf(" data: %s\n", hep->he_data);
printf(" data_len: %d\n", hep->he_data_len);
hep = hep->he_next;
}
}
}
/* All done */
}
主文件
void make_string(char* buffer, int blen) {
/* Local declarations */
char viable_chars[] = { "abcdefghijklmnopqrstuvwxyz-_ABCDEFGHIJKLMNOPQRSTUVWXYZ" };
char* bp; /* Pointer into buffer */
int i; /* Loop counter */
int c; /* Index into viable_chars */
static int vc_len = 0; /* Length of viable_chars */
/* Do once only */
if (vc_len == 0)
vc_len = strlen(viable_chars);
/* Do the fill operation on a subset of buffer */
blen = rand() % blen;
if (blen == 0) blen++;
for(bp = buffer, i = 0; i < blen ; i++) {
c = rand() % vc_len;
*bp++ = viable_chars[c];
}
*bp++ = '\0';
}
int main(int argc, char** argv) {
/* Local declarations */
hashtbl* htable; /* Hash table pointer */
char* kstrings; /* Pointer to key vector */
char* dstrings; /* Pointer to data vector */
int tblsize = 0; /* Hash table size */
int nkeys = 0; /* Number of keys */
int dlen = 0; /* Maximum data length for (keys,data) */
int i; /* Loop counter */
double t1; /* Time keeper */
double t2; /* Time keeper */
double mrun(); /* Get timing information */
#ifdef GOOD_SEED
/* Get a good random number seed */
struct timeval t1; /* holder for time of day (seconds, microseconds) */
gettimeofday(&t1, NULL);
srand(t1.tv_usec * t1.tv_sec);
#endif
/* Get hash table size */
printf("Hash table size (pick a prime or bound for one): ");
scanf("%d", &tblsize);
fflush(stdin);
/* Initialize hash table */
if((htable = hashtbl_init(tblsize)) == NULL) {
fprintf(stderr, "Oops... hashtbl_init returned NULL\n");
return 1;
}
printf("Actual size of hash table is %d\n", htable->ht_prime);
/* Now fill it up */
while (nkeys <= 0) {
printf("How many keys do you want? ");
scanf("%d", &nkeys);
fflush(stdin);
}
while (dlen <= 0) {
printf("What is the maximum character string length? ");
scanf("%d", &dlen);
fflush(stdin);
}
/* Allocate vectors for (key,data) */
kstrings = (char*)malloc((dlen+1) * sizeof(char));
dstrings = (char*)malloc((dlen+1) * sizeof(char));
if (kstrings == NULL || dstrings == NULL) {
fprintf(stderr, "main: Could not allocate string vectors for (key,data)\n");
return 2;
}
/* Now fill it up */
for(i = 0 ; i < nkeys ; i++) {
make_string(kstrings, dlen);
make_string(dstrings, dlen);
hashtbl_add(htable, kstrings, dstrings);
}
/* Print it out */
hashtbl_print(htable);
/* All done */
return 0;
}
謝謝,如果我使用數組而不是鏈接列表會更好嗎?讓我們說一個數組來存儲散列表元素的鍵(也可能是該元素中的字符串元素的長度)。然後我可以使用這些信息找到正確的元素數據。除了使用鏈表之外,我還可以使用另一個數組來存儲元素數據。這會產生更好的表現嗎?這種方法的優缺點是什麼?這只是一個想法,突然出現在我的腦海裏。有沒有其他的方式來加快散列表查找,可能有更好的散列表結構? –
不,同一散列bin中多個條目的鏈接列表沒有問題,這在行業中是相當標準的,因爲預計同一個散列bin中的多個條目很少。 –
我添加了信息,'hashelt * elements'最好保存在結構中的最後一項,否則下面的字段將根據你的elemCount具有地址。順便說一句,您可以通過將它們視爲結構體'pHashTbl-> elements [0]'中的普通數組來訪問這些元素。 –