Hashtable的高效实现,具有缓存感知的位置属性(位置敏感的哈希表)

时间:2011-07-16 13:05:12

标签: c hashtable locality-sensitive-hash

我正在尝试使用C数据结构(哈希表)。我没有使用任何预先构建的哈希表库(如STL),因为我想更好地理解它是如何工作的。 所以在这里我创建一个哈希表,其中包含元素列表,其中每个元素包含一个键和一个字符串元素数据(字符数组),以及字符串元素数据的长度。

我的实现工作正常,但在与我的一位同事讨论之后,我被告知我的实现效率不高,特别是在上下文中:我的实现不支持缓存,导致我的哈希表查找效率低下。我真的不明白他的解释。

所以我想知道,缓存感知的位置实现实际意味着什么?

如何在执行查找时使用该缓存感知的locality属性,从而提高哈希表的实现效率?有没有更好的方法为此构建结构,以及组织元素的更好方法(进行查找)?

这是我到目前为止所做的事情:

HASHTBL.h

struct hashelt {
    struct hashelt*     he_next;        /* One way linked list pointer      */
    char*               he_key;         /* Pointer to element's key         */
    char*               he_data;        /* Pointer to element's data        */
    int                 he_data_len;    /* Length of element's data         */
};

typedef struct hashelt      hashelt;

struct hashtbl {
    hashelt**           ht_links;       /* Array of pointers to elemsnts                */
    int                 ht_prime;       /* Length of hash table - it is a prime number  */

};

typedef struct hashtbl      hashtbl;

int prime_check( int num );

hashtbl* hashtbl_init( int tbl_size );

int hashtbl_key_convert( hashtbl* htable, char* hkey );

hashelt* hashtbl_lookup( hashtbl* htable, char* hkey, char* hdata );

int hashtbl_add( hashtbl* htable, char* hkey, char* hdata );

void hashtbl_print( hashtbl* htable );

HASHTBL.c

/*
 *                               prime_check
 *  Check if num is a prime or not.
 *      num         Number to use as an upper bound.
 *
 *  The functions returns 0 for a prime and 1 otherwise.
 */

int prime_check( int num ) {

    /* Local declarations */
    int         i;              /* Loop counter */

    /* Search through the first sqrt(num) integers */
    for( i = 2; i*i < num; i++ )
        if ( (num % i) == 0 )
            return 1;

    /* It is a prime */
    return 0;
}

/*
 *                               hashtbl_init
 *  Create and initialize a hash table.
 *  The input variable are
 *      tbl_size    Suggested size of the table, which should be a prime or 0.
 *  The functions returns a pointer to a hashtbl or NULL for failure.
 *  The hash table is the size of the largest prime found for the suggested
 *  table size of the default size if 0 is given as a suggestion.
 */

hashtbl* hashtbl_init( int tbl_size ) {

    /* Local declarations */
    hashtbl*    hp;             /* Hash table pointer */
    int         i;              /* Loop counter */

    /* Initial values */
    hp = NULL;

    /* Determine the actual table size */
    if ( tbl_size <= 0 )
        tbl_size = DEFAULT_HASHTBL_LENGTH;
    else if ( prime_check( tbl_size ) )
        return NULL;
    if ( tbl_size <= 0 )
        return NULL;

    /* Allocate the hash table */
    if( ( hp = (hashtbl*)malloc( sizeof(hashtbl) ) ) == NULL )
        return hp;

    /* Allocate the linked list vector */
    if( ( hp->ht_links = (hashelt**)malloc( tbl_size * sizeof(hashelt*) ) ) == NULL ) {
        free( hp );
        return NULL;
    }

    /* Fill in the hash table with no entries */
    for( i = 0 ; i < tbl_size; i++ )
        hp->ht_links[i] = NULL;

    /* Fill in the hash table size */
    hp->ht_prime = tbl_size;

    /* All done */
    return hp;
    }


/*
 *                           hashtbl_key_convert
 *  Make an index into the key chain in the hash table from a key:
 *      kindex = sum of the characters in hkey modulo ht_prime
 *  The input variable are
 *      htable      A pointer to a hash table.
 *      hkey        A pointer to a a key.
 *  The functions returns an index into the key chain.
 */

int hashtbl_key_convert( hashtbl* htable, char* hkey ) {

    /* Local declarations */
    int         i;              /* Loop counter */
    int         klen;           /* Length of the key */
    int         kindex;         /* Key index to return */

    /* Compute the key index */
    klen = strlen( hkey );
    for( kindex = 0, i = 0 ; i < klen; i++ )
        kindex += hkey[i];
    kindex = kindex % htable->ht_prime;

    /* All done */
    return kindex;
    }


/*
 *                              hashtbl_lookup
 *  Lookup a (key,data) in a hash table.
 *  The input variable are
 *      htable      A pointer to a hash table.
 *      hkey        A key.
 *      hdata       A pointer to the data.
 *  The functions returns 0 for found and 1 for not found.
 */

hashelt* hashtbl_lookup( hashtbl* htable, char* hkey, char* hdata ) {

    /* Local declarations */
    hashelt*    hep;            /* Hash element pointer */
    int         i;              /* Loop counter */
    int         key;            /* Hash table key index */

    /* Get key index from hkey */
    key = hashtbl_key_convert( htable, hkey );

    /* Search through the hash table, including collicions */
    for( hep = htable->ht_links[key] ; hep != NULL ; hep = hep->he_next )
        if ( strcmp( hep->he_key, hkey ) == 0 )
            return hep;

    /* Not found */
    return NULL;
}

/*
 *                               hashtbl_add
 *  Add a (key,data) to a hash table.
 *  The input variable are
 *      htable      A pointer to a hash table.
 *      hkey        A key.
 *      hdata       A pointer to the data.
 *  The functions returns 0 for success and 1-4 for failure.
 *
 */

int hashtbl_add( hashtbl* htable, char* hkey, char* hdata ) {

    /* Local declarations */
    hashelt*    hep;            /* Element in linked list */
    hashelt*    newelt;         /* New element in hash table */
    int         i;              /* Loop counter */
    int         dlen;           /* Length of data */
    int         key;            /* Hash table key index */

    /* Get key index from hkey */
    key = hashtbl_key_convert( htable, hkey );

    /* Check if the (key,data) is already in the hash table */
    if ( ( hep = hashtbl_lookup( htable, hkey, hdata ) ) != NULL )
        if ( strcmp( hep->he_data, hdata ) == 0 )
            return 1;

    /* Add the data */
    dlen = strlen( hdata );
    hep = htable->ht_links[key];
    if ( ( newelt = (hashelt*)malloc( sizeof( hashelt ) ) ) == NULL ) {
        fprintf( stderr, "hashtbl_add: Cannot allocate new hash element\n" );
        return 3;
        }
    newelt->he_next = hep;
    newelt->he_data_len = dlen;
    newelt->he_key = (char*)malloc( (strlen(hkey)+1) * sizeof(char) );
    newelt->he_data = (char*)malloc( (dlen+1) * sizeof(char) );
    if ( newelt->he_key == NULL || newelt->he_data == NULL ) {
        fprintf( stderr, "hashtbl_add: Cannot allocate new hash element contents\n" );
        return 4;
        }
    strcpy( newelt->he_key, hkey );
    strcpy( newelt->he_data, hdata );
    htable->ht_links[key] = newelt;

    /* All done */
    return 0;
    }

/*
 *                              hashtbl_print
 *  Print an entire hash table.
 *  The input variable are
 *      htable      A pointer to a hash table.
 *  The functions returns 0 for success and 1-4 for failure.
 */

void hashtbl_print( hashtbl* htable ) {

    /* Local declarations */
    hashelt*    hep;            /* Element in linked list */
    int         i;              /* Loop counter */
    int         l;              /* Link count */

    /* Initial printing */
    printf( "\nHash table contents\n" );

    /* Print a has tbale out, one key bucket at a time */
    for( i = 0 ; i < htable->ht_prime ; i++ ) {
        printf( "\nkey index %i\n", i );
        hep = htable->ht_links[i];
        if ( hep == NULL )
            printf( "    No entries in the linked list\n" );
        else {
            l = 0;
            while( hep != NULL ) {
                printf( "    Element %d\n", l );
                printf( "        key:       %s\n", hep->he_key );
                printf( "        data:      %s\n", hep->he_data );
                printf( "        data_len:  %d\n", hep->he_data_len );
                hep = hep->he_next;
                }
            }
        }

    /* All done */
}

主要文件

void make_string( char* buffer, int blen ) {

    /* Local declarations */
    char        viable_chars[] = { "abcdefghijklmnopqrstuvwxyz-0123456789_ABCDEFGHIJKLMNOPQRSTUVWXYZ" };
    char*       bp;             /* Pointer into buffer */
    int         i;              /* Loop counter */
    int         c;              /* Index into viable_chars */
    static int  vc_len = 0;     /* Length of viable_chars */

    /* Do once only */
    if ( vc_len == 0 )
        vc_len = strlen( viable_chars );

    /* Do the fill operation on a subset of buffer */
    blen = rand() % blen;
    if ( blen == 0 ) blen++;
    for( bp = buffer, i = 0; i < blen ; i++ ) {
        c = rand() % vc_len;
        *bp++ = viable_chars[c];
        }
    *bp++ = '\0';

    }

int main( int argc, char** argv ) {

    /* Local declarations */
    hashtbl*    htable;         /* Hash table pointer */
    char*       kstrings;       /* Pointer to key vector */
    char*       dstrings;       /* Pointer to data vector */

    int         tblsize = 0;    /* Hash table size */
    int         nkeys = 0;      /* Number of keys */
    int         dlen = 0;       /* Maximum data length for (keys,data) */
    int         i;              /* Loop counter */

    double      t1;             /* Time keeper */
    double      t2;             /* Time keeper */
    double      mrun();         /* Get timing information */

#ifdef GOOD_SEED
    /* Get a good random number seed */
    struct timeval t1;  /* holder for time of day (seconds, microseconds) */
    gettimeofday( &t1, NULL );
    srand( t1.tv_usec * t1.tv_sec );
#endif

    /* Get hash table size */
    printf( "Hash table size (pick a prime or bound for one): " );
    scanf( "%d", &tblsize );
    fflush( stdin );

    /* Initialize hash table */
    if( ( htable = hashtbl_init( tblsize ) ) == NULL ) {
        fprintf( stderr, "Oops... hashtbl_init returned NULL\n" );
        return 1;
        }
    printf( "Actual size of hash table is %d\n", htable->ht_prime );

    /* Now fill it up */
    while ( nkeys <= 0 ) {
        printf( "How many keys do you want? " );
        scanf( "%d", &nkeys );
        fflush( stdin );
        }
    while ( dlen <= 0 ) {
        printf( "What is the maximum character string length? " );
        scanf( "%d", &dlen );
        fflush( stdin );
        }

    /* Allocate vectors for (key,data) */
    kstrings = (char*)malloc( (dlen+1) * sizeof( char ) );
    dstrings = (char*)malloc( (dlen+1) * sizeof( char ) );
    if ( kstrings == NULL || dstrings == NULL ) {
        fprintf( stderr, "main: Could not allocate string vectors for (key,data)\n" );
        return 2;
        }

    /* Now fill it up */
    for( i = 0 ; i < nkeys ; i++ ) {
        make_string( kstrings, dlen );
        make_string( dstrings, dlen );
        hashtbl_add( htable, kstrings, dstrings );
        }

    /* Print it out */
    hashtbl_print( htable );

    /* All done */
    return 0;
   }

1 个答案:

答案 0 :(得分:1)

如果你改变了

,那么地方会更好
struct hashtbl {
    hashelt**           ht_links;       /* Array of pointers to elemsnts    
    ...

是:

struct hashtbl {
    ...
    hashelt*   elements;       /* Array of elements (as last entry of struct)

考虑一下hash-table的样子,在你的版本中,表中包含指向元素的指针,在我的修改中,表实际上包含了一个接一个地打包的结构!当然有一个不足之处:一个空的哈希bin不仅包含一个空指针(就像你的版本一样),而是一个整体sizeof(hashelt)。此外,您必须确保分配大小为sizeof(hashtbl)的hashtbl-struct,因为您的hashtbl 包含 hashelt:您必须分配ht_prime*sizeof(hashelt)+sizeof(int)(第一个加号)是你所包含的ht_prime hashelt,第二个summand用于存储ht_prime本身。)