The following code is trying to count word frequency in a document, by using hashset
and vector
.
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/** ==================================== VECTOR ======================================= */
typedef enum {
true, false
} bool;
typedef int (*VectorCompareFunction)(const void *elemAddr1, const void *elemAddr2);
typedef void (*VectorFreeFunction)(void *elemAddr);
typedef struct {
int elemSize; //how many byte for each element
int elemNum; //number of current element in vector
int capacity; //maximum number of element vector can hold
void *elems; //pointer to data memory
VectorFreeFunction freefn; //pointer to the function used to free each element
} vector;
/**
* Reallocate a new memory of twice of original size
* return 1 if reallocation success, otherwise return -1.
*/
static void DoubleMemory(vector *v) {
void *tmp = realloc(v->elems, v->capacity * v->elemSize * 2);
assert(tmp != NULL);
v->elems = tmp;
v->capacity *= 2;
}
/**
* Constructor
*/
void VectorNew(vector *v, int elemSize, VectorFreeFunction freefn, int initialAllocation) {
v->elems = malloc(initialAllocation * elemSize);
assert(v->elems != NULL);
v->elemSize = elemSize;
v->elemNum = 0;
v->capacity = initialAllocation;
v->freefn = freefn;
}
/**
* Appends a new element to the end of the specified vector.
*/
void VectorAppend(vector *v, const void *elemAddr) {
/* double size if neccessary */
if (v->elemNum == v->capacity) DoubleMemory(v);
memcpy((char *)v->elems + v->elemNum * v->elemSize, elemAddr, v->elemSize);
v->elemNum++;
}
/**
* Search the specified vector for an element whose contents match the element passed as the key.
*/
int VectorSearch(const vector *v, const void *key, VectorCompareFunction searchfn, int startIndex, bool isSorted) {
assert(key && searchfn);
if (v->elemNum == 0) return -1;
assert(startIndex >= 0 && startIndex < v->elemNum);
if (isSorted == true) {
/* binary search */
void *startAddr = (char *)v->elems + startIndex * v->elemSize;
int size = v->elemNum - startIndex;
void *resAddr = bsearch(key, startAddr, size, v->elemSize, searchfn);
return (resAddr != NULL)? ((char *)resAddr - (char *)v->elems) / v->elemSize : -1;
} else {
/* linear search */
for (int i = 0; i < v->elemNum; i++) {
if (searchfn((char *)v->elems + i * v->elemSize, key) == 0) {
return i;
}
}
return -1;
}
}
/**
* Overwrites the element at the specified position.
*/
void VectorReplace(vector *v, const void *elemAddr, int position) {
assert(position >= 0 && position < v->elemNum);
void *posAddr = (char *)v->elems + position * v->elemSize;
/* free the memory of old element first */
if (v->freefn != NULL) v->freefn(posAddr);
memcpy(posAddr, elemAddr, v->elemSize);
}
/** ==================================== HASHSET ======================================= */
typedef int (*HashSetHashFunction)(const void *elemAddr, int numBuckets);
typedef int (*HashSetCompareFunction)(const void *elemAddr1, const void *elemAddr2);
typedef void (*HashSetFreeFunction)(void *elemAddr);
typedef struct {
int elemNum; //current element number
int bucketNum; //number of hash bucket
int elemSize; //how many byte each element has
vector *buckets; //array of vector
HashSetHashFunction hashfn;
HashSetCompareFunction compfn;
HashSetFreeFunction freefn;
} hashset;
void HashSetNew(hashset *h, int elemSize, int numBuckets,
HashSetHashFunction hashfn, HashSetCompareFunction comparefn, HashSetFreeFunction freefn) {
assert(elemSize > 0 && numBuckets > 0 && hashfn != NULL && comparefn != NULL);
h->buckets = (vector *)malloc(numBuckets * sizeof(vector));
assert(h->buckets != NULL);
for (int i = 0; i < numBuckets; i++) {
vector *bucket = (vector *)((char *)h->buckets + i * sizeof(vector));
VectorNew(bucket, elemSize, freefn, 4);
}
h->bucketNum = numBuckets;
h->elemSize = elemSize;
h->elemNum = 0;
h->hashfn = hashfn;
h->compfn = comparefn;
h->freefn = freefn;
}
void HashSetEnter(hashset *h, const void *elemAddr) {
int hash = h->hashfn(elemAddr, h->bucketNum);
vector *bucket = (vector *)((char *)h->buckets + hash * sizeof(vector));
// search in the hash set first
int pos = VectorSearch(bucket, elemAddr, h->compfn, 0, false);
if (pos != -1) {
// replace the old one if find a match
VectorReplace(bucket, elemAddr, pos);
} else {
// otherwise insert the new one
VectorAppend(bucket, elemAddr);
h->elemNum++;
}
}
/** ==================================== DOC_FREQ & WORD_INDEX ======================================= */
/****************************************************************
*
* doc_freq is a key-value pair of [documentid, frequency]
* It's not supposed to be exposed to user or search engine.
* -----------------------------------------------------------
* It looks like:
* [1611742826915764000] [4 ]
* |-------------------| |-------|
* docid freq
***************************************************************/
typedef struct {
long docid;
int freq;
} doc_freq;
static void new_docfreq(doc_freq *df, long docid, int freq) {
df->docid = docid;
df->freq = freq;
}
/**
* HashSetHashFunction<doc_freq>
*/
static int hash_docfreq(const void *elemAddr, int numBuckets) {
doc_freq *df = (doc_freq *)elemAddr;
return (int)(df->docid % numBuckets);
}
/**
* HashSetCompareFunction<doc_freq>
*/
static int comp_docfreq(const void *elemAddr1, const void *elemAddr2) {
long id1 = ((doc_freq *)elemAddr1)->docid;
long id2 = ((doc_freq *)elemAddr2)->docid;
if (id1 < id2) {
return -1;
} else if (id1 > id2) {
return 1;
} else { // id1 == id2
return 0;
}
}
/**
* word_index is a index of a single word.
* ---------------------------------------
* A typical word_index looks like:
* [apple]: [doc1, 5], [doc3, 10], [doc5, 7]
* |-----| |------------------------------|
* word freqs
*/
typedef struct {
char *word;
hashset *freqs; // hashset<doc_freq>
} word_index;
static const size_t kWordIndexHashSetBuckets = 64;
static void new_wordindex(word_index *wi, const char *word) {
hashset h;
HashSetNew(&h, sizeof(doc_freq), kWordIndexHashSetBuckets, hash_docfreq, comp_docfreq, NULL);
wi->freqs = &h;
size_t wordlen = strlen(word);
wi->word = (char *)malloc(wordlen + 1); // +1 for null-termination
strcpy(wi->word, word);
(wi->word)[wordlen] = '\0';
}
/**
* Mainly used to build a word_index.
*/
void add_docfreq(word_index *wi, const long docid, const int frequency) {
doc_freq df;
new_docfreq(&df, docid, frequency);
HashSetEnter(wi->freqs, &df);
}
/** ==================================== UNIT-TEST ======================================= */
int main(void) {
/* apple: [1611742826915764000, 5][1611742826915538000, 10] */
word_index *apple = (word_index *)malloc(sizeof(word_index));
new_wordindex(apple, "apple");
add_docfreq(apple, 1611742826915764000L, 5);
add_docfreq(apple, 1611742826915538000L, 10);
}
It gave me a segmentation fault
:
[1] 84309 segmentation fault testindexer
lldb find the problem occured when hashset
try to callback the given pointer of function hashfn
. I don't quite understand what is EXC_BAD_ACCESS (code=EXC_I386_GPFLT)
here. I have done several unit test on hashset before, the HashSetEnter()
function worked well with hashfn
. Another unit test was conducted on hash_docfreq()
function, it can also calculate correctly the hash number. I'm a little bit confused. Anyone can help? Thanks!
Process 89962 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=EXC_I386_GPFLT)
frame #0: 0x0000000100003b83 testnothing`HashSetEnter(h=0x00007ffeefbff620, elemAddr=0x00007ffeefbff638) at test_nothing.c:130:13
127 }
128
129 void HashSetEnter(hashset *h, const void *elemAddr) {
-> 130 int hash = h->hashfn(elemAddr, h->bucketNum);
131 vector *bucket = (vector *)((char *)h->buckets + hash * sizeof(vector));
132 // search in the hash set first
133 int pos = VectorSearch(bucket, elemAddr, h->compfn, 0, false);
Target 0: (testnothing) stopped.
(lldb) bt
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=EXC_I386_GPFLT)
* frame #0: 0x0000000100003b83 testnothing`HashSetEnter(h=0x00007ffeefbff620, elemAddr=0x00007ffeefbff638) at test_nothing.c:130:13
frame #1: 0x0000000100003c37 testnothing`add_docfreq(wi=0x0000000100306060, docid=1611742826915764000, frequency=5) at test_nothing.c:222:2
frame #2: 0x0000000100003cae testnothing`main at test_nothing.c:235:2
frame #3: 0x00007fff70df0cc9 libdyld.dylib`start + 1
(lldb)
Running under gdb
, after the fault, doing a tb
command to get a stack traceback, we see:
#0 0x00000005004016e6 in ?? ()
#1 0x000000000040163a in HashSetEnter (h=0x7fffffffdc10,
elemAddr=0x7fffffffdc40) at orig.c:150
#2 0x0000000000401834 in add_docfreq (wi=0x405260, docid=1611742826915764000,
frequency=5) at orig.c:266
#3 0x0000000000401879 in main () at orig.c:278
(gdb) frame 1
#1 0x000000000040163a in HashSetEnter (h=0x7fffffffdc10,
elemAddr=0x7fffffffdc40) at orig.c:150
150 int hash = h->hashfn(elemAddr, h->bucketNum);
You are segfaulting in HashSetEnter
, at the line:
int hash = h->hashfn(elemAddr, h->bucketNum);
This is because h
is not valid at this point.
Examinining the source, the place that sets the value that is ultimately invalid, it is set in new_wordindex
.
In new_wordindex
, you are saving [and returning] the address of h
.
h
is a function scoped variable here, so it is no longer valid after the function returns.
You have to use malloc
for this. And, later, you need to be able to free
this pointer during cleanup.
Here's the refactored code for the incorrect function.
Note that to show old/original code vs. new/corrected code, I'm using preprocessor conditionals:
#if 0
// old/original code
// NOTE: this is _not_ compiled in
#else
// new/corrected code
// NOTE: this _is_ compiled in
#endif
The code under #if 0
can be elided/removed, leaving just the #else
code.
static void
new_wordindex(word_index * wi, const char *word)
{
// NOTE/BUG: h is function scoped -- this can _not_ be saved and returned
// because it ceases to be valid when we return
#if 0
hashset h;
HashSetNew(&h, sizeof(doc_freq), kWordIndexHashSetBuckets, hash_docfreq, comp_docfreq, NULL);
wi->freqs = &h;
#else
hashset *h = malloc(sizeof(*h));
HashSetNew(h, sizeof(doc_freq), kWordIndexHashSetBuckets, hash_docfreq, comp_docfreq, NULL);
wi->freqs = h;
#endif
size_t wordlen = strlen(word);
wi->word = (char *) malloc(wordlen + 1); // +1 for null-termination
strcpy(wi->word, word);
(wi->word)[wordlen] = '\0';
}