1 : /*
2 : Copyright (C) 2007, Bruce Ediger
3 :
4 : This file is part of cl.
5 :
6 : cl is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 2 of the License, or
9 : (at your option) any later version.
10 :
11 : cl is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with cl; if not, write to the Free Software
18 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 :
20 : */
21 : #include <stdio.h>
22 : #include <stdlib.h>
23 : #include <string.h>
24 :
25 : #include <hashtable.h>
26 :
27 : #include <node.h>
28 : #include <abbreviations.h> /* free_graph() */
29 :
30 : unsigned int hash_djb2(const char *s);
31 : void rehash_hashtable(struct hashtable *h);
32 : void reallocate_buckets(struct hashtable *h);
33 : int sparsebit(long i);
34 : int msbbit(long i);
35 :
36 : /* chain is the dummy node at head of hash chain */
37 : void
38 : insert_node(struct hashnode *chain, struct hashnode *n)
39 0 : {
40 0 : n->next = chain->next;
41 0 : chain->next = n;
42 0 : n->next->prev = n;
43 0 : n->prev = chain;
44 0 : }
45 :
46 : struct hashtable *
47 : init_hashtable(int buckets, int maxload)
48 56 : {
49 : int i;
50 : int f, z;
51 :
52 : struct hashtable *h;
53 :
54 56 : h = malloc(sizeof(*h));
55 :
56 56 : h->flags = 0;
57 :
58 : /* Insure that bucket array size is always a multiple of 2.
59 : * This amounted to about 10% run-time savings for small files
60 : * by avoiding the '%' operator. */
61 :
62 56 : f = msbbit(buckets);
63 56 : z = 1 << f;
64 :
65 56 : if (z < buckets)
66 0 : z <<= 1;
67 :
68 56 : buckets = z;
69 :
70 : /* buckets and sentinels */
71 56 : h->buckets =
72 : (struct hashnode **)malloc(buckets*sizeof(*h->buckets));
73 56 : memset((char *)h->buckets, '\0', buckets*sizeof(*h->buckets));
74 :
75 56 : h->sentinels =
76 : (struct hashnode **)malloc(buckets*sizeof(*h->sentinels));
77 56 : memset((char *)h->sentinels, '\0', buckets*sizeof(*h->sentinels));
78 :
79 3640 : for (i = 0; i < buckets; ++i)
80 : {
81 : /* first node is a dummy - for convenience and to hold chain stats */
82 :
83 3584 : h->buckets[i] = (struct hashnode *)malloc(sizeof(*h->buckets[i]));
84 3584 : h->buckets[i]->data = NULL;
85 3584 : h->buckets[i]->string = NULL;
86 3584 : h->buckets[i]->string_length = 0;
87 :
88 3584 : h->sentinels[i] = (struct hashnode *)malloc(sizeof(*h->sentinels[i]));
89 3584 : h->sentinels[i]->data = NULL;
90 :
91 3584 : h->buckets[i]->next = h->sentinels[i];
92 3584 : h->buckets[i]->prev = h->buckets[i];
93 3584 : h->sentinels[i]->next = h->sentinels[i];
94 3584 : h->sentinels[i]->prev = h->buckets[i];
95 :
96 3584 : h->buckets[i]->string = NULL;
97 3584 : h->buckets[i]->value = 0; /* split count of chain */
98 3584 : h->buckets[i]->nodes_in_chain = 0; /* split count of chain */
99 : }
100 :
101 : /* bucket allocation */
102 56 : h->currentsize = buckets;
103 56 : h->allocated = buckets;
104 :
105 56 : h->maxload = maxload;
106 :
107 : /* bucket splitting */
108 56 : h->p = 0;
109 56 : h->maxp = buckets;
110 :
111 56 : h->node_cnt = 0;
112 56 : h->rehash_cnt = 0;
113 :
114 56 : return h;
115 : }
116 :
117 : void
118 : rehash_hashtable(struct hashtable *h)
119 0 : {
120 : struct hashnode *l;
121 : struct hashnode *oldbucket, *oldtail;
122 : int newindex;
123 :
124 0 : ++h->rehash_cnt;
125 :
126 0 : if (h->currentsize >= h->allocated)
127 0 : reallocate_buckets(h);
128 :
129 0 : oldbucket = h->buckets[h->p];
130 0 : oldtail = h->sentinels[h->p];
131 0 : l = oldbucket->next;
132 :
133 0 : oldbucket->next = oldtail;
134 0 : oldtail->prev = oldbucket;
135 :
136 0 : newindex = h->p + h->maxp;
137 :
138 0 : if (h->flags > 1)
139 : {
140 0 : printf("# %d rehashes, %d total nodes, old bucket %d, new bucket %d\n",
141 : h->rehash_cnt, h->node_cnt, h->p, newindex);
142 0 : printf("# %d nodes in old bucket\n", oldbucket->nodes_in_chain);
143 : }
144 :
145 0 : oldbucket->nodes_in_chain = 0; /* may not be any lines in chain after rehash */
146 0 : ++oldbucket->value; /* number of splits */
147 :
148 0 : ++h->p;
149 :
150 0 : if (h->p == h->maxp)
151 : {
152 0 : h->maxp *= 2;
153 0 : h->p = 0;
154 : }
155 :
156 0 : ++h->currentsize;
157 :
158 0 : while (oldtail != l)
159 : {
160 0 : struct hashnode *t = l->next;
161 0 : int idx = MOD(l->value, h->maxp);
162 :
163 0 : if (idx < h->p)
164 0 : idx = MOD(l->value, (2*h->maxp));
165 :
166 0 : if (idx == newindex)
167 : {
168 0 : insert_node(h->buckets[newindex], l);
169 0 : ++h->buckets[newindex]->nodes_in_chain;
170 : } else {
171 0 : insert_node(oldbucket, l);
172 0 : ++oldbucket->nodes_in_chain;
173 : }
174 :
175 0 : l = t;
176 : }
177 :
178 0 : if (h->flags > 1)
179 : {
180 0 : printf("# split: %d in old chain, %d in new chain\n",
181 : oldbucket->nodes_in_chain, h->buckets[newindex]->nodes_in_chain);
182 : }
183 0 : }
184 :
185 : void
186 : reallocate_buckets(struct hashtable *h)
187 0 : {
188 : int i;
189 : int newallocated;
190 :
191 0 : newallocated = h->allocated * 2;
192 :
193 0 : h->buckets = (struct hashnode **)realloc(
194 : h->buckets,
195 : sizeof(struct hashnode *)*newallocated
196 : );
197 :
198 0 : h->sentinels = (struct hashnode **)realloc(
199 : h->sentinels,
200 : sizeof(struct hashnode *)*newallocated
201 : );
202 :
203 0 : for (i = h->allocated; i < newallocated; ++i)
204 : {
205 0 : h->buckets[i] = (struct hashnode *)malloc(sizeof(*h->buckets[i]));
206 0 : h->sentinels[i] = (struct hashnode *)malloc(sizeof(*h->sentinels[i]));
207 :
208 0 : h->buckets[i]->next = h->sentinels[i];
209 0 : h->sentinels[i]->prev = h->buckets[i];
210 0 : h->sentinels[i]->next = h->sentinels[i];
211 0 : h->buckets[i]->prev = h->buckets[i];
212 :
213 0 : h->buckets[i]->string = NULL;
214 0 : h->buckets[i]->data = NULL;
215 0 : h->sentinels[i]->string = NULL;
216 0 : h->sentinels[i]->data = NULL;
217 :
218 0 : h->buckets[i]->nodes_in_chain = 0;
219 0 : h->buckets[i]->value = 0; /* number of times chain splits, too */
220 : }
221 :
222 0 : h->allocated = newallocated;
223 0 : }
224 :
225 : const char *
226 : add_string(struct hashtable *h, const char *string)
227 1398 : {
228 1398 : return (const char *)add_data(h, string, NULL);
229 : }
230 :
231 : void *
232 : add_data(struct hashtable *h, const char *string, void *data)
233 1398 : {
234 : struct hashnode *n, *head;
235 : int bucket_index;
236 : unsigned int hv;
237 : int hash_mod;
238 :
239 1398 : if((n = node_lookup(h, string, &hv)))
240 : {
241 1029 : void *r = NULL;
242 1029 : if (data)
243 : {
244 0 : r = (void *)n->data;
245 0 : n->data = data;
246 : } else {
247 1029 : r = (void *)n->string;
248 : }
249 1029 : return r;
250 : }
251 :
252 369 : bucket_index = MOD(hv, h->maxp);
253 369 : hash_mod = h->maxp;
254 369 : if (bucket_index < h->p)
255 : {
256 0 : bucket_index = MOD(hv, (2*h->maxp));
257 0 : hash_mod = 2*h->maxp;
258 : }
259 :
260 369 : if (h->flags > 1)
261 0 : printf("# \"%s\": hash value %u, base %d, bucket %d\n", string, hv, hash_mod, bucket_index);
262 :
263 : /* allocate new node */
264 369 : n = (struct hashnode *)malloc(sizeof(*n));
265 :
266 : /* fill in newly allocated node */
267 369 : n->value = hv;
268 369 : n->string_length = strlen(string);
269 369 : n->string = malloc(n->string_length+1);
270 369 : memcpy(n->string, string, n->string_length+1);
271 369 : n->data = data;
272 :
273 : /* add newly allocated node to appropriate chain */
274 369 : head = h->buckets[bucket_index];
275 :
276 : /* just push it on the chain */
277 369 : n->next = head->next;
278 369 : head->next = n;
279 369 : n->next->prev = n;
280 369 : n->prev = head;
281 :
282 369 : ++h->buckets[bucket_index]->nodes_in_chain;
283 :
284 369 : ++h->node_cnt;
285 :
286 : /* "load" on hashtable too high? */
287 369 : if (h->node_cnt/h->currentsize > h->maxload)
288 0 : rehash_hashtable(h);
289 :
290 369 : return (NULL == data)? (void *)n->string: NULL;
291 : }
292 :
293 : void free_hashtable(struct hashtable *h)
294 56 : {
295 : unsigned int i;
296 :
297 3640 : for (i = 0; i < h->currentsize; ++i)
298 : {
299 3584 : struct hashnode *chain = h->buckets[i];
300 11121 : while (chain != h->sentinels[i])
301 : {
302 3953 : struct hashnode *tmp = chain->next;
303 3953 : if (chain->string)
304 : {
305 369 : free(chain->string);
306 369 : chain->string = NULL;
307 : }
308 3953 : free_graph(chain->data);
309 3953 : chain->data = NULL;
310 3953 : free(chain);
311 3953 : chain = tmp;
312 : }
313 3584 : free(h->sentinels[i]);
314 3584 : h->sentinels[i] = NULL;
315 : }
316 :
317 : /* free the dummy heads and sentinels that the table
318 : * doesn't currently use - it doubles bucket count
319 : * every time the "load" gets too high */
320 56 : for (i = h->currentsize; i < h->allocated; ++i)
321 : {
322 0 : free(h->buckets[i]);
323 0 : h->buckets[i] = NULL;
324 0 : free(h->sentinels[i]);
325 0 : h->sentinels[i] = NULL;
326 : }
327 :
328 56 : free(h->buckets);
329 56 : free(h->sentinels);
330 56 : h->buckets = NULL;
331 56 : h->sentinels = NULL;
332 :
333 56 : free(h);
334 56 : h = NULL;
335 56 : }
336 :
337 : const char *
338 : string_lookup(struct hashtable *h, const char *string_to_lookup)
339 0 : {
340 0 : struct hashnode *hn = NULL;
341 0 : char *r = NULL;
342 : unsigned int hv; /* dummy - not used in this function */
343 :
344 0 : if (NULL != (hn = node_lookup(h, string_to_lookup, &hv)))
345 0 : r = hn->string;
346 :
347 0 : return r;
348 : }
349 :
350 : void *
351 : data_lookup(struct hashtable *h, const char *string_to_lookup)
352 1017 : {
353 1017 : struct hashnode *hn = NULL;
354 1017 : void *r = NULL;
355 : unsigned int hv; /* dummy - not used in this function */
356 :
357 1017 : if (NULL != (hn = node_lookup(h, string_to_lookup, &hv)))
358 1017 : r = hn->data;
359 :
360 1017 : return r;
361 : }
362 :
363 : /* note that 3rd argument must evaluate to non-NULL - returns hash value
364 : * of string_to_lookup to add_data() to allow it to skip doing a hash
365 : */
366 : struct hashnode *
367 : node_lookup(struct hashtable *h, const char *string_to_lookup, unsigned int *rhv)
368 2538 : {
369 2538 : struct hashnode *r = NULL;
370 2538 : unsigned int hashval = hash_djb2(string_to_lookup);
371 2538 : int idx = MOD(hashval, h->maxp);
372 : struct hashnode *chain;
373 :
374 2538 : *rhv = hashval;
375 :
376 2538 : if (idx < h->p)
377 0 : idx = MOD(hashval, (2*h->maxp));
378 :
379 2538 : chain = h->buckets[idx]->next;
380 :
381 5268 : while (chain != h->sentinels[idx])
382 : {
383 : /* checking the equivalence of hash value first prevents
384 : * expensive byte-by-byte strcmp(). Seems to improve performance. */
385 2361 : if (chain->value == hashval &&
386 : !strcmp(chain->string, string_to_lookup))
387 : {
388 2169 : r = chain;
389 2169 : break;
390 : }
391 :
392 192 : chain = chain->next;
393 : }
394 :
395 2538 : return r;
396 : }
397 :
398 : unsigned int
399 : hash_djb2(const char *str)
400 2538 : {
401 2538 : unsigned long hash = 5381;
402 : int c;
403 :
404 12254 : while ((c = *str++))
405 7178 : hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
406 :
407 2538 : return hash;
408 : }
409 :
410 : int
411 : sparsebit(long i)
412 56 : {
413 56 : int p = 0;
414 :
415 56 : if (i == 0) return(-1); /* no bits set */
416 :
417 56 : if ((i & (-i)) != i) return(-1); /* two or more bits set */
418 :
419 56 : if (i & 0xAAAAAAAA) p++;
420 56 : if (i & 0xCCCCCCCC) p += 2;
421 56 : if (i & 0xF0F0F0F0) p += 4;
422 56 : if (i & 0xFF00FF00) p += 8;
423 56 : if (i & 0xFFFF0000) p += 16;
424 :
425 56 : return p;
426 : }
427 :
428 : int
429 : msbbit(long i)
430 56 : {
431 : long i2;
432 :
433 56 : if (i<0) return(31); /* speed-up assumes a 'long's msb is at pos 31 */
434 :
435 56 : while ((i2 = (i & (i-1)))) i = i2;
436 :
437 56 : return(sparsebit(i));
438 : }
|