strcache2.c@ 1921

Last change on this file since 1921 was 1921, checked in by bird, 16 years ago
strcache2: disabled the experimental stuff because I'm unsure about the actual gain on all CPUs.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 38.2 KB

Line
1	/* $Id: strcache2.c 1921 2008-10-24 00:30:58Z bird $ */
2	/** @file
3	* strcache2 - New string cache.
4	*/
5
6	/*
7	* Copyright (c) 2008 knut st. osmundsen <[email protected]>
8	*
9	* This file is part of kBuild.
10	*
11	* kBuild is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* kBuild is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with kBuild; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24	*
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include "make.h"
32	#include "strcache2.h"
33
34	#include <assert.h>
35
36	#include "debug.h"
37
38	#ifdef _MSC_VER
39	typedef unsigned char uint8_t;
40	typedef unsigned short uint16_t;
41	typedef unsigned int uint32_t;
42	typedef signed char int8_t;
43	typedef signed short int16_t;
44	typedef signed int int32_t;
45	#else
46	# include <stdint.h>
47	#endif
48
49	#ifdef WINDOWS32
50	# include <io.h>
51	# include <process.h>
52	# include <Windows.h>
53	# define PARSE_IN_WORKER
54	#endif
55
56	#ifdef __OS2__
57	# include <sys/fmutex.h>
58	#endif
59
60	#ifdef HAVE_PTHREAD
61	# include <pthread.h>
62	#endif
63
64
65	/*******************************************************************************
66	* Defined Constants And Macros *
67	*******************************************************************************/
68	/* The default size of a memory segment (1MB). */
69	#define STRCACHE2_SEG_SIZE (1024U*1024U)
70	/* The default hash table shift (hash size give as a power of two). */
71	#define STRCACHE2_HASH_SHIFT 16
72	/** Does the modding / masking of a hash number into an index. */
73	#ifdef STRCACHE2_USE_MASK
74	# define STRCACHE2_MOD_IT(cache, hash) ((hash) & (cache)->hash_mask)
75	#else
76	# define STRCACHE2_MOD_IT(cache, hash) ((hash) % (cache)->hash_div)
77	#endif
78
79	# if defined(__amd64__) \|\| defined(__x86_64__) \|\| defined(__AMD64__) \|\| defined(_M_X64) \|\| defined(__amd64) \
80	\|\| defined(__i386__) \|\| defined(__x86__) \|\| defined(__X86__) \|\| defined(_M_IX86) \|\| defined(__i386)
81	# define strcache2_get_unaligned_16bits(ptr) ( ((const uint16_t )(ptr)))
82	# else
83	/* (endian doesn't matter) */
84	# define strcache2_get_unaligned_16bits(ptr) ( (((const uint8_t *)(ptr))[0] << 8) \
85	\| (((const uint8_t *)(ptr))[1]) )
86	# endif
87
88
89	/*******************************************************************************
90	* Global Variables *
91	*******************************************************************************/
92	/* List of initialized string caches. */
93	static struct strcache2 *strcache_head;
94
95
96	/** Finds the closest primary number for power of two value (or something else
97	* useful if not support). */
98	MY_INLINE unsigned int strcache2_find_prime(unsigned int shift)
99	{
100	switch (shift)
101	{
102	case 5: return 31;
103	case 6: return 61;
104	case 7: return 127;
105	case 8: return 251;
106	case 9: return 509;
107	case 10: return 1021;
108	case 11: return 2039;
109	case 12: return 4093;
110	case 13: return 8191;
111	case 14: return 16381;
112	case 15: return 32749;
113	case 16: return 65521;
114	case 17: return 131063;
115	case 18: return 262139;
116	case 19: return 524269;
117	case 20: return 1048573;
118	case 21: return 2097143;
119	case 22: return 4194301;
120	case 23: return 8388593;
121
122	default:
123	assert (0);
124	return (1 << shift) - 1;
125	}
126	}
127
128	/* The following is a bit experiment. It produces longer chains, i.e. worse
129	distribution of the strings in the table, however the actual make
130	performances is better (<time). The explanation is probably that the
131	collisions only really increase for entries that aren't looked up that
132	much and that it actually improoves the situation for those that is. Or
133	that we spend so much less time hashing that it makes up (and more) for
134	the pentalty we suffer from the longer chains and worse distribution.
135
136	XXX: Check how this works out with different path lengths. I suspect it
137	might depend on the length of PATH_ROOT and the depth of the files
138	in the project as well. If it does, this might make matters worse
139	for some and better for others which isn't very cool... */
140
141	#if 0
142	# define BIG_HASH_SIZE 32 /* kinda fast */
143	# define BIG_HASH_HEAD 16
144	# define BIG_HASH_TAIL 12
145	#elif 0
146	# define BIG_HASH_SIZE 68 /* kinda safe */
147	# define BIG_HASH_HEAD 24
148	# define BIG_HASH_TAIL 24
149	#elif 0
150	# define BIG_HASH_SIZE 128 /* safe */
151	# define BIG_HASH_HEAD 32
152	# define BIG_HASH_TAIL 32
153	#endif
154
155	#ifdef BIG_HASH_SIZE
156	/* long string: hash head and tail, drop the middle. */
157	MY_INLINE unsigned int
158	strcache2_case_sensitive_hash_big (const char *str, unsigned int len)
159	{
160	uint32_t hash = len;
161	uint32_t tmp;
162	unsigned int head;
163
164	/* head BIG_HASH_HEAD bytes */
165	head = (BIG_HASH_HEAD >> 2);
166	while (head-- > 0)
167	{
168	hash += strcache2_get_unaligned_16bits (str);
169	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
170	hash = (hash << 16) ^ tmp;
171	str += 2 * sizeof (uint16_t);
172	hash += hash >> 11;
173	}
174
175	/* tail BIG_HASH_TAIL bytes (minus the odd ones) */
176	str += (len - BIG_HASH_HEAD - BIG_HASH_TAIL) & ~3U;
177	head = (BIG_HASH_TAIL >> 2);
178	while (head-- > 0)
179	{
180	hash += strcache2_get_unaligned_16bits (str);
181	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
182	hash = (hash << 16) ^ tmp;
183	str += 2 * sizeof (uint16_t);
184	hash += hash >> 11;
185	}
186
187	/* force "avalanching" of final 127 bits. */
188	hash ^= hash << 3;
189	hash += hash >> 5;
190	hash ^= hash << 4;
191	hash += hash >> 17;
192	hash ^= hash << 25;
193	hash += hash >> 6;
194
195	return hash;
196	}
197	#endif /* BIG_HASH_SIZE */
198
199	MY_INLINE unsigned int
200	strcache2_case_sensitive_hash (const char *str, unsigned int len)
201	{
202	#if 1
203	/* Paul Hsieh hash SuperFast function:
204	http://www.azillionmonkeys.com/qed/hash.html
205
206	This performs very good and as a sligtly better distribution than
207	STRING_N_HASH_1 on a typical kBuild run.
208
209	It is also 37% faster than return_STRING_N_HASH_1 when running the
210	two 100 times over typical kBuild strings that end up here (did a
211	fprintf here and built kBuild). Compiler was 32-bit gcc 4.0.1, darwin,
212	with -O2.
213
214	FIXME: A path for well aligned data should be added to speed up
215	execution on alignment sensitive systems. */
216	unsigned int rem;
217	uint32_t hash;
218	uint32_t tmp;
219
220	assert (sizeof (uint8_t) == sizeof (char));
221
222	# ifdef BIG_HASH_SIZE
223	/* long string? */
224	# if 0 /BIG_HASH_SIZE > 128/
225	if (MY_PREDICT_FALSE(len >= BIG_HASH_SIZE))
226	# else
227	if (len >= BIG_HASH_SIZE)
228	# endif
229	return strcache2_case_sensitive_hash_big (str, len);
230	# endif
231
232	/* short string: main loop, walking on 2 x uint16_t */
233	hash = len;
234	rem = len & 3;
235	len >>= 2;
236	while (len > 0)
237	{
238	hash += strcache2_get_unaligned_16bits (str);
239	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
240	hash = (hash << 16) ^ tmp;
241	str += 2 * sizeof (uint16_t);
242	hash += hash >> 11;
243	len--;
244	}
245
246	/* the remainder */
247	switch (rem)
248	{
249	case 3:
250	hash += strcache2_get_unaligned_16bits (str);
251	hash ^= hash << 16;
252	hash ^= str[sizeof (uint16_t)] << 18;
253	hash += hash >> 11;
254	break;
255	case 2:
256	hash += strcache2_get_unaligned_16bits (str);
257	hash ^= hash << 11;
258	hash += hash >> 17;
259	break;
260	case 1:
261	hash += *str;
262	hash ^= hash << 10;
263	hash += hash >> 1;
264	break;
265	}
266
267	/* force "avalanching" of final 127 bits. */
268	hash ^= hash << 3;
269	hash += hash >> 5;
270	hash ^= hash << 4;
271	hash += hash >> 17;
272	hash ^= hash << 25;
273	hash += hash >> 6;
274
275	return hash;
276
277	#elif 1
278	/* Note! This implementation is 18% faster than return_STRING_N_HASH_1
279	when running the two 100 times over typical kBuild strings that
280	end up here (did a fprintf here and built kBuild).
281	Compiler was 32-bit gcc 4.0.1, darwin, with -O2. */
282
283	unsigned int hash = 0;
284	if (MY_PREDICT_TRUE(len >= 2))
285	{
286	unsigned int ch0 = *str++;
287	hash = 0;
288	len--;
289	while (len >= 2)
290	{
291	unsigned int ch1 = *str++;
292	hash += ch0 << (ch1 & 0xf);
293
294	ch0 = *str++;
295	hash += ch1 << (ch0 & 0xf);
296
297	len -= 2;
298	}
299	if (len == 1)
300	{
301	unsigned ch1 = *str;
302	hash += ch0 << (ch1 & 0xf);
303
304	hash += ch1;
305	}
306	else
307	hash += ch0;
308	}
309	else if (len)
310	{
311	hash = *str;
312	hash += hash << (hash & 0xf);
313	}
314	else
315	hash = 0;
316	return hash;
317
318	#elif 1
319	# if 0
320	/* This is SDBM. This specific form/unroll was benchmarked to be 28% faster
321	than return_STRING_N_HASH_1. (Now the weird thing is that putting the (ch)
322	first in the assignment made it noticably slower.)
323
324	However, it is noticably slower in practice, most likely because of more
325	collisions. Hrmpf. */
326
327	# define UPDATE_HASH(ch) hash = (hash << 6) + (hash << 16) - hash + (ch)
328	unsigned int hash = 0;
329
330	# else
331	/* This is DJB2. This specific form/unroll was benchmarked to be 27%
332	fast than return_STRING_N_HASH_1.
333
334	Ditto. */
335
336	# define UPDATE_HASH(ch) hash = (hash << 5) + hash + (ch)
337	unsigned int hash = 5381;
338	# endif
339
340
341	while (len >= 4)
342	{
343	UPDATE_HASH (str[0]);
344	UPDATE_HASH (str[1]);
345	UPDATE_HASH (str[2]);
346	UPDATE_HASH (str[3]);
347	str += 4;
348	len -= 4;
349	}
350	switch (len)
351	{
352	default:
353	case 0:
354	return hash;
355	case 1:
356	UPDATE_HASH (str[0]);
357	return hash;
358	case 2:
359	UPDATE_HASH (str[0]);
360	UPDATE_HASH (str[1]);
361	return hash;
362	case 3:
363	UPDATE_HASH (str[0]);
364	UPDATE_HASH (str[1]);
365	UPDATE_HASH (str[2]);
366	return hash;
367	}
368	#endif
369	}
370
371	MY_INLINE unsigned int
372	strcache2_case_insensitive_hash (const char *str, unsigned int len)
373	{
374	unsigned int hash = 0;
375	if (MY_PREDICT_TRUE(len >= 2))
376	{
377	unsigned int ch0 = *str++;
378	ch0 = tolower (ch0);
379	hash = 0;
380	len--;
381	while (len >= 2)
382	{
383	unsigned int ch1 = *str++;
384	ch1 = tolower (ch1);
385	hash += ch0 << (ch1 & 0xf);
386
387	ch0 = *str++;
388	ch0 = tolower (ch0);
389	hash += ch1 << (ch0 & 0xf);
390
391	len -= 2;
392	}
393	if (len == 1)
394	{
395	unsigned ch1 = *str;
396	ch1 = tolower (ch1);
397	hash += ch0 << (ch1 & 0xf);
398
399	hash += ch1;
400	}
401	else
402	hash += ch0;
403	}
404	else if (len)
405	{
406	hash = *str;
407	hash += hash << (hash & 0xf);
408	}
409	else
410	hash = 0;
411	return hash;
412	}
413
414	MY_INLINE int
415	strcache2_memcmp_inline_short (const char xs, const char ys, unsigned int length)
416	{
417	if (length <= 8)
418	{
419	/* short string compare - ~50% of the kBuild calls. */
420	assert ( !((size_t)ys & 3) );
421	if (!((size_t)xs & 3))
422	{
423	/* aligned */
424	int result;
425	switch (length)
426	{
427	default: /* memcmp for longer strings */
428	return memcmp (xs, ys, length);
429	case 8:
430	result = (int32_t)(xs + 4) - (int32_t)(ys + 4);
431	result \|= (int32_t)xs - (int32_t)ys;
432	return result;
433	case 7:
434	result = xs[6] - ys[6];
435	result \|= xs[5] - ys[5];
436	result \|= xs[4] - ys[4];
437	result \|= (int32_t)xs - (int32_t)ys;
438	return result;
439	case 6:
440	result = xs[5] - ys[5];
441	result \|= xs[4] - ys[4];
442	result \|= (int32_t)xs - (int32_t)ys;
443	return result;
444	case 5:
445	result = xs[4] - ys[4];
446	result \|= (int32_t)xs - (int32_t)ys;
447	return result;
448	case 4:
449	return (int32_t)xs - (int32_t)ys;
450	case 3:
451	result = xs[2] - ys[2];
452	result \|= xs[1] - ys[1];
453	result \|= xs[0] - ys[0];
454	return result;
455	case 2:
456	result = xs[1] - ys[1];
457	result \|= xs[0] - ys[0];
458	return result;
459	case 1:
460	return xs - ys;
461	case 0:
462	return 0;
463	}
464	}
465	else
466	{
467	/* unaligned */
468	int result = 0;
469	switch (length)
470	{
471	case 8: result \|= xs[7] - ys[7];
472	case 7: result \|= xs[6] - ys[6];
473	case 6: result \|= xs[5] - ys[5];
474	case 5: result \|= xs[4] - ys[4];
475	case 4: result \|= xs[3] - ys[3];
476	case 3: result \|= xs[2] - ys[2];
477	case 2: result \|= xs[1] - ys[1];
478	case 1: result \|= xs[0] - ys[0];
479	case 0:
480	return result;
481	}
482	}
483	}
484
485	/* memcmp for longer strings */
486	return memcmp (xs, ys, length);
487	}
488
489	MY_INLINE int
490	strcache2_memcmp_inlined (const char xs, const char ys, unsigned int length)
491	{
492	#ifndef ELECTRIC_HEAP
493	assert ( !((size_t)ys & 3) );
494	#endif
495	if (!((size_t)xs & 3))
496	{
497	int result;
498	/* aligned */
499	while (length >= 8)
500	{
501	result = (int32_t)xs - (int32_t)ys;
502	result \|= (int32_t)(xs + 4) - (int32_t)(ys + 4);
503	if (MY_PREDICT_FALSE(result))
504	return result;
505	xs += 8;
506	ys += 8;
507	length -= 8;
508	}
509	switch (length)
510	{
511	case 7:
512	result = (int32_t)xs - (int32_t)ys;
513	result \|= xs[6] - ys[6];
514	result \|= xs[5] - ys[5];
515	result \|= xs[4] - ys[4];
516	return result;
517	case 6:
518	result = (int32_t)xs - (int32_t)ys;
519	result \|= xs[5] - ys[5];
520	result \|= xs[4] - ys[4];
521	return result;
522	case 5:
523	result = (int32_t)xs - (int32_t)ys;
524	result \|= xs[4] - ys[4];
525	return result;
526	case 4:
527	return (int32_t)xs - (int32_t)ys;
528	case 3:
529	result = xs[2] - ys[2];
530	result \|= xs[1] - ys[1];
531	result \|= xs[0] - ys[0];
532	return result;
533	case 2:
534	result = xs[1] - ys[1];
535	result \|= xs[0] - ys[0];
536	return result;
537	case 1:
538	return xs - ys;
539	default:
540	case 0:
541	return 0;
542	}
543	}
544	else
545	{
546	/* unaligned */
547	int result;
548	while (length >= 8)
549	{
550	#if defined(__i386__) \|\| defined(__x86_64__)
551	result = ( ((int32_t)xs[3] << 24)
552	\| ((int32_t)xs[2] << 16)
553	\| ((int32_t)xs[1] << 8)
554	\| xs[0] )
555	- (int32_t)ys;
556	result \|= ( ((int32_t)xs[7] << 24)
557	\| ((int32_t)xs[6] << 16)
558	\| ((int32_t)xs[5] << 8)
559	\| xs[4] )
560	- (int32_t)(ys + 4);
561	#else
562	result = xs[3] - ys[3];
563	result \|= xs[2] - ys[2];
564	result \|= xs[1] - ys[1];
565	result \|= xs[0] - ys[0];
566	result \|= xs[7] - ys[7];
567	result \|= xs[6] - ys[6];
568	result \|= xs[5] - ys[5];
569	result \|= xs[4] - ys[4];
570	#endif
571	if (MY_PREDICT_FALSE(result))
572	return result;
573	xs += 8;
574	ys += 8;
575	length -= 8;
576	}
577	result = 0;
578	switch (length)
579	{
580	case 7: result \|= xs[6] - ys[6];
581	case 6: result \|= xs[5] - ys[5];
582	case 5: result \|= xs[4] - ys[4];
583	case 4: result \|= xs[3] - ys[3];
584	case 3: result \|= xs[2] - ys[2];
585	case 2: result \|= xs[1] - ys[1];
586	case 1: result \|= xs[0] - ys[0];
587	return result;
588	default:
589	case 0:
590	return 0;
591	}
592	}
593	}
594
595	MY_INLINE int
596	strcache2_is_equal (struct strcache2 cache, struct strcache2_entry const entry,
597	const char *str, unsigned int length, unsigned int hash)
598	{
599	assert (!cache->case_insensitive);
600
601	/* the simple stuff first. */
602	if ( entry->hash != hash
603	\|\| entry->length != length)
604	return 0;
605
606	#if 0
607	return memcmp (str, entry + 1, length) == 0;
608	#elif 1
609	return strcache2_memcmp_inlined (str, (const char *)(entry + 1), length) == 0;
610	#else
611	return strcache2_memcmp_inline_short (str, (const char *)(entry + 1), length) == 0;
612	#endif
613	}
614
615	MY_INLINE int
616	strcache2_is_iequal (struct strcache2 cache, struct strcache2_entry const entry,
617	const char *str, unsigned int length, unsigned int hash)
618	{
619	assert (cache->case_insensitive);
620
621	/* the simple stuff first. */
622	if ( entry->hash != hash
623	\|\| entry->length != length)
624	return 0;
625
626	#if defined(_MSC_VER) \|\| defined(__OS2__)
627	return _memicmp (entry + 1, str, length) == 0;
628	#else
629	return strncasecmp ((const char *)(entry + 1), str, length) == 0;
630	#endif
631	}
632
633	static void
634	strcache2_rehash (struct strcache2 *cache)
635	{
636	unsigned int src = cache->hash_size;
637	struct strcache2_entry **src_tab = cache->hash_tab;
638	struct strcache2_entry **dst_tab;
639	#ifndef STRCACHE2_USE_MASK
640	unsigned int hash_shift;
641	#endif
642
643	/* Allocate a new hash table twice the size of the current. */
644	cache->hash_size <<= 1;
645	#ifdef STRCACHE2_USE_MASK
646	cache->hash_mask <<= 1;
647	cache->hash_mask \|= 1;
648	#else
649	for (hash_shift = 1; (1U << hash_shift) < cache->hash_size; hash_shift++)
650	/* nothing */;
651	cache->hash_div = strcache2_find_prime (hash_shift);
652	#endif
653	cache->rehash_count <<= 1;
654	cache->hash_tab = dst_tab = (struct strcache2_entry **)
655	xmalloc (cache->hash_size * sizeof (struct strcache2_entry *));
656	memset (dst_tab, '\0', cache->hash_size * sizeof (struct strcache2_entry *));
657
658	/* Copy the entries from the old to the new hash table. */
659	cache->collision_count = 0;
660	while (src-- > 0)
661	{
662	struct strcache2_entry *entry = src_tab[src];
663	while (entry)
664	{
665	struct strcache2_entry *next = entry->next;
666	unsigned int dst = STRCACHE2_MOD_IT (cache, entry->hash);
667	if ((entry->next = dst_tab[dst]) != 0)
668	cache->collision_count++;
669	dst_tab[dst] = entry;
670
671	entry = next;
672	}
673	}
674
675	/* That's it, just free the old table and we're done. */
676	free (src_tab);
677	}
678
679	static struct strcache2_seg *
680	strcache2_new_seg (struct strcache2 *cache, unsigned int minlen)
681	{
682	struct strcache2_seg *seg;
683	size_t size;
684	size_t off;
685
686	size = cache->def_seg_size;
687	if (size < (size_t)minlen + sizeof (struct strcache2_seg) + STRCACHE2_ENTRY_ALIGNMENT)
688	{
689	size = (size_t)minlen * 2;
690	size = (size + 0xfff) & ~(size_t)0xfff;
691	}
692
693	seg = xmalloc (size);
694	seg->start = (char *)(seg + 1);
695	seg->size = size - sizeof (struct strcache2_seg);
696	off = (size_t)seg->start & (STRCACHE2_ENTRY_ALIGNMENT - 1);
697	if (off)
698	{
699	seg->start += off;
700	seg->size -= off;
701	}
702	assert (seg->size > minlen);
703	seg->cursor = seg->start;
704	seg->avail = seg->size;
705
706	seg->next = cache->seg_head;
707	cache->seg_head = seg;
708
709	return seg;
710	}
711
712	/* Internal worker that enters a new string into the cache. */
713	static const char *
714	strcache2_enter_string (struct strcache2 *cache, unsigned int idx,
715	const char *str, unsigned int length,
716	unsigned int hash)
717	{
718	struct strcache2_entry *entry;
719	struct strcache2_seg *seg;
720	unsigned int size;
721	char *str_copy;
722
723	/* Allocate space for the string. */
724
725	size = length + 1 + sizeof (struct strcache2_entry);
726	size = (size + STRCACHE2_ENTRY_ALIGNMENT - 1) & ~(STRCACHE2_ENTRY_ALIGNMENT - 1U);
727
728	seg = cache->seg_head;
729	if (MY_PREDICT_FALSE(seg->avail < size))
730	{
731	do
732	seg = seg->next;
733	while (seg && seg->avail < size);
734	if (!seg)
735	seg = strcache2_new_seg (cache, size);
736	}
737
738	entry = (struct strcache2_entry *) seg->cursor;
739	assert ((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1));
740	seg->cursor += size;
741	seg->avail -= size;
742
743	/* Setup the entry, copy the string and insert it into the hash table. */
744
745	entry->user = NULL;
746	entry->length = length;
747	entry->hash = hash;
748	str_copy = (char *) memcpy (entry + 1, str, length);
749	str_copy[length] = '\0';
750
751	if ((entry->next = cache->hash_tab[idx]) != 0)
752	cache->collision_count++;
753	cache->hash_tab[idx] = entry;
754	cache->count++;
755	if (cache->count >= cache->rehash_count)
756	strcache2_rehash (cache);
757
758	return str_copy;
759	}
760
761	/* The public add string interface. */
762	const char *
763	strcache2_add (struct strcache2 cache, const char str, unsigned int length)
764	{
765	struct strcache2_entry const *entry;
766	unsigned int hash = strcache2_case_sensitive_hash (str, length);
767	unsigned int idx;
768
769	assert (!cache->case_insensitive);
770
771	MAKE_STATS (cache->lookup_count++);
772
773	/* Lookup the entry in the hash table, hoping for an
774	early match. If not found, enter the string at IDX. */
775	idx = STRCACHE2_MOD_IT (cache, hash);
776	entry = cache->hash_tab[idx];
777	if (!entry)
778	return strcache2_enter_string (cache, idx, str, length, hash);
779	if (strcache2_is_equal (cache, entry, str, length, hash))
780	return (const char *)(entry + 1);
781	MAKE_STATS (cache->collision_1st_count++);
782
783	entry = entry->next;
784	if (!entry)
785	return strcache2_enter_string (cache, idx, str, length, hash);
786	if (strcache2_is_equal (cache, entry, str, length, hash))
787	return (const char *)(entry + 1);
788	MAKE_STATS (cache->collision_2nd_count++);
789
790	/* Loop the rest. */
791	for (;;)
792	{
793	entry = entry->next;
794	if (!entry)
795	return strcache2_enter_string (cache, idx, str, length, hash);
796	if (strcache2_is_equal (cache, entry, str, length, hash))
797	return (const char *)(entry + 1);
798	MAKE_STATS (cache->collision_3rd_count++);
799	}
800	/* not reached */
801	}
802
803	/* The public add string interface for prehashed strings.
804	Use strcache2_hash_str to calculate the hash of a string. */
805	const char *
806	strcache2_add_hashed (struct strcache2 cache, const char str,
807	unsigned int length, unsigned int hash)
808	{
809	struct strcache2_entry const *entry;
810	unsigned int idx;
811	#ifndef NDEBUG
812	unsigned correct_hash;
813
814	assert (!cache->case_insensitive);
815	correct_hash = strcache2_case_sensitive_hash (str, length);
816	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
817	#endif /* NDEBUG */
818
819	MAKE_STATS (cache->lookup_count++);
820
821	/* Lookup the entry in the hash table, hoping for an
822	early match. If not found, enter the string at IDX. */
823	idx = STRCACHE2_MOD_IT (cache, hash);
824	entry = cache->hash_tab[idx];
825	if (!entry)
826	return strcache2_enter_string (cache, idx, str, length, hash);
827	if (strcache2_is_equal (cache, entry, str, length, hash))
828	return (const char *)(entry + 1);
829	MAKE_STATS (cache->collision_1st_count++);
830
831	entry = entry->next;
832	if (!entry)
833	return strcache2_enter_string (cache, idx, str, length, hash);
834	if (strcache2_is_equal (cache, entry, str, length, hash))
835	return (const char *)(entry + 1);
836	MAKE_STATS (cache->collision_2nd_count++);
837
838	/* Loop the rest. */
839	for (;;)
840	{
841	entry = entry->next;
842	if (!entry)
843	return strcache2_enter_string (cache, idx, str, length, hash);
844	if (strcache2_is_equal (cache, entry, str, length, hash))
845	return (const char *)(entry + 1);
846	MAKE_STATS (cache->collision_3rd_count++);
847	}
848	/* not reached */
849	}
850
851	/* The public lookup (case sensitive) string interface. */
852	const char *
853	strcache2_lookup (struct strcache2 cache, const char str, unsigned int length)
854	{
855	struct strcache2_entry const *entry;
856	unsigned int hash = strcache2_case_sensitive_hash (str, length);
857	unsigned int idx;
858
859	assert (!cache->case_insensitive);
860
861	MAKE_STATS (cache->lookup_count++);
862
863	/* Lookup the entry in the hash table, hoping for an
864	early match. */
865	idx = STRCACHE2_MOD_IT (cache, hash);
866	entry = cache->hash_tab[idx];
867	if (!entry)
868	return NULL;
869	if (strcache2_is_equal (cache, entry, str, length, hash))
870	return (const char *)(entry + 1);
871	MAKE_STATS (cache->collision_1st_count++);
872
873	entry = entry->next;
874	if (!entry)
875	return NULL;
876	if (strcache2_is_equal (cache, entry, str, length, hash))
877	return (const char *)(entry + 1);
878	MAKE_STATS (cache->collision_2nd_count++);
879
880	/* Loop the rest. */
881	for (;;)
882	{
883	entry = entry->next;
884	if (!entry)
885	return NULL;
886	if (strcache2_is_equal (cache, entry, str, length, hash))
887	return (const char *)(entry + 1);
888	MAKE_STATS (cache->collision_3rd_count++);
889	}
890	/* not reached */
891	}
892
893	#if defined(HAVE_CASE_INSENSITIVE_FS)
894
895	/* The public add string interface for case insensitive strings. */
896	const char *
897	strcache2_iadd (struct strcache2 cache, const char str, unsigned int length)
898	{
899	struct strcache2_entry const *entry;
900	unsigned int hash = strcache2_case_insensitive_hash (str, length);
901	unsigned int idx;
902
903	assert (!cache->case_insensitive);
904
905	MAKE_STATS (cache->lookup_count++);
906
907	/* Lookup the entry in the hash table, hoping for an
908	early match. If not found, enter the string at IDX. */
909	idx = STRCACHE2_MOD_IT (cache, hash);
910	entry = cache->hash_tab[idx];
911	if (!entry)
912	return strcache2_enter_string (cache, idx, str, length, hash);
913	if (strcache2_is_equal (cache, entry, str, length, hash))
914	return (const char *)(entry + 1);
915	MAKE_STATS (cache->collision_1st_count++);
916
917	entry = entry->next;
918	if (!entry)
919	return strcache2_enter_string (cache, idx, str, length, hash);
920	if (strcache2_is_equal (cache, entry, str, length, hash))
921	return (const char *)(entry + 1);
922	MAKE_STATS (cache->collision_2nd_count++);
923
924	/* Loop the rest. */
925	for (;;)
926	{
927	entry = entry->next;
928	if (!entry)
929	return strcache2_enter_string (cache, idx, str, length, hash);
930	if (strcache2_is_equal (cache, entry, str, length, hash))
931	return (const char *)(entry + 1);
932	MAKE_STATS (cache->collision_3rd_count++);
933	}
934	/* not reached */
935	}
936
937	/* The public add string interface for prehashed case insensitive strings.
938	Use strcache2_hash_istr to calculate the hash of a string. */
939	const char *
940	strcache2_iadd_hashed (struct strcache2 cache, const char str,
941	unsigned int length, unsigned int hash)
942	{
943	struct strcache2_entry const *entry;
944	unsigned int idx;
945	#ifndef NDEBUG
946	unsigned correct_hash;
947
948	assert (!cache->case_insensitive);
949	correct_hash = strcache2_case_insensitive_hash (str, length);
950	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
951	#endif /* NDEBUG */
952
953	MAKE_STATS (cache->lookup_count++);
954
955	/* Lookup the entry in the hash table, hoping for an
956	early match. If not found, enter the string at IDX. */
957	idx = STRCACHE2_MOD_IT (cache, hash);
958	entry = cache->hash_tab[idx];
959	if (!entry)
960	return strcache2_enter_string (cache, idx, str, length, hash);
961	if (strcache2_is_equal (cache, entry, str, length, hash))
962	return (const char *)(entry + 1);
963	MAKE_STATS (cache->collision_1st_count++);
964
965	entry = entry->next;
966	if (!entry)
967	return strcache2_enter_string (cache, idx, str, length, hash);
968	if (strcache2_is_equal (cache, entry, str, length, hash))
969	return (const char *)(entry + 1);
970	MAKE_STATS (cache->collision_2nd_count++);
971
972	/* Loop the rest. */
973	for (;;)
974	{
975	entry = entry->next;
976	if (!entry)
977	return strcache2_enter_string (cache, idx, str, length, hash);
978	if (strcache2_is_equal (cache, entry, str, length, hash))
979	return (const char *)(entry + 1);
980	MAKE_STATS (cache->collision_3rd_count++);
981	}
982	/* not reached */
983	}
984
985	/* The public lookup (case insensitive) string interface. */
986	const char *
987	strcache2_ilookup (struct strcache2 cache, const char str, unsigned int length)
988	{
989	struct strcache2_entry const *entry;
990	unsigned int hash = strcache2_case_insensitive_hash (str, length);
991	unsigned int idx;
992
993	assert (!cache->case_insensitive);
994
995	MAKE_STATS (cache->lookup_count++);
996
997	/* Lookup the entry in the hash table, hoping for an
998	early match. */
999	idx = STRCACHE2_MOD_IT (cache, hash);
1000	entry = cache->hash_tab[idx];
1001	if (!entry)
1002	return NULL;
1003	if (strcache2_is_equal (cache, entry, str, length, hash))
1004	return (const char *)(entry + 1);
1005	MAKE_STATS (cache->collision_1st_count++);
1006
1007	entry = entry->next;
1008	if (!entry)
1009	return NULL;
1010	if (strcache2_is_equal (cache, entry, str, length, hash))
1011	return (const char *)(entry + 1);
1012	MAKE_STATS (cache->collision_2nd_count++);
1013
1014	/* Loop the rest. */
1015	for (;;)
1016	{
1017	entry = entry->next;
1018	if (!entry)
1019	return NULL;
1020	if (strcache2_is_equal (cache, entry, str, length, hash))
1021	return (const char *)(entry + 1);
1022	MAKE_STATS (cache->collision_3rd_count++);
1023	}
1024	/* not reached */
1025	}
1026
1027	#endif /* HAVE_CASE_INSENSITIVE_FS */
1028
1029	/* Is the given string cached? returns 1 if it is, 0 if it isn't. */
1030	int
1031	strcache2_is_cached (struct strcache2 cache, const char str)
1032	{
1033	/* Check mandatory alignment first. */
1034	if (!((size_t)str & (sizeof (void *) - 1)))
1035	{
1036	/* Check the segment list and consider the question answered if the
1037	string is within one of them. (Could check it more thoroughly...) */
1038	struct strcache2_seg const *seg;
1039	for (seg = cache->seg_head; seg; seg = seg->next)
1040	if ((size_t)(str - seg->start) < seg->size)
1041	return 1;
1042	}
1043
1044	return 0;
1045	}
1046
1047
1048	/* Verify the integrity of the specified string, returning 0 if OK. */
1049	int
1050	strcache2_verify_entry (struct strcache2 cache, const char str)
1051	{
1052	struct strcache2_entry const *entry;
1053	unsigned hash;
1054	const char *end;
1055
1056	if ((size_t)str & (sizeof (void *) - 1))
1057	{
1058	fprintf (stderr,
1059	"strcache2[%s]: missaligned string %p\nstring: %s\n",
1060	cache->name, (void *)str, str);
1061	return -1;
1062	}
1063
1064	entry = (struct strcache2_entry const *)str - 1;
1065	end = memchr (str, '\0', entry->length);
1066	if ((size_t)(end - str) == entry->length)
1067	{
1068	fprintf (stderr,
1069	"strcache2[%s]: corrupt entry %p, length: %lu, expected %u;\nstring: %s\n",
1070	cache->name, (void *)entry, (unsigned long)(end - str), entry->length, str);
1071	return -1;
1072	}
1073
1074	hash = cache->case_insensitive
1075	? strcache2_case_insensitive_hash (str, entry->length)
1076	: strcache2_case_sensitive_hash (str, entry->length);
1077	if (hash != entry->hash)
1078	{
1079	fprintf (stderr,
1080	"strcache2[%s]: corrupt entry %p, hash#1: %x, expected %x;\nstring: %s\n",
1081	cache->name, (void *)entry, hash, entry->hash, str);
1082	return -1;
1083	}
1084
1085	return 0;
1086	}
1087
1088
1089	/* Calculates the case sensitive hash values of the string.
1090	The first hash is returned, the other is put at HASH2P. */
1091	unsigned int strcache2_hash_str (const char str, unsigned int length, unsigned int hash2p)
1092	{
1093	*hash2p = 1;
1094	return strcache2_case_sensitive_hash (str, length);
1095	}
1096
1097	/* Calculates the case insensitive hash values of the string.
1098	The first hash is returned, the other is put at HASH2P. */
1099	unsigned int strcache2_hash_istr (const char str, unsigned int length, unsigned int hash2p)
1100	{
1101	*hash2p = 1;
1102	return strcache2_case_insensitive_hash (str, length);
1103	}
1104
1105
1106
1107	/* Initalizes a new cache. */
1108	void
1109	strcache2_init (struct strcache2 cache, const char name, unsigned int size,
1110	unsigned int def_seg_size, int case_insensitive, int thread_safe)
1111	{
1112	unsigned hash_shift;
1113	assert (!thread_safe);
1114
1115	/* calc the size as a power of two */
1116	if (!size)
1117	hash_shift = STRCACHE2_HASH_SHIFT;
1118	else
1119	{
1120	assert (size <= (~0U / 2 + 1));
1121	for (hash_shift = 8; (1U << hash_shift) < size; hash_shift++)
1122	/* nothing */;
1123	}
1124
1125	/* adjust the default segment size */
1126	if (!def_seg_size)
1127	def_seg_size = STRCACHE2_SEG_SIZE;
1128	else if (def_seg_size < sizeof (struct strcache2_seg) * 10)
1129	def_seg_size = sizeof (struct strcache2_seg) * 10;
1130	else if ((def_seg_size & 0xfff) < 0xf00)
1131	def_seg_size = ((def_seg_size + 0xfff) & ~0xfffU);
1132
1133
1134	/* init the structure. */
1135	cache->case_insensitive = case_insensitive;
1136	#ifdef STRCACHE2_USE_MASK
1137	cache->hash_mask = (1U << hash_shift) - 1U;
1138	#else
1139	cache->hash_div = strcache2_find_prime(hash_shift);
1140	#endif
1141	cache->count = 0;
1142	cache->collision_count = 0;
1143	cache->lookup_count = 0;
1144	cache->collision_1st_count = 0;
1145	cache->collision_2nd_count = 0;
1146	cache->collision_3rd_count = 0;
1147	cache->rehash_count = (1U << hash_shift) / 4 * 3; /* rehash at 75% */
1148	cache->init_size = 1U << hash_shift;
1149	cache->hash_size = 1U << hash_shift;
1150	cache->def_seg_size = def_seg_size;
1151	cache->lock = NULL;
1152	cache->name = name;
1153
1154	/* allocate the hash table and first segment. */
1155	cache->hash_tab = (struct strcache2_entry **)
1156	xmalloc (cache->init_size * sizeof (struct strcache2_entry *));
1157	memset (cache->hash_tab, '\0', cache->init_size * sizeof (struct strcache2_entry *));
1158	strcache2_new_seg (cache, 0);
1159
1160	/* link it */
1161	cache->next = strcache_head;
1162	strcache_head = cache;
1163	}
1164
1165
1166	/* Terminates a string cache, freeing all memory and other
1167	associated resources. */
1168	void
1169	strcache2_term (struct strcache2 *cache)
1170	{
1171	/* unlink it */
1172	if (strcache_head == cache)
1173	strcache_head = cache->next;
1174	else
1175	{
1176	struct strcache2 *prev = strcache_head;
1177	while (prev->next != cache)
1178	prev = prev->next;
1179	assert (prev);
1180	prev->next = cache->next;
1181	}
1182
1183	/* free the memory segments */
1184	do
1185	{
1186	void *free_it = cache->seg_head;
1187	cache->seg_head = cache->seg_head->next;
1188	free (free_it);
1189	}
1190	while (cache->seg_head);
1191
1192	/* free the hash and clear the structure. */
1193	free (cache->hash_tab);
1194	memset (cache, '\0', sizeof (struct strcache2));
1195	}
1196
1197	/* Print statistics a string cache. */
1198	void
1199	strcache2_print_stats (struct strcache2 cache, const char prefix)
1200	{
1201	unsigned int seg_count = 0;
1202	unsigned long seg_total_bytes = 0;
1203	unsigned long seg_avg_bytes;
1204	unsigned long seg_avail_bytes = 0;
1205	unsigned long seg_max_bytes = 0;
1206	struct strcache2_seg *seg;
1207	unsigned int str_count = 0;
1208	unsigned long str_total_len = 0;
1209	unsigned int str_avg_len;
1210	unsigned int str_min_len = ~0U;
1211	unsigned int str_max_len = 0;
1212	unsigned int idx;
1213	unsigned int rehashes;
1214	unsigned int chain_depths[32];
1215
1216	printf (_("\n%s strcache2: %s\n"), prefix, cache->name);
1217
1218	/* Segment statistics. */
1219	for (seg = cache->seg_head; seg; seg = seg->next)
1220	{
1221	seg_count++;
1222	seg_total_bytes += seg->size;
1223	seg_avail_bytes += seg->avail;
1224	if (seg->size > seg_max_bytes)
1225	seg_max_bytes = seg->size;
1226	}
1227	seg_avg_bytes = seg_total_bytes / seg_count;
1228	printf (_("%s %u segments: total = %lu / max = %lu / avg = %lu / def = %u avail = %lu\n"),
1229	prefix, seg_count, seg_total_bytes, seg_max_bytes, seg_avg_bytes,
1230	cache->def_seg_size, seg_avail_bytes);
1231
1232	/* String statistics. */
1233	memset (chain_depths, '\0', sizeof (chain_depths));
1234	idx = cache->hash_size;
1235	while (idx-- > 0)
1236	{
1237	struct strcache2_entry const *entry = cache->hash_tab[idx];
1238	unsigned int depth = 0;
1239	for (; entry != 0; entry = entry->next, depth++)
1240	{
1241	unsigned int length = entry->length;
1242	str_total_len += length;
1243	if (length > str_max_len)
1244	str_max_len = length;
1245	if (length < str_min_len)
1246	str_min_len = length;
1247	str_count++;
1248	}
1249	chain_depths[depth >= 32 ? 31 : depth]++;
1250	}
1251	str_avg_len = cache->count ? str_total_len / cache->count : 0;
1252	printf (_("%s %u strings: total len = %lu / max = %u / avg = %u / min = %u\n"),
1253	prefix, cache->count, str_total_len, str_max_len, str_avg_len, str_min_len);
1254	if (str_count != cache->count)
1255	printf (_("%s string count mismatch! cache->count=%u, actual count is %u\n"), prefix,
1256	cache->count, str_count);
1257
1258	/* Hash statistics. */
1259	idx = cache->init_size;
1260	rehashes = 0;
1261	while (idx < cache->hash_size)
1262	{
1263	idx *= 2;
1264	rehashes++;
1265	}
1266
1267	#ifdef STRCACHE2_USE_MASK
1268	printf (_("%s hash size = %u mask = %#x rehashed %u times lookups = %lu\n"),
1269	prefix, cache->hash_size, cache->hash_mask, rehashes, cache->lookup_count);
1270	#else
1271	printf (_("%s hash size = %u div = %#x rehashed %u times lookups = %lu\n"),
1272	prefix, cache->hash_size, cache->hash_div, rehashes, cache->lookup_count);
1273	#endif
1274	if (cache->lookup_count)
1275	printf (_("%s hash collisions 1st = %lu (%u%%) 2nd = %lu (%u%%) 3rd = %lu (%u%%)\n"),
1276	prefix,
1277	cache->collision_1st_count, (unsigned int)((100.0 * cache->collision_1st_count) / cache->lookup_count),
1278	cache->collision_2nd_count, (unsigned int)((100.0 * cache->collision_2nd_count) / cache->lookup_count),
1279	cache->collision_3rd_count, (unsigned int)((100.0 * cache->collision_3rd_count) / cache->lookup_count));
1280	printf (_("%s hash insert collisions = %u (%u%%)\n"),
1281	prefix, cache->collision_count,(unsigned int)((100.0 * cache->collision_count) / cache->count));
1282	printf (_("%s %5u (%u%%) empty hash table slots\n"),
1283	prefix, chain_depths[0], (unsigned int)((100.0 * chain_depths[0]) / cache->hash_size));
1284	printf (_("%s %5u (%u%%) occupied hash table slots\n"),
1285	prefix, chain_depths[1], (unsigned int)((100.0 * chain_depths[1]) / cache->hash_size));
1286	for (idx = 2; idx < 32; idx++)
1287	{
1288	unsigned strs_at_this_depth = chain_depths[idx];
1289	unsigned i;
1290	for (i = idx + 1; i < 32; i++)
1291	strs_at_this_depth += chain_depths[i];
1292	if (strs_at_this_depth)
1293	printf (_("%s %5u (%2u%%) with %u string%s chained on; %5u (%2u%%) strings at depth %u.\n"),
1294	prefix, chain_depths[idx], (unsigned int)((100.0 * chain_depths[idx]) / (cache->count - cache->collision_count)),
1295	idx - 1, idx == 2 ? " " : "s",
1296	strs_at_this_depth, (unsigned int)((100.0 * strs_at_this_depth) / cache->count), idx - 1);
1297	}
1298	}
1299
1300	/* Print statistics for all string caches. */
1301	void
1302	strcache2_print_stats_all (const char *prefix)
1303	{
1304	struct strcache2 *cur;
1305	for (cur = strcache_head; cur; cur = cur->next)
1306	strcache2_print_stats (cur, prefix);
1307	}
1308

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/kmk/strcache2.c@ 1921

Download in other formats: