cpmap.pl@ 33085

Last change on this file since 33085 was 28475, checked in by vboxsync, 15 years ago
crOpenGL: update to wine 1.1.43
Property svn:eol-style set to `native`
File size: 46.8 KB

Line
1	#!/usr/bin/perl -w
2	#
3	# Generate code page .c files from ftp.unicode.org descriptions
4	#
5	# Copyright 2000 Alexandre Julliard
6	#
7	# This library is free software; you can redistribute it and/or
8	# modify it under the terms of the GNU Lesser General Public
9	# License as published by the Free Software Foundation; either
10	# version 2.1 of the License, or (at your option) any later version.
11	#
12	# This library is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	# Lesser General Public License for more details.
16	#
17	# You should have received a copy of the GNU Lesser General Public
18	# License along with this library; if not, write to the Free Software
19	# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20	#
21
22	#
23	# Sun LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
24	# other than GPL or LGPL is available it will apply instead, Sun elects to use only
25	# the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
26	# a choice of LGPL license versions is made available with the language indicating
27	# that LGPLv2 or any later version may be used, or where a choice of which version
28	# of the LGPL is applied is otherwise unspecified.
29	#
30
31	use strict;
32
33	# base directory for ftp.unicode.org files
34	my $BASEDIR = "ftp.unicode.org/Public/";
35	my $MAPPREFIX = $BASEDIR . "MAPPINGS/";
36
37	# UnicodeData file
38	my $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
39
40	# Sort keys file
41	my $SORTKEYS = "www.unicode.org/reports/tr10/allkeys.txt";
42
43	# Defaults mapping
44	my $DEFAULTS = "./defaults";
45
46	# Default char for undefined mappings
47	my $DEF_CHAR = ord '?';
48
49	my @allfiles =
50	(
51	[ 37, "VENDORS/MICSFT/EBCDIC/CP037.TXT", 0, "IBM EBCDIC US Canada" ],
52	[ 424, "VENDORS/MISC/CP424.TXT", 0, "IBM EBCDIC Hebrew" ],
53	[ 437, "VENDORS/MICSFT/PC/CP437.TXT", 1, "OEM United States" ],
54	[ 500, "VENDORS/MICSFT/EBCDIC/CP500.TXT", 0, "IBM EBCDIC International" ],
55	[ 737, "VENDORS/MICSFT/PC/CP737.TXT", 1, "OEM Greek 437G" ],
56	[ 775, "VENDORS/MICSFT/PC/CP775.TXT", 1, "OEM Baltic" ],
57	[ 850, "VENDORS/MICSFT/PC/CP850.TXT", 1, "OEM Multilingual Latin 1" ],
58	[ 852, "VENDORS/MICSFT/PC/CP852.TXT", 1, "OEM Slovak Latin 2" ],
59	[ 855, "VENDORS/MICSFT/PC/CP855.TXT", 1, "OEM Cyrillic" ],
60	[ 856, "VENDORS/MISC/CP856.TXT", 0, "Hebrew PC" ],
61	[ 857, "VENDORS/MICSFT/PC/CP857.TXT", 1, "OEM Turkish" ],
62	[ 860, "VENDORS/MICSFT/PC/CP860.TXT", 1, "OEM Portuguese" ],
63	[ 861, "VENDORS/MICSFT/PC/CP861.TXT", 1, "OEM Icelandic" ],
64	[ 862, "VENDORS/MICSFT/PC/CP862.TXT", 1, "OEM Hebrew" ],
65	[ 863, "VENDORS/MICSFT/PC/CP863.TXT", 1, "OEM Canadian French" ],
66	[ 864, "VENDORS/MICSFT/PC/CP864.TXT", 0, "OEM Arabic" ],
67	[ 865, "VENDORS/MICSFT/PC/CP865.TXT", 1, "OEM Nordic" ],
68	[ 866, "VENDORS/MICSFT/PC/CP866.TXT", 1, "OEM Russian" ],
69	[ 869, "VENDORS/MICSFT/PC/CP869.TXT", 1, "OEM Greek" ],
70	[ 874, "VENDORS/MICSFT/WindowsBestFit/bestfit874.txt", 1, "ANSI/OEM Thai" ],
71	[ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", 0, "IBM EBCDIC Greek" ],
72	[ 878, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
73	[ 932, "VENDORS/MICSFT/WindowsBestFit/bestfit932.txt", 0, "ANSI/OEM Japanese Shift-JIS" ],
74	[ 936, "VENDORS/MICSFT/WindowsBestFit/bestfit936.txt", 0, "ANSI/OEM Simplified Chinese GBK" ],
75	[ 949, "VENDORS/MICSFT/WindowsBestFit/bestfit949.txt", 0, "ANSI/OEM Korean Unified Hangul" ],
76	[ 950, "VENDORS/MICSFT/WindowsBestFit/bestfit950.txt", 0, "ANSI/OEM Traditional Chinese Big5" ],
77	[ 1006, "VENDORS/MISC/CP1006.TXT", 0, "IBM Arabic" ],
78	[ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", 0, "IBM EBCDIC Latin 5 Turkish" ],
79	[ 1250, "VENDORS/MICSFT/WindowsBestFit/bestfit1250.txt", 0, "ANSI Eastern Europe" ],
80	[ 1251, "VENDORS/MICSFT/WindowsBestFit/bestfit1251.txt", 0, "ANSI Cyrillic" ],
81	[ 1252, "VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt", 0, "ANSI Latin 1" ],
82	[ 1253, "VENDORS/MICSFT/WindowsBestFit/bestfit1253.txt", 0, "ANSI Greek" ],
83	[ 1254, "VENDORS/MICSFT/WindowsBestFit/bestfit1254.txt", 0, "ANSI Turkish" ],
84	[ 1255, "VENDORS/MICSFT/WindowsBestFit/bestfit1255.txt", 0, "ANSI Hebrew" ],
85	[ 1256, "VENDORS/MICSFT/WindowsBestFit/bestfit1256.txt", 0, "ANSI Arabic" ],
86	[ 1257, "VENDORS/MICSFT/WindowsBestFit/bestfit1257.txt", 0, "ANSI Baltic" ],
87	[ 1258, "VENDORS/MICSFT/WindowsBestFit/bestfit1258.txt", 0, "ANSI/OEM Viet Nam" ],
88	[ 1361, "OBSOLETE/EASTASIA/KSC/JOHAB.TXT", 0, "Korean Johab" ],
89	[ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT", 0, "Mac Roman" ],
90	[ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT", 0, "Mac Greek" ],
91	[ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT", 0, "Mac Cyrillic" ],
92	[ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT", 0, "Mac Latin 2" ],
93	[ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT", 0, "Mac Icelandic" ],
94	[ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT", 0, "Mac Turkish" ],
95	[ 20127, undef, 0, "US-ASCII (7bit)" ],
96	[ 20866, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
97	[ 20932, "OBSOLETE/EASTASIA/JIS/JIS0208.TXT", 0, "EUC-JP" ],
98	[ 21866, "VENDORS/MISC/KOI8-U.TXT", 0, "Ukrainian KOI8" ],
99	[ 28591, "ISO8859/8859-1.TXT", 0, "ISO 8859-1 Latin 1" ],
100	[ 28592, "ISO8859/8859-2.TXT", 0, "ISO 8859-2 Latin 2 (East European)" ],
101	[ 28593, "ISO8859/8859-3.TXT", 0, "ISO 8859-3 Latin 3 (South European)" ],
102	[ 28594, "ISO8859/8859-4.TXT", 0, "ISO 8859-4 Latin 4 (Baltic old)" ],
103	[ 28595, "ISO8859/8859-5.TXT", 0, "ISO 8859-5 Cyrillic" ],
104	[ 28596, "ISO8859/8859-6.TXT", 0, "ISO 8859-6 Arabic" ],
105	[ 28597, "ISO8859/8859-7.TXT", 0, "ISO 8859-7 Greek" ],
106	[ 28598, "ISO8859/8859-8.TXT", 0, "ISO 8859-8 Hebrew" ],
107	[ 28599, "ISO8859/8859-9.TXT", 0, "ISO 8859-9 Latin 5 (Turkish)" ],
108	[ 28600, "ISO8859/8859-10.TXT", 0, "ISO 8859-10 Latin 6 (Nordic)" ],
109	[ 28603, "ISO8859/8859-13.TXT", 0, "ISO 8859-13 Latin 7 (Baltic)" ],
110	[ 28604, "ISO8859/8859-14.TXT", 0, "ISO 8859-14 Latin 8 (Celtic)" ],
111	[ 28605, "ISO8859/8859-15.TXT", 0, "ISO 8859-15 Latin 9 (Euro)" ],
112	[ 28606, "ISO8859/8859-16.TXT", 0, "ISO 8859-16 Latin 10 (Balkan)" ]
113	);
114
115
116	my %ctype =
117	(
118	"upper" => 0x0001,
119	"lower" => 0x0002,
120	"digit" => 0x0004,
121	"space" => 0x0008,
122	"punct" => 0x0010,
123	"cntrl" => 0x0020,
124	"blank" => 0x0040,
125	"xdigit" => 0x0080,
126	"alpha" => 0x0100
127	);
128
129	my %categories =
130	(
131	"Lu" => $ctype{"alpha"}\|$ctype{"upper"}, # Letter, Uppercase
132	"Ll" => $ctype{"alpha"}\|$ctype{"lower"}, # Letter, Lowercase
133	"Lt" => $ctype{"alpha"}, # Letter, Titlecase
134	"Mn" => $ctype{"punct"}, # Mark, Non-Spacing
135	"Mc" => $ctype{"punct"}, # Mark, Spacing Combining
136	"Me" => $ctype{"punct"}, # Mark, Enclosing
137	"Nd" => $ctype{"digit"}, # Number, Decimal Digit
138	"Nl" => $ctype{"punct"}, # Number, Letter
139	"No" => $ctype{"punct"}, # Number, Other
140	"Zs" => $ctype{"space"}, # Separator, Space
141	"Zl" => $ctype{"space"}, # Separator, Line
142	"Zp" => $ctype{"space"}, # Separator, Paragraph
143	"Cc" => $ctype{"cntrl"}, # Other, Control
144	"Cf" => 0, # Other, Format
145	"Cs" => 0, # Other, Surrogate
146	"Co" => 0, # Other, Private Use
147	"Cn" => 0, # Other, Not Assigned
148	"Lm" => $ctype{"punct"}, # Letter, Modifier
149	"Lo" => $ctype{"alpha"}, # Letter, Other
150	"Pc" => $ctype{"punct"}, # Punctuation, Connector
151	"Pd" => $ctype{"punct"}, # Punctuation, Dash
152	"Ps" => $ctype{"punct"}, # Punctuation, Open
153	"Pe" => $ctype{"punct"}, # Punctuation, Close
154	"Pi" => $ctype{"punct"}, # Punctuation, Initial quote
155	"Pf" => $ctype{"punct"}, # Punctuation, Final quote
156	"Po" => $ctype{"punct"}, # Punctuation, Other
157	"Sm" => $ctype{"punct"}, # Symbol, Math
158	"Sc" => $ctype{"punct"}, # Symbol, Currency
159	"Sk" => $ctype{"punct"}, # Symbol, Modifier
160	"So" => $ctype{"punct"} # Symbol, Other
161	);
162
163	# a few characters need additional categories that cannot be determined automatically
164	my %special_categories =
165	(
166	"xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
167	0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
168	"space" => [ 0x09..0x0d, 0x85 ],
169	"blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
170	"cntrl" => [ 0x070f, 0x180b, 0x180c, 0x180d, 0x180e, 0x200c, 0x200d,
171	0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
172	0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
173	0xfff9, 0xfffa, 0xfffb ]
174	);
175
176	my %directions =
177	(
178	"L" => 1, # Left-to-Right
179	"LRE" => 15, # Left-to-Right Embedding
180	"LRO" => 15, # Left-to-Right Override
181	"R" => 2, # Right-to-Left
182	"AL" => 12, # Right-to-Left Arabic
183	"RLE" => 15, # Right-to-Left Embedding
184	"RLO" => 15, # Right-to-Left Override
185	"PDF" => 15, # Pop Directional Format
186	"EN" => 3, # European Number
187	"ES" => 4, # European Number Separator
188	"ET" => 5, # European Number Terminator
189	"AN" => 6, # Arabic Number
190	"CS" => 7, # Common Number Separator
191	"NSM" => 13, # Non-Spacing Mark
192	"BN" => 14, # Boundary Neutral
193	"B" => 8, # Paragraph Separator
194	"S" => 9, # Segment Separator
195	"WS" => 10, # Whitespace
196	"ON" => 11 # Other Neutrals
197	);
198
199	my @cp2uni = ();
200	my @lead_bytes = ();
201	my @uni2cp = ();
202	my @unicode_defaults = ();
203	my @unicode_aliases = ();
204	my @tolower_table = ();
205	my @toupper_table = ();
206	my @digitmap_table = ();
207	my @compatmap_table = ();
208	my @category_table = (0) x 65536;
209	my @direction_table = ();
210	my @decomp_table = ();
211	my @compose_table = ();
212
213
214	################################################################
215	# read in the defaults file
216	sub READ_DEFAULTS($)
217	{
218	my $filename = shift;
219	my $start;
220
221	# first setup a few default mappings
222
223	open DEFAULTS, "$filename" or die "Cannot open $filename";
224	print "Loading $filename\n";
225	while (<DEFAULTS>)
226	{
227	next if /^\#/; # skip comments
228	next if /^$/; # skip empty lines
229	if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+))\s+([0-9a-fA-F]+\|'.'\|none)\s+(\#.)?/)
230	{
231	my @src = map hex, split /,/,$1;
232	my $dst = $4;
233	my $comment = $5;
234	if ($#src > 0) { push @unicode_aliases, \@src; }
235	next if ($dst eq "none");
236	$dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
237	foreach my $src (@src)
238	{
239	die "Duplicate value" if defined($unicode_defaults[$src]);
240	$unicode_defaults[$src] = $dst;
241	}
242	next;
243	}
244	die "Unrecognized line $_\n";
245	}
246
247	# now build mappings from the decomposition field of the Unicode database
248
249	open UNICODEDATA, "$UNICODEDATA" or die "Cannot open $UNICODEDATA";
250	print "Loading $UNICODEDATA\n";
251	while (<UNICODEDATA>)
252	{
253	# Decode the fields ...
254	my ($code, $name, $cat, $comb, $bidi,
255	$decomp, $dec, $dig, $num, $mirror,
256	$oldname, $comment, $upper, $lower, $title) = split /;/;
257	my $dst;
258	my $src = hex $code;
259
260	die "unknown category $cat" unless defined $categories{$cat};
261	die "unknown directionality $bidi" unless defined $directions{$bidi};
262
263	$category_table[$src] = $categories{$cat};
264	$direction_table[$src] = $directions{$bidi};
265
266	if ($lower ne "")
267	{
268	$tolower_table[$src] = hex $lower;
269	$category_table[$src] \|= $ctype{"upper"}\|$ctype{"alpha"};
270	}
271	if ($upper ne "")
272	{
273	$toupper_table[$src] = hex $upper;
274	$category_table[$src] \|= $ctype{"lower"}\|$ctype{"alpha"};
275	}
276	if ($dec ne "")
277	{
278	$category_table[$src] \|= $ctype{"digit"};
279	}
280	if ($dig ne "")
281	{
282	$digitmap_table[$src] = ord $dig;
283	}
284
285	# copy the category and direction for everything between First/Last pairs
286	if ($name =~ /, First>/) { $start = $src; }
287	if ($name =~ /, Last>/)
288	{
289	while ($start < $src)
290	{
291	$category_table[$start] = $category_table[$src];
292	$direction_table[$start] = $direction_table[$src];
293	$start++;
294	}
295	}
296
297	next if $decomp eq ""; # no decomposition, skip it
298
299	if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
300	{
301	# decomposition of the form "<foo> 1234" -> use char if type is known
302	if (($src >= 0xf900 && $src < 0xfb00) \|\| ($src >= 0xfe30 && $src < 0xfffd))
303	{
304	# Single char decomposition in the compatibility range
305	$compatmap_table[$src] = hex $2;
306	}
307	next unless ($1 eq "font" \|\|
308	$1 eq "noBreak" \|\|
309	$1 eq "circle" \|\|
310	$1 eq "super" \|\|
311	$1 eq "sub" \|\|
312	$1 eq "wide" \|\|
313	$1 eq "narrow" \|\|
314	$1 eq "compat" \|\|
315	$1 eq "small");
316	$dst = hex $2;
317	}
318	elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
319	{
320	# decomposition "<compat> 0020 1234" -> combining accent
321	$dst = hex $1;
322	}
323	elsif ($decomp =~ /^([0-9a-fA-F]+)/)
324	{
325	# decomposition contains only char values without prefix -> use first char
326	$dst = hex $1;
327	$category_table[$src] \|= $category_table[$dst] if defined $category_table[$dst];
328	# store decomposition if it contains two chars
329	if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
330	{
331	$decomp_table[$src] = [ hex $1, hex $2 ];
332	push @compose_table, [ hex $1, hex $2, $src ];
333	}
334	elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
335	(($src >= 0xf900 && $src < 0xfb00) \|\| ($src >= 0xfe30 && $src < 0xfffd)))
336	{
337	# Single char decomposition in the compatibility range
338	$compatmap_table[$src] = hex $2;
339	}
340	}
341	else
342	{
343	next;
344	}
345
346	next if defined($unicode_defaults[$src]); # may have been set in the defaults file
347
348	# check for loops
349	for (my $i = $dst; ; $i = $unicode_defaults[$i])
350	{
351	die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
352	last unless defined($unicode_defaults[$i]);
353	}
354	$unicode_defaults[$src] = $dst;
355	}
356
357	# patch the category of some special characters
358
359	foreach my $cat (keys %special_categories)
360	{
361	my $flag = $ctype{$cat};
362	foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] \|= $flag; }
363	}
364	}
365
366
367	################################################################
368	# parse the input file
369	sub READ_FILE($)
370	{
371	my $name = shift;
372	open INPUT,$name or die "Cannot open $name";
373
374	while (<INPUT>)
375	{
376	next if /^\#/; # skip comments
377	next if /^$/; # skip empty lines
378	next if /\x1a/; # skip ^Z
379	next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/); # undefined char
380
381	if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
382	{
383	my $cp = hex $1;
384	push @lead_bytes,$cp;
385	$cp2uni[$cp] = 0;
386	next;
387	}
388	if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
389	{
390	my $cp = hex $1;
391	my $uni = hex $2;
392	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
393	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
394	if ($cp > 0xff && !defined($cp2uni[$cp >> 8]))
395	{
396	push @lead_bytes,$cp >> 8;
397	$cp2uni[$cp >> 8] = 0;
398	}
399	next;
400	}
401	die "$name: Unrecognized line $_\n";
402	}
403	}
404
405
406	################################################################
407	# fill input data for the 20127 (us-ascii) codepage
408	sub fill_20127_codepage()
409	{
410	for (my $i = 0; $i < 128; $i++) { $cp2uni[$i] = $uni2cp[$i] = $i; }
411	for (my $i = 128; $i < 256; $i++) { $cp2uni[$i] = $i & 0x7f; }
412	}
413
414	################################################################
415	# get a mapping including glyph chars for MB_USEGLYPHCHARS
416
417	sub get_glyphs_mapping(@)
418	{
419	$_[0x01] = 0x263a; # (WHITE SMILING FACE)
420	$_[0x02] = 0x263b; # (BLACK SMILING FACE)
421	$_[0x03] = 0x2665; # (BLACK HEART SUIT)
422	$_[0x04] = 0x2666; # (BLACK DIAMOND SUIT)
423	$_[0x05] = 0x2663; # (BLACK CLUB SUIT)
424	$_[0x06] = 0x2660; # (BLACK SPADE SUIT)
425	$_[0x07] = 0x2022; # (BULLET)
426	$_[0x08] = 0x25d8; # (INVERSE BULLET)
427	$_[0x09] = 0x25cb; # (WHITE CIRCLE)
428	$_[0x0a] = 0x25d9; # (INVERSE WHITE CIRCLE)
429	$_[0x0b] = 0x2642; # (MALE SIGN)
430	$_[0x0c] = 0x2640; # (FEMALE SIGN)
431	$_[0x0d] = 0x266a; # (EIGHTH NOTE)
432	$_[0x0e] = 0x266b; # (BEAMED EIGHTH NOTES)
433	$_[0x0f] = 0x263c; # (WHITE SUN WITH RAYS)
434	$_[0x10] = 0x25ba; # (BLACK RIGHT-POINTING POINTER)
435	$_[0x11] = 0x25c4; # (BLACK LEFT-POINTING POINTER)
436	$_[0x12] = 0x2195; # (UP DOWN ARROW)
437	$_[0x13] = 0x203c; # (DOUBLE EXCLAMATION MARK)
438	$_[0x14] = 0x00b6; # (PILCROW SIGN)
439	$_[0x15] = 0x00a7; # (SECTION SIGN)
440	$_[0x16] = 0x25ac; # (BLACK RECTANGLE)
441	$_[0x17] = 0x21a8; # (UP DOWN ARROW WITH BASE)
442	$_[0x18] = 0x2191; # (UPWARDS ARROW)
443	$_[0x19] = 0x2193; # (DOWNWARDS ARROW)
444	$_[0x1a] = 0x2192; # (RIGHTWARDS ARROW)
445	$_[0x1b] = 0x2190; # (LEFTWARDS ARROW)
446	$_[0x1c] = 0x221f; # (RIGHT ANGLE)
447	$_[0x1d] = 0x2194; # (LEFT RIGHT ARROW)
448	$_[0x1e] = 0x25b2; # (BLACK UP-POINTING TRIANGLE)
449	$_[0x1f] = 0x25bc; # (BLACK DOWN-POINTING TRIANGLE)
450	$_[0x7f] = 0x2302; # (HOUSE)
451	return @_;
452	}
453
454	################################################################
455	# build EUC-JP table from the JIS 0208 file
456	# FIXME: for proper EUC-JP we should probably read JIS 0212 too
457	# but this would require 3-byte DBCS characters
458	sub READ_JIS0208_FILE($)
459	{
460	my $name = shift;
461
462	# ASCII chars
463	for (my $i = 0x00; $i <= 0x7f; $i++)
464	{
465	$cp2uni[$i] = $i;
466	$uni2cp[$i] = $i;
467	}
468
469	# JIS X 0201 right plane
470	for (my $i = 0xa1; $i <= 0xdf; $i++)
471	{
472	$cp2uni[0x8e00 + $i] = 0xfec0 + $i;
473	$uni2cp[0xfec0 + $i] = 0x8e00 + $i;
474	}
475
476	# lead bytes
477	foreach my $i (0x8e, 0x8f, 0xa1 .. 0xfe)
478	{
479	push @lead_bytes,$i;
480	$cp2uni[$i] = 0;
481	}
482
483	# undefined chars
484	foreach my $i (0x80 .. 0x8d, 0x90 .. 0xa0, 0xff)
485	{
486	$cp2uni[$i] = $DEF_CHAR;
487	}
488
489	# Shift-JIS compatibility
490	$uni2cp[0x00a5] = 0x5c;
491	$uni2cp[0x203e] = 0x7e;
492
493	# Fix backslash conversion
494	$cp2uni[0xa1c0] = 0xff3c;
495	$uni2cp[0xff3c] = 0xa1c0;
496
497	open INPUT, "$name" or die "Cannot open $name";
498	while (<INPUT>)
499	{
500	next if /^\#/; # skip comments
501	next if /^$/; # skip empty lines
502	next if /\x1a/; # skip ^Z
503	if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
504	{
505	my $cp = 0x8080 + hex $1;
506	my $uni = hex $2;
507	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
508	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
509	next;
510	}
511	die "$name: Unrecognized line $_\n";
512	}
513	}
514
515
516	################################################################
517	# build the sort keys table
518	sub READ_SORTKEYS_FILE()
519	{
520	my @sortkeys = ();
521	for (my $i = 0; $i < 65536; $i++) { $sortkeys[$i] = [ -1, 0, 0, 0, 0 ] };
522
523	open INPUT, "$SORTKEYS" or die "Cannot open $SORTKEYS";
524	print "Loading $SORTKEYS\n";
525	while (<INPUT>)
526	{
527	next if /^\#/; # skip comments
528	next if /^$/; # skip empty lines
529	next if /\x1a/; # skip ^Z
530	next if /^\@version/; # skip @version header
531	if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
532	{
533	my ($uni,$variable) = (hex $1, $2);
534	next if $uni > 65535;
535	$sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
536	next;
537	}
538	if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
539	{
540	# multiple character sequence, ignored for now
541	next;
542	}
543	die "$SORTKEYS: Unrecognized line $_\n";
544	}
545	close INPUT;
546
547	# compress the keys to 32 bit:
548	# key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
549
550	@sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
551	${$a}[2] <=> ${$b}[2] or
552	${$a}[3] <=> ${$b}[3] or
553	${$a}[4] <=> ${$b}[4] or
554	$a cmp $b; } @sortkeys;
555
556	my ($n2, $n3) = (1, 1);
557	my @keys = (-1, -1, -1, -1, -1 );
558	my @flatkeys = ();
559
560	for (my $i = 0; $i < 65536; $i++)
561	{
562	my @current = @{$sortkeys[$i]};
563	next if $current[0] == -1;
564	if ($current[1] == $keys[1])
565	{
566	if ($current[2] == $keys[2])
567	{
568	if ($current[3] == $keys[3])
569	{
570	# nothing
571	}
572	else
573	{
574	$keys[3] = $current[3];
575	$n3++;
576	die if ($n3 >= 16);
577	}
578	}
579	else
580	{
581	$keys[2] = $current[2];
582	$keys[3] = $current[3];
583	$n2++;
584	$n3 = 1;
585	die if ($n2 >= 256);
586	}
587	}
588	else
589	{
590	$keys[1] = $current[1];
591	$keys[2] = $current[2];
592	$keys[3] = $current[3];
593	$n2 = 1;
594	$n3 = 1;
595	}
596
597	if ($current[2]) { $current[2] = $n2; }
598	if ($current[3]) { $current[3] = $n3; }
599	if ($current[4]) { $current[4] = 1; }
600
601	$flatkeys[$current[0]] = ($current[1] << 16) \| ($current[2] << 8) \| ($current[3] << 4) \| $current[4];
602	}
603	return @flatkeys;
604	}
605
606
607	################################################################
608	# build the sort keys table
609	sub DUMP_SORTKEYS($@)
610	{
611	my ($filename, @keys) = @_;
612
613	# count the number of 256-key ranges that contain something
614
615	my @offsets = ();
616	my $ranges = 2;
617	for (my $i = 0; $i < 256; $i++) { $offsets[$i] = 256; }
618	for (my $i = 0; $i < 65536; $i++)
619	{
620	next unless defined $keys[$i];
621	$offsets[$i >> 8] = $ranges * 256;
622	$ranges++;
623	$i \|= 255;
624	}
625
626	# output the range offsets
627
628	open OUTPUT,">$filename.new" or die "Cannot create $filename";
629	printf "Building $filename\n";
630	printf OUTPUT "/* Unicode collation element table */\n";
631	printf OUTPUT "/* generated from %s */\n", $SORTKEYS;
632	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
633
634	printf OUTPUT "const unsigned int collation_table[%d] =\n{\n", $ranges*256;
635	printf OUTPUT " /* index */\n";
636	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%08x", 0, @offsets );
637
638	# output the default values
639
640	printf OUTPUT " /* defaults */\n";
641	printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0, (0xffffffff) x 256 );
642
643	# output all the key ranges
644
645	for (my $i = 0; $i < 256; $i++)
646	{
647	next if $offsets[$i] == 256;
648	printf OUTPUT ",\n /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
649	printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0xffffffff, @keys[($i<<8) .. ($i<<8)+255] );
650	}
651	printf OUTPUT "\n};\n";
652	close OUTPUT;
653	save_file($filename);
654	}
655
656
657	################################################################
658	# add default mappings once the file had been read
659	sub ADD_DEFAULT_MAPPINGS()
660	{
661	# Apply aliases
662
663	foreach my $alias (@unicode_aliases)
664	{
665	my $target = undef;
666	foreach my $src (@$alias)
667	{
668	if (defined($uni2cp[$src]))
669	{
670	$target = $uni2cp[$src];
671	last;
672	}
673	}
674	next unless defined($target);
675
676	# At least one char of the alias set is defined, set the others to the same value
677	foreach my $src (@$alias)
678	{
679	$uni2cp[$src] = $target unless defined($uni2cp[$src]);
680	}
681	}
682
683	# For every src -> target mapping in the defaults table,
684	# make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
685
686	for (my $src = 0; $src < 65536; $src++)
687	{
688	next if defined($uni2cp[$src]); # source has a definition already
689	next unless defined($unicode_defaults[$src]); # no default for this char
690	my $target = $unicode_defaults[$src];
691
692	# do a recursive mapping until we find a target char that is defined
693	while (!defined($uni2cp[$target]) &&
694	defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
695
696	if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
697	}
698
699	# Add an identity mapping for all undefined chars
700
701	for (my $i = 0; $i < 256; $i++)
702	{
703	next if defined($cp2uni[$i]);
704	next if defined($uni2cp[$i]);
705	$cp2uni[$i] = $uni2cp[$i] = $i;
706	}
707	}
708
709	################################################################
710	# dump an array of integers
711	sub DUMP_ARRAY($$@)
712	{
713	my ($format,$default,@array) = @_;
714	my $i;
715	my $ret = " ";
716	for ($i = 0; $i < $#array; $i++)
717	{
718	$ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
719	$ret .= (($i % 8) != 7) ? ", " : ",\n ";
720	}
721	$ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
722	return $ret;
723	}
724
725	################################################################
726	# dump an SBCS mapping table
727	sub dump_sbcs_table($$$$$)
728	{
729	my ($codepage, $has_glyphs, $name, $def, $defw) = @_;
730	my $i;
731
732	# output the ascii->unicode table
733
734	if ($has_glyphs)
735	{
736	printf OUTPUT "static const WCHAR cp2uni[512] =\n";
737	printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
738	printf OUTPUT ",\n /* glyphs */\n%s\n};\n\n",
739	DUMP_ARRAY( "0x%04x", $defw, get_glyphs_mapping(@cp2uni[0 .. 255]) );
740	}
741	else
742	{
743	printf OUTPUT "static const WCHAR cp2uni[256] =\n";
744	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
745	}
746
747	# count the number of unicode->ascii subtables that contain something
748
749	my @filled = ();
750	my $subtables = 1;
751	for (my $i = 0; $i < 65536; $i++)
752	{
753	next unless defined $uni2cp[$i];
754	$filled[$i >> 8] = 1;
755	$subtables++;
756	$i \|= 255;
757	}
758
759	# output all the subtables into a single array
760
761	printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
762	for (my $i = 0; $i < 256; $i++)
763	{
764	next unless $filled[$i];
765	printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
766	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $def, @uni2cp[($i<<8) .. ($i<<8)+255] );
767	}
768	printf OUTPUT " /* defaults */\n";
769	printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($def) x 256 );
770
771	# output a table of the offsets of the subtables in the previous array
772
773	my $pos = 0;
774	my @offsets = ();
775	for (my $i = 0; $i < 256; $i++)
776	{
777	if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
778	else { push @offsets, ($subtables-1) * 256; }
779	}
780	printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
781	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
782
783	# output the code page descriptor
784
785	printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
786	printf OUTPUT " { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
787	$codepage, $def, $defw, $name;
788	printf OUTPUT " cp2uni,\n";
789	if ($has_glyphs) { printf OUTPUT " cp2uni + 256,\n"; }
790	else { printf OUTPUT " cp2uni,\n"; }
791	printf OUTPUT " uni2cp_low,\n";
792	printf OUTPUT " uni2cp_high\n};\n";
793	}
794
795
796	################################################################
797	# dump a DBCS mapping table
798	sub dump_dbcs_table($$$$@)
799	{
800	my ($codepage, $name, $def, $defw, @lb_ranges) = @_;
801
802	# build a list of lead bytes that are actually used
803
804	my @lblist = ();
805	LBLOOP: for (my $y = 0; $y <= $#lead_bytes; $y++)
806	{
807	my $base = $lead_bytes[$y] << 8;
808	for (my $x = 0; $x < 256; $x++)
809	{
810	if (defined $cp2uni[$base+$x])
811	{
812	push @lblist,$lead_bytes[$y];
813	next LBLOOP;
814	}
815	}
816	}
817	my $unused = ($#lead_bytes > $#lblist);
818
819	# output the ascii->unicode table for the single byte chars
820
821	printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
822	printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
823
824	# output the default table for unused lead bytes
825
826	if ($unused)
827	{
828	printf OUTPUT " /* unused lead bytes */\n";
829	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($defw) x 256 );
830	}
831
832	# output the ascii->unicode table for each DBCS lead byte
833
834	for (my $y = 0; $y <= $#lblist; $y++)
835	{
836	my $base = $lblist[$y] << 8;
837	printf OUTPUT " /* lead byte %02x */\n", $lblist[$y];
838	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[$base .. $base+255] );
839	printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
840	}
841
842	# output the lead byte subtables offsets
843
844	my @offsets = ();
845	for (my $x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
846	for (my $x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
847	if ($unused)
848	{
849	# increment all lead bytes offset to take into account the unused table
850	for (my $x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
851	}
852	printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
853	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
854
855	# count the number of unicode->ascii subtables that contain something
856
857	my @filled = ();
858	my $subtables = 1;
859	for (my $i = 0; $i < 65536; $i++)
860	{
861	next unless defined $uni2cp[$i];
862	$filled[$i >> 8] = 1;
863	$subtables++;
864	$i \|= 255;
865	}
866
867	# output all the subtables into a single array
868
869	printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
870	for (my $y = 0; $y < 256; $y++)
871	{
872	next unless $filled[$y];
873	printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
874	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $def, @uni2cp[($y<<8) .. ($y<<8)+255] );
875	}
876	printf OUTPUT " /* defaults */\n";
877	printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($def) x 256 );
878
879	# output a table of the offsets of the subtables in the previous array
880
881	my $pos = 0;
882	@offsets = ();
883	for (my $y = 0; $y < 256; $y++)
884	{
885	if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
886	else { push @offsets, ($subtables-1) * 256; }
887	}
888	printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
889	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
890
891	# output the code page descriptor
892
893	printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
894	printf OUTPUT " { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
895	$codepage, $def, $defw, $name;
896	printf OUTPUT " cp2uni,\n";
897	printf OUTPUT " cp2uni_leadbytes,\n";
898	printf OUTPUT " uni2cp_low,\n";
899	printf OUTPUT " uni2cp_high,\n";
900	printf OUTPUT " {\n %s\n }\n", DUMP_ARRAY( "0x%02x", 0, @lb_ranges, 0, 0 );
901	printf OUTPUT "};\n";
902	}
903
904
905	################################################################
906	# get the list of defined lead byte ranges
907	sub get_lb_ranges()
908	{
909	my @list = ();
910	my @ranges = ();
911	my $i = 0;
912	foreach $i (@lead_bytes) { $list[$i] = 1; }
913	my $on = 0;
914	for (my $i = 0; $i < 256; $i++)
915	{
916	if ($on)
917	{
918	if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
919	}
920	else
921	{
922	if ($list[$i]) { push @ranges, $i; $on = 1; }
923	}
924	}
925	if ($on) { push @ranges, 0xff; }
926	return @ranges;
927	}
928
929
930	################################################################
931	# dump the case mapping tables
932	sub DUMP_CASE_MAPPINGS($)
933	{
934	my $filename = shift;
935	open OUTPUT,">$filename.new" or die "Cannot create $filename";
936	printf "Building $filename\n";
937	printf OUTPUT "/* Unicode case mappings */\n";
938	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
939	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
940
941	DUMP_CASE_TABLE( "wine_casemap_lower", @tolower_table );
942	DUMP_CASE_TABLE( "wine_casemap_upper", @toupper_table );
943	DUMP_CASE_TABLE( "wine_digitmap", @digitmap_table );
944	DUMP_CASE_TABLE( "wine_compatmap", @compatmap_table );
945	close OUTPUT;
946	save_file($filename);
947	}
948
949
950	################################################################
951	# dump a case mapping table
952	sub DUMP_CASE_TABLE($@)
953	{
954	my ($name,@table) = @_;
955
956	# count the number of sub tables that contain something
957	# also compute the low and upper populated bounds
958
959	my @lowerbounds = ( 0, 0 );
960	my @upperbounds = ( 0, 255 );
961	my $index = 0;
962	my @filled = ();
963	for (my $i = 0; $i < 65536; $i++)
964	{
965	next unless defined $table[$i];
966	if (!defined $filled[$i >> 8])
967	{
968	$lowerbounds[$index] = $i & 0xff;
969	$upperbounds[$index] = 0xff - $lowerbounds[$index];
970	$filled[$i >> 8] = $index * 256 + 512;
971	$index++;
972	}
973	else
974	{
975	$upperbounds[$index-1] = 0xff - ($i & 0xff);
976	}
977	$table[$i] = ($table[$i] - $i) & 0xffff;
978	}
979
980	# Collapse blocks upwards if possible
981	my $removed = 0;
982	$index = 0;
983	for (my $i = 0; $i < 256; $i++)
984	{
985	next unless defined $filled[$i];
986	if ($upperbounds[$index - 1] > $lowerbounds[$index])
987	{
988	$removed = $removed + $lowerbounds[$index];
989	}
990	else
991	{
992	$removed = $removed + $upperbounds[$index - 1];
993	$lowerbounds[$index] = $upperbounds[$index - 1];
994	}
995	$filled[$i] = $filled[$i] - $removed;
996	$index++;
997	}
998
999	# dump the table
1000
1001	printf OUTPUT "const WCHAR %s[%d] =\n", $name, $index * 256 + 512 - $removed;
1002	printf OUTPUT "{\n /* index */\n";
1003	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
1004	printf OUTPUT " /* defaults */\n";
1005	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
1006	$index = 0;
1007	for (my $i = 0; $i < 256; $i++)
1008	{
1009	next unless $filled[$i];
1010	printf OUTPUT ",\n /* 0x%02x%02x .. 0x%02xff */\n", $i, $lowerbounds[$index], $i;
1011	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0,
1012	@table[($i<<8) + $lowerbounds[$index] .. ($i<<8)+255] );
1013	$index++;
1014	}
1015	printf OUTPUT "\n};\n";
1016	}
1017
1018
1019	################################################################
1020	# dump the ctype tables
1021	sub DUMP_CTYPE_TABLES($)
1022	{
1023	my $filename = shift;
1024	open OUTPUT,">$filename.new" or die "Cannot create $filename";
1025	printf "Building $filename\n";
1026	printf OUTPUT "/* Unicode ctype tables */\n";
1027	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1028	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1029
1030	my @array = (0) x 256;
1031	my %sequences;
1032
1033	# add the direction in the high 4 bits of the category
1034	for (my $i = 0; $i < 65536; $i++)
1035	{
1036	$category_table[$i] \|= $direction_table[$i] << 12 if defined $direction_table[$i];
1037	}
1038
1039	# try to merge table rows
1040	for (my $row = 0; $row < 256; $row++)
1041	{
1042	my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
1043	if (defined($sequences{$rowtxt}))
1044	{
1045	# reuse an existing row
1046	$array[$row] = $sequences{$rowtxt};
1047	}
1048	else
1049	{
1050	# create a new row
1051	$sequences{$rowtxt} = $array[$row] = $#array + 1;
1052	push @array, @category_table[($row<<8)..($row<<8)+255];
1053	}
1054	}
1055
1056	printf OUTPUT "const unsigned short wine_wctype_table[%d] =\n{\n", $#array+1;
1057	printf OUTPUT " /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
1058	printf OUTPUT " /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
1059
1060	close OUTPUT;
1061	save_file($filename);
1062	}
1063
1064
1065	################################################################
1066	# dump the char composition tables
1067	sub DUMP_COMPOSE_TABLES($)
1068	{
1069	my $filename = shift;
1070
1071	open OUTPUT,">$filename.new" or die "Cannot create $filename";
1072	printf "Building $filename\n";
1073	printf OUTPUT "/* Unicode char composition */\n";
1074	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1075	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1076
1077	######### composition table
1078
1079	my @filled = ();
1080	foreach my $i (@compose_table)
1081	{
1082	my @comp = @$i;
1083	push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
1084	}
1085
1086	# count how many different second chars we have
1087
1088	my $count = 0;
1089	for (my $i = 0; $i < 65536; $i++)
1090	{
1091	next unless defined $filled[$i];
1092	$count++;
1093	}
1094
1095	# build the table of second chars and offsets
1096
1097	my $pos = $count + 1;
1098	my @table = ();
1099	for (my $i = 0; $i < 65536; $i++)
1100	{
1101	next unless defined $filled[$i];
1102	push @table, $i, $pos;
1103	$pos += @{$filled[$i]};
1104	}
1105	# terminator with last position
1106	push @table, 0, $pos;
1107	printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
1108	printf OUTPUT " /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
1109
1110	# build the table of first chars and mappings
1111
1112	for (my $i = 0; $i < 65536; $i++)
1113	{
1114	next unless defined $filled[$i];
1115	my @table = ();
1116	my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
1117	for (my $j = 0; $j <= $#list; $j++)
1118	{
1119	push @table, $list[$j][0], $list[$j][1];
1120	}
1121	printf OUTPUT ",\n /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
1122	}
1123	printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
1124
1125	######### decomposition table
1126
1127	# first determine all the 16-char subsets that contain something
1128
1129	@filled = (0) x 4096;
1130	$pos = 16*2; # for the null subset
1131	for (my $i = 0; $i < 65536; $i++)
1132	{
1133	next unless defined $decomp_table[$i];
1134	$filled[$i >> 4] = $pos;
1135	$pos += 16*2;
1136	$i \|= 15;
1137	}
1138	my $total = $pos;
1139
1140	# now count the 256-char subsets that contain something
1141
1142	my @filled_idx = (256) x 256;
1143	$pos = 256 + 16;
1144	for (my $i = 0; $i < 4096; $i++)
1145	{
1146	next unless $filled[$i];
1147	$filled_idx[$i >> 4] = $pos;
1148	$pos += 16;
1149	$i \|= 15;
1150	}
1151	my $null_offset = $pos; # null mapping
1152	$total += $pos;
1153
1154	# add the index offsets to the subsets positions
1155
1156	for (my $i = 0; $i < 4096; $i++)
1157	{
1158	next unless $filled[$i];
1159	$filled[$i] += $null_offset;
1160	}
1161
1162	# dump the main index
1163
1164	printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
1165	printf OUTPUT "{\n /* index */\n";
1166	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
1167	printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
1168
1169	# dump the second-level indexes
1170
1171	for (my $i = 0; $i < 256; $i++)
1172	{
1173	next unless ($filled_idx[$i] > 256);
1174	my @table = @filled[($i<<4)..($i<<4)+15];
1175	for (my $j = 0; $j < 16; $j++) { $table[$j] \|\|= $null_offset; }
1176	printf OUTPUT ",\n /* sub-index %02x */\n", $i;
1177	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1178	}
1179
1180	# dump the 16-char subsets
1181
1182	printf OUTPUT ",\n /* null mapping */\n";
1183	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
1184
1185	for (my $i = 0; $i < 4096; $i++)
1186	{
1187	next unless $filled[$i];
1188	my @table = (0) x 32;
1189	for (my $j = 0; $j < 16; $j++)
1190	{
1191	if (defined $decomp_table[($i<<4) + $j])
1192	{
1193	$table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
1194	$table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
1195	}
1196	}
1197	printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
1198	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1199	}
1200
1201	printf OUTPUT "\n};\n";
1202	close OUTPUT;
1203	save_file($filename);
1204	}
1205
1206
1207	################################################################
1208	# handle a "bestfit" Windows mapping file
1209
1210	sub handle_bestfit_file($$$)
1211	{
1212	my ($filename, $has_glyphs, $comment) = @_;
1213	my $state = "";
1214	my ($codepage, $width, $def, $defw, $count);
1215	my ($lb_cur, $lb_end);
1216	my @lb_ranges = ();
1217
1218	open INPUT,$MAPPREFIX . $filename or die "Cannot open $filename";
1219
1220	while (<INPUT>)
1221	{
1222	next if /^;/; # skip comments
1223	next if /^\s*$/; # skip empty lines
1224	next if /\x1a/; # skip ^Z
1225	last if /^ENDCODEPAGE/;
1226
1227	if (/^CODEPAGE\s+(\d+)/)
1228	{
1229	$codepage = $1;
1230	next;
1231	}
1232	if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
1233	{
1234	$width = $1;
1235	$def = hex $2;
1236	$defw = hex $3;
1237	next;
1238	}
1239	if (/^(MBTABLE\|WCTABLE\|DBCSRANGE\|DBCSTABLE)\s+(\d+)/)
1240	{
1241	$state = $1;
1242	$count = $2;
1243	next;
1244	}
1245	if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
1246	{
1247	if ($state eq "MBTABLE")
1248	{
1249	my $cp = hex $1;
1250	my $uni = hex $2;
1251	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
1252	next;
1253	}
1254	if ($state eq "WCTABLE")
1255	{
1256	my $uni = hex $1;
1257	my $cp = hex $2;
1258	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
1259	next;
1260	}
1261	if ($state eq "DBCSRANGE")
1262	{
1263	my $start = hex $1;
1264	my $end = hex $2;
1265	push @lb_ranges, $start, $end;
1266	for (my $i = $start; $i <= $end; $i++)
1267	{
1268	push @lead_bytes, $i;
1269	$cp2uni[$i] = 0;
1270	}
1271	$lb_cur = $start;
1272	$lb_end = $end;
1273	next;
1274	}
1275	if ($state eq "DBCSTABLE")
1276	{
1277	my $mb = hex $1;
1278	my $uni = hex $2;
1279	my $cp = ($lb_cur << 8) \| $mb;
1280	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
1281	if (!--$count)
1282	{
1283	if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
1284	}
1285	next;
1286	}
1287	}
1288	die "$filename: Unrecognized line $_\n";
1289	}
1290	close INPUT;
1291
1292	my $output = sprintf "c_%03d.c", $codepage;
1293	open OUTPUT,">$output.new" or die "Cannot create $output";
1294
1295	printf "Building %s from %s (%s)\n", $output, $filename, $comment;
1296
1297	# dump all tables
1298
1299	printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
1300	printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
1301	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1302	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1303
1304	if ($width == 1) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $def, $defw ); }
1305	else { dump_dbcs_table( $codepage, $comment, $def, $defw, @lb_ranges ); }
1306	close OUTPUT;
1307	save_file($output);
1308	}
1309
1310
1311	################################################################
1312	# read an input file and generate the corresponding .c file
1313	sub HANDLE_FILE(@)
1314	{
1315	my ($codepage,$filename,$has_glyphs,$comment) = @_;
1316
1317	@cp2uni = ();
1318	@lead_bytes = ();
1319	@uni2cp = ();
1320
1321	# symbol codepage file is special
1322	if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
1323	elsif ($codepage == 20127) { fill_20127_codepage(); }
1324	elsif ($filename =~ /\/bestfit/)
1325	{
1326	handle_bestfit_file( $filename, $has_glyphs, $comment );
1327	return;
1328	}
1329	else { READ_FILE($MAPPREFIX . $filename); }
1330
1331	ADD_DEFAULT_MAPPINGS();
1332
1333	my $output = sprintf "c_%03d.c", $codepage;
1334	open OUTPUT,">$output.new" or die "Cannot create $output";
1335
1336	printf "Building %s from %s (%s)\n", $output, $filename \|\| "hardcoded data", $comment;
1337
1338	# dump all tables
1339
1340	printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
1341	if ($filename)
1342	{
1343	printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
1344	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1345	}
1346	else
1347	{
1348	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1349	}
1350	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1351
1352	if (!@lead_bytes) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $DEF_CHAR, $DEF_CHAR ); }
1353	else { dump_dbcs_table( $codepage, $comment, $DEF_CHAR, $DEF_CHAR, get_lb_ranges() ); }
1354	close OUTPUT;
1355	save_file($output);
1356	}
1357
1358
1359	################################################################
1360	# save a file if modified
1361	sub save_file($)
1362	{
1363	my $file = shift;
1364	if (-f $file && !system "cmp $file $file.new >/dev/null")
1365	{
1366	unlink "$file.new";
1367	}
1368	else
1369	{
1370	rename "$file.new", "$file";
1371	}
1372	}
1373
1374
1375	################################################################
1376	# output the list of codepage tables into the cptable.c file
1377	sub OUTPUT_CPTABLE()
1378	{
1379	my @tables_decl = ();
1380
1381	foreach my $file (@allfiles)
1382	{
1383	my ($codepage,$filename,$comment) = @$file;
1384	push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
1385	}
1386
1387	push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
1388	foreach my $file (@allfiles)
1389	{
1390	my ($codepage,$filename,$comment) = @$file;
1391	push @tables_decl, sprintf(" &cptable_%03d,\n", $codepage);
1392	}
1393	push @tables_decl, "};";
1394	REPLACE_IN_FILE( "cptable.c", @tables_decl );
1395	}
1396
1397	################################################################
1398	# replace the contents of a file between ### cpmap ### marks
1399
1400	sub REPLACE_IN_FILE($@)
1401	{
1402	my $name = shift;
1403	my @data = @_;
1404	my @lines = ();
1405	open(FILE,$name) or die "Can't open $name";
1406	while (<FILE>)
1407	{
1408	push @lines, $_;
1409	last if /\#\#\# cpmap begin \#\#\#/;
1410	}
1411	push @lines, @data;
1412	while (<FILE>)
1413	{
1414	if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
1415	}
1416	push @lines, <FILE>;
1417	open(FILE,">$name.new") or die "Can't modify $name";
1418	print FILE @lines;
1419	close(FILE);
1420	save_file($name);
1421	}
1422
1423	################################################################
1424	# main routine
1425
1426	READ_DEFAULTS( $DEFAULTS );
1427	DUMP_CASE_MAPPINGS( "casemap.c" );
1428	DUMP_SORTKEYS( "collation.c", READ_SORTKEYS_FILE() );
1429	DUMP_COMPOSE_TABLES( "compose.c" );
1430	DUMP_CTYPE_TABLES( "wctype.c" );
1431
1432	foreach my $file (@allfiles) { HANDLE_FILE( @{$file} ); }
1433
1434	OUTPUT_CPTABLE();
1435
1436	exit 0;
1437
1438	# Local Variables:
1439	# compile-command: "./cpmap.pl && make -k"
1440	# End:

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Additions/WINNT/Graphics/Wine/libWine/cpmap.pl@ 33085

Download in other formats: