cpmap.pl@ 33876

Last change on this file since 33876 was 33656, checked in by vboxsync, 14 years ago
*: rebrand Sun (L)GPL disclaimers
Property svn:eol-style set to `native`
File size: 46.8 KB

Line
1	#!/usr/bin/perl -w
2	#
3	# Generate code page .c files from ftp.unicode.org descriptions
4	#
5	# Copyright 2000 Alexandre Julliard
6	#
7	# This library is free software; you can redistribute it and/or
8	# modify it under the terms of the GNU Lesser General Public
9	# License as published by the Free Software Foundation; either
10	# version 2.1 of the License, or (at your option) any later version.
11	#
12	# This library is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	# Lesser General Public License for more details.
16	#
17	# You should have received a copy of the GNU Lesser General Public
18	# License along with this library; if not, write to the Free Software
19	# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20	#
21
22
23	# Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
24	# other than GPL or LGPL is available it will apply instead, Oracle elects to use only
25	# the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
26	# a choice of LGPL license versions is made available with the language indicating
27	# that LGPLv2 or any later version may be used, or where a choice of which version
28	# of the LGPL is applied is otherwise unspecified.
29
30	#
31	#
32
33	use strict;
34
35	# base directory for ftp.unicode.org files
36	my $BASEDIR = "ftp.unicode.org/Public/";
37	my $MAPPREFIX = $BASEDIR . "MAPPINGS/";
38
39	# UnicodeData file
40	my $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
41
42	# Sort keys file
43	my $SORTKEYS = "www.unicode.org/reports/tr10/allkeys.txt";
44
45	# Defaults mapping
46	my $DEFAULTS = "./defaults";
47
48	# Default char for undefined mappings
49	my $DEF_CHAR = ord '?';
50
51	my @allfiles =
52	(
53	[ 37, "VENDORS/MICSFT/EBCDIC/CP037.TXT", 0, "IBM EBCDIC US Canada" ],
54	[ 424, "VENDORS/MISC/CP424.TXT", 0, "IBM EBCDIC Hebrew" ],
55	[ 437, "VENDORS/MICSFT/PC/CP437.TXT", 1, "OEM United States" ],
56	[ 500, "VENDORS/MICSFT/EBCDIC/CP500.TXT", 0, "IBM EBCDIC International" ],
57	[ 737, "VENDORS/MICSFT/PC/CP737.TXT", 1, "OEM Greek 437G" ],
58	[ 775, "VENDORS/MICSFT/PC/CP775.TXT", 1, "OEM Baltic" ],
59	[ 850, "VENDORS/MICSFT/PC/CP850.TXT", 1, "OEM Multilingual Latin 1" ],
60	[ 852, "VENDORS/MICSFT/PC/CP852.TXT", 1, "OEM Slovak Latin 2" ],
61	[ 855, "VENDORS/MICSFT/PC/CP855.TXT", 1, "OEM Cyrillic" ],
62	[ 856, "VENDORS/MISC/CP856.TXT", 0, "Hebrew PC" ],
63	[ 857, "VENDORS/MICSFT/PC/CP857.TXT", 1, "OEM Turkish" ],
64	[ 860, "VENDORS/MICSFT/PC/CP860.TXT", 1, "OEM Portuguese" ],
65	[ 861, "VENDORS/MICSFT/PC/CP861.TXT", 1, "OEM Icelandic" ],
66	[ 862, "VENDORS/MICSFT/PC/CP862.TXT", 1, "OEM Hebrew" ],
67	[ 863, "VENDORS/MICSFT/PC/CP863.TXT", 1, "OEM Canadian French" ],
68	[ 864, "VENDORS/MICSFT/PC/CP864.TXT", 0, "OEM Arabic" ],
69	[ 865, "VENDORS/MICSFT/PC/CP865.TXT", 1, "OEM Nordic" ],
70	[ 866, "VENDORS/MICSFT/PC/CP866.TXT", 1, "OEM Russian" ],
71	[ 869, "VENDORS/MICSFT/PC/CP869.TXT", 1, "OEM Greek" ],
72	[ 874, "VENDORS/MICSFT/WindowsBestFit/bestfit874.txt", 1, "ANSI/OEM Thai" ],
73	[ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", 0, "IBM EBCDIC Greek" ],
74	[ 878, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
75	[ 932, "VENDORS/MICSFT/WindowsBestFit/bestfit932.txt", 0, "ANSI/OEM Japanese Shift-JIS" ],
76	[ 936, "VENDORS/MICSFT/WindowsBestFit/bestfit936.txt", 0, "ANSI/OEM Simplified Chinese GBK" ],
77	[ 949, "VENDORS/MICSFT/WindowsBestFit/bestfit949.txt", 0, "ANSI/OEM Korean Unified Hangul" ],
78	[ 950, "VENDORS/MICSFT/WindowsBestFit/bestfit950.txt", 0, "ANSI/OEM Traditional Chinese Big5" ],
79	[ 1006, "VENDORS/MISC/CP1006.TXT", 0, "IBM Arabic" ],
80	[ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", 0, "IBM EBCDIC Latin 5 Turkish" ],
81	[ 1250, "VENDORS/MICSFT/WindowsBestFit/bestfit1250.txt", 0, "ANSI Eastern Europe" ],
82	[ 1251, "VENDORS/MICSFT/WindowsBestFit/bestfit1251.txt", 0, "ANSI Cyrillic" ],
83	[ 1252, "VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt", 0, "ANSI Latin 1" ],
84	[ 1253, "VENDORS/MICSFT/WindowsBestFit/bestfit1253.txt", 0, "ANSI Greek" ],
85	[ 1254, "VENDORS/MICSFT/WindowsBestFit/bestfit1254.txt", 0, "ANSI Turkish" ],
86	[ 1255, "VENDORS/MICSFT/WindowsBestFit/bestfit1255.txt", 0, "ANSI Hebrew" ],
87	[ 1256, "VENDORS/MICSFT/WindowsBestFit/bestfit1256.txt", 0, "ANSI Arabic" ],
88	[ 1257, "VENDORS/MICSFT/WindowsBestFit/bestfit1257.txt", 0, "ANSI Baltic" ],
89	[ 1258, "VENDORS/MICSFT/WindowsBestFit/bestfit1258.txt", 0, "ANSI/OEM Viet Nam" ],
90	[ 1361, "OBSOLETE/EASTASIA/KSC/JOHAB.TXT", 0, "Korean Johab" ],
91	[ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT", 0, "Mac Roman" ],
92	[ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT", 0, "Mac Greek" ],
93	[ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT", 0, "Mac Cyrillic" ],
94	[ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT", 0, "Mac Latin 2" ],
95	[ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT", 0, "Mac Icelandic" ],
96	[ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT", 0, "Mac Turkish" ],
97	[ 20127, undef, 0, "US-ASCII (7bit)" ],
98	[ 20866, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ],
99	[ 20932, "OBSOLETE/EASTASIA/JIS/JIS0208.TXT", 0, "EUC-JP" ],
100	[ 21866, "VENDORS/MISC/KOI8-U.TXT", 0, "Ukrainian KOI8" ],
101	[ 28591, "ISO8859/8859-1.TXT", 0, "ISO 8859-1 Latin 1" ],
102	[ 28592, "ISO8859/8859-2.TXT", 0, "ISO 8859-2 Latin 2 (East European)" ],
103	[ 28593, "ISO8859/8859-3.TXT", 0, "ISO 8859-3 Latin 3 (South European)" ],
104	[ 28594, "ISO8859/8859-4.TXT", 0, "ISO 8859-4 Latin 4 (Baltic old)" ],
105	[ 28595, "ISO8859/8859-5.TXT", 0, "ISO 8859-5 Cyrillic" ],
106	[ 28596, "ISO8859/8859-6.TXT", 0, "ISO 8859-6 Arabic" ],
107	[ 28597, "ISO8859/8859-7.TXT", 0, "ISO 8859-7 Greek" ],
108	[ 28598, "ISO8859/8859-8.TXT", 0, "ISO 8859-8 Hebrew" ],
109	[ 28599, "ISO8859/8859-9.TXT", 0, "ISO 8859-9 Latin 5 (Turkish)" ],
110	[ 28600, "ISO8859/8859-10.TXT", 0, "ISO 8859-10 Latin 6 (Nordic)" ],
111	[ 28603, "ISO8859/8859-13.TXT", 0, "ISO 8859-13 Latin 7 (Baltic)" ],
112	[ 28604, "ISO8859/8859-14.TXT", 0, "ISO 8859-14 Latin 8 (Celtic)" ],
113	[ 28605, "ISO8859/8859-15.TXT", 0, "ISO 8859-15 Latin 9 (Euro)" ],
114	[ 28606, "ISO8859/8859-16.TXT", 0, "ISO 8859-16 Latin 10 (Balkan)" ]
115	);
116
117
118	my %ctype =
119	(
120	"upper" => 0x0001,
121	"lower" => 0x0002,
122	"digit" => 0x0004,
123	"space" => 0x0008,
124	"punct" => 0x0010,
125	"cntrl" => 0x0020,
126	"blank" => 0x0040,
127	"xdigit" => 0x0080,
128	"alpha" => 0x0100
129	);
130
131	my %categories =
132	(
133	"Lu" => $ctype{"alpha"}\|$ctype{"upper"}, # Letter, Uppercase
134	"Ll" => $ctype{"alpha"}\|$ctype{"lower"}, # Letter, Lowercase
135	"Lt" => $ctype{"alpha"}, # Letter, Titlecase
136	"Mn" => $ctype{"punct"}, # Mark, Non-Spacing
137	"Mc" => $ctype{"punct"}, # Mark, Spacing Combining
138	"Me" => $ctype{"punct"}, # Mark, Enclosing
139	"Nd" => $ctype{"digit"}, # Number, Decimal Digit
140	"Nl" => $ctype{"punct"}, # Number, Letter
141	"No" => $ctype{"punct"}, # Number, Other
142	"Zs" => $ctype{"space"}, # Separator, Space
143	"Zl" => $ctype{"space"}, # Separator, Line
144	"Zp" => $ctype{"space"}, # Separator, Paragraph
145	"Cc" => $ctype{"cntrl"}, # Other, Control
146	"Cf" => 0, # Other, Format
147	"Cs" => 0, # Other, Surrogate
148	"Co" => 0, # Other, Private Use
149	"Cn" => 0, # Other, Not Assigned
150	"Lm" => $ctype{"punct"}, # Letter, Modifier
151	"Lo" => $ctype{"alpha"}, # Letter, Other
152	"Pc" => $ctype{"punct"}, # Punctuation, Connector
153	"Pd" => $ctype{"punct"}, # Punctuation, Dash
154	"Ps" => $ctype{"punct"}, # Punctuation, Open
155	"Pe" => $ctype{"punct"}, # Punctuation, Close
156	"Pi" => $ctype{"punct"}, # Punctuation, Initial quote
157	"Pf" => $ctype{"punct"}, # Punctuation, Final quote
158	"Po" => $ctype{"punct"}, # Punctuation, Other
159	"Sm" => $ctype{"punct"}, # Symbol, Math
160	"Sc" => $ctype{"punct"}, # Symbol, Currency
161	"Sk" => $ctype{"punct"}, # Symbol, Modifier
162	"So" => $ctype{"punct"} # Symbol, Other
163	);
164
165	# a few characters need additional categories that cannot be determined automatically
166	my %special_categories =
167	(
168	"xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
169	0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
170	"space" => [ 0x09..0x0d, 0x85 ],
171	"blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
172	"cntrl" => [ 0x070f, 0x180b, 0x180c, 0x180d, 0x180e, 0x200c, 0x200d,
173	0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
174	0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
175	0xfff9, 0xfffa, 0xfffb ]
176	);
177
178	my %directions =
179	(
180	"L" => 1, # Left-to-Right
181	"LRE" => 15, # Left-to-Right Embedding
182	"LRO" => 15, # Left-to-Right Override
183	"R" => 2, # Right-to-Left
184	"AL" => 12, # Right-to-Left Arabic
185	"RLE" => 15, # Right-to-Left Embedding
186	"RLO" => 15, # Right-to-Left Override
187	"PDF" => 15, # Pop Directional Format
188	"EN" => 3, # European Number
189	"ES" => 4, # European Number Separator
190	"ET" => 5, # European Number Terminator
191	"AN" => 6, # Arabic Number
192	"CS" => 7, # Common Number Separator
193	"NSM" => 13, # Non-Spacing Mark
194	"BN" => 14, # Boundary Neutral
195	"B" => 8, # Paragraph Separator
196	"S" => 9, # Segment Separator
197	"WS" => 10, # Whitespace
198	"ON" => 11 # Other Neutrals
199	);
200
201	my @cp2uni = ();
202	my @lead_bytes = ();
203	my @uni2cp = ();
204	my @unicode_defaults = ();
205	my @unicode_aliases = ();
206	my @tolower_table = ();
207	my @toupper_table = ();
208	my @digitmap_table = ();
209	my @compatmap_table = ();
210	my @category_table = (0) x 65536;
211	my @direction_table = ();
212	my @decomp_table = ();
213	my @compose_table = ();
214
215
216	################################################################
217	# read in the defaults file
218	sub READ_DEFAULTS($)
219	{
220	my $filename = shift;
221	my $start;
222
223	# first setup a few default mappings
224
225	open DEFAULTS, "$filename" or die "Cannot open $filename";
226	print "Loading $filename\n";
227	while (<DEFAULTS>)
228	{
229	next if /^\#/; # skip comments
230	next if /^$/; # skip empty lines
231	if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+))\s+([0-9a-fA-F]+\|'.'\|none)\s+(\#.)?/)
232	{
233	my @src = map hex, split /,/,$1;
234	my $dst = $4;
235	my $comment = $5;
236	if ($#src > 0) { push @unicode_aliases, \@src; }
237	next if ($dst eq "none");
238	$dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
239	foreach my $src (@src)
240	{
241	die "Duplicate value" if defined($unicode_defaults[$src]);
242	$unicode_defaults[$src] = $dst;
243	}
244	next;
245	}
246	die "Unrecognized line $_\n";
247	}
248
249	# now build mappings from the decomposition field of the Unicode database
250
251	open UNICODEDATA, "$UNICODEDATA" or die "Cannot open $UNICODEDATA";
252	print "Loading $UNICODEDATA\n";
253	while (<UNICODEDATA>)
254	{
255	# Decode the fields ...
256	my ($code, $name, $cat, $comb, $bidi,
257	$decomp, $dec, $dig, $num, $mirror,
258	$oldname, $comment, $upper, $lower, $title) = split /;/;
259	my $dst;
260	my $src = hex $code;
261
262	die "unknown category $cat" unless defined $categories{$cat};
263	die "unknown directionality $bidi" unless defined $directions{$bidi};
264
265	$category_table[$src] = $categories{$cat};
266	$direction_table[$src] = $directions{$bidi};
267
268	if ($lower ne "")
269	{
270	$tolower_table[$src] = hex $lower;
271	$category_table[$src] \|= $ctype{"upper"}\|$ctype{"alpha"};
272	}
273	if ($upper ne "")
274	{
275	$toupper_table[$src] = hex $upper;
276	$category_table[$src] \|= $ctype{"lower"}\|$ctype{"alpha"};
277	}
278	if ($dec ne "")
279	{
280	$category_table[$src] \|= $ctype{"digit"};
281	}
282	if ($dig ne "")
283	{
284	$digitmap_table[$src] = ord $dig;
285	}
286
287	# copy the category and direction for everything between First/Last pairs
288	if ($name =~ /, First>/) { $start = $src; }
289	if ($name =~ /, Last>/)
290	{
291	while ($start < $src)
292	{
293	$category_table[$start] = $category_table[$src];
294	$direction_table[$start] = $direction_table[$src];
295	$start++;
296	}
297	}
298
299	next if $decomp eq ""; # no decomposition, skip it
300
301	if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
302	{
303	# decomposition of the form "<foo> 1234" -> use char if type is known
304	if (($src >= 0xf900 && $src < 0xfb00) \|\| ($src >= 0xfe30 && $src < 0xfffd))
305	{
306	# Single char decomposition in the compatibility range
307	$compatmap_table[$src] = hex $2;
308	}
309	next unless ($1 eq "font" \|\|
310	$1 eq "noBreak" \|\|
311	$1 eq "circle" \|\|
312	$1 eq "super" \|\|
313	$1 eq "sub" \|\|
314	$1 eq "wide" \|\|
315	$1 eq "narrow" \|\|
316	$1 eq "compat" \|\|
317	$1 eq "small");
318	$dst = hex $2;
319	}
320	elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
321	{
322	# decomposition "<compat> 0020 1234" -> combining accent
323	$dst = hex $1;
324	}
325	elsif ($decomp =~ /^([0-9a-fA-F]+)/)
326	{
327	# decomposition contains only char values without prefix -> use first char
328	$dst = hex $1;
329	$category_table[$src] \|= $category_table[$dst] if defined $category_table[$dst];
330	# store decomposition if it contains two chars
331	if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
332	{
333	$decomp_table[$src] = [ hex $1, hex $2 ];
334	push @compose_table, [ hex $1, hex $2, $src ];
335	}
336	elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
337	(($src >= 0xf900 && $src < 0xfb00) \|\| ($src >= 0xfe30 && $src < 0xfffd)))
338	{
339	# Single char decomposition in the compatibility range
340	$compatmap_table[$src] = hex $2;
341	}
342	}
343	else
344	{
345	next;
346	}
347
348	next if defined($unicode_defaults[$src]); # may have been set in the defaults file
349
350	# check for loops
351	for (my $i = $dst; ; $i = $unicode_defaults[$i])
352	{
353	die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
354	last unless defined($unicode_defaults[$i]);
355	}
356	$unicode_defaults[$src] = $dst;
357	}
358
359	# patch the category of some special characters
360
361	foreach my $cat (keys %special_categories)
362	{
363	my $flag = $ctype{$cat};
364	foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] \|= $flag; }
365	}
366	}
367
368
369	################################################################
370	# parse the input file
371	sub READ_FILE($)
372	{
373	my $name = shift;
374	open INPUT,$name or die "Cannot open $name";
375
376	while (<INPUT>)
377	{
378	next if /^\#/; # skip comments
379	next if /^$/; # skip empty lines
380	next if /\x1a/; # skip ^Z
381	next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/); # undefined char
382
383	if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
384	{
385	my $cp = hex $1;
386	push @lead_bytes,$cp;
387	$cp2uni[$cp] = 0;
388	next;
389	}
390	if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
391	{
392	my $cp = hex $1;
393	my $uni = hex $2;
394	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
395	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
396	if ($cp > 0xff && !defined($cp2uni[$cp >> 8]))
397	{
398	push @lead_bytes,$cp >> 8;
399	$cp2uni[$cp >> 8] = 0;
400	}
401	next;
402	}
403	die "$name: Unrecognized line $_\n";
404	}
405	}
406
407
408	################################################################
409	# fill input data for the 20127 (us-ascii) codepage
410	sub fill_20127_codepage()
411	{
412	for (my $i = 0; $i < 128; $i++) { $cp2uni[$i] = $uni2cp[$i] = $i; }
413	for (my $i = 128; $i < 256; $i++) { $cp2uni[$i] = $i & 0x7f; }
414	}
415
416	################################################################
417	# get a mapping including glyph chars for MB_USEGLYPHCHARS
418
419	sub get_glyphs_mapping(@)
420	{
421	$_[0x01] = 0x263a; # (WHITE SMILING FACE)
422	$_[0x02] = 0x263b; # (BLACK SMILING FACE)
423	$_[0x03] = 0x2665; # (BLACK HEART SUIT)
424	$_[0x04] = 0x2666; # (BLACK DIAMOND SUIT)
425	$_[0x05] = 0x2663; # (BLACK CLUB SUIT)
426	$_[0x06] = 0x2660; # (BLACK SPADE SUIT)
427	$_[0x07] = 0x2022; # (BULLET)
428	$_[0x08] = 0x25d8; # (INVERSE BULLET)
429	$_[0x09] = 0x25cb; # (WHITE CIRCLE)
430	$_[0x0a] = 0x25d9; # (INVERSE WHITE CIRCLE)
431	$_[0x0b] = 0x2642; # (MALE SIGN)
432	$_[0x0c] = 0x2640; # (FEMALE SIGN)
433	$_[0x0d] = 0x266a; # (EIGHTH NOTE)
434	$_[0x0e] = 0x266b; # (BEAMED EIGHTH NOTES)
435	$_[0x0f] = 0x263c; # (WHITE SUN WITH RAYS)
436	$_[0x10] = 0x25ba; # (BLACK RIGHT-POINTING POINTER)
437	$_[0x11] = 0x25c4; # (BLACK LEFT-POINTING POINTER)
438	$_[0x12] = 0x2195; # (UP DOWN ARROW)
439	$_[0x13] = 0x203c; # (DOUBLE EXCLAMATION MARK)
440	$_[0x14] = 0x00b6; # (PILCROW SIGN)
441	$_[0x15] = 0x00a7; # (SECTION SIGN)
442	$_[0x16] = 0x25ac; # (BLACK RECTANGLE)
443	$_[0x17] = 0x21a8; # (UP DOWN ARROW WITH BASE)
444	$_[0x18] = 0x2191; # (UPWARDS ARROW)
445	$_[0x19] = 0x2193; # (DOWNWARDS ARROW)
446	$_[0x1a] = 0x2192; # (RIGHTWARDS ARROW)
447	$_[0x1b] = 0x2190; # (LEFTWARDS ARROW)
448	$_[0x1c] = 0x221f; # (RIGHT ANGLE)
449	$_[0x1d] = 0x2194; # (LEFT RIGHT ARROW)
450	$_[0x1e] = 0x25b2; # (BLACK UP-POINTING TRIANGLE)
451	$_[0x1f] = 0x25bc; # (BLACK DOWN-POINTING TRIANGLE)
452	$_[0x7f] = 0x2302; # (HOUSE)
453	return @_;
454	}
455
456	################################################################
457	# build EUC-JP table from the JIS 0208 file
458	# FIXME: for proper EUC-JP we should probably read JIS 0212 too
459	# but this would require 3-byte DBCS characters
460	sub READ_JIS0208_FILE($)
461	{
462	my $name = shift;
463
464	# ASCII chars
465	for (my $i = 0x00; $i <= 0x7f; $i++)
466	{
467	$cp2uni[$i] = $i;
468	$uni2cp[$i] = $i;
469	}
470
471	# JIS X 0201 right plane
472	for (my $i = 0xa1; $i <= 0xdf; $i++)
473	{
474	$cp2uni[0x8e00 + $i] = 0xfec0 + $i;
475	$uni2cp[0xfec0 + $i] = 0x8e00 + $i;
476	}
477
478	# lead bytes
479	foreach my $i (0x8e, 0x8f, 0xa1 .. 0xfe)
480	{
481	push @lead_bytes,$i;
482	$cp2uni[$i] = 0;
483	}
484
485	# undefined chars
486	foreach my $i (0x80 .. 0x8d, 0x90 .. 0xa0, 0xff)
487	{
488	$cp2uni[$i] = $DEF_CHAR;
489	}
490
491	# Shift-JIS compatibility
492	$uni2cp[0x00a5] = 0x5c;
493	$uni2cp[0x203e] = 0x7e;
494
495	# Fix backslash conversion
496	$cp2uni[0xa1c0] = 0xff3c;
497	$uni2cp[0xff3c] = 0xa1c0;
498
499	open INPUT, "$name" or die "Cannot open $name";
500	while (<INPUT>)
501	{
502	next if /^\#/; # skip comments
503	next if /^$/; # skip empty lines
504	next if /\x1a/; # skip ^Z
505	if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
506	{
507	my $cp = 0x8080 + hex $1;
508	my $uni = hex $2;
509	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
510	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
511	next;
512	}
513	die "$name: Unrecognized line $_\n";
514	}
515	}
516
517
518	################################################################
519	# build the sort keys table
520	sub READ_SORTKEYS_FILE()
521	{
522	my @sortkeys = ();
523	for (my $i = 0; $i < 65536; $i++) { $sortkeys[$i] = [ -1, 0, 0, 0, 0 ] };
524
525	open INPUT, "$SORTKEYS" or die "Cannot open $SORTKEYS";
526	print "Loading $SORTKEYS\n";
527	while (<INPUT>)
528	{
529	next if /^\#/; # skip comments
530	next if /^$/; # skip empty lines
531	next if /\x1a/; # skip ^Z
532	next if /^\@version/; # skip @version header
533	if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
534	{
535	my ($uni,$variable) = (hex $1, $2);
536	next if $uni > 65535;
537	$sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
538	next;
539	}
540	if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
541	{
542	# multiple character sequence, ignored for now
543	next;
544	}
545	die "$SORTKEYS: Unrecognized line $_\n";
546	}
547	close INPUT;
548
549	# compress the keys to 32 bit:
550	# key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
551
552	@sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
553	${$a}[2] <=> ${$b}[2] or
554	${$a}[3] <=> ${$b}[3] or
555	${$a}[4] <=> ${$b}[4] or
556	$a cmp $b; } @sortkeys;
557
558	my ($n2, $n3) = (1, 1);
559	my @keys = (-1, -1, -1, -1, -1 );
560	my @flatkeys = ();
561
562	for (my $i = 0; $i < 65536; $i++)
563	{
564	my @current = @{$sortkeys[$i]};
565	next if $current[0] == -1;
566	if ($current[1] == $keys[1])
567	{
568	if ($current[2] == $keys[2])
569	{
570	if ($current[3] == $keys[3])
571	{
572	# nothing
573	}
574	else
575	{
576	$keys[3] = $current[3];
577	$n3++;
578	die if ($n3 >= 16);
579	}
580	}
581	else
582	{
583	$keys[2] = $current[2];
584	$keys[3] = $current[3];
585	$n2++;
586	$n3 = 1;
587	die if ($n2 >= 256);
588	}
589	}
590	else
591	{
592	$keys[1] = $current[1];
593	$keys[2] = $current[2];
594	$keys[3] = $current[3];
595	$n2 = 1;
596	$n3 = 1;
597	}
598
599	if ($current[2]) { $current[2] = $n2; }
600	if ($current[3]) { $current[3] = $n3; }
601	if ($current[4]) { $current[4] = 1; }
602
603	$flatkeys[$current[0]] = ($current[1] << 16) \| ($current[2] << 8) \| ($current[3] << 4) \| $current[4];
604	}
605	return @flatkeys;
606	}
607
608
609	################################################################
610	# build the sort keys table
611	sub DUMP_SORTKEYS($@)
612	{
613	my ($filename, @keys) = @_;
614
615	# count the number of 256-key ranges that contain something
616
617	my @offsets = ();
618	my $ranges = 2;
619	for (my $i = 0; $i < 256; $i++) { $offsets[$i] = 256; }
620	for (my $i = 0; $i < 65536; $i++)
621	{
622	next unless defined $keys[$i];
623	$offsets[$i >> 8] = $ranges * 256;
624	$ranges++;
625	$i \|= 255;
626	}
627
628	# output the range offsets
629
630	open OUTPUT,">$filename.new" or die "Cannot create $filename";
631	printf "Building $filename\n";
632	printf OUTPUT "/* Unicode collation element table */\n";
633	printf OUTPUT "/* generated from %s */\n", $SORTKEYS;
634	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
635
636	printf OUTPUT "const unsigned int collation_table[%d] =\n{\n", $ranges*256;
637	printf OUTPUT " /* index */\n";
638	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%08x", 0, @offsets );
639
640	# output the default values
641
642	printf OUTPUT " /* defaults */\n";
643	printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0, (0xffffffff) x 256 );
644
645	# output all the key ranges
646
647	for (my $i = 0; $i < 256; $i++)
648	{
649	next if $offsets[$i] == 256;
650	printf OUTPUT ",\n /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
651	printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0xffffffff, @keys[($i<<8) .. ($i<<8)+255] );
652	}
653	printf OUTPUT "\n};\n";
654	close OUTPUT;
655	save_file($filename);
656	}
657
658
659	################################################################
660	# add default mappings once the file had been read
661	sub ADD_DEFAULT_MAPPINGS()
662	{
663	# Apply aliases
664
665	foreach my $alias (@unicode_aliases)
666	{
667	my $target = undef;
668	foreach my $src (@$alias)
669	{
670	if (defined($uni2cp[$src]))
671	{
672	$target = $uni2cp[$src];
673	last;
674	}
675	}
676	next unless defined($target);
677
678	# At least one char of the alias set is defined, set the others to the same value
679	foreach my $src (@$alias)
680	{
681	$uni2cp[$src] = $target unless defined($uni2cp[$src]);
682	}
683	}
684
685	# For every src -> target mapping in the defaults table,
686	# make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
687
688	for (my $src = 0; $src < 65536; $src++)
689	{
690	next if defined($uni2cp[$src]); # source has a definition already
691	next unless defined($unicode_defaults[$src]); # no default for this char
692	my $target = $unicode_defaults[$src];
693
694	# do a recursive mapping until we find a target char that is defined
695	while (!defined($uni2cp[$target]) &&
696	defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
697
698	if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
699	}
700
701	# Add an identity mapping for all undefined chars
702
703	for (my $i = 0; $i < 256; $i++)
704	{
705	next if defined($cp2uni[$i]);
706	next if defined($uni2cp[$i]);
707	$cp2uni[$i] = $uni2cp[$i] = $i;
708	}
709	}
710
711	################################################################
712	# dump an array of integers
713	sub DUMP_ARRAY($$@)
714	{
715	my ($format,$default,@array) = @_;
716	my $i;
717	my $ret = " ";
718	for ($i = 0; $i < $#array; $i++)
719	{
720	$ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
721	$ret .= (($i % 8) != 7) ? ", " : ",\n ";
722	}
723	$ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
724	return $ret;
725	}
726
727	################################################################
728	# dump an SBCS mapping table
729	sub dump_sbcs_table($$$$$)
730	{
731	my ($codepage, $has_glyphs, $name, $def, $defw) = @_;
732	my $i;
733
734	# output the ascii->unicode table
735
736	if ($has_glyphs)
737	{
738	printf OUTPUT "static const WCHAR cp2uni[512] =\n";
739	printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
740	printf OUTPUT ",\n /* glyphs */\n%s\n};\n\n",
741	DUMP_ARRAY( "0x%04x", $defw, get_glyphs_mapping(@cp2uni[0 .. 255]) );
742	}
743	else
744	{
745	printf OUTPUT "static const WCHAR cp2uni[256] =\n";
746	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
747	}
748
749	# count the number of unicode->ascii subtables that contain something
750
751	my @filled = ();
752	my $subtables = 1;
753	for (my $i = 0; $i < 65536; $i++)
754	{
755	next unless defined $uni2cp[$i];
756	$filled[$i >> 8] = 1;
757	$subtables++;
758	$i \|= 255;
759	}
760
761	# output all the subtables into a single array
762
763	printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
764	for (my $i = 0; $i < 256; $i++)
765	{
766	next unless $filled[$i];
767	printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
768	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $def, @uni2cp[($i<<8) .. ($i<<8)+255] );
769	}
770	printf OUTPUT " /* defaults */\n";
771	printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($def) x 256 );
772
773	# output a table of the offsets of the subtables in the previous array
774
775	my $pos = 0;
776	my @offsets = ();
777	for (my $i = 0; $i < 256; $i++)
778	{
779	if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
780	else { push @offsets, ($subtables-1) * 256; }
781	}
782	printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
783	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
784
785	# output the code page descriptor
786
787	printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
788	printf OUTPUT " { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
789	$codepage, $def, $defw, $name;
790	printf OUTPUT " cp2uni,\n";
791	if ($has_glyphs) { printf OUTPUT " cp2uni + 256,\n"; }
792	else { printf OUTPUT " cp2uni,\n"; }
793	printf OUTPUT " uni2cp_low,\n";
794	printf OUTPUT " uni2cp_high\n};\n";
795	}
796
797
798	################################################################
799	# dump a DBCS mapping table
800	sub dump_dbcs_table($$$$@)
801	{
802	my ($codepage, $name, $def, $defw, @lb_ranges) = @_;
803
804	# build a list of lead bytes that are actually used
805
806	my @lblist = ();
807	LBLOOP: for (my $y = 0; $y <= $#lead_bytes; $y++)
808	{
809	my $base = $lead_bytes[$y] << 8;
810	for (my $x = 0; $x < 256; $x++)
811	{
812	if (defined $cp2uni[$base+$x])
813	{
814	push @lblist,$lead_bytes[$y];
815	next LBLOOP;
816	}
817	}
818	}
819	my $unused = ($#lead_bytes > $#lblist);
820
821	# output the ascii->unicode table for the single byte chars
822
823	printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
824	printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
825
826	# output the default table for unused lead bytes
827
828	if ($unused)
829	{
830	printf OUTPUT " /* unused lead bytes */\n";
831	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($defw) x 256 );
832	}
833
834	# output the ascii->unicode table for each DBCS lead byte
835
836	for (my $y = 0; $y <= $#lblist; $y++)
837	{
838	my $base = $lblist[$y] << 8;
839	printf OUTPUT " /* lead byte %02x */\n", $lblist[$y];
840	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[$base .. $base+255] );
841	printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
842	}
843
844	# output the lead byte subtables offsets
845
846	my @offsets = ();
847	for (my $x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
848	for (my $x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
849	if ($unused)
850	{
851	# increment all lead bytes offset to take into account the unused table
852	for (my $x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
853	}
854	printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
855	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
856
857	# count the number of unicode->ascii subtables that contain something
858
859	my @filled = ();
860	my $subtables = 1;
861	for (my $i = 0; $i < 65536; $i++)
862	{
863	next unless defined $uni2cp[$i];
864	$filled[$i >> 8] = 1;
865	$subtables++;
866	$i \|= 255;
867	}
868
869	# output all the subtables into a single array
870
871	printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
872	for (my $y = 0; $y < 256; $y++)
873	{
874	next unless $filled[$y];
875	printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
876	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $def, @uni2cp[($y<<8) .. ($y<<8)+255] );
877	}
878	printf OUTPUT " /* defaults */\n";
879	printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($def) x 256 );
880
881	# output a table of the offsets of the subtables in the previous array
882
883	my $pos = 0;
884	@offsets = ();
885	for (my $y = 0; $y < 256; $y++)
886	{
887	if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
888	else { push @offsets, ($subtables-1) * 256; }
889	}
890	printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
891	printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
892
893	# output the code page descriptor
894
895	printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
896	printf OUTPUT " { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
897	$codepage, $def, $defw, $name;
898	printf OUTPUT " cp2uni,\n";
899	printf OUTPUT " cp2uni_leadbytes,\n";
900	printf OUTPUT " uni2cp_low,\n";
901	printf OUTPUT " uni2cp_high,\n";
902	printf OUTPUT " {\n %s\n }\n", DUMP_ARRAY( "0x%02x", 0, @lb_ranges, 0, 0 );
903	printf OUTPUT "};\n";
904	}
905
906
907	################################################################
908	# get the list of defined lead byte ranges
909	sub get_lb_ranges()
910	{
911	my @list = ();
912	my @ranges = ();
913	my $i = 0;
914	foreach $i (@lead_bytes) { $list[$i] = 1; }
915	my $on = 0;
916	for (my $i = 0; $i < 256; $i++)
917	{
918	if ($on)
919	{
920	if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
921	}
922	else
923	{
924	if ($list[$i]) { push @ranges, $i; $on = 1; }
925	}
926	}
927	if ($on) { push @ranges, 0xff; }
928	return @ranges;
929	}
930
931
932	################################################################
933	# dump the case mapping tables
934	sub DUMP_CASE_MAPPINGS($)
935	{
936	my $filename = shift;
937	open OUTPUT,">$filename.new" or die "Cannot create $filename";
938	printf "Building $filename\n";
939	printf OUTPUT "/* Unicode case mappings */\n";
940	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
941	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
942
943	DUMP_CASE_TABLE( "wine_casemap_lower", @tolower_table );
944	DUMP_CASE_TABLE( "wine_casemap_upper", @toupper_table );
945	DUMP_CASE_TABLE( "wine_digitmap", @digitmap_table );
946	DUMP_CASE_TABLE( "wine_compatmap", @compatmap_table );
947	close OUTPUT;
948	save_file($filename);
949	}
950
951
952	################################################################
953	# dump a case mapping table
954	sub DUMP_CASE_TABLE($@)
955	{
956	my ($name,@table) = @_;
957
958	# count the number of sub tables that contain something
959	# also compute the low and upper populated bounds
960
961	my @lowerbounds = ( 0, 0 );
962	my @upperbounds = ( 0, 255 );
963	my $index = 0;
964	my @filled = ();
965	for (my $i = 0; $i < 65536; $i++)
966	{
967	next unless defined $table[$i];
968	if (!defined $filled[$i >> 8])
969	{
970	$lowerbounds[$index] = $i & 0xff;
971	$upperbounds[$index] = 0xff - $lowerbounds[$index];
972	$filled[$i >> 8] = $index * 256 + 512;
973	$index++;
974	}
975	else
976	{
977	$upperbounds[$index-1] = 0xff - ($i & 0xff);
978	}
979	$table[$i] = ($table[$i] - $i) & 0xffff;
980	}
981
982	# Collapse blocks upwards if possible
983	my $removed = 0;
984	$index = 0;
985	for (my $i = 0; $i < 256; $i++)
986	{
987	next unless defined $filled[$i];
988	if ($upperbounds[$index - 1] > $lowerbounds[$index])
989	{
990	$removed = $removed + $lowerbounds[$index];
991	}
992	else
993	{
994	$removed = $removed + $upperbounds[$index - 1];
995	$lowerbounds[$index] = $upperbounds[$index - 1];
996	}
997	$filled[$i] = $filled[$i] - $removed;
998	$index++;
999	}
1000
1001	# dump the table
1002
1003	printf OUTPUT "const WCHAR %s[%d] =\n", $name, $index * 256 + 512 - $removed;
1004	printf OUTPUT "{\n /* index */\n";
1005	printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
1006	printf OUTPUT " /* defaults */\n";
1007	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
1008	$index = 0;
1009	for (my $i = 0; $i < 256; $i++)
1010	{
1011	next unless $filled[$i];
1012	printf OUTPUT ",\n /* 0x%02x%02x .. 0x%02xff */\n", $i, $lowerbounds[$index], $i;
1013	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0,
1014	@table[($i<<8) + $lowerbounds[$index] .. ($i<<8)+255] );
1015	$index++;
1016	}
1017	printf OUTPUT "\n};\n";
1018	}
1019
1020
1021	################################################################
1022	# dump the ctype tables
1023	sub DUMP_CTYPE_TABLES($)
1024	{
1025	my $filename = shift;
1026	open OUTPUT,">$filename.new" or die "Cannot create $filename";
1027	printf "Building $filename\n";
1028	printf OUTPUT "/* Unicode ctype tables */\n";
1029	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1030	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1031
1032	my @array = (0) x 256;
1033	my %sequences;
1034
1035	# add the direction in the high 4 bits of the category
1036	for (my $i = 0; $i < 65536; $i++)
1037	{
1038	$category_table[$i] \|= $direction_table[$i] << 12 if defined $direction_table[$i];
1039	}
1040
1041	# try to merge table rows
1042	for (my $row = 0; $row < 256; $row++)
1043	{
1044	my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
1045	if (defined($sequences{$rowtxt}))
1046	{
1047	# reuse an existing row
1048	$array[$row] = $sequences{$rowtxt};
1049	}
1050	else
1051	{
1052	# create a new row
1053	$sequences{$rowtxt} = $array[$row] = $#array + 1;
1054	push @array, @category_table[($row<<8)..($row<<8)+255];
1055	}
1056	}
1057
1058	printf OUTPUT "const unsigned short wine_wctype_table[%d] =\n{\n", $#array+1;
1059	printf OUTPUT " /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
1060	printf OUTPUT " /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
1061
1062	close OUTPUT;
1063	save_file($filename);
1064	}
1065
1066
1067	################################################################
1068	# dump the char composition tables
1069	sub DUMP_COMPOSE_TABLES($)
1070	{
1071	my $filename = shift;
1072
1073	open OUTPUT,">$filename.new" or die "Cannot create $filename";
1074	printf "Building $filename\n";
1075	printf OUTPUT "/* Unicode char composition */\n";
1076	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1077	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1078
1079	######### composition table
1080
1081	my @filled = ();
1082	foreach my $i (@compose_table)
1083	{
1084	my @comp = @$i;
1085	push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
1086	}
1087
1088	# count how many different second chars we have
1089
1090	my $count = 0;
1091	for (my $i = 0; $i < 65536; $i++)
1092	{
1093	next unless defined $filled[$i];
1094	$count++;
1095	}
1096
1097	# build the table of second chars and offsets
1098
1099	my $pos = $count + 1;
1100	my @table = ();
1101	for (my $i = 0; $i < 65536; $i++)
1102	{
1103	next unless defined $filled[$i];
1104	push @table, $i, $pos;
1105	$pos += @{$filled[$i]};
1106	}
1107	# terminator with last position
1108	push @table, 0, $pos;
1109	printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
1110	printf OUTPUT " /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
1111
1112	# build the table of first chars and mappings
1113
1114	for (my $i = 0; $i < 65536; $i++)
1115	{
1116	next unless defined $filled[$i];
1117	my @table = ();
1118	my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
1119	for (my $j = 0; $j <= $#list; $j++)
1120	{
1121	push @table, $list[$j][0], $list[$j][1];
1122	}
1123	printf OUTPUT ",\n /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
1124	}
1125	printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
1126
1127	######### decomposition table
1128
1129	# first determine all the 16-char subsets that contain something
1130
1131	@filled = (0) x 4096;
1132	$pos = 16*2; # for the null subset
1133	for (my $i = 0; $i < 65536; $i++)
1134	{
1135	next unless defined $decomp_table[$i];
1136	$filled[$i >> 4] = $pos;
1137	$pos += 16*2;
1138	$i \|= 15;
1139	}
1140	my $total = $pos;
1141
1142	# now count the 256-char subsets that contain something
1143
1144	my @filled_idx = (256) x 256;
1145	$pos = 256 + 16;
1146	for (my $i = 0; $i < 4096; $i++)
1147	{
1148	next unless $filled[$i];
1149	$filled_idx[$i >> 4] = $pos;
1150	$pos += 16;
1151	$i \|= 15;
1152	}
1153	my $null_offset = $pos; # null mapping
1154	$total += $pos;
1155
1156	# add the index offsets to the subsets positions
1157
1158	for (my $i = 0; $i < 4096; $i++)
1159	{
1160	next unless $filled[$i];
1161	$filled[$i] += $null_offset;
1162	}
1163
1164	# dump the main index
1165
1166	printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
1167	printf OUTPUT "{\n /* index */\n";
1168	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
1169	printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
1170
1171	# dump the second-level indexes
1172
1173	for (my $i = 0; $i < 256; $i++)
1174	{
1175	next unless ($filled_idx[$i] > 256);
1176	my @table = @filled[($i<<4)..($i<<4)+15];
1177	for (my $j = 0; $j < 16; $j++) { $table[$j] \|\|= $null_offset; }
1178	printf OUTPUT ",\n /* sub-index %02x */\n", $i;
1179	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1180	}
1181
1182	# dump the 16-char subsets
1183
1184	printf OUTPUT ",\n /* null mapping */\n";
1185	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
1186
1187	for (my $i = 0; $i < 4096; $i++)
1188	{
1189	next unless $filled[$i];
1190	my @table = (0) x 32;
1191	for (my $j = 0; $j < 16; $j++)
1192	{
1193	if (defined $decomp_table[($i<<4) + $j])
1194	{
1195	$table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
1196	$table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
1197	}
1198	}
1199	printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
1200	printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1201	}
1202
1203	printf OUTPUT "\n};\n";
1204	close OUTPUT;
1205	save_file($filename);
1206	}
1207
1208
1209	################################################################
1210	# handle a "bestfit" Windows mapping file
1211
1212	sub handle_bestfit_file($$$)
1213	{
1214	my ($filename, $has_glyphs, $comment) = @_;
1215	my $state = "";
1216	my ($codepage, $width, $def, $defw, $count);
1217	my ($lb_cur, $lb_end);
1218	my @lb_ranges = ();
1219
1220	open INPUT,$MAPPREFIX . $filename or die "Cannot open $filename";
1221
1222	while (<INPUT>)
1223	{
1224	next if /^;/; # skip comments
1225	next if /^\s*$/; # skip empty lines
1226	next if /\x1a/; # skip ^Z
1227	last if /^ENDCODEPAGE/;
1228
1229	if (/^CODEPAGE\s+(\d+)/)
1230	{
1231	$codepage = $1;
1232	next;
1233	}
1234	if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
1235	{
1236	$width = $1;
1237	$def = hex $2;
1238	$defw = hex $3;
1239	next;
1240	}
1241	if (/^(MBTABLE\|WCTABLE\|DBCSRANGE\|DBCSTABLE)\s+(\d+)/)
1242	{
1243	$state = $1;
1244	$count = $2;
1245	next;
1246	}
1247	if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
1248	{
1249	if ($state eq "MBTABLE")
1250	{
1251	my $cp = hex $1;
1252	my $uni = hex $2;
1253	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
1254	next;
1255	}
1256	if ($state eq "WCTABLE")
1257	{
1258	my $uni = hex $1;
1259	my $cp = hex $2;
1260	$uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
1261	next;
1262	}
1263	if ($state eq "DBCSRANGE")
1264	{
1265	my $start = hex $1;
1266	my $end = hex $2;
1267	push @lb_ranges, $start, $end;
1268	for (my $i = $start; $i <= $end; $i++)
1269	{
1270	push @lead_bytes, $i;
1271	$cp2uni[$i] = 0;
1272	}
1273	$lb_cur = $start;
1274	$lb_end = $end;
1275	next;
1276	}
1277	if ($state eq "DBCSTABLE")
1278	{
1279	my $mb = hex $1;
1280	my $uni = hex $2;
1281	my $cp = ($lb_cur << 8) \| $mb;
1282	$cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
1283	if (!--$count)
1284	{
1285	if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
1286	}
1287	next;
1288	}
1289	}
1290	die "$filename: Unrecognized line $_\n";
1291	}
1292	close INPUT;
1293
1294	my $output = sprintf "c_%03d.c", $codepage;
1295	open OUTPUT,">$output.new" or die "Cannot create $output";
1296
1297	printf "Building %s from %s (%s)\n", $output, $filename, $comment;
1298
1299	# dump all tables
1300
1301	printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
1302	printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
1303	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1304	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1305
1306	if ($width == 1) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $def, $defw ); }
1307	else { dump_dbcs_table( $codepage, $comment, $def, $defw, @lb_ranges ); }
1308	close OUTPUT;
1309	save_file($output);
1310	}
1311
1312
1313	################################################################
1314	# read an input file and generate the corresponding .c file
1315	sub HANDLE_FILE(@)
1316	{
1317	my ($codepage,$filename,$has_glyphs,$comment) = @_;
1318
1319	@cp2uni = ();
1320	@lead_bytes = ();
1321	@uni2cp = ();
1322
1323	# symbol codepage file is special
1324	if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
1325	elsif ($codepage == 20127) { fill_20127_codepage(); }
1326	elsif ($filename =~ /\/bestfit/)
1327	{
1328	handle_bestfit_file( $filename, $has_glyphs, $comment );
1329	return;
1330	}
1331	else { READ_FILE($MAPPREFIX . $filename); }
1332
1333	ADD_DEFAULT_MAPPINGS();
1334
1335	my $output = sprintf "c_%03d.c", $codepage;
1336	open OUTPUT,">$output.new" or die "Cannot create $output";
1337
1338	printf "Building %s from %s (%s)\n", $output, $filename \|\| "hardcoded data", $comment;
1339
1340	# dump all tables
1341
1342	printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
1343	if ($filename)
1344	{
1345	printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
1346	printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1347	}
1348	else
1349	{
1350	printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1351	}
1352	printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1353
1354	if (!@lead_bytes) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $DEF_CHAR, $DEF_CHAR ); }
1355	else { dump_dbcs_table( $codepage, $comment, $DEF_CHAR, $DEF_CHAR, get_lb_ranges() ); }
1356	close OUTPUT;
1357	save_file($output);
1358	}
1359
1360
1361	################################################################
1362	# save a file if modified
1363	sub save_file($)
1364	{
1365	my $file = shift;
1366	if (-f $file && !system "cmp $file $file.new >/dev/null")
1367	{
1368	unlink "$file.new";
1369	}
1370	else
1371	{
1372	rename "$file.new", "$file";
1373	}
1374	}
1375
1376
1377	################################################################
1378	# output the list of codepage tables into the cptable.c file
1379	sub OUTPUT_CPTABLE()
1380	{
1381	my @tables_decl = ();
1382
1383	foreach my $file (@allfiles)
1384	{
1385	my ($codepage,$filename,$comment) = @$file;
1386	push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
1387	}
1388
1389	push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
1390	foreach my $file (@allfiles)
1391	{
1392	my ($codepage,$filename,$comment) = @$file;
1393	push @tables_decl, sprintf(" &cptable_%03d,\n", $codepage);
1394	}
1395	push @tables_decl, "};";
1396	REPLACE_IN_FILE( "cptable.c", @tables_decl );
1397	}
1398
1399	################################################################
1400	# replace the contents of a file between ### cpmap ### marks
1401
1402	sub REPLACE_IN_FILE($@)
1403	{
1404	my $name = shift;
1405	my @data = @_;
1406	my @lines = ();
1407	open(FILE,$name) or die "Can't open $name";
1408	while (<FILE>)
1409	{
1410	push @lines, $_;
1411	last if /\#\#\# cpmap begin \#\#\#/;
1412	}
1413	push @lines, @data;
1414	while (<FILE>)
1415	{
1416	if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
1417	}
1418	push @lines, <FILE>;
1419	open(FILE,">$name.new") or die "Can't modify $name";
1420	print FILE @lines;
1421	close(FILE);
1422	save_file($name);
1423	}
1424
1425	################################################################
1426	# main routine
1427
1428	READ_DEFAULTS( $DEFAULTS );
1429	DUMP_CASE_MAPPINGS( "casemap.c" );
1430	DUMP_SORTKEYS( "collation.c", READ_SORTKEYS_FILE() );
1431	DUMP_COMPOSE_TABLES( "compose.c" );
1432	DUMP_CTYPE_TABLES( "wctype.c" );
1433
1434	foreach my $file (@allfiles) { HANDLE_FILE( @{$file} ); }
1435
1436	OUTPUT_CPTABLE();
1437
1438	exit 0;
1439
1440	# Local Variables:
1441	# compile-command: "./cpmap.pl && make -k"
1442	# End:

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Additions/WINNT/Graphics/Wine/libWine/cpmap.pl@ 33876

Download in other formats: