width data generation

2018-03-07 23:55:52 +01:00 · 2018-03-07 23:55:52 +01:00 · 37132125bc
commit 37132125bc
parent 8e8fd6c849
5 changed files with 1388 additions and 0 deletions
--- a/newlib/libc/string/WIDTH-A
+++ b/newlib/libc/string/WIDTH-A
@ -0,0 +1,569 @@
 # UAX #11: East Asian Ambiguous
 # Plane 00
 # Rows	Positions (Cells)
  00	A1 A4 A7-A8 AA AD-AE B0-B4 B6-BA BC-BF C6 D0 D7-D8 DE-E1 E6 E8-EA
  00	EC-ED F0 F2-F3 F7-FA FC FE
  01	01 11 13 1B 26-27 2B 31-33 38 3F-42 44 48-4B 4D 52-53 66-67 6B
  01	CE D0 D2 D4 D6 D8 DA DC
  02	51 61 C4 C7 C9-CB CD D0 D8-DB DD DF
  03	00-6F 91-A1 A3-A9 B1-C1 C3-C9
  04	01 10-4F 51
  20	10 13-16 18-19 1C-1D 20-22 24-27 30 32-33 35 3B 3E 74 7F 81-84
  20	AC
  21	03 05 09 13 16 21-22 26 2B 53-54 5B-5E 60-6B 70-79 89 90-99 B8-B9
  21	D2 D4 E7
  22	00 02-03 07-08 0B 0F 11 15 1A 1D-20 23 25 27-2C 2E 34-37 3C-3D
  22	48 4C 52 60-61 64-67 6A-6B 6E-6F 82-83 86-87 95 99 A5 BF
  23	12
  24	60-E9 EB-FF
  25	00-4B 50-73 80-8F 92-95 A0-A1 A3-A9 B2-B3 B6-B7 BC-BD C0-C1 C6-C8
  25	CB CE-D1 E2-E5 EF
  26	05-06 09 0E-0F 1C 1E 40 42 60-61 63-65 67-6A 6C-6D 6F 9E-9F BF
  26	C6-CD CF-D3 D5-E1 E3 E8-E9 EB-F1 F4 F6-F9 FB-FC FE-FF
  27	3D 76-7F
  2B	56-59
  32	48-4F
  E0	00-FF
  E1	00-FF
  E2	00-FF
  E3	00-FF
  E4	00-FF
  E5	00-FF
  E6	00-FF
  E7	00-FF
  E8	00-FF
  E9	00-FF
  EA	00-FF
  EB	00-FF
  EC	00-FF
  ED	00-FF
  EE	00-FF
  EF	00-FF
  F0	00-FF
  F1	00-FF
  F2	00-FF
  F3	00-FF
  F4	00-FF
  F5	00-FF
  F6	00-FF
  F7	00-FF
  F8	00-FF
  FE	00-0F
  FF	FD
  1F1	00-0A 10-2D 30-69 70-8D 8F-90 9B-AC
  E01	00-EF
  F00	00-FF
  F01	00-FF
  F02	00-FF
  F03	00-FF
  F04	00-FF
  F05	00-FF
  F06	00-FF
  F07	00-FF
  F08	00-FF
  F09	00-FF
  F0A	00-FF
  F0B	00-FF
  F0C	00-FF
  F0D	00-FF
  F0E	00-FF
  F0F	00-FF
  F10	00-FF
  F11	00-FF
  F12	00-FF
  F13	00-FF
  F14	00-FF
  F15	00-FF
  F16	00-FF
  F17	00-FF
  F18	00-FF
  F19	00-FF
  F1A	00-FF
  F1B	00-FF
  F1C	00-FF
  F1D	00-FF
  F1E	00-FF
  F1F	00-FF
  F20	00-FF
  F21	00-FF
  F22	00-FF
  F23	00-FF
  F24	00-FF
  F25	00-FF
  F26	00-FF
  F27	00-FF
  F28	00-FF
  F29	00-FF
  F2A	00-FF
  F2B	00-FF
  F2C	00-FF
  F2D	00-FF
  F2E	00-FF
  F2F	00-FF
  F30	00-FF
  F31	00-FF
  F32	00-FF
  F33	00-FF
  F34	00-FF
  F35	00-FF
  F36	00-FF
  F37	00-FF
  F38	00-FF
  F39	00-FF
  F3A	00-FF
  F3B	00-FF
  F3C	00-FF
  F3D	00-FF
  F3E	00-FF
  F3F	00-FF
  F40	00-FF
  F41	00-FF
  F42	00-FF
  F43	00-FF
  F44	00-FF
  F45	00-FF
  F46	00-FF
  F47	00-FF
  F48	00-FF
  F49	00-FF
  F4A	00-FF
  F4B	00-FF
  F4C	00-FF
  F4D	00-FF
  F4E	00-FF
  F4F	00-FF
  F50	00-FF
  F51	00-FF
  F52	00-FF
  F53	00-FF
  F54	00-FF
  F55	00-FF
  F56	00-FF
  F57	00-FF
  F58	00-FF
  F59	00-FF
  F5A	00-FF
  F5B	00-FF
  F5C	00-FF
  F5D	00-FF
  F5E	00-FF
  F5F	00-FF
  F60	00-FF
  F61	00-FF
  F62	00-FF
  F63	00-FF
  F64	00-FF
  F65	00-FF
  F66	00-FF
  F67	00-FF
  F68	00-FF
  F69	00-FF
  F6A	00-FF
  F6B	00-FF
  F6C	00-FF
  F6D	00-FF
  F6E	00-FF
  F6F	00-FF
  F70	00-FF
  F71	00-FF
  F72	00-FF
  F73	00-FF
  F74	00-FF
  F75	00-FF
  F76	00-FF
  F77	00-FF
  F78	00-FF
  F79	00-FF
  F7A	00-FF
  F7B	00-FF
  F7C	00-FF
  F7D	00-FF
  F7E	00-FF
  F7F	00-FF
  F80	00-FF
  F81	00-FF
  F82	00-FF
  F83	00-FF
  F84	00-FF
  F85	00-FF
  F86	00-FF
  F87	00-FF
  F88	00-FF
  F89	00-FF
  F8A	00-FF
  F8B	00-FF
  F8C	00-FF
  F8D	00-FF
  F8E	00-FF
  F8F	00-FF
  F90	00-FF
  F91	00-FF
  F92	00-FF
  F93	00-FF
  F94	00-FF
  F95	00-FF
  F96	00-FF
  F97	00-FF
  F98	00-FF
  F99	00-FF
  F9A	00-FF
  F9B	00-FF
  F9C	00-FF
  F9D	00-FF
  F9E	00-FF
  F9F	00-FF
  FA0	00-FF
  FA1	00-FF
  FA2	00-FF
  FA3	00-FF
  FA4	00-FF
  FA5	00-FF
  FA6	00-FF
  FA7	00-FF
  FA8	00-FF
  FA9	00-FF
  FAA	00-FF
  FAB	00-FF
  FAC	00-FF
  FAD	00-FF
  FAE	00-FF
  FAF	00-FF
  FB0	00-FF
  FB1	00-FF
  FB2	00-FF
  FB3	00-FF
  FB4	00-FF
  FB5	00-FF
  FB6	00-FF
  FB7	00-FF
  FB8	00-FF
  FB9	00-FF
  FBA	00-FF
  FBB	00-FF
  FBC	00-FF
  FBD	00-FF
  FBE	00-FF
  FBF	00-FF
  FC0	00-FF
  FC1	00-FF
  FC2	00-FF
  FC3	00-FF
  FC4	00-FF
  FC5	00-FF
  FC6	00-FF
  FC7	00-FF
  FC8	00-FF
  FC9	00-FF
  FCA	00-FF
  FCB	00-FF
  FCC	00-FF
  FCD	00-FF
  FCE	00-FF
  FCF	00-FF
  FD0	00-FF
  FD1	00-FF
  FD2	00-FF
  FD3	00-FF
  FD4	00-FF
  FD5	00-FF
  FD6	00-FF
  FD7	00-FF
  FD8	00-FF
  FD9	00-FF
  FDA	00-FF
  FDB	00-FF
  FDC	00-FF
  FDD	00-FF
  FDE	00-FF
  FDF	00-FF
  FE0	00-FF
  FE1	00-FF
  FE2	00-FF
  FE3	00-FF
  FE4	00-FF
  FE5	00-FF
  FE6	00-FF
  FE7	00-FF
  FE8	00-FF
  FE9	00-FF
  FEA	00-FF
  FEB	00-FF
  FEC	00-FF
  FED	00-FF
  FEE	00-FF
  FEF	00-FF
  FF0	00-FF
  FF1	00-FF
  FF2	00-FF
  FF3	00-FF
  FF4	00-FF
  FF5	00-FF
  FF6	00-FF
  FF7	00-FF
  FF8	00-FF
  FF9	00-FF
  FFA	00-FF
  FFB	00-FF
  FFC	00-FF
  FFD	00-FF
  FFE	00-FF
  FFF	00-FD
  1000	00-FF
  1001	00-FF
  1002	00-FF
  1003	00-FF
  1004	00-FF
  1005	00-FF
  1006	00-FF
  1007	00-FF
  1008	00-FF
  1009	00-FF
  100A	00-FF
  100B	00-FF
  100C	00-FF
  100D	00-FF
  100E	00-FF
  100F	00-FF
  1010	00-FF
  1011	00-FF
  1012	00-FF
  1013	00-FF
  1014	00-FF
  1015	00-FF
  1016	00-FF
  1017	00-FF
  1018	00-FF
  1019	00-FF
  101A	00-FF
  101B	00-FF
  101C	00-FF
  101D	00-FF
  101E	00-FF
  101F	00-FF
  1020	00-FF
  1021	00-FF
  1022	00-FF
  1023	00-FF
  1024	00-FF
  1025	00-FF
  1026	00-FF
  1027	00-FF
  1028	00-FF
  1029	00-FF
  102A	00-FF
  102B	00-FF
  102C	00-FF
  102D	00-FF
  102E	00-FF
  102F	00-FF
  1030	00-FF
  1031	00-FF
  1032	00-FF
  1033	00-FF
  1034	00-FF
  1035	00-FF
  1036	00-FF
  1037	00-FF
  1038	00-FF
  1039	00-FF
  103A	00-FF
  103B	00-FF
  103C	00-FF
  103D	00-FF
  103E	00-FF
  103F	00-FF
  1040	00-FF
  1041	00-FF
  1042	00-FF
  1043	00-FF
  1044	00-FF
  1045	00-FF
  1046	00-FF
  1047	00-FF
  1048	00-FF
  1049	00-FF
  104A	00-FF
  104B	00-FF
  104C	00-FF
  104D	00-FF
  104E	00-FF
  104F	00-FF
  1050	00-FF
  1051	00-FF
  1052	00-FF
  1053	00-FF
  1054	00-FF
  1055	00-FF
  1056	00-FF
  1057	00-FF
  1058	00-FF
  1059	00-FF
  105A	00-FF
  105B	00-FF
  105C	00-FF
  105D	00-FF
  105E	00-FF
  105F	00-FF
  1060	00-FF
  1061	00-FF
  1062	00-FF
  1063	00-FF
  1064	00-FF
  1065	00-FF
  1066	00-FF
  1067	00-FF
  1068	00-FF
  1069	00-FF
  106A	00-FF
  106B	00-FF
  106C	00-FF
  106D	00-FF
  106E	00-FF
  106F	00-FF
  1070	00-FF
  1071	00-FF
  1072	00-FF
  1073	00-FF
  1074	00-FF
  1075	00-FF
  1076	00-FF
  1077	00-FF
  1078	00-FF
  1079	00-FF
  107A	00-FF
  107B	00-FF
  107C	00-FF
  107D	00-FF
  107E	00-FF
  107F	00-FF
  1080	00-FF
  1081	00-FF
  1082	00-FF
  1083	00-FF
  1084	00-FF
  1085	00-FF
  1086	00-FF
  1087	00-FF
  1088	00-FF
  1089	00-FF
  108A	00-FF
  108B	00-FF
  108C	00-FF
  108D	00-FF
  108E	00-FF
  108F	00-FF
  1090	00-FF
  1091	00-FF
  1092	00-FF
  1093	00-FF
  1094	00-FF
  1095	00-FF
  1096	00-FF
  1097	00-FF
  1098	00-FF
  1099	00-FF
  109A	00-FF
  109B	00-FF
  109C	00-FF
  109D	00-FF
  109E	00-FF
  109F	00-FF
  10A0	00-FF
  10A1	00-FF
  10A2	00-FF
  10A3	00-FF
  10A4	00-FF
  10A5	00-FF
  10A6	00-FF
  10A7	00-FF
  10A8	00-FF
  10A9	00-FF
  10AA	00-FF
  10AB	00-FF
  10AC	00-FF
  10AD	00-FF
  10AE	00-FF
  10AF	00-FF
  10B0	00-FF
  10B1	00-FF
  10B2	00-FF
  10B3	00-FF
  10B4	00-FF
  10B5	00-FF
  10B6	00-FF
  10B7	00-FF
  10B8	00-FF
  10B9	00-FF
  10BA	00-FF
  10BB	00-FF
  10BC	00-FF
  10BD	00-FF
  10BE	00-FF
  10BF	00-FF
  10C0	00-FF
  10C1	00-FF
  10C2	00-FF
  10C3	00-FF
  10C4	00-FF
  10C5	00-FF
  10C6	00-FF
  10C7	00-FF
  10C8	00-FF
  10C9	00-FF
  10CA	00-FF
  10CB	00-FF
  10CC	00-FF
  10CD	00-FF
  10CE	00-FF
  10CF	00-FF
  10D0	00-FF
  10D1	00-FF
  10D2	00-FF
  10D3	00-FF
  10D4	00-FF
  10D5	00-FF
  10D6	00-FF
  10D7	00-FF
  10D8	00-FF
  10D9	00-FF
  10DA	00-FF
  10DB	00-FF
  10DC	00-FF
  10DD	00-FF
  10DE	00-FF
  10DF	00-FF
  10E0	00-FF
  10E1	00-FF
  10E2	00-FF
  10E3	00-FF
  10E4	00-FF
  10E5	00-FF
  10E6	00-FF
  10E7	00-FF
  10E8	00-FF
  10E9	00-FF
  10EA	00-FF
  10EB	00-FF
  10EC	00-FF
  10ED	00-FF
  10EE	00-FF
  10EF	00-FF
  10F0	00-FF
  10F1	00-FF
  10F2	00-FF
  10F3	00-FF
  10F4	00-FF
  10F5	00-FF
  10F6	00-FF
  10F7	00-FF
  10F8	00-FF
  10F9	00-FF
  10FA	00-FF
  10FB	00-FF
  10FC	00-FF
  10FD	00-FF
  10FE	00-FF
  10FF	00-FD
--- a/newlib/libc/string/mkunidata
+++ b/newlib/libc/string/mkunidata
@ -0,0 +1,54 @@
 #! /bin/sh
 echo generating Unicode width data for newlib/libc/string/wcwidth.c
 cd `dirname $0`
 PATH="$PATH":.	# ensure access to uniset tool
 #############################################################################
 # checks and (with option -u) downloads
 case "$1" in
 -u)
 	#WGET=wget -N -t 1 --timeout=55
 	WGET=curl -R -O --connect-timeout 55
 	WGET+=-z $@
 	echo downloading uniset tool
 	$WGET http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
 	gzip -dc uniset.tar.gz | tar xvf - uniset
 	echo downloading data from unicode.org
 	for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
 	do	$WGET http://unicode.org/Public/UNIDATA/$data
 	done
 	;;
 *)	echo checking package unicode-ucd
 	grep unicode-ucd /etc/setup/installed.db || exit 9
 	;;
 esac
 echo checking uniset tool
 type uniset || exit 9
 for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
 do	test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
 done
 echo generating from Unicode version `sed -e 's,[^.0-9],,g' -e 1q Blocks.txt`
 exit
 #############################################################################
 # table generation
 echo generating combining characters table
 uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B +D7B0-D7C6 +D7CB-D7FB c > combining.t
 echo generating ambiguous width characters table
 sh ./mkwidthA && uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c > ambiguous.t
 echo generating wide characters table
 sh ./mkwide
 #############################################################################
 # end
--- a/newlib/libc/string/mkwide
+++ b/newlib/libc/string/mkwide
@ -0,0 +1,49 @@
 #! /bin/sh
 # generate list of wide characters, with convex closure
 skipcheck=false
 if [ ! -r EastAsianWidth.txt ]
 then	ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
 fi
 if [ ! -r UnicodeData.txt ]
 then	ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
 fi
 if [ ! -r Blocks.txt ]
 then	ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
 fi
 sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na
 sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw
 PATH="$PATH:." # for uniset
 nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'`
 echo FW $nrfw
 nrna=`uniset +wide.na nr | sed -e 's,.*:,,'`
 echo NAH $nrna
 extrablocks="2E80-303E"
 # check all blocks
 includes () {
 	nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'`
 	test $nr != $3
 }
 echo "adding compact closure of wide ranges, this may take ~10min"
 for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt`
 do	range=$b
 	echo checking $range $* >&2
 	if includes $range fw $nrfw && ! includes $range na $nrna
 	then	echo $range
 	fi
 done > wide.blocks
 (
 sed -e "s,^,//," -e 1q EastAsianWidth.txt
 sed -e "s,^,//," -e 1q Blocks.txt
 uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c
 ) > wide.t
 rm -f wide.na wide.fw wide.blocks
--- a/newlib/libc/string/mkwidthA
+++ b/newlib/libc/string/mkwidthA
@ -0,0 +1,20 @@
 #! /bin/sh
 # generate WIDTH-A file, listing Unicode characters with width property
 # Ambiguous, from EastAsianWidth.txt
 if [ ! -r EastAsianWidth.txt ]
 then	ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
 fi
 if [ ! -r UnicodeData.txt ]
 then	ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
 fi
 if [ ! -r Blocks.txt ]
 then	ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
 fi
 sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new
 rm -f WIDTH-A
 echo "# UAX #11: East Asian Ambiguous" > WIDTH-A
 PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A
 rm -f width-a-new
--- a/newlib/libc/string/uniset
+++ b/newlib/libc/string/uniset
@ -0,0 +1,696 @@
 #!/usr/bin/perl
 # Uniset -- Unicode subset manager -- Markus Kuhn
 # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
 require 5.008;
 use open ':utf8';
 use FindBin qw($RealBin);  # to find directory where this file is located
 binmode(STDOUT, ":utf8");
 binmode(STDIN, ":utf8");
 my (%name, %invname, %category, %comment);
 print <<End if $#ARGV < 0;
 Uniset -- Unicode subset manager -- Markus Kuhn
 Uniset merges and subtracts Unicode subsets. It can output and
 analyse the resulting character set in various formats.
 Uniset understand the following command-line arguments:
 Commands to define a set of characters:
  + filename   add the character set described in the file to the set
  - filename   remove the character set described in the file from the set
  +: filename  add the characters in the UTF-8 file to the set
  -: filename  remove the characters in the UTF-8 file from the set
  +xxxx..yyyy  add the range to the set (xxxx and yyyy are hex numbers)
  -xxxx..yyyy  remove the range from the set (xxxx and yyyy are hex numbers)
  +cat=Xx      add all Unicode characters with category code Xx
  -cat=Xx      remove all Unicode characters with category code Xx
  -cat!=Xx     remove all Unicode characters without category code Xx
  clean        remove any elements that do not appear in the Unicode database
  unknown      remove any elements that do appear in the Unicode database
 Command to output descriptions of the constructed set of characters:
  table        write a full table with one line per character
  compact      output the set in compact MES format
  c            output the set as C interval array
  nr           output the number of characters
  sources      output a table that shows the number of characters contributed
               by the various combinations of input sets added with +.
  utf8-list    output a list of all characters encoded in UTF-8
 Commands to tailor the following output commands:
  html         write HTML tables instead of plain text
  ucs          add the unicode character itself to the table (UTF-8 in
               plain table, numeric character reference in HTML)
 Formats of character set input files read by the + and - command:
 Empty lines, white space at the start and end of the line and any
 comment text following a \# are ignored. The following formats are
 recognized
 xx yyyy             xx is the hex code in an 8-bit character set and yyyy
                    is the corresponding Unicode value. Both can optionally
                    be prefixed by 0x. This is the format used in the
                    files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
 yyyy                yyyy (optionally prefixed with 0x) is a Unicode character
                    belonging to the specified subset.
 yyyy-yyyy           a range of Unicode characters belonging to
 yyyy..yyyy          the specified subset.
 xx yy yy yy-yy yy   xx denotes a row (high-byte) and the yy specify
                    corresponding low bytes or with a hyphen also ranges of
                    low bytes in the Unicode values that belong to this
                    subset. This is also the format that is generated by
                    the compact command.
 End
 exit 1 if $#ARGV < 0;
 # Subroutine to identify whether the ISO 10646/Unicode character code
 # ucs belongs into the East Asian Wide (W) or East Asian FullWidth
 # (F) category as defined in Unicode Technical Report #11.
 sub iswide ($) {
    my $ucs = shift(@_);
    return ($ucs >= 0x1100 &&
 	    ($ucs <= 0x115f ||                     # Hangul Jamo
 	     $ucs == 0x2329 || $ucs == 0x232a ||
 	     ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
 	      $ucs != 0x303f) ||                   # CJK .. Yi
 	     ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
 	     ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
 	     ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
 	     ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
 	     ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
 	     ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
 	     ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
 }
 # Return the Unicode name that belongs to a given character code
 # Jamo short names, see Unicode 3.0, table 4-4, page 86
 my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
 	     'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
 my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
 	     'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
 	     'EU', 'YI', 'I'); # 1161..1175
 my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
 	     'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
 	     'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
 sub name {
    my $ucs = shift(@_);
    # The intervals used here reflect Unicode Version 3.2
    if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 	return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
    }
    if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
 	my $s = $ucs - 0xac00;
 	my $l = 0x1100 + int($s / (21 * 28));
 	my $v = 0x1161 + int(($s % (21 * 28)) / 28);
 	my $t = 0x11a7 + $s % 28;
 	return "HANGUL SYLLABLE " .
 	    ($lname[int($s / (21 * 28))] .
 	     $vname[int(($s % (21 * 28)) / 28)] .
 	     $tname[$s % 28 - 1]);
    }
    return $name{$ucs};
 }
 sub is_unicode {
    my $ucs = shift(@_);
    # The intervals used here reflect Unicode Version 3.2
    if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 	($ucs >=  0xac00 && $ucs <=  0xd7a3) ||
 	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 	return 1;
    }
    return exists $name{$ucs};
 }
 my @search_path;
 push @search_path, "$ENV{HOME}/local/share/uniset"
    if -d "$ENV{HOME}/local/share/uniset";
 push @search_path, "/usr/share/uniset" if -d "/usr/share/uniset";
 push @search_path, $RealBin unless $RealBin =~ m|^/usr/bin|;
 sub search_open {
    my ($mode, $fn) = @_;
    my $file;
    return $file if open($file, $mode, $fn);
    return undef if $fn =~ m|/|;
    for my $path (@search_path) {
 	return $file if open($file, $mode, "$path/$fn");
    }
    return undef;
 }
 my $html = 0;
 my $image = 0;
 my $adducs = 0;
 my $unicodedata = "UnicodeData.txt";
 my $blockdata = "Blocks.txt";
 # read list of all Unicode names
 my $data = search_open('<', $unicodedata);
 unless ($data) {
    die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
 	 "Please make sure that you have downloaded the file\n" .
 	 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
 }
 while (<$data>) {
    if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
 	next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
 	$ucs = hex($1);
        $name{$ucs} = $2;
 	$invname{$2} = $ucs;
 	$category{$ucs} = $3;
        $comment{$ucs} = $12;
    } else {
        die("Syntax error in line '$_' in file '$unicodedata'");
    }
 }
 close($data);
 # read list of all Unicode blocks
 $data = search_open('<', $blockdata);
 unless ($data) {
    die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
 	 "Please make sure that you have downloaded the file\n" .
 	 "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
 }
 my $blocks = 0;
 my (@blockstart, @blockend, @blockname);
 while (<$data>) {
    if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
        $blockstart[$blocks] = hex($1);
 	$blockend  [$blocks] = hex($2);
        $blockname [$blocks] = $3;
 	$blocks++;
    } elsif (/^\s*\#/ || /^\s*$/) {
 	# ignore comments and empty lines
    } else {
        die("Syntax error in line '$_' in file '$blockdata'");
    }
 }
 close($data);
 if ($blockend[$blocks-1] < 0x110000) {
    $blockstart[$blocks] = 0x110000;
    $blockend  [$blocks] = 0x7FFFFFFF;
    $blockname [$blocks] = "Beyond Plane 16";
    $blocks++;
 }
 # process command line arguments
 while ($_ = shift(@ARGV)) {
    if (/^html$/) {
 	$html = 1;
    } elsif (/^ucs$/) {
 	$adducs = 1;
    } elsif (/^img$/) {
 	$html = 1;
 	$image = 1;
    } elsif (/^template$/) {
 	$template = shift(@ARGV);
 	open(TEMPLATE, $template) || die("Can't open template file '$template': '$!'");
 	while (<TEMPLATE>) {
 	    if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
 		open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
 		while (<INCLUDE>) {
 		    print $_;
 		}
 		close(INCLUDE);
 	    } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
 		open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
 		while (<INCLUDE>) {
 		    s/&/&amp;/g;
 		    s/</&lt;/g;
 		    print $_;
 		}
 		close(INCLUDE);
 	    } else {
 		print $_;
 	    }
 	}
 	close(TEMPLATE);
    } elsif (/^\+cat=(.+)$/) {
 	# add characters with given category
 	$cat = $1;
 	for $i (keys(%category)) {
 	    $used{$i} = "[${cat}]" if $category{$i} eq $cat;
 	}
    } elsif (/^\-cat=(.+)$/) {
 	# remove characters with given category
 	$cat = $1;
 	for $i (keys(%category)) {
 	    delete $used{$i} if $category{$i} eq $cat;
 	}
    } elsif (/^\-cat!=(.+)$/) {
 	# remove characters without given category
 	$cat = $1;
 	for $i (keys(%category)) {
 	    delete $used{$i} unless $category{$i} eq $cat;
 	}
    } elsif (/^([+-]):(.*)/) {
 	$remove = $1 eq "-";
 	$setfile = $2;
 	$setfile = shift(@ARGV) if $setfile eq "";
 	push(@SETS, $setfile);
 	open(SET, $setfile) || die("Can't open set file '$setfile': '$!'");
 	$setname = $setfile;
 	while (<SET>) {
 	    while ($_) {
 		$i = ord($_);
 		$used{$i} .= "[${setname}]" unless $remove;
 		delete $used{$i} if $remove;
 		$_ = substr($_, 1);
 	    }
 	}
 	close SET;
    } elsif (/^([+-])(.*)/) {
 	$remove = $1 eq "-";
 	$setfile = $2;
 	$setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
 	if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
 	    # handle intervall specification on command line
 	    $first = hex($1);
 	    $last = hex($3);
 	    for ($i = $first; $i <= $last; $i++) {
 		$used{$i} .= "[ARG]" unless $remove;
 		delete $used{$i} if $remove;
 	    }
 	    next;
 	}
 	$setfile = shift(@ARGV) if $setfile eq "";
 	push(@SETS, $setfile);
 	my $setf = search_open('<', $setfile);
 	die("Can't open set file '$setfile': '$!'") unless $setf;
 	$cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
 	$setname = $setfile;
 	$setname =~ s/([^.\[\]]*)\..*/$1/;
 	while (<$setf>) {
 	    if (/^<code_set_name>/) {
 		# handle ISO 15897 (POSIX registry) charset mapping format
 		undef $comment_char;
 		undef $escape_char;
 		while (<$setf>) {
 		    if ($comment_char && /^$comment_char/) {
 			# remove comments
 			$_ = $`;
 		    }
 		    next if (/^\032?\s*$/);                                             # skip empty lines
 		    if (/^<comment_char> (\S)$/) {
 			$comment_char = $1;
 		    } elsif (/^<escape_char> (\S)$/) {
 			$escape_char = $1;
 		    } elsif (/^(END )?CHARMAP$/) {
 			#ignore
 		    } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
 			$used{hex($2)} .= "[${setname}{$1}]" unless $remove;
 			delete $used{hex($2)} if $remove;
 		    } else {
 			die("Syntax error in line $. in file '$setfile':\n'$_'\n");
 		    }
 		}
 		next;
 	    } elsif (/^STARTFONT /) {
 		# handle X11 BDF file
 		while (<$setf>) {
 		    if (/^ENCODING\s+([0-9]+)/) {
 			$used{$1} .= "[${setname}]" unless $remove;
 			delete $used{$1} if $remove;
 		    }
 		}
 		next;
 	    }
 	    tr/a-z/A-Z/;           # make input uppercase
 	    if ($cedf) {
 		if ($. > 4) {
 		    if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
 			# handle Kosta Kosti's trans CEDF format
 			next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
 			$ucs = $invname{$2};
 			die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
 			$used{$ucs} .= "[${setname}{$1}]" unless $remove;
 			delete $used{$ucs} if $remove;
 		    } else {
 			die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
 		    }
 		}
 		next;
 	    }
 	    if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
 		# ignore ftp.unicode.org mapping file lines with #UNDEFINED
 		next;
 	    }
 	    s/^([^\#]*)\#.*$/$1/;  # remove comments
 	    next if (/^\032?\s*$/);     # skip empty lines
 	    if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 		# handle entry from a ftp.unicode.org mapping file
 		$used{hex($4)} .= "[${setname}{$2}]" unless $remove;
 		delete $used{hex($4)} if $remove;
 	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 		# handle interval specification
 		$first = hex($2);
 		$last = hex($5);
 		for ($i = $first; $i <= $last; $i++) {
 		    $used{$i} .= "[${setname}]" unless $remove;
 		    delete $used{$i} if $remove;
 		}
 	    } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
 		# handle lines from P10 MES draft
 		$row = $1;
 		$cols = $_;
 		$cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
 		$cols =~ tr/,//d;
 		@cols = split(/\s+/, $cols);
 		for (@cols) {
 		    if (/^(..)$/) {
 			$first = hex("$row$1");
 			$last  = $first;
 		    } elsif (/^(..)-(..)$/) {
 			$first = hex("$row$1");
 			$last  = hex("$row$2");
 		    } else {
 			die ("this should never happen '$_'");
 		    }
 		    for ($i = $first; $i <= $last; $i++) {
 			$used{$i} .= "[${setname}]" unless $remove;
 			delete $used{$i} if $remove;
 		    }
 		}
 	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
 		# handle single character
 		$used{hex($2)} .= "[${setname}]" unless $remove;
 		delete $used{hex($2)} if $remove;
 	    } else {
 		die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
 	    }
 	}
 	close $setf;
    } elsif (/^loadimages$/ || /^loadbigimages$/) {
 	if (/^loadimages$/) {
 	    $prefix = "Small.Glyphs";
 	} else {
 	    $prefix = "Glyphs";
 	}
 	$total = 0;
 	for $i (keys(%used)) {
 	    next if ($name{$i} eq "<control>");
 	    $total++;
 	}
 	$count = 0;
 	$| = 1;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    $count++;
 	    $j = sprintf("%04X", $i);
 	    $j =~ /(..)(..)/;
 	    $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
 	    print("\r$count/$total: $gif");
 	    system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
 	    select(undef, undef, undef, 0.2);
 	}
 	print("\n");
 	exit 0;
    } elsif (/^giftable/) {
 	# form a table of glyphs (requires pbmtools installed)
 	$count = 0;
 	for $i (keys(%used)) {
 	    $count++ unless $name{$i} eq "<control>";
 	}
 	$width = int(sqrt($count/sqrt(2)) + 0.5);
 	$width = $1 if /^giftable([0-9]+)$/;
 	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
 	$col = 0;
 	$row = 0;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    $j = sprintf("%04X", $i);
 	    $j =~ /(..)(..)/;
 	    $gif = "Small.Glyphs/$1/U$j.gif";
 	    $pnm = sprintf("tmp-%02x.pnm", $col);
 	    $fallback = "Small.Glyphs/FF/UFFFD.gif";
 	    system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
 	    if (++$col == $width) {
 		system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 		if ($row == 0) {
 		    system("mv tmp-row.pnm table.pnm");
 		} else {
 		    system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
 		}
 		$row++;
 		$col = 0;
 		system("rm -f tmp-*.pnm table.pnm~");
 	    }
 	}
 	if ($col > 0) {
 	    system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 	    if ($row == 0) {
 		system("mv tmp-row.pnm table.pnm");
 	    } else {
 		system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
 	    }
 	}
 	system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
 	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
    } elsif (/^table$/) {
 	# go through all used names to print full table
 	print "<TABLE border=2>\n" if $html;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    if ($html) {
 		$sources = $used{$i};
 		$sources =~ s/\]\[/, /g;
 		$sources =~ s/^\[//g;
 		$sources =~ s/\]$//g;
 		$sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
 		$j = sprintf("%04X", $i);
 		$j =~ /(..)(..)/;
 		$gif = "Small.Glyphs/$1/U$j.gif";
 		print "<TR>";
 		print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
 		printf("<TD>&#%d;", $i) if $adducs;
 		print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
 		print " ($comment{$i})" if $comment{$i};
 		print "</SAMP><TD><SMALL>$sources</SMALL>\n";
 	    } else {
 		printf("%04X \# ", $i);
 		print pack("U", $i) . " " if $adducs;
 		print name($i) ."\n";
 	    }
 	}
 	print "</TABLE>\n" if $html;
    } elsif (/^imgblock$/) {
 	$width = 16;
 	$width = $1 if /giftable([0-9]+)/;
 	$col = 0;
 	$subline = "";
 	print "\n<P><TABLE cellspacing=0 cellpadding=0>";
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    print "<TR>" if $col == 0;
 	    $j = sprintf("%04X", $i);
 	    $j =~ /(..)(..)/;
 	    $gif = "Small.Glyphs/$1/U$j.gif";
 	    $alt = name($i);
 	    print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
 	    $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
 	    if (++$col == $width) {
 		print "<TR align=center>$subline";
 		$col = 0;
 		$subline = "";
 	    }
 	}
 	print "<TR align=center>$subline" if ($col > 0);
 	print "</TABLE>\n";
    } elsif (/^sources$/) {
 	# count how many characters are attributed to the various source set combinations
 	print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
 	for $i (keys(%used)) {
 	    next if ($name{$i} eq "<control>");
 	    $sources = $used{$i};
 	    $sources =~ s/\]\[/, /g;
 	    $sources =~ s/^\[//g;
 	    $sources =~ s/\]$//g;
 	    $sources =~ s/\{(..)\}//g;
 	    $contribs{$sources} += 1;
 	}
 	for $j (keys(%contribs)) {
 	    print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
 	}
 	print "</TABLE>\n" if $html;
    } elsif (/^compact$/) {
 	# print compact table in P10 MES format
 	print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
 	print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
 	print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
 	$current_row = '';
 	$start_col = '';
 	$last_col = '';
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    $row = sprintf("%02X", $i >> 8);
 	    $col = sprintf("%02X", $i & 0xff);
 	    if ($row ne $current_row) {
 		if (($last_col ne '') and ($last_col ne $start_col)) {
 		    print "-$last_col";
 		    print "</SAMP>" if $html;
 		}
 		print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
 		print "\n  $row\t" unless $html;
 		$len = 0;
 		$current_row = $row;
 		$start_col = '';
 	    }
 	    if ($start_col eq '') {
 		print "$col";
 		$len += 2;
 		$start_col = $col;
 		$last_col = $col;
 	    } elsif (hex($col) == hex($last_col) + 1) {
 		$last_col = $col;
 	    } else {
 		if ($last_col ne $start_col) {
 		    print "-$last_col";
 		    $len += 3;
 		}
 		if ($len > 60 && !$html) {
 		    print "\n  $row\t";
 		    $len = 0;
 		};
 		print " " if $len;
 		print "$col";
 		$len += 2 + !! $len;
 		$start_col = $col;
 		$last_col = $col;
 	    }
 	}
 	if (($last_col ne '') and ($last_col ne $start_col)) {
 	    print "-$last_col";
 	    print "</SAMP>" if $html;
 	}
 	print "\n" if ($current_row ne '');
 	print "</TABLE>\n" if $html;
 	print "\n";
    } elsif (/^c$/) {
 	# print table as C interval array
 	print "{";
 	$last_i = '';
 	$columns = 3;
 	$col = $columns;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    if ($last_i eq '') {
 		if (++$col > $columns) { $col = 1; print "\n "; }
 		printf(" { 0x%04X, ", $i);
 		$last_i = $i;
 	    } elsif ($i == $last_i + 1) {
 		$last_i = $i;
 	    } else {
 		printf("0x%04X },", $last_i);
 		if (++$col > $columns) { $col = 1; print "\n "; }
 		printf(" { 0x%04X, ", $i);
 		$last_i = $i;
 	    }
 	}
 	if ($last_i ne '') {
 	    printf("0x%04X }", $last_i);
 	}
 	print "\n};\n";
    } elsif (/^utf8-list$/) {
 	$col = 0;
 	$block = 0;
 	$last = -1;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    while ($blockend[$block] < $i && $block < $blocks - 1) {
 		$block++;
 	    }
 	    if ($last <= $blockend[$block-1] &&
 		$i < $blockstart[$block]) {
 		print "\n" if ($col);
 		printf "\nFree block (U+%04X-U+%04X):\n\n",
 		    $blockend[$block-1] + 1, $blockstart[$block] - 1;
 		$col = 0;
 	    }
 	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 		print "\n" if ($col);
 		printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
 		$blockstart[$block], $blockend[$block];
 		$col = 0;
 	    }
 	    if ($category{$i} eq 'Mn') {
 		# prefix non-spacing character with U+25CC DOTTED CIRCLE
 		print "\x{25CC}";
 	    } elsif ($category{$i} eq 'Me') {
 		# prefix enclosing non-spacing character with space
 		print " ";
 	    }
 	    print pack("U", $i);
 	    $col += 1 + iswide($i);
 	    if ($col >= 64) {
 		print "\n";
 		$col = 0;
 	    }
 	    $last = $i;
 	}
 	print "\n" if ($col);
    } elsif (/^collections$/) {
 	$block = 0;
 	$last = -1;
 	for $i (sort({$a <=> $b} keys(%used))) {
 	    next if ($name{$i} eq "<control>");
 	    while ($blockend[$block] < $i && $block < $blocks - 1) {
 		$block++;
 	    }
 	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 		print $blockname[$block],
 		  " " x (40 - length($blockname[$block]));
 		printf "%04X-%04X\n",
 		  $blockstart[$block], $blockend[$block];
 	    }
 	    $last = $i;
 	}
    } elsif (/^nr$/) {
 	print "<P>" if $html;
 	print "# " unless $html;
 	print "Number of characters in above table: ";
 	$count = 0;
 	for $i (keys(%used)) {
 	    $count++ unless $name{$i} eq "<control>";
 	}
 	print $count;
 	print "\n";
    } elsif (/^clean$/) {
 	# remove characters from set that are not in $unicodedata
 	for $i (keys(%used)) {
 	    delete $used{$i} unless is_unicode($i);
 	}
    } elsif (/^unknown$/) {
 	# remove characters from set that are in $unicodedata
 	for $i (keys(%used)) {
 	    delete $used{$i} if is_unicode($i);
 	}
    } else {
 	die("Unknown command line command '$_'");
    };
 }