2003-09-26 20:55:01 +02:00
|
|
|
# uni_parse.tcl --
|
|
|
|
#
|
|
|
|
# This program parses the UnicodeData file and generates the
|
|
|
|
# corresponding uni_data.c file with compressed character
|
|
|
|
# data tables. The input to this program should be rfc3454.txt
|
|
|
|
#
|
|
|
|
# Copyright (c) 1998-1999 by Scriptics Corporation.
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Modified for ejabberd by Alexey Shchepin
|
|
|
|
#
|
|
|
|
# RCS: @(#) $Id$
|
|
|
|
|
|
|
|
|
|
|
|
namespace eval uni {
|
2005-05-07 03:21:39 +02:00
|
|
|
set shift 8; # number of bits of data within a page
|
2003-09-26 20:55:01 +02:00
|
|
|
# This value can be adjusted to find the
|
|
|
|
# best split to minimize table size
|
|
|
|
|
|
|
|
variable pMap; # map from page to page index, each entry is
|
|
|
|
# an index into the pages table, indexed by
|
|
|
|
# page number
|
|
|
|
variable pages; # map from page index to page info, each
|
|
|
|
# entry is a list of indices into the groups
|
|
|
|
# table, the list is indexed by the offset
|
|
|
|
variable groups; # list of character info values, indexed by
|
|
|
|
# group number, initialized with the
|
|
|
|
# unassigned character group
|
|
|
|
}
|
|
|
|
|
2003-10-06 22:12:11 +02:00
|
|
|
proc uni::getValue {i} {
|
|
|
|
variable casemap
|
|
|
|
variable casemap2
|
|
|
|
variable tablemap
|
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
if {[info exists tablemap($i)]} {
|
|
|
|
set tables $tablemap($i)
|
|
|
|
} else {
|
|
|
|
set tables {}
|
|
|
|
}
|
|
|
|
|
2003-10-06 22:12:11 +02:00
|
|
|
if {[info exists casemap2($i)]} {
|
|
|
|
set multicase 1
|
|
|
|
set delta $casemap2($i)
|
|
|
|
} else {
|
|
|
|
set multicase 0
|
2005-05-07 03:21:39 +02:00
|
|
|
if {[info exists casemap($i)]} {
|
|
|
|
set delta $casemap($i)
|
|
|
|
} else {
|
|
|
|
set delta 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if {abs($delta) > 0xFFFFF} {
|
|
|
|
puts "delta must be less than 22 bits wide"
|
|
|
|
exit
|
2003-10-06 22:12:11 +02:00
|
|
|
}
|
|
|
|
|
2003-09-26 20:55:01 +02:00
|
|
|
set ac 0
|
|
|
|
set c11 0
|
|
|
|
set c21 0
|
|
|
|
set b1 0
|
|
|
|
set d1 0
|
|
|
|
set d2 0
|
|
|
|
set xnp 0
|
|
|
|
|
|
|
|
foreach tab $tables {
|
|
|
|
switch -glob -- $tab {
|
|
|
|
C.1.1 {set c11 1}
|
|
|
|
C.2.1 {set c21 1}
|
|
|
|
C.* {set ac 1}
|
|
|
|
A.1 {set ac 1}
|
|
|
|
B.1 {set b1 1}
|
|
|
|
D.1 {set d1 1}
|
|
|
|
D.2 {set d2 1}
|
|
|
|
XNP {set xnp 1}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
set val [expr {($ac << 0) |
|
|
|
|
($c11 << 1) |
|
|
|
|
($c21 << 2) |
|
|
|
|
($b1 << 3) |
|
|
|
|
($d1 << 4) |
|
|
|
|
($d2 << 5) |
|
|
|
|
($xnp << 6) |
|
2003-10-06 22:12:11 +02:00
|
|
|
($multicase << 7) |
|
2005-05-07 03:21:39 +02:00
|
|
|
($delta << 11)}]
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
return $val
|
|
|
|
}
|
|
|
|
|
|
|
|
proc uni::getGroup {value} {
|
|
|
|
variable groups
|
|
|
|
|
|
|
|
set gIndex [lsearch -exact $groups $value]
|
|
|
|
if {$gIndex == -1} {
|
|
|
|
set gIndex [llength $groups]
|
|
|
|
lappend groups $value
|
|
|
|
}
|
|
|
|
return $gIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
proc uni::addPage {info} {
|
|
|
|
variable pMap
|
|
|
|
variable pages
|
2005-05-07 03:21:39 +02:00
|
|
|
variable pages_map
|
2003-09-26 20:55:01 +02:00
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
if {[info exists pages_map($info)]} {
|
|
|
|
lappend pMap $pages_map($info)
|
|
|
|
} else {
|
2003-09-26 20:55:01 +02:00
|
|
|
set pIndex [llength $pages]
|
|
|
|
lappend pages $info
|
2005-05-07 03:21:39 +02:00
|
|
|
set pages_map($info) $pIndex
|
|
|
|
lappend pMap $pIndex
|
2003-09-26 20:55:01 +02:00
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
|
2003-09-26 20:55:01 +02:00
|
|
|
proc uni::load_tables {data} {
|
|
|
|
variable casemap
|
2003-10-06 22:12:11 +02:00
|
|
|
variable casemap2
|
|
|
|
variable multicasemap
|
2003-09-26 20:55:01 +02:00
|
|
|
variable tablemap
|
|
|
|
|
2003-10-06 22:12:11 +02:00
|
|
|
set multicasemap {}
|
2003-09-26 20:55:01 +02:00
|
|
|
set table ""
|
|
|
|
|
|
|
|
foreach line [split $data \n] {
|
|
|
|
if {$table == ""} {
|
|
|
|
if {[regexp { ----- Start Table (.*) -----} $line temp table]} {
|
|
|
|
#puts "Start table '$table'"
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if {[regexp { ----- End Table (.*) -----} $line temp table1]} {
|
|
|
|
set table ""
|
|
|
|
} else {
|
|
|
|
if {$table == "B.1"} {
|
|
|
|
if {[regexp {^ ([[:xdigit:]]+); ;} $line \
|
|
|
|
temp val]} {
|
|
|
|
scan $val %x val
|
2005-05-07 03:21:39 +02:00
|
|
|
if {$val <= 0x10ffff} {
|
2003-09-26 20:55:01 +02:00
|
|
|
lappend tablemap($val) $table
|
|
|
|
}
|
|
|
|
}
|
2003-10-08 20:27:48 +02:00
|
|
|
} elseif {$table == "B.2"} {
|
2003-09-26 20:55:01 +02:00
|
|
|
if {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \
|
|
|
|
temp from to]} {
|
|
|
|
scan $from %x from
|
|
|
|
scan $to %x to
|
2005-05-07 03:21:39 +02:00
|
|
|
if {$from <= 0x10ffff && $to <= 0x10ffff} {
|
2003-09-26 20:55:01 +02:00
|
|
|
set casemap($from) [expr {$to - $from}]
|
|
|
|
}
|
2003-10-06 22:12:11 +02:00
|
|
|
} elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
|
|
|
|
temp from to1 to2]} {
|
|
|
|
scan $from %x from
|
|
|
|
scan $to1 %x to1
|
|
|
|
scan $to2 %x to2
|
2005-05-07 03:21:39 +02:00
|
|
|
if {$from <= 0x10ffff && \
|
|
|
|
$to1 <= 0x10ffff && $to2 <= 0x10ffff} {
|
2003-10-06 22:12:11 +02:00
|
|
|
set casemap2($from) [llength $multicasemap]
|
|
|
|
lappend multicasemap [list $to1 $to2]
|
|
|
|
}
|
|
|
|
} elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
|
|
|
|
temp from to1 to2 to3]} {
|
|
|
|
scan $from %x from
|
|
|
|
scan $to1 %x to1
|
|
|
|
scan $to2 %x to2
|
|
|
|
scan $to3 %x to3
|
2005-05-07 03:21:39 +02:00
|
|
|
if {$from <= 0x10ffff && \
|
|
|
|
$to1 <= 0x10ffff && $to2 <= 0x10ffff && \
|
|
|
|
$to3 <= 0x10ffff} {
|
2003-10-06 22:12:11 +02:00
|
|
|
set casemap2($from) [llength $multicasemap]
|
|
|
|
lappend multicasemap [list $to1 $to2 $to3]
|
|
|
|
}
|
2003-09-26 20:55:01 +02:00
|
|
|
} else {
|
2003-10-06 22:12:11 +02:00
|
|
|
#puts "missed: $line"
|
2003-09-26 20:55:01 +02:00
|
|
|
}
|
|
|
|
|
2003-10-08 20:27:48 +02:00
|
|
|
} elseif {$table != "B.3"} {
|
2003-09-26 20:55:01 +02:00
|
|
|
if {[regexp {^ ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \
|
|
|
|
temp from to]} {
|
|
|
|
scan $from %x from
|
|
|
|
scan $to %x to
|
2005-05-07 03:21:39 +02:00
|
|
|
for {set i $from} {$i <= $to && $i <= 0x10ffff} {incr i} {
|
2003-09-26 20:55:01 +02:00
|
|
|
lappend tablemap($i) $table
|
|
|
|
}
|
|
|
|
} elseif {[regexp {^ ([[:xdigit:]]+)} $line \
|
|
|
|
temp val]} {
|
|
|
|
scan $val %x val
|
2005-05-07 03:21:39 +02:00
|
|
|
if {$val <= 0x10ffff} {
|
2003-09-26 20:55:01 +02:00
|
|
|
lappend tablemap($val) $table
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# XMPP nodeprep prohibited
|
|
|
|
foreach val {22 26 27 2f 3a 3c 3e 40} {
|
|
|
|
scan $val %x val
|
|
|
|
lappend tablemap($val) XNP
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc uni::buildTables {} {
|
|
|
|
variable shift
|
|
|
|
|
|
|
|
variable casemap
|
|
|
|
variable tablemap
|
|
|
|
|
|
|
|
variable pMap {}
|
|
|
|
variable pages {}
|
|
|
|
variable groups {}
|
|
|
|
set info {} ;# temporary page info
|
|
|
|
|
|
|
|
set mask [expr {(1 << $shift) - 1}]
|
|
|
|
|
|
|
|
set next 0
|
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
for {set i 0} {$i <= 0x10ffff} {incr i} {
|
2003-10-06 22:12:11 +02:00
|
|
|
set gIndex [getGroup [getValue $i]]
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
# Split character index into offset and page number
|
|
|
|
set offset [expr {$i & $mask}]
|
|
|
|
set page [expr {($i >> $shift)}]
|
|
|
|
|
|
|
|
# Add the group index to the info for the current page
|
|
|
|
lappend info $gIndex
|
|
|
|
|
|
|
|
# If this is the last entry in the page, add the page
|
|
|
|
if {$offset == $mask} {
|
|
|
|
addPage $info
|
|
|
|
set info {}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
proc uni::main {} {
|
|
|
|
global argc argv0 argv
|
|
|
|
variable pMap
|
|
|
|
variable pages
|
|
|
|
variable groups
|
|
|
|
variable shift
|
2003-10-06 22:12:11 +02:00
|
|
|
variable multicasemap
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
if {$argc != 2} {
|
|
|
|
puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
set f [open [lindex $argv 0] r]
|
|
|
|
set data [read $f]
|
|
|
|
close $f
|
|
|
|
|
|
|
|
load_tables $data
|
|
|
|
buildTables
|
|
|
|
puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
|
|
|
|
set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
|
2005-05-07 03:21:39 +02:00
|
|
|
puts "shift = $shift, space = $size"
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
set f [open [file join [lindex $argv 1] uni_data.c] w]
|
|
|
|
fconfigure $f -translation lf
|
|
|
|
puts $f "/*
|
|
|
|
* uni_data.c --
|
|
|
|
*
|
|
|
|
* Declarations of Unicode character information tables. This file is
|
|
|
|
* automatically generated by the uni_parse.tcl script. Do not
|
|
|
|
* modify this file by hand.
|
|
|
|
*
|
|
|
|
* Copyright (c) 1998 by Scriptics Corporation.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Modified for ejabberd by Alexey Shchepin
|
|
|
|
*
|
|
|
|
* RCS: @(#) \$Id\$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A 16-bit Unicode character is split into two parts in order to index
|
|
|
|
* into the following tables. The lower OFFSET_BITS comprise an offset
|
|
|
|
* into a page of characters. The upper bits comprise the page number.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define OFFSET_BITS $shift
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The pageMap is indexed by page number and returns an alternate page number
|
|
|
|
* that identifies a unique page of characters. Many Unicode characters map
|
|
|
|
* to the same alternate page number.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static unsigned char pageMap\[\] = {"
|
|
|
|
set line " "
|
|
|
|
set last [expr {[llength $pMap] - 1}]
|
|
|
|
for {set i 0} {$i <= $last} {incr i} {
|
|
|
|
append line [lindex $pMap $i]
|
|
|
|
if {$i != $last} {
|
|
|
|
append line ", "
|
|
|
|
}
|
|
|
|
if {[string length $line] > 70} {
|
|
|
|
puts $f $line
|
|
|
|
set line " "
|
|
|
|
}
|
|
|
|
}
|
|
|
|
puts $f $line
|
|
|
|
puts $f "};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The groupMap is indexed by combining the alternate page number with
|
|
|
|
* the page offset and returns a group number that identifies a unique
|
|
|
|
* set of character attributes.
|
|
|
|
*/
|
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
static unsigned short int groupMap\[\] = {"
|
2003-09-26 20:55:01 +02:00
|
|
|
set line " "
|
|
|
|
set lasti [expr {[llength $pages] - 1}]
|
|
|
|
for {set i 0} {$i <= $lasti} {incr i} {
|
|
|
|
set page [lindex $pages $i]
|
|
|
|
set lastj [expr {[llength $page] - 1}]
|
|
|
|
for {set j 0} {$j <= $lastj} {incr j} {
|
|
|
|
append line [lindex $page $j]
|
|
|
|
if {$j != $lastj || $i != $lasti} {
|
|
|
|
append line ", "
|
|
|
|
}
|
|
|
|
if {[string length $line] > 70} {
|
|
|
|
puts $f $line
|
|
|
|
set line " "
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
puts $f $line
|
|
|
|
puts $f "};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each group represents a unique set of character attributes. The attributes
|
|
|
|
* are encoded into a 32-bit value as follows:
|
|
|
|
*
|
|
|
|
* Bit 0 A.1 | C.1.2 | C.2.2 | C.3 -- C.9
|
|
|
|
*
|
|
|
|
* Bit 1 C.1.1
|
|
|
|
*
|
|
|
|
* Bit 2 C.2.1
|
|
|
|
*
|
|
|
|
* Bit 3 B.1
|
|
|
|
*
|
2005-05-07 03:21:39 +02:00
|
|
|
* Bit 4 D.1
|
2003-09-26 20:55:01 +02:00
|
|
|
*
|
2005-05-07 03:21:39 +02:00
|
|
|
* Bit 5 D.2
|
2003-09-26 20:55:01 +02:00
|
|
|
*
|
2005-05-07 03:21:39 +02:00
|
|
|
* Bit 6 XNP
|
2003-09-26 20:55:01 +02:00
|
|
|
*
|
2003-10-06 22:12:11 +02:00
|
|
|
* Bit 7 Case maps to several characters
|
|
|
|
*
|
2005-05-07 03:21:39 +02:00
|
|
|
* Bits 8-10 Reserved for future use.
|
2003-09-26 20:55:01 +02:00
|
|
|
*
|
2005-05-07 03:21:39 +02:00
|
|
|
* Bits 11-31 Case delta: delta for case conversions. This should be the
|
2003-09-26 20:55:01 +02:00
|
|
|
* highest field so we can easily sign extend.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int groups\[\] = {"
|
|
|
|
set line " "
|
|
|
|
set last [expr {[llength $groups] - 1}]
|
|
|
|
for {set i 0} {$i <= $last} {incr i} {
|
|
|
|
set val [lindex $groups $i]
|
|
|
|
|
|
|
|
append line [format "%d" $val]
|
|
|
|
if {$i != $last} {
|
|
|
|
append line ", "
|
|
|
|
}
|
|
|
|
if {[string length $line] > 65} {
|
|
|
|
puts $f $line
|
|
|
|
set line " "
|
|
|
|
}
|
|
|
|
}
|
|
|
|
puts $f $line
|
|
|
|
puts $f "};
|
|
|
|
|
2003-10-06 22:12:11 +02:00
|
|
|
/*
|
|
|
|
* Table for characters that lowercased to multiple ones
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int multiCaseTable\[\]\[4\] = {"
|
|
|
|
set last [expr {[llength $multicasemap] - 1}]
|
|
|
|
for {set i 0} {$i <= $last} {incr i} {
|
|
|
|
set val [lindex $multicasemap $i]
|
|
|
|
|
|
|
|
set line " "
|
|
|
|
append line [format "{%d, %s}" [llength $val] [join $val ", "]]
|
|
|
|
if {$i != $last} {
|
|
|
|
append line ", "
|
|
|
|
}
|
|
|
|
puts $f $line
|
|
|
|
}
|
|
|
|
puts $f "};
|
|
|
|
|
2003-09-26 20:55:01 +02:00
|
|
|
/*
|
|
|
|
* The following constants are used to determine the category of a
|
|
|
|
* Unicode character.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define ACMask (1 << 0)
|
|
|
|
#define C11Mask (1 << 1)
|
|
|
|
#define C21Mask (1 << 2)
|
|
|
|
#define B1Mask (1 << 3)
|
|
|
|
#define D1Mask (1 << 4)
|
|
|
|
#define D2Mask (1 << 5)
|
|
|
|
#define XNPMask (1 << 6)
|
2003-10-06 22:12:11 +02:00
|
|
|
#define MCMask (1 << 7)
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following macros extract the fields of the character info. The
|
|
|
|
* GetDelta() macro is complicated because we can't rely on the C compiler
|
|
|
|
* to do sign extension on right shifts.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define GetCaseType(info) (((info) & 0xE0) >> 5)
|
|
|
|
#define GetCategory(info) ((info) & 0x1F)
|
2005-05-07 03:21:39 +02:00
|
|
|
#define GetDelta(info) (((info) > 0) ? ((info) >> 11) : (~(~((info)) >> 11)))
|
2003-10-06 22:12:11 +02:00
|
|
|
#define GetMC(info) (multiCaseTable\[GetDelta(info)\])
|
2003-09-26 20:55:01 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This macro extracts the information about a character from the
|
|
|
|
* Unicode character tables.
|
|
|
|
*/
|
|
|
|
|
2005-05-07 03:21:39 +02:00
|
|
|
#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0x1fffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
|
2003-09-26 20:55:01 +02:00
|
|
|
"
|
|
|
|
|
|
|
|
close $f
|
|
|
|
}
|
|
|
|
|
|
|
|
uni::main
|
|
|
|
|
|
|
|
return
|