* src/stringprep/stringprep_drv.c: Added check for bidi

* src/stringprep/uni_parse.tcl: Now handle all Unicode code points
up to U+10FFFF
* src/stringprep/uni_parse2.tcl: Likewise
* src/stringprep/uni_data.c: Regenerated
* src/stringprep/uni_norm.c: Likewise

SVN Revision: 343
This commit is contained in:
Alexey Shchepin 2005-05-07 01:21:39 +00:00
parent 1a4c851c7c
commit 4eca697b2d
6 changed files with 2907 additions and 1282 deletions

View File

@ -1,3 +1,13 @@
2005-05-07 Alexey Shchepin <alexey@sevcom.net>
* src/stringprep/stringprep_drv.c: Added check for bidi
* src/stringprep/uni_parse.tcl: Now handle all Unicode code points
up to U+10FFFF
* src/stringprep/uni_parse2.tcl: Likewise
* src/stringprep/uni_data.c: Regenerated
* src/stringprep/uni_norm.c: Likewise
2005-05-06 Alexey Shchepin <alexey@sevcom.net>
* src/stringprep/uni_norm.c: Regenerated with Unicode 3.2 tables

View File

@ -60,7 +60,7 @@ static void canonical_ordering(int *str, int len)
next = GetUniCharCClass(str[i + 1]);
if (next != 0 && last > next)
{
for(j = i; j >= 0; j--)
for (j = i; j >= 0; j--)
{
if (GetUniCharCClass(str[j]) <= next)
break;
@ -118,23 +118,23 @@ static int compose(int ch1, int ch2)
#define ADD_UCHAR(ruc) \
if(ruc < 0x80) { \
if(pos >= size) { \
if (ruc <= 0x7F) { \
if (pos >= size) { \
size = 2*size + 1; \
rstring = driver_realloc(rstring, size); \
} \
rstring[pos] = (char) ruc; \
pos++; \
} else if(ruc < 0x7FF) { \
if(pos + 1 >= size) { \
} else if (ruc <= 0x7FF) { \
if (pos + 1 >= size) { \
size = 2*size + 2; \
rstring = driver_realloc(rstring, size); \
} \
rstring[pos] = (char) ((ruc >> 6) | 0xC0); \
rstring[pos+1] = (char) ((ruc | 0x80) & 0xBF); \
pos += 2; \
} else if(ruc < 0xFFFF) { \
if(pos + 2 >= size) { \
} else if (ruc <= 0xFFFF) { \
if (pos + 2 >= size) { \
size = 2*size + 3; \
rstring = driver_realloc(rstring, size); \
} \
@ -142,10 +142,20 @@ static int compose(int ch1, int ch2)
rstring[pos+1] = (char) (((ruc >> 6) | 0x80) & 0xBF); \
rstring[pos+2] = (char) ((ruc | 0x80) & 0xBF); \
pos += 3; \
} else if (ruc <= 0x1FFFFF) { \
if (pos + 2 >= size) { \
size = 2*size + 4; \
rstring = driver_realloc(rstring, size); \
} \
rstring[pos] = (char) ((ruc >> 18) | 0xF0); \
rstring[pos+1] = (char) (((ruc >> 12) | 0x80) & 0xBF); \
rstring[pos+2] = (char) (((ruc >> 6) | 0x80) & 0xBF); \
rstring[pos+3] = (char) ((ruc | 0x80) & 0xBF); \
pos += 4; \
}
#define ADD_UCHAR32(str, pos, len, ch) \
if(pos >= len) { \
if (pos >= len) { \
len = 2*len + 1; \
str = driver_realloc(str, len * sizeof(int)); \
} \
@ -155,10 +165,10 @@ static int compose(int ch1, int ch2)
#define ADD_DECOMP(ruc) \
info = GetUniCharDecompInfo(ruc); \
if(info >= 0) { \
if (info >= 0) { \
decomp_len = GetDecompLen(info); \
decomp_shift = GetDecompShift(info); \
for(j = 0; j < decomp_len; j++) { \
for (j = 0; j < decomp_len; j++) { \
ADD_UCHAR32(str32, str32pos, str32len, \
decompList[decomp_shift + j]); \
} \
@ -188,6 +198,7 @@ static int stringprep_erl_control(ErlDrvData drv_data,
int comp_pos, comp_starter_pos;
int cclass_prev, cclass2;
int ch1, ch2;
int first_ral, last_ral, have_ral, have_l;
size = len + 1;
@ -221,35 +232,50 @@ static int stringprep_erl_control(ErlDrvData drv_data,
break;
}
for(i = 0; i < len; i++)
for (i = 0; i < len; i++)
{
c = buf[i];
if(c < 0x80) {
if (c < 0x80) {
uc = c;
} else if(c < 0xC0) {
} else if (c < 0xC0) {
bad = 1;
} else if(c < 0xE0) {
if(i+1 < len && (buf[i+1] & 0xC0) == 0x80) {
} else if (c < 0xE0) {
if (i+1 < len && (buf[i+1] & 0xC0) == 0x80) {
uc = ((c & 0x1F) << 6) | (buf[i+1] & 0x3F);
i++;
} else {
bad = 1;
}
} else if(c < 0xF0) {
if(i+2 < len && (buf[i+1] & 0xC0) == 0x80 &&
(buf[i+2] & 0xC0) == 0x80) {
uc = ((c & 0x0F) << 12) | ((buf[i+1] & 0x3F) << 6)
} else if (c < 0xF0) {
if (i+2 < len && (buf[i+1] & 0xC0) == 0x80 &&
(buf[i+2] & 0xC0) == 0x80) {
uc = ((c & 0x0F) << 12)
| ((buf[i+1] & 0x3F) << 6)
| (buf[i+2] & 0x3F);
i += 2;
} else {
bad = 1;
}
} else if (c < 0xF8) {
if (i+3 < len &&
(buf[i+1] & 0xC0) == 0x80 &&
(buf[i+2] & 0xC0) == 0x80 &&
(buf[i+3] & 0xC0) == 0x80) {
uc = ((c & 0x07) << 18)
| ((buf[i+1] & 0x3F) << 12)
| ((buf[i+2] & 0x3F) << 6)
| (buf[i+3] & 0x3F);
i += 3;
if (uc > 0x10FFFF)
bad = 1;
} else {
bad = 1;
}
} else {
// TODO
bad = 1;
}
if(bad) {
if (bad) {
*rbuf = rstring;
driver_free(str32);
return 1;
@ -257,16 +283,16 @@ static int stringprep_erl_control(ErlDrvData drv_data,
info = GetUniCharInfo(uc);
if(!(info & B1Mask))
if (!(info & B1Mask))
{
if(tolower) {
if(!(info & MCMask))
if (tolower) {
if (!(info & MCMask))
{
ruc = uc + GetDelta(info);
ADD_DECOMP(ruc);
} else {
mc = GetMC(info);
for(j = 1; j <= mc[0]; j++) {
for (j = 1; j <= mc[0]; j++) {
ruc = mc[j];
ADD_DECOMP(ruc);
}
@ -313,18 +339,30 @@ static int stringprep_erl_control(ErlDrvData drv_data,
str32[comp_starter_pos] = ch1;
str32pos = comp_pos;
for(i = 0; i < str32pos; i++)
last_ral = have_ral = have_l = 0;
info = GetUniCharInfo(str32[0]);
first_ral = info & D1Mask;
for (i = 0; i < str32pos; i++)
{
ruc = str32[i];
info = GetUniCharInfo(ruc);
if(info & prohibit) {
if (info & prohibit) {
*rbuf = rstring;
driver_free(str32);
return 1;
}
last_ral = info & D1Mask;
have_ral = have_ral || last_ral;
have_l = info & D2Mask;
ADD_UCHAR(ruc);
}
if (have_ral && (!first_ral || !last_ral || have_l)) {
*rbuf = rstring;
driver_free(str32);
return 1;
}
rstring[0] = 1;
*rbuf = rstring;
driver_free(str32);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@
namespace eval uni {
set shift 5; # number of bits of data within a page
set shift 8; # number of bits of data within a page
# This value can be adjusted to find the
# best split to minimize table size
@ -33,13 +33,27 @@ proc uni::getValue {i} {
variable casemap2
variable tablemap
set tables $tablemap($i)
if {[info exists tablemap($i)]} {
set tables $tablemap($i)
} else {
set tables {}
}
if {[info exists casemap2($i)]} {
set multicase 1
set delta $casemap2($i)
} else {
set multicase 0
set delta $casemap($i)
if {[info exists casemap($i)]} {
set delta $casemap($i)
} else {
set delta 0
}
}
if {abs($delta) > 0xFFFFF} {
puts "delta must be less than 22 bits wide"
exit
}
set ac 0
@ -71,7 +85,7 @@ proc uni::getValue {i} {
($d2 << 5) |
($xnp << 6) |
($multicase << 7) |
($delta << 16)}]
($delta << 11)}]
return $val
}
@ -90,27 +104,26 @@ proc uni::getGroup {value} {
proc uni::addPage {info} {
variable pMap
variable pages
variable pages_map
set pIndex [lsearch -exact $pages $info]
if {$pIndex == -1} {
if {[info exists pages_map($info)]} {
lappend pMap $pages_map($info)
} else {
set pIndex [llength $pages]
lappend pages $info
set pages_map($info) $pIndex
lappend pMap $pIndex
}
lappend pMap $pIndex
return
}
proc uni::load_tables {data} {
variable casemap
variable casemap2
variable multicasemap
variable tablemap
for {set i 0} {$i <= 0xffff} {incr i} {
set casemap($i) 0
set tablemap($i) {}
}
set multicasemap {}
set table ""
@ -127,7 +140,7 @@ proc uni::load_tables {data} {
if {[regexp {^ ([[:xdigit:]]+); ;} $line \
temp val]} {
scan $val %x val
if {$val <= 0xffff} {
if {$val <= 0x10ffff} {
lappend tablemap($val) $table
}
}
@ -136,7 +149,7 @@ proc uni::load_tables {data} {
temp from to]} {
scan $from %x from
scan $to %x to
if {$from <= 0xffff && $to <= 0xffff} {
if {$from <= 0x10ffff && $to <= 0x10ffff} {
set casemap($from) [expr {$to - $from}]
}
} elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
@ -144,8 +157,8 @@ proc uni::load_tables {data} {
scan $from %x from
scan $to1 %x to1
scan $to2 %x to2
if {$from <= 0xffff && \
$to1 <= 0xffff && $to2 <= 0xffff} {
if {$from <= 0x10ffff && \
$to1 <= 0x10ffff && $to2 <= 0x10ffff} {
set casemap2($from) [llength $multicasemap]
lappend multicasemap [list $to1 $to2]
}
@ -155,9 +168,9 @@ proc uni::load_tables {data} {
scan $to1 %x to1
scan $to2 %x to2
scan $to3 %x to3
if {$from <= 0xffff && \
$to1 <= 0xffff && $to2 <= 0xffff && \
$to3 <= 0xffff} {
if {$from <= 0x10ffff && \
$to1 <= 0x10ffff && $to2 <= 0x10ffff && \
$to3 <= 0x10ffff} {
set casemap2($from) [llength $multicasemap]
lappend multicasemap [list $to1 $to2 $to3]
}
@ -170,13 +183,13 @@ proc uni::load_tables {data} {
temp from to]} {
scan $from %x from
scan $to %x to
for {set i $from} {$i <= $to && $i <= 0xffff} {incr i} {
for {set i $from} {$i <= $to && $i <= 0x10ffff} {incr i} {
lappend tablemap($i) $table
}
} elseif {[regexp {^ ([[:xdigit:]]+)} $line \
temp val]} {
scan $val %x val
if {$val <= 0xffff} {
if {$val <= 0x10ffff} {
lappend tablemap($val) $table
}
}
@ -207,7 +220,7 @@ proc uni::buildTables {} {
set next 0
for {set i 0} {$i <= 0xffff} {incr i} {
for {set i 0} {$i <= 0x10ffff} {incr i} {
set gIndex [getGroup [getValue $i]]
# Split character index into offset and page number
@ -246,7 +259,7 @@ proc uni::main {} {
buildTables
puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
puts "shift = 6, space = $size"
puts "shift = $shift, space = $size"
set f [open [file join [lindex $argv 1] uni_data.c] w]
fconfigure $f -translation lf
@ -301,7 +314,7 @@ static unsigned char pageMap\[\] = {"
* set of character attributes.
*/
static unsigned char groupMap\[\] = {"
static unsigned short int groupMap\[\] = {"
set line " "
set lasti [expr {[llength $pages] - 1}]
for {set i 0} {$i <= $lasti} {incr i} {
@ -333,17 +346,17 @@ static unsigned char groupMap\[\] = {"
*
* Bit 3 B.1
*
* Bit 4 B.1
* Bit 4 D.1
*
* Bit 5 D.1
* Bit 5 D.2
*
* Bit 6 D.2
* Bit 6 XNP
*
* Bit 7 Case maps to several characters
*
* Bits 8-15 Reserved for future use.
* Bits 8-10 Reserved for future use.
*
* Bits 16-31 Case delta: delta for case conversions. This should be the
* Bits 11-31 Case delta: delta for case conversions. This should be the
* highest field so we can easily sign extend.
*/
@ -405,7 +418,7 @@ static int multiCaseTable\[\]\[4\] = {"
#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(info) ((info) & 0x1F)
#define GetDelta(info) (((info) > 0) ? ((info) >> 16) : (~(~((info)) >> 16)))
#define GetDelta(info) (((info) > 0) ? ((info) >> 11) : (~(~((info)) >> 11)))
#define GetMC(info) (multiCaseTable\[GetDelta(info)\])
/*
@ -413,7 +426,7 @@ static int multiCaseTable\[\]\[4\] = {"
* Unicode character tables.
*/
#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0x1fffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
"
close $f

View File

@ -2,8 +2,8 @@
#
# This program parses the UnicodeData file and generates the
# corresponding uni_norm.c file with compressed character
# data tables. The input to this program should be the latest
# UnicodeData.txt and CompositionExclusions.txt files from:
# data tables. The input to this program should be
# UnicodeData-3.2.0.txt and CompositionExclusions-3.2.0.txt files from:
# ftp://ftp.unicode.org/Public/UNIDATA/
#
# Copyright (c) 1998-1999 by Scriptics Corporation.
@ -15,9 +15,9 @@
namespace eval uni {
set cclass_shift 6
set decomp_shift 5
set comp_shift 5
set cclass_shift 8
set decomp_shift 8
set comp_shift 8
set shift 5; # number of bits of data within a page
# This value can be adjusted to find the
# best split to minimize table size
@ -278,7 +278,7 @@ proc uni::buildTables {} {
set next 0
for {set i 0} {$i <= 0xffff} {incr i} {
for {set i 0} {$i <= 0x10ffff} {incr i} {
#set gIndex [getGroup [getValue $i]]
set cclass_offset [expr {$i & $cclass_mask}]
@ -473,7 +473,7 @@ static unsigned char cclassGroupMap\[\] = {"
puts $f $line
puts $f "};
#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0xffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\])
#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0x1fffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\])
#define DECOMP_OFFSET_BITS $decomp_shift
@ -554,7 +554,7 @@ static int decompList\[\] = {"
* Unicode character tables.
*/
#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0xffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\])
#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0x1fffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\])
#define GetDecompShift(info) ((info) & 0xffff)
#define GetDecompLen(info) ((info) >> 16)
@ -687,7 +687,7 @@ static int compBothList\[[llength $comp_x_list]\]\[[llength $comp_y_list]\] = {"
puts $f "};
#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0xffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\])
#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0x1fffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\])
#define CompSingleMask (1 << 16)
#define CompMask ((1 << 16) - 1)