#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <inttypes.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#include <time.h>
#include "cutils.h"
#ifdef USE_TEST
#include "libunicode.c"
#endif
#define CHARCODE_MAX 0x10ffff
#define CC_LEN_MAX 3
void
*mallocz(
size_t
size)
{
void
*ptr;
ptr =
malloc
(size);
memset
(ptr, 0, size);
return
ptr;
}
const
char
*get_field(
const
char
*p,
int
n)
{
int
i;
for
(i = 0; i < n; i++) {
while
(*p !=
';'
&& *p !=
'\0'
)
p++;
if
(*p ==
'\0'
)
return
NULL;
p++;
}
return
p;
}
const
char
*get_field_buf(
char
*buf,
size_t
buf_size,
const
char
*p,
int
n)
{
char
*q;
p = get_field(p, n);
q = buf;
while
(*p !=
';'
&& *p !=
'\0'
) {
if
((q - buf) < buf_size - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
return
buf;
}
void
add_char(
int
**pbuf,
int
*psize,
int
*plen,
int
c)
{
int
len, size, *buf;
buf = *pbuf;
size = *psize;
len = *plen;
if
(len >= size) {
size = *psize;
size = max_int(len + 1, size * 3 / 2);
buf =
realloc
(buf,
sizeof
(buf[0]) * size);
*pbuf = buf;
*psize = size;
}
buf[len++] = c;
*plen = len;
}
int
*get_field_str(
int
*plen,
const
char
*str,
int
n)
{
const
char
*p;
int
*buf, len, size;
p = get_field(str, n);
if
(!p) {
*plen = 0;
return
NULL;
}
len = 0;
size = 0;
buf = NULL;
for
(;;) {
while
(
isspace
(*p))
p++;
if
(!
isxdigit
(*p))
break
;
add_char(&buf, &size, &len,
strtoul
(p, (
char
**)&p, 16));
}
*plen = len;
return
buf;
}
char
*get_line(
char
*buf,
int
buf_size,
FILE
*f)
{
int
len;
if
(!
fgets
(buf, buf_size, f))
return
NULL;
len =
strlen
(buf);
if
(len > 0 && buf[len - 1] ==
'\n'
)
buf[len - 1] =
'\0'
;
return
buf;
}
#define UNICODE_GENERAL_CATEGORY
typedef
enum
{
#define DEF(id, str) GCAT_ ## id,
#include "unicode_gen_def.h"
#undef DEF
GCAT_COUNT,
} UnicodeGCEnum1;
static
const
char
*unicode_gc_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
static
const
char
*unicode_gc_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_GENERAL_CATEGORY
#define UNICODE_SCRIPT
typedef
enum
{
#define DEF(id, str) SCRIPT_ ## id,
#include "unicode_gen_def.h"
#undef DEF
SCRIPT_COUNT,
} UnicodeScriptEnum1;
static
const
char
*unicode_script_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
const
char
*unicode_script_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_SCRIPT
#define UNICODE_PROP_LIST
typedef
enum
{
#define DEF(id, str) PROP_ ## id,
#include "unicode_gen_def.h"
#undef DEF
PROP_COUNT,
} UnicodePropEnum1;
static
const
char
*unicode_prop_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
static
const
char
*unicode_prop_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_PROP_LIST
typedef
struct
{
uint8_t u_len;
uint8_t l_len;
uint8_t f_len;
int
u_data[CC_LEN_MAX];
int
l_data[CC_LEN_MAX];
int
f_data[CC_LEN_MAX];
uint8_t combining_class;
uint8_t is_compat:1;
uint8_t is_excluded:1;
uint8_t general_category;
uint8_t script;
uint8_t script_ext_len;
uint8_t *script_ext;
uint32_t prop_bitmap_tab[3];
int
decomp_len;
int
*decomp_data;
} CCInfo;
CCInfo *unicode_db;
int
find_name(
const
char
**tab,
int
tab_len,
const
char
*name)
{
int
i, len, name_len;
const
char
*p, *r;
name_len =
strlen
(name);
for
(i = 0; i < tab_len; i++) {
p = tab[i];
for
(;;) {
r =
strchr
(p,
','
);
if
(!r)
len =
strlen
(p);
else
len = r - p;
if
(len == name_len &&
memcmp
(p, name, len) == 0)
return
i;
if
(!r)
break
;
p = r + 1;
}
}
return
-1;
}
static
int
get_prop(uint32_t c,
int
prop_idx)
{
return
(unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
}
static
void
set_prop(uint32_t c,
int
prop_idx,
int
val)
{
uint32_t mask;
mask = 1U << (prop_idx & 0x1f);
if
(val)
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
else
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
}
void
parse_unicode_data(
const
char
*filename)
{
FILE
*f;
char
line[1024];
char
buf1[256];
const
char
*p;
int
code, lc, uc, last_code;
CCInfo *ci, *tab = unicode_db;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
last_code = 0;
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
)
continue
;
p = get_field(line, 0);
if
(!p)
continue
;
code =
strtoul
(p, NULL, 16);
lc = 0;
uc = 0;
p = get_field(line, 12);
if
(p && *p !=
';'
) {
uc =
strtoul
(p, NULL, 16);
}
p = get_field(line, 13);
if
(p && *p !=
';'
) {
lc =
strtoul
(p, NULL, 16);
}
ci = &tab[code];
if
(uc > 0 || lc > 0) {
assert
(code <= CHARCODE_MAX);
if
(uc > 0) {
assert
(ci->u_len == 0);
ci->u_len = 1;
ci->u_data[0] = uc;
}
if
(lc > 0) {
assert
(ci->l_len == 0);
ci->l_len = 1;
ci->l_data[0] = lc;
}
}
{
int
i;
get_field_buf(buf1,
sizeof
(buf1), line, 2);
i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
if
(i < 0) {
fprintf
(stderr,
"General category '%s' not found\n"
,
buf1);
exit
(1);
}
ci->general_category = i;
}
p = get_field(line, 3);
if
(p && *p !=
';'
&& *p !=
'\0'
) {
int
cc;
cc =
strtoul
(p, NULL, 0);
if
(cc != 0) {
assert
(code <= CHARCODE_MAX);
ci->combining_class = cc;
}
}
p = get_field(line, 5);
if
(p && *p !=
';'
&& *p !=
'\0'
) {
int
size;
assert
(code <= CHARCODE_MAX);
ci->is_compat = 0;
if
(*p ==
'<'
) {
while
(*p !=
'\0'
&& *p !=
'>'
)
p++;
if
(*p ==
'>'
)
p++;
ci->is_compat = 1;
}
size = 0;
for
(;;) {
while
(
isspace
(*p))
p++;
if
(!
isxdigit
(*p))
break
;
add_char(&ci->decomp_data, &size, &ci->decomp_len,
strtoul
(p, (
char
**)&p, 16));
}
#if 0
{
int
i;
static
int
count, d_count;
printf
(
"%05x: %c"
, code, ci->is_compat ?
'C'
:
' '
);
for
(i = 0; i < ci->decomp_len; i++)
printf
(
" %05x"
, ci->decomp_data[i]);
printf
(
"\n"
);
count++;
d_count += ci->decomp_len;
}
#endif
}
p = get_field(line, 9);
if
(p && *p ==
'Y'
) {
set_prop(code, PROP_Bidi_Mirrored, 1);
}
get_field_buf(buf1,
sizeof
(buf1), line, 1);
if
(
strstr
(buf1,
" Last>"
)) {
int
i;
assert
(ci->decomp_len == 0);
assert
(ci->script_ext_len == 0);
for
(i = last_code + 1; i < code; i++) {
unicode_db[i] = *ci;
}
}
last_code = code;
}
fclose
(f);
}
void
parse_special_casing(CCInfo *tab,
const
char
*filename)
{
FILE
*f;
char
line[1024];
const
char
*p;
int
code;
CCInfo *ci;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
)
continue
;
p = get_field(line, 0);
if
(!p)
continue
;
code =
strtoul
(p, NULL, 16);
assert
(code <= CHARCODE_MAX);
ci = &tab[code];
p = get_field(line, 4);
if
(p) {
while
(
isspace
(*p))
p++;
if
(*p !=
'#'
&& *p !=
'\0'
)
continue
;
}
p = get_field(line, 1);
if
(p && *p !=
';'
) {
ci->l_len = 0;
for
(;;) {
while
(
isspace
(*p))
p++;
if
(*p ==
';'
)
break
;
assert
(ci->l_len < CC_LEN_MAX);
ci->l_data[ci->l_len++] =
strtoul
(p, (
char
**)&p, 16);
}
if
(ci->l_len == 1 && ci->l_data[0] == code)
ci->l_len = 0;
}
p = get_field(line, 3);
if
(p && *p !=
';'
) {
ci->u_len = 0;
for
(;;) {
while
(
isspace
(*p))
p++;
if
(*p ==
';'
)
break
;
assert
(ci->u_len < CC_LEN_MAX);
ci->u_data[ci->u_len++] =
strtoul
(p, (
char
**)&p, 16);
}
if
(ci->u_len == 1 && ci->u_data[0] == code)
ci->u_len = 0;
}
}
fclose
(f);
}
void
parse_case_folding(CCInfo *tab,
const
char
*filename)
{
FILE
*f;
char
line[1024];
const
char
*p;
int
code, status;
CCInfo *ci;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
)
continue
;
p = get_field(line, 0);
if
(!p)
continue
;
code =
strtoul
(p, NULL, 16);
assert
(code <= CHARCODE_MAX);
ci = &tab[code];
p = get_field(line, 1);
if
(!p)
continue
;
while
(
isspace
(*p))
p++;
status = *p;
if
(status !=
'C'
&& status !=
'S'
&& status !=
'F'
)
continue
;
p = get_field(line, 2);
assert
(p != NULL);
if
(status ==
'S'
) {
assert
(ci->f_len >= 2);
ci->f_len = 0;
}
else
{
assert
(ci->f_len == 0);
}
for
(;;) {
while
(
isspace
(*p))
p++;
if
(*p ==
';'
)
break
;
assert
(ci->l_len < CC_LEN_MAX);
ci->f_data[ci->f_len++] =
strtoul
(p, (
char
**)&p, 16);
}
}
fclose
(f);
}
void
parse_composition_exclusions(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p;
uint32_t c0;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
assert
(c0 > 0 && c0 <= CHARCODE_MAX);
unicode_db[c0].is_excluded = TRUE;
}
fclose
(f);
}
void
parse_derived_core_properties(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int
i;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
if
(*p ==
'.'
&& p[1] ==
'.'
) {
p += 2;
c1 =
strtoul
(p, (
char
**)&p, 16);
}
else
{
c1 = c0;
}
assert
(c1 <= CHARCODE_MAX);
p +=
strspn
(p,
" \t"
);
if
(*p ==
';'
) {
p++;
p +=
strspn
(p,
" \t"
);
q = buf;
while
(*p !=
'\0'
&& *p !=
' '
&& *p !=
'#'
&& *p !=
'\t'
) {
if
((q - buf) <
sizeof
(buf) - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
i = find_name(unicode_prop_name,
countof(unicode_prop_name), buf);
if
(i < 0) {
if
(!
strcmp
(buf,
"Grapheme_Link"
))
goto
next;
fprintf
(stderr,
"Property not found: %s\n"
, buf);
exit
(1);
}
for
(c = c0; c <= c1; c++) {
set_prop(c, i, 1);
}
next: ;
}
}
fclose
(f);
}
void
parse_derived_norm_properties(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
if
(*p ==
'.'
&& p[1] ==
'.'
) {
p += 2;
c1 =
strtoul
(p, (
char
**)&p, 16);
}
else
{
c1 = c0;
}
assert
(c1 <= CHARCODE_MAX);
p +=
strspn
(p,
" \t"
);
if
(*p ==
';'
) {
p++;
p +=
strspn
(p,
" \t"
);
q = buf;
while
(*p !=
'\0'
&& *p !=
' '
&& *p !=
'#'
&& *p !=
'\t'
) {
if
((q - buf) <
sizeof
(buf) - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
if
(!
strcmp
(buf,
"Changes_When_NFKC_Casefolded"
)) {
for
(c = c0; c <= c1; c++) {
set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
}
}
}
}
fclose
(f);
}
void
parse_prop_list(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int
i;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
if
(*p ==
'.'
&& p[1] ==
'.'
) {
p += 2;
c1 =
strtoul
(p, (
char
**)&p, 16);
}
else
{
c1 = c0;
}
assert
(c1 <= CHARCODE_MAX);
p +=
strspn
(p,
" \t"
);
if
(*p ==
';'
) {
p++;
p +=
strspn
(p,
" \t"
);
q = buf;
while
(*p !=
'\0'
&& *p !=
' '
&& *p !=
'#'
&& *p !=
'\t'
) {
if
((q - buf) <
sizeof
(buf) - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
i = find_name(unicode_prop_name,
countof(unicode_prop_name), buf);
if
(i < 0) {
fprintf
(stderr,
"Property not found: %s\n"
, buf);
exit
(1);
}
for
(c = c0; c <= c1; c++) {
set_prop(c, i, 1);
}
}
}
fclose
(f);
}
void
parse_scripts(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int
i;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
if
(*p ==
'.'
&& p[1] ==
'.'
) {
p += 2;
c1 =
strtoul
(p, (
char
**)&p, 16);
}
else
{
c1 = c0;
}
assert
(c1 <= CHARCODE_MAX);
p +=
strspn
(p,
" \t"
);
if
(*p ==
';'
) {
p++;
p +=
strspn
(p,
" \t"
);
q = buf;
while
(*p !=
'\0'
&& *p !=
' '
&& *p !=
'#'
&& *p !=
'\t'
) {
if
((q - buf) <
sizeof
(buf) - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
i = find_name(unicode_script_name,
countof(unicode_script_name), buf);
if
(i < 0) {
fprintf
(stderr,
"Unknown script: '%s'\n"
, buf);
exit
(1);
}
for
(c = c0; c <= c1; c++)
unicode_db[c].script = i;
}
}
fclose
(f);
}
void
parse_script_extensions(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int
i;
uint8_t script_ext[255];
int
script_ext_len;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
|| *p ==
'\0'
)
continue
;
c0 =
strtoul
(p, (
char
**)&p, 16);
if
(*p ==
'.'
&& p[1] ==
'.'
) {
p += 2;
c1 =
strtoul
(p, (
char
**)&p, 16);
}
else
{
c1 = c0;
}
assert
(c1 <= CHARCODE_MAX);
p +=
strspn
(p,
" \t"
);
script_ext_len = 0;
if
(*p ==
';'
) {
p++;
for
(;;) {
p +=
strspn
(p,
" \t"
);
q = buf;
while
(*p !=
'\0'
&& *p !=
' '
&& *p !=
'#'
&& *p !=
'\t'
) {
if
((q - buf) <
sizeof
(buf) - 1)
*q++ = *p;
p++;
}
*q =
'\0'
;
if
(buf[0] ==
'\0'
)
break
;
i = find_name(unicode_script_short_name,
countof(unicode_script_short_name), buf);
if
(i < 0) {
fprintf
(stderr,
"Script not found: %s\n"
, buf);
exit
(1);
}
assert
(script_ext_len <
sizeof
(script_ext));
script_ext[script_ext_len++] = i;
}
for
(c = c0; c <= c1; c++) {
CCInfo *ci = &unicode_db[c];
ci->script_ext_len = script_ext_len;
ci->script_ext =
malloc
(
sizeof
(ci->script_ext[0]) * script_ext_len);
for
(i = 0; i < script_ext_len; i++)
ci->script_ext[i] = script_ext[i];
}
}
}
fclose
(f);
}
void
dump_cc_info(CCInfo *ci,
int
i)
{
int
j;
printf
(
"%05x:"
, i);
if
(ci->u_len != 0) {
printf
(
" U:"
);
for
(j = 0; j < ci->u_len; j++)
printf
(
" %05x"
, ci->u_data[j]);
}
if
(ci->l_len != 0) {
printf
(
" L:"
);
for
(j = 0; j < ci->l_len; j++)
printf
(
" %05x"
, ci->l_data[j]);
}
if
(ci->f_len != 0) {
printf
(
" F:"
);
for
(j = 0; j < ci->f_len; j++)
printf
(
" %05x"
, ci->f_data[j]);
}
printf
(
"\n"
);
}
void
dump_unicode_data(CCInfo *tab)
{
int
i;
CCInfo *ci;
for
(i = 0; i <= CHARCODE_MAX; i++) {
ci = &tab[i];
if
(ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
dump_cc_info(ci, i);
}
}
}
BOOL
is_complicated_case(
const
CCInfo *ci)
{
return
(ci->u_len > 1 || ci->l_len > 1 ||
(ci->u_len > 0 && ci->l_len > 0) ||
(ci->f_len != ci->l_len) ||
(
memcmp
(ci->f_data, ci->l_data, ci->f_len *
sizeof
(ci->f_data[0])) != 0));
}
#ifndef USE_TEST
enum
{
RUN_TYPE_U,
RUN_TYPE_L,
RUN_TYPE_UF,
RUN_TYPE_LF,
RUN_TYPE_UL,
RUN_TYPE_LSU,
RUN_TYPE_U2L_399_EXT2,
RUN_TYPE_UF_D20,
RUN_TYPE_UF_D1_EXT,
RUN_TYPE_U_EXT,
RUN_TYPE_LF_EXT,
RUN_TYPE_UF_EXT2,
RUN_TYPE_LF_EXT2,
RUN_TYPE_UF_EXT3,
};
#endif
const
char
*run_type_str[] = {
"U"
,
"L"
,
"UF"
,
"LF"
,
"UL"
,
"LSU"
,
"U2L_399_EXT2"
,
"UF_D20"
,
"UF_D1_EXT"
,
"U_EXT"
,
"LF_EXT"
,
"UF_EXT2"
,
"LF_EXT2"
,
"UF_EXT3"
,
};
typedef
struct
{
int
code;
int
len;
int
type;
int
data;
int
ext_len;
int
ext_data[3];
int
data_index;
} TableEntry;
static
int
simple_to_lower(CCInfo *tab,
int
c)
{
if
(tab[c].l_len != 1)
return
c;
return
tab[c].l_data[0];
}
void
find_run_type(TableEntry *te, CCInfo *tab,
int
code)
{
int
is_lower, len;
CCInfo *ci, *ci1, *ci2;
ci = &tab[code];
ci1 = &tab[code + 1];
ci2 = &tab[code + 2];
te->code = code;
if
(ci->l_len == 1 && ci->l_data[0] == code + 2 &&
ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
ci->u_len == 0 &&
ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
ci1->u_len == 1 && ci1->u_data[0] == code &&
ci2->l_len == 0 &&
ci2->f_len == 0 &&
ci2->u_len == 1 && ci2->u_data[0] == code) {
te->len = 3;
te->data = 0;
te->type = RUN_TYPE_LSU;
return
;
}
if
(is_complicated_case(ci)) {
len = 1;
while
(code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if
(ci1->u_len != 1 ||
ci1->u_data[0] != ci->u_data[0] + len ||
ci1->l_len != 0 ||
ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
break
;
len++;
}
if
(len > 1) {
te->len = len;
te->type = RUN_TYPE_UF;
te->data = ci->u_data[0];
return
;
}
if
(ci->l_len == 0 &&
ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
len = 1;
while
(code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if
(!(ci1->u_len == 2 &&
ci1->u_data[1] == ci->u_data[1] &&
ci1->u_data[0] == ci->u_data[0] + len &&
ci1->f_len == 2 &&
ci1->f_data[1] == ci->f_data[1] &&
ci1->f_data[0] == ci->f_data[0] + len &&
ci1->l_len == 0))
break
;
len++;
}
te->len = len;
te->type = RUN_TYPE_UF_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_len = 2;
return
;
}
if
(ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->l_len == 1 &&
ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
len = 1;
while
(code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if
(!(ci1->u_len == 2 &&
ci1->u_data[1] == 0x399 &&
ci1->u_data[0] == ci->u_data[0] + len &&
ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len &&
ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
break
;
len++;
}
te->len = len;
te->type = RUN_TYPE_U2L_399_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->l_data[0];
te->ext_len = 2;
return
;
}
if
(ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
len = 1;
while
(code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if
(!(ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len &&
ci1->u_len == 0 && ci1->f_len == 0))
break
;
len++;
}
te->len = len;
te->type = RUN_TYPE_L;
te->data = ci->l_data[0];
return
;
}
if
(ci->l_len == 0 &&
ci->u_len == 1 &&
ci->u_data[0] < 0x1000 &&
ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
te->len = 1;
te->type = RUN_TYPE_UF_D20;
te->data = ci->u_data[0];
}
else
if
(ci->l_len == 0 &&
ci->u_len == 1 &&
ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
te->len = 1;
te->type = RUN_TYPE_UF_D1_EXT;
te->ext_data[0] = ci->u_data[0];
te->ext_len = 1;
}
else
if
(ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
ci->l_data[0] == ci->f_data[0] &&
ci->l_data[1] == ci->f_data[1]) {
te->len = 1;
te->type = RUN_TYPE_LF_EXT2;
te->ext_data[0] = ci->l_data[0];
te->ext_data[1] = ci->l_data[1];
te->ext_len = 2;
}
else
if
(ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
te->len = 1;
te->type = RUN_TYPE_UF_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_len = 2;
}
else
if
(ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
te->len = 1;
te->type = RUN_TYPE_UF_EXT3;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_data[2] = ci->u_data[2];
te->ext_len = 3;
}
else
{
printf
(
"unsupported encoding case:\n"
);
dump_cc_info(ci, code);
abort
();
}
}
else
{
len = 0;
for
(;;) {
if
(code >= CHARCODE_MAX || len >= 126)
break
;
ci = &tab[code + len];
ci1 = &tab[code + len + 1];
if
(is_complicated_case(ci) || is_complicated_case(ci1)) {
break
;
}
if
(ci->l_len != 1 || ci->l_data[0] != code + len + 1)
break
;
if
(ci1->u_len != 1 || ci1->u_data[0] != code + len)
break
;
len += 2;
}
if
(len > 0) {
te->len = len;
te->type = RUN_TYPE_UL;
te->data = 0;
return
;
}
ci = &tab[code];
is_lower = ci->l_len > 0;
len = 1;
while
(code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if
(is_complicated_case(ci1))
break
;
if
(is_lower) {
if
(ci1->l_len != 1 ||
ci1->l_data[0] != ci->l_data[0] + len)
break
;
}
else
{
if
(ci1->u_len != 1 ||
ci1->u_data[0] != ci->u_data[0] + len)
break
;
}
len++;
}
te->len = len;
if
(is_lower) {
te->type = RUN_TYPE_LF;
te->data = ci->l_data[0];
}
else
{
te->type = RUN_TYPE_U;
te->data = ci->u_data[0];
}
}
}
TableEntry conv_table[1000];
int
conv_table_len;
int
ext_data[1000];
int
ext_data_len;
void
dump_case_conv_table1(
void
)
{
int
i, j;
const
TableEntry *te;
for
(i = 0; i < conv_table_len; i++) {
te = &conv_table[i];
printf
(
"%05x %02x %-10s %05x"
,
te->code, te->len, run_type_str[te->type], te->data);
for
(j = 0; j < te->ext_len; j++) {
printf
(
" %05x"
, te->ext_data[j]);
}
printf
(
"\n"
);
}
printf
(
"table_len=%d ext_len=%d\n"
, conv_table_len, ext_data_len);
}
int
find_data_index(
const
TableEntry *conv_table,
int
len,
int
data)
{
int
i;
const
TableEntry *te;
for
(i = 0; i < len; i++) {
te = &conv_table[i];
if
(te->code == data)
return
i;
}
return
-1;
}
int
find_ext_data_index(
int
data)
{
int
i;
for
(i = 0; i < ext_data_len; i++) {
if
(ext_data[i] == data)
return
i;
}
assert
(ext_data_len < countof(ext_data));
ext_data[ext_data_len++] = data;
return
ext_data_len - 1;
}
void
build_conv_table(CCInfo *tab)
{
int
code, i, j;
CCInfo *ci;
TableEntry *te;
te = conv_table;
for
(code = 0; code <= CHARCODE_MAX; code++) {
ci = &tab[code];
if
(ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
continue
;
assert
(te - conv_table < countof(conv_table));
find_run_type(te, tab, code);
#if 0
if
(te->type == RUN_TYPE_TODO) {
printf
(
"TODO: "
);
dump_cc_info(ci, code);
}
#endif
assert
(te->len <= 127);
code += te->len - 1;
te++;
}
conv_table_len = te - conv_table;
for
(i = 0; i < conv_table_len; i++) {
int
data_index;
te = &conv_table[i];
switch
(te->type) {
case
RUN_TYPE_U:
case
RUN_TYPE_L:
case
RUN_TYPE_UF:
case
RUN_TYPE_LF:
data_index = find_data_index(conv_table, conv_table_len, te->data);
if
(data_index < 0) {
switch
(te->type) {
case
RUN_TYPE_U:
te->type = RUN_TYPE_U_EXT;
te->ext_len = 1;
te->ext_data[0] = te->data;
break
;
case
RUN_TYPE_LF:
te->type = RUN_TYPE_LF_EXT;
te->ext_len = 1;
te->ext_data[0] = te->data;
break
;
default
:
printf
(
"%05x: index not found\n"
, te->code);
exit
(1);
}
}
else
{
te->data_index = data_index;
}
break
;
case
RUN_TYPE_UF_D20:
te->data_index = te->data;
break
;
}
}
for
(i = 0; i < conv_table_len; i++) {
te = &conv_table[i];
if
(te->type == RUN_TYPE_UF_EXT3) {
int
p, v;
v = 0;
for
(j = 0; j < 3; j++) {
p = find_ext_data_index(te->ext_data[j]);
assert
(p < 16);
v = (v << 4) | p;
}
te->data_index = v;
}
}
for
(i = 0; i < conv_table_len; i++) {
te = &conv_table[i];
if
(te->type == RUN_TYPE_LF_EXT2 ||
te->type == RUN_TYPE_UF_EXT2 ||
te->type == RUN_TYPE_U2L_399_EXT2) {
int
p, v;
v = 0;
for
(j = 0; j < 2; j++) {
p = find_ext_data_index(te->ext_data[j]);
assert
(p < 64);
v = (v << 6) | p;
}
te->data_index = v;
}
}
for
(i = 0; i < conv_table_len; i++) {
te = &conv_table[i];
if
(te->type == RUN_TYPE_UF_D1_EXT ||
te->type == RUN_TYPE_U_EXT ||
te->type == RUN_TYPE_LF_EXT) {
te->data_index = find_ext_data_index(te->ext_data[0]);
}
}
#ifdef DUMP_CASE_CONV_TABLE
dump_case_conv_table1();
#endif
}
void
dump_case_conv_table(
FILE
*f)
{
int
i;
uint32_t v;
const
TableEntry *te;
fprintf
(f,
"static const uint32_t case_conv_table1[%u] = {"
, conv_table_len);
for
(i = 0; i < conv_table_len; i++) {
if
(i % 4 == 0)
fprintf
(f,
"\n "
);
te = &conv_table[i];
v = te->code << (32 - 17);
v |= te->len << (32 - 17 - 7);
v |= te->type << (32 - 17 - 7 - 4);
v |= te->data_index >> 8;
fprintf
(f,
" 0x%08x,"
, v);
}
fprintf
(f,
"\n};\n\n"
);
fprintf
(f,
"static const uint8_t case_conv_table2[%u] = {"
, conv_table_len);
for
(i = 0; i < conv_table_len; i++) {
if
(i % 8 == 0)
fprintf
(f,
"\n "
);
te = &conv_table[i];
fprintf
(f,
" 0x%02x,"
, te->data_index & 0xff);
}
fprintf
(f,
"\n};\n\n"
);
fprintf
(f,
"static const uint16_t case_conv_ext[%u] = {"
, ext_data_len);
for
(i = 0; i < ext_data_len; i++) {
if
(i % 8 == 0)
fprintf
(f,
"\n "
);
fprintf
(f,
" 0x%04x,"
, ext_data[i]);
}
fprintf
(f,
"\n};\n\n"
);
}
static
CCInfo *global_tab;
static
int
sp_cc_cmp(
const
void
*p1,
const
void
*p2)
{
CCInfo *c1 = &global_tab[*(
const
int
*)p1];
CCInfo *c2 = &global_tab[*(
const
int
*)p2];
if
(c1->f_len < c2->f_len) {
return
-1;
}
else
if
(c2->f_len < c1->f_len) {
return
1;
}
else
{
return
memcmp
(c1->f_data, c2->f_data,
sizeof
(c1->f_data[0]) * c1->f_len);
}
}
void
dump_case_folding_special_cases(CCInfo *tab)
{
int
i, len, j;
int
*perm;
perm =
malloc
(
sizeof
(perm[0]) * (CHARCODE_MAX + 1));
for
(i = 0; i <= CHARCODE_MAX; i++)
perm[i] = i;
global_tab = tab;
qsort
(perm, CHARCODE_MAX + 1,
sizeof
(perm[0]), sp_cc_cmp);
for
(i = 0; i <= CHARCODE_MAX;) {
if
(tab[perm[i]].f_len <= 1) {
i++;
}
else
{
len = 1;
while
((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
len++;
if
(len > 1) {
for
(j = i; j < i + len; j++)
dump_cc_info(&tab[perm[j]], perm[j]);
}
i += len;
}
}
free
(perm);
global_tab = NULL;
}
int
tabcmp(
const
int
*tab1,
const
int
*tab2,
int
n)
{
int
i;
for
(i = 0; i < n; i++) {
if
(tab1[i] != tab2[i])
return
-1;
}
return
0;
}
void
dump_str(
const
char
*str,
const
int
*buf,
int
len)
{
int
i;
printf
(
"%s="
, str);
for
(i = 0; i < len; i++)
printf
(
" %05x"
, buf[i]);
printf
(
"\n"
);
}
void
compute_internal_props(
void
)
{
int
i;
BOOL
has_ul;
for
(i = 0; i <= CHARCODE_MAX; i++) {
CCInfo *ci = &unicode_db[i];
has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
if
(has_ul) {
assert
(get_prop(i, PROP_Cased));
}
else
{
set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
}
set_prop(i, PROP_ID_Continue1,
get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
set_prop(i, PROP_XID_Start1,
get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
set_prop(i, PROP_XID_Continue1,
get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
set_prop(i, PROP_Changes_When_Titlecased1,
get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
set_prop(i, PROP_Changes_When_Casefolded1,
get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
#if 0
#define M(x) (1U << GCAT_ ## x)
{
int
b;
b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
unicode_db[i].general_category) & 1;
set_prop(i, PROP_Cased1,
get_prop(i, PROP_Case_Ignorable) ^ b);
}
#undef M
#endif
}
}
void
dump_byte_table(
FILE
*f,
const
char
*cname,
const
uint8_t *tab,
int
len)
{
int
i;
fprintf
(f,
"static const uint8_t %s[%d] = {"
, cname, len);
for
(i = 0; i < len; i++) {
if
(i % 8 == 0)
fprintf
(f,
"\n "
);
fprintf
(f,
" 0x%02x,"
, tab[i]);
}
fprintf
(f,
"\n};\n\n"
);
}
#define PROP_BLOCK_LEN 32
void
build_prop_table(
FILE
*f,
int
prop_index,
BOOL
add_index)
{
int
i, j, n, v, offset, code;
DynBuf dbuf_s, *dbuf = &dbuf_s;
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
const
uint32_t *buf;
int
buf_len, block_end_pos, bit;
char
cname[128];
dbuf_init(dbuf1);
for
(i = 0; i <= CHARCODE_MAX;) {
v = get_prop(i, prop_index);
j = i + 1;
while
(j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
j++;
}
n = j - i;
if
(j == (CHARCODE_MAX + 1) && v == 0)
break
;
dbuf_put_u32(dbuf1, n - 1);
i += n;
}
dbuf_init(dbuf);
dbuf_init(dbuf2);
buf = (uint32_t *)dbuf1->buf;
buf_len = dbuf1->size /
sizeof
(buf[0]);
assert
(get_prop(0, prop_index) == 0);
block_end_pos = PROP_BLOCK_LEN;
i = 0;
code = 0;
bit = 0;
while
(i < buf_len) {
if
(add_index && dbuf->size >= block_end_pos && bit == 0) {
offset = (dbuf->size - block_end_pos);
assert
(offset <= 7);
v = code | (offset << 21);
dbuf_putc(dbuf2, v);
dbuf_putc(dbuf2, v >> 8);
dbuf_putc(dbuf2, v >> 16);
block_end_pos += PROP_BLOCK_LEN;
}
v = buf[i];
code += v + 1;
bit ^= 1;
if
(v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
code += buf[i + 1] + 1;
bit ^= 1;
dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
i += 2;
}
else
if
(v < 128) {
dbuf_putc(dbuf, 0x80 + v);
i++;
}
else
if
(v < (1 << 13)) {
dbuf_putc(dbuf, 0x40 + (v >> 8));
dbuf_putc(dbuf, v);
i++;
}
else
{
assert
(v < (1 << 21));
dbuf_putc(dbuf, 0x60 + (v >> 16));
dbuf_putc(dbuf, v >> 8);
dbuf_putc(dbuf, v);
i++;
}
}
if
(add_index) {
v = code;
dbuf_putc(dbuf2, v);
dbuf_putc(dbuf2, v >> 8);
dbuf_putc(dbuf2, v >> 16);
}
#ifdef DUMP_TABLE_SIZE
printf
(
"prop %s: length=%d bytes\n"
, unicode_prop_name[prop_index],
(
int
)(dbuf->size + dbuf2->size));
#endif
snprintf(cname,
sizeof
(cname),
"unicode_prop_%s_table"
, unicode_prop_name[prop_index]);
dump_byte_table(f, cname, dbuf->buf, dbuf->size);
if
(add_index) {
snprintf(cname,
sizeof
(cname),
"unicode_prop_%s_index"
, unicode_prop_name[prop_index]);
dump_byte_table(f, cname, dbuf2->buf, dbuf2->size);
}
dbuf_free(dbuf);
dbuf_free(dbuf1);
dbuf_free(dbuf2);
}
void
build_flags_tables(
FILE
*f)
{
build_prop_table(f, PROP_Cased1, TRUE);
build_prop_table(f, PROP_Case_Ignorable, TRUE);
build_prop_table(f, PROP_ID_Start, TRUE);
build_prop_table(f, PROP_ID_Continue1, TRUE);
}
void
dump_name_table(
FILE
*f,
const
char
*cname,
const
char
**tab_name,
int
len,
const
char
**tab_short_name)
{
int
i, w, maxw;
maxw = 0;
for
(i = 0; i < len; i++) {
w =
strlen
(tab_name[i]);
if
(tab_short_name[i][0] !=
'\0'
) {
w += 1 +
strlen
(tab_short_name[i]);
}
if
(maxw < w)
maxw = w;
}
fprintf
(f,
"static const char %s[] =\n"
, cname);
for
(i = 0; i < len; i++) {
fprintf
(f,
" \""
);
w =
fprintf
(f,
"%s"
, tab_name[i]);
if
(tab_short_name[i][0] !=
'\0'
) {
w +=
fprintf
(f,
",%s"
, tab_short_name[i]);
}
fprintf
(f,
"\"%*s\"\\0\"\n"
, 1 + maxw - w,
""
);
}
fprintf
(f,
";\n\n"
);
}
void
build_general_category_table(
FILE
*f)
{
int
i, v, j, n, n1;
DynBuf dbuf_s, *dbuf = &dbuf_s;
int
cw_count, cw_len_count[4], cw_start;
fprintf
(f,
"typedef enum {\n"
);
for
(i = 0; i < GCAT_COUNT; i++)
fprintf
(f,
" UNICODE_GC_%s,\n"
, unicode_gc_name[i]);
fprintf
(f,
" UNICODE_GC_COUNT,\n"
);
fprintf
(f,
"} UnicodeGCEnum;\n\n"
);
dump_name_table(f,
"unicode_gc_name_table"
,
unicode_gc_name, GCAT_COUNT,
unicode_gc_short_name);
dbuf_init(dbuf);
cw_count = 0;
for
(i = 0; i < 4; i++)
cw_len_count[i] = 0;
for
(i = 0; i <= CHARCODE_MAX;) {
v = unicode_db[i].general_category;
j = i + 1;
while
(j <= CHARCODE_MAX && unicode_db[j].general_category == v)
j++;
n = j - i;
if
(v == GCAT_Lu) {
n1 = 1;
while
((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
n1++;
}
if
(n1 > n) {
v = 31;
n = n1;
}
}
cw_count++;
n--;
cw_start = dbuf->size;
if
(n < 7) {
dbuf_putc(dbuf, (n << 5) | v);
}
else
if
(n < 7 + 128) {
n1 = n - 7;
assert
(n1 < 128);
dbuf_putc(dbuf, (0xf << 5) | v);
dbuf_putc(dbuf, n1);
}
else
if
(n < 7 + 128 + (1 << 14)) {
n1 = n - (7 + 128);
assert
(n1 < (1 << 14));
dbuf_putc(dbuf, (0xf << 5) | v);
dbuf_putc(dbuf, (n1 >> 8) + 128);
dbuf_putc(dbuf, n1);
}
else
{
n1 = n - (7 + 128 + (1 << 14));
assert
(n1 < (1 << 22));
dbuf_putc(dbuf, (0xf << 5) | v);
dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
dbuf_putc(dbuf, n1 >> 8);
dbuf_putc(dbuf, n1);
}
cw_len_count[dbuf->size - cw_start - 1]++;
i += n + 1;
}
#ifdef DUMP_TABLE_SIZE
printf
(
"general category: %d entries ["
,
cw_count);
for
(i = 0; i < 4; i++)
printf
(
" %d"
, cw_len_count[i]);
printf
(
" ], length=%d bytes\n"
, (
int
)dbuf->size);
#endif
dump_byte_table(f,
"unicode_gc_table"
, dbuf->buf, dbuf->size);
dbuf_free(dbuf);
}
void
build_script_table(
FILE
*f)
{
int
i, v, j, n, n1, type;
DynBuf dbuf_s, *dbuf = &dbuf_s;
int
cw_count, cw_len_count[4], cw_start;
fprintf
(f,
"typedef enum {\n"
);
for
(i = 0; i < SCRIPT_COUNT; i++)
fprintf
(f,
" UNICODE_SCRIPT_%s,\n"
, unicode_script_name[i]);
fprintf
(f,
" UNICODE_SCRIPT_COUNT,\n"
);
fprintf
(f,
"} UnicodeScriptEnum;\n\n"
);
i = 1;
dump_name_table(f,
"unicode_script_name_table"
,
unicode_script_name + i, SCRIPT_COUNT - i,
unicode_script_short_name + i);
dbuf_init(dbuf);
cw_count = 0;
for
(i = 0; i < 4; i++)
cw_len_count[i] = 0;
for
(i = 0; i <= CHARCODE_MAX;) {
v = unicode_db[i].script;
j = i + 1;
while
(j <= CHARCODE_MAX && unicode_db[j].script == v)
j++;
n = j - i;
if
(v == 0 && j == (CHARCODE_MAX + 1))
break
;
cw_count++;
n--;
cw_start = dbuf->size;
if
(v == 0)
type = 0;
else
type = 1;
if
(n < 96) {
dbuf_putc(dbuf, n | (type << 7));
}
else
if
(n < 96 + (1 << 12)) {
n1 = n - 96;
assert
(n1 < (1 << 12));
dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
dbuf_putc(dbuf, n1);
}
else
{
n1 = n - (96 + (1 << 12));
assert
(n1 < (1 << 20));
dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
dbuf_putc(dbuf, n1 >> 8);
dbuf_putc(dbuf, n1);
}
if
(type != 0)
dbuf_putc(dbuf, v);
cw_len_count[dbuf->size - cw_start - 1]++;
i += n + 1;
}
#if defined(DUMP_TABLE_SIZE)
printf
(
"script: %d entries ["
,
cw_count);
for
(i = 0; i < 4; i++)
printf
(
" %d"
, cw_len_count[i]);
printf
(
" ], length=%d bytes\n"
, (
int
)dbuf->size);
#endif
dump_byte_table(f,
"unicode_script_table"
, dbuf->buf, dbuf->size);
dbuf_free(dbuf);
}
void
build_script_ext_table(
FILE
*f)
{
int
i, j, n, n1, script_ext_len;
DynBuf dbuf_s, *dbuf = &dbuf_s;
int
cw_count;
dbuf_init(dbuf);
cw_count = 0;
for
(i = 0; i <= CHARCODE_MAX;) {
script_ext_len = unicode_db[i].script_ext_len;
j = i + 1;
while
(j <= CHARCODE_MAX &&
unicode_db[j].script_ext_len == script_ext_len &&
!
memcmp
(unicode_db[j].script_ext, unicode_db[i].script_ext,
script_ext_len)) {
j++;
}
n = j - i;
cw_count++;
n--;
if
(n < 128) {
dbuf_putc(dbuf, n);
}
else
if
(n < 128 + (1 << 14)) {
n1 = n - 128;
assert
(n1 < (1 << 14));
dbuf_putc(dbuf, (n1 >> 8) + 128);
dbuf_putc(dbuf, n1);
}
else
{
n1 = n - (128 + (1 << 14));
assert
(n1 < (1 << 22));
dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
dbuf_putc(dbuf, n1 >> 8);
dbuf_putc(dbuf, n1);
}
dbuf_putc(dbuf, script_ext_len);
for
(j = 0; j < script_ext_len; j++)
dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
i += n + 1;
}
#ifdef DUMP_TABLE_SIZE
printf
(
"script_ext: %d entries"
,
cw_count);
printf
(
", length=%d bytes\n"
, (
int
)dbuf->size);
#endif
dump_byte_table(f,
"unicode_script_ext_table"
, dbuf->buf, dbuf->size);
dbuf_free(dbuf);
}
#define PROP_TABLE_COUNT PROP_ASCII
void
build_prop_list_table(
FILE
*f)
{
int
i;
for
(i = 0; i < PROP_TABLE_COUNT; i++) {
if
(i == PROP_ID_Start ||
i == PROP_Case_Ignorable ||
i == PROP_ID_Continue1) {
}
else
{
build_prop_table(f, i, FALSE);
}
}
fprintf
(f,
"typedef enum {\n"
);
for
(i = 0; i < PROP_COUNT; i++)
fprintf
(f,
" UNICODE_PROP_%s,\n"
, unicode_prop_name[i]);
fprintf
(f,
" UNICODE_PROP_COUNT,\n"
);
fprintf
(f,
"} UnicodePropertyEnum;\n\n"
);
i = PROP_ASCII_Hex_Digit;
dump_name_table(f,
"unicode_prop_name_table"
,
unicode_prop_name + i, PROP_XID_Start - i + 1,
unicode_prop_short_name + i);
fprintf
(f,
"static const uint8_t * const unicode_prop_table[] = {\n"
);
for
(i = 0; i < PROP_TABLE_COUNT; i++) {
fprintf
(f,
" unicode_prop_%s_table,\n"
, unicode_prop_name[i]);
}
fprintf
(f,
"};\n\n"
);
fprintf
(f,
"static const uint16_t unicode_prop_len_table[] = {\n"
);
for
(i = 0; i < PROP_TABLE_COUNT; i++) {
fprintf
(f,
" countof(unicode_prop_%s_table),\n"
, unicode_prop_name[i]);
}
fprintf
(f,
"};\n\n"
);
}
#ifdef USE_TEST
int
check_conv(uint32_t *res, uint32_t c,
int
conv_type)
{
return
lre_case_conv(res, c, conv_type);
}
void
check_case_conv(
void
)
{
CCInfo *tab = unicode_db;
uint32_t res[3];
int
l, error;
CCInfo ci_s, *ci1, *ci = &ci_s;
int
code;
for
(code = 0; code <= CHARCODE_MAX; code++) {
ci1 = &tab[code];
*ci = *ci1;
if
(ci->l_len == 0) {
ci->l_len = 1;
ci->l_data[0] = code;
}
if
(ci->u_len == 0) {
ci->u_len = 1;
ci->u_data[0] = code;
}
if
(ci->f_len == 0) {
ci->f_len = 1;
ci->f_data[0] = code;
}
error = 0;
l = check_conv(res, code, 0);
if
(l != ci->u_len || tabcmp((
int
*)res, ci->u_data, l)) {
printf
(
"ERROR: L\n"
);
error++;
}
l = check_conv(res, code, 1);
if
(l != ci->l_len || tabcmp((
int
*)res, ci->l_data, l)) {
printf
(
"ERROR: U\n"
);
error++;
}
l = check_conv(res, code, 2);
if
(l != ci->f_len || tabcmp((
int
*)res, ci->f_data, l)) {
printf
(
"ERROR: F\n"
);
error++;
}
if
(error) {
dump_cc_info(ci, code);
exit
(1);
}
}
}
#ifdef PROFILE
static
int64_t get_time_ns(
void
)
{
struct
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return
(int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
}
#endif
void
check_flags(
void
)
{
int
c;
BOOL
flag_ref, flag;
for
(c = 0; c <= CHARCODE_MAX; c++) {
flag_ref = get_prop(c, PROP_Cased);
flag = lre_is_cased(c);
if
(flag != flag_ref) {
printf
(
"ERROR: c=%05x cased=%d ref=%d\n"
,
c, flag, flag_ref);
exit
(1);
}
flag_ref = get_prop(c, PROP_Case_Ignorable);
flag = lre_is_case_ignorable(c);
if
(flag != flag_ref) {
printf
(
"ERROR: c=%05x case_ignorable=%d ref=%d\n"
,
c, flag, flag_ref);
exit
(1);
}
flag_ref = get_prop(c, PROP_ID_Start);
flag = lre_is_id_start(c);
if
(flag != flag_ref) {
printf
(
"ERROR: c=%05x id_start=%d ref=%d\n"
,
c, flag, flag_ref);
exit
(1);
}
flag_ref = get_prop(c, PROP_ID_Continue);
flag = lre_is_id_continue(c);
if
(flag != flag_ref) {
printf
(
"ERROR: c=%05x id_cont=%d ref=%d\n"
,
c, flag, flag_ref);
exit
(1);
}
}
#ifdef PROFILE
{
int64_t ti, count;
ti = get_time_ns();
count = 0;
for
(c = 0x20; c <= 0xffff; c++) {
flag_ref = get_prop(c, PROP_ID_Start);
flag = lre_is_id_start(c);
assert
(flag == flag_ref);
count++;
}
ti = get_time_ns() - ti;
printf
(
"flags time=%0.1f ns/char\n"
,
(
double
)ti / count);
}
#endif
}
#endif
#define CC_BLOCK_LEN 32
void
build_cc_table(
FILE
*f)
{
int
i, cc, n, cc_table_len, type, n1;
DynBuf dbuf_s, *dbuf = &dbuf_s;
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
int
cw_len_tab[3], cw_start, block_end_pos;
uint32_t v;
dbuf_init(dbuf);
dbuf_init(dbuf1);
cc_table_len = 0;
for
(i = 0; i < countof(cw_len_tab); i++)
cw_len_tab[i] = 0;
block_end_pos = CC_BLOCK_LEN;
for
(i = 0; i <= CHARCODE_MAX;) {
cc = unicode_db[i].combining_class;
assert
(cc <= 255);
n = 1;
while
((i + n) <= CHARCODE_MAX &&
unicode_db[i + n].combining_class == (cc + n))
n++;
if
(n >= 2) {
type = 1;
}
else
{
type = 0;
n = 1;
while
((i + n) <= CHARCODE_MAX &&
unicode_db[i + n].combining_class == cc)
n++;
}
if
(cc == 0 && (i + n - 1) == CHARCODE_MAX)
break
;
#ifdef DUMP_CC_TABLE
printf
(
"%05x %6d %d %d\n"
, i, n, type, cc);
#endif
if
(type == 0) {
if
(cc == 0)
type = 2;
else
if
(cc == 230)
type = 3;
}
n1 = n - 1;
if
(dbuf->size >= block_end_pos) {
v = i | ((dbuf->size - block_end_pos) << 21);
dbuf_putc(dbuf1, v);
dbuf_putc(dbuf1, v >> 8);
dbuf_putc(dbuf1, v >> 16);
block_end_pos += CC_BLOCK_LEN;
}
cw_start = dbuf->size;
if
(n1 < 48) {
dbuf_putc(dbuf, n1 | (type << 6));
}
else
if
(n1 < 48 + (1 << 11)) {
n1 -= 48;
dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
dbuf_putc(dbuf, n1);
}
else
{
n1 -= 48 + (1 << 11);
assert
(n1 < (1 << 20));
dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
dbuf_putc(dbuf, n1 >> 8);
dbuf_putc(dbuf, n1);
}
cw_len_tab[dbuf->size - cw_start - 1]++;
if
(type == 0 || type == 1)
dbuf_putc(dbuf, cc);
cc_table_len++;
i += n;
}
v = i;
dbuf_putc(dbuf1, v);
dbuf_putc(dbuf1, v >> 8);
dbuf_putc(dbuf1, v >> 16);
dump_byte_table(f,
"unicode_cc_table"
, dbuf->buf, dbuf->size);
dump_byte_table(f,
"unicode_cc_index"
, dbuf1->buf, dbuf1->size);
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
printf
(
"CC table: size=%d (%d entries) ["
,
(
int
)(dbuf->size + dbuf1->size),
cc_table_len);
for
(i = 0; i < countof(cw_len_tab); i++)
printf
(
" %d"
, cw_len_tab[i]);
printf
(
" ]\n"
);
#endif
dbuf_free(dbuf);
dbuf_free(dbuf1);
}
#ifndef USE_TEST
typedef
enum
{
DECOMP_TYPE_C1,
DECOMP_TYPE_L1,
DECOMP_TYPE_L2,
DECOMP_TYPE_L3,
DECOMP_TYPE_L4,
DECOMP_TYPE_L5,
DECOMP_TYPE_L6,
DECOMP_TYPE_L7,
DECOMP_TYPE_LL1,
DECOMP_TYPE_LL2,
DECOMP_TYPE_S1,
DECOMP_TYPE_S2,
DECOMP_TYPE_S3,
DECOMP_TYPE_S4,
DECOMP_TYPE_S5,
DECOMP_TYPE_I1,
DECOMP_TYPE_I2_0,
DECOMP_TYPE_I2_1,
DECOMP_TYPE_I3_1,
DECOMP_TYPE_I3_2,
DECOMP_TYPE_I4_1,
DECOMP_TYPE_I4_2,
DECOMP_TYPE_B1,
DECOMP_TYPE_B2,
DECOMP_TYPE_B3,
DECOMP_TYPE_B4,
DECOMP_TYPE_B5,
DECOMP_TYPE_B6,
DECOMP_TYPE_B7,
DECOMP_TYPE_B8,
DECOMP_TYPE_B18,
DECOMP_TYPE_LS2,
DECOMP_TYPE_PAT3,
DECOMP_TYPE_S2_UL,
DECOMP_TYPE_LS2_UL,
} DecompTypeEnum;
#endif
const
char
*decomp_type_str[] = {
"C1"
,
"L1"
,
"L2"
,
"L3"
,
"L4"
,
"L5"
,
"L6"
,
"L7"
,
"LL1"
,
"LL2"
,
"S1"
,
"S2"
,
"S3"
,
"S4"
,
"S5"
,
"I1"
,
"I2_0"
,
"I2_1"
,
"I3_1"
,
"I3_2"
,
"I4_1"
,
"I4_2"
,
"B1"
,
"B2"
,
"B3"
,
"B4"
,
"B5"
,
"B6"
,
"B7"
,
"B8"
,
"B18"
,
"LS2"
,
"PAT3"
,
"S2_UL"
,
"LS2_UL"
,
};
const
int
decomp_incr_tab[4][4] = {
{ DECOMP_TYPE_I1, 0, -1 },
{ DECOMP_TYPE_I2_0, 0, 1, -1 },
{ DECOMP_TYPE_I3_1, 1, 2, -1 },
{ DECOMP_TYPE_I4_1, 1, 2, -1 },
};
typedef
struct
{
int
code;
uint8_t len;
uint8_t type;
uint8_t c_len;
uint16_t c_min;
uint16_t data_index;
int
cost;
} DecompEntry;
int
get_decomp_run_size(
const
DecompEntry *de)
{
int
s;
s = 6;
if
(de->type <= DECOMP_TYPE_C1) {
}
else
if
(de->type <= DECOMP_TYPE_L7) {
s += de->len * de->c_len * 2;
}
else
if
(de->type <= DECOMP_TYPE_LL2) {
s += (de->len * de->c_len * 18 + 7) / 8;
}
else
if
(de->type <= DECOMP_TYPE_S5) {
s += de->len * de->c_len;
}
else
if
(de->type <= DECOMP_TYPE_I4_2) {
s += de->c_len * 2;
}
else
if
(de->type <= DECOMP_TYPE_B18) {
s += 2 + de->len * de->c_len;
}
else
if
(de->type <= DECOMP_TYPE_LS2) {
s += de->len * 3;
}
else
if
(de->type <= DECOMP_TYPE_PAT3) {
s += 4 + de->len * 2;
}
else
if
(de->type <= DECOMP_TYPE_S2_UL) {
s += de->len;
}
else
if
(de->type <= DECOMP_TYPE_LS2_UL) {
s += (de->len / 2) * 3;
}
else
{
abort
();
}
return
s;
}
static
const
uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
int
get_short_code(
int
c)
{
int
i;
if
(c < 0x80) {
return
c;
}
else
if
(c >= 0x300 && c < 0x350) {
return
c - 0x300 + 0x80;
}
else
{
for
(i = 0; i < countof(unicode_short_table); i++) {
if
(c == unicode_short_table[i])
return
i + 0x80 + 0x50;
}
return
-1;
}
}
static
BOOL
is_short(
int
code)
{
return
get_short_code(code) >= 0;
}
static
BOOL
is_short_tab(
const
int
*tab,
int
len)
{
int
i;
for
(i = 0; i < len; i++) {
if
(!is_short(tab[i]))
return
FALSE;
}
return
TRUE;
}
static
BOOL
is_16bit(
const
int
*tab,
int
len)
{
int
i;
for
(i = 0; i < len; i++) {
if
(tab[i] > 0xffff)
return
FALSE;
}
return
TRUE;
}
static
uint32_t to_lower_simple(uint32_t c)
{
if
(c < 0x100 || (c >= 0x410 && c <= 0x42f))
c += 0x20;
else
c++;
return
c;
}
void
find_decomp_run(DecompEntry *tab_de,
int
i)
{
DecompEntry de_s, *de = &de_s;
CCInfo *ci, *ci1, *ci2;
int
l, j, n, len_max;
ci = &unicode_db[i];
l = ci->decomp_len;
if
(l == 0) {
tab_de[i].cost = tab_de[i + 1].cost;
return
;
}
if
(!ci->is_compat && !ci->is_excluded && l == 2)
len_max = 64;
else
len_max = 127;
tab_de[i].cost = 0x7fffffff;
if
(!is_16bit(ci->decomp_data, l)) {
assert
(l <= 2);
n = 1;
for
(;;) {
de->code = i;
de->len = n;
de->type = DECOMP_TYPE_LL1 + l - 1;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == 0 ||
(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat)))
break
;
n++;
}
return
;
}
if
(l <= 7) {
n = 1;
for
(;;) {
de->code = i;
de->len = n;
if
(l == 1 && n == 1) {
de->type = DECOMP_TYPE_C1;
}
else
{
assert
(l <= 8);
de->type = DECOMP_TYPE_L1 + l - 1;
}
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == 0 ||
(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat &&
is_16bit(ci1->decomp_data, l))))
break
;
n++;
}
}
if
(l <= 8 || l == 18) {
int
c_min, c_max, c;
c_min = c_max = -1;
n = 1;
for
(;;) {
ci1 = &unicode_db[i + n - 1];
for
(j = 0; j < l; j++) {
c = ci1->decomp_data[j];
if
(c == 0x20) {
}
else
if
(c_min == -1) {
c_min = c_max = c;
}
else
{
c_min = min_int(c_min, c);
c_max = max_int(c_max, c);
}
}
if
((c_max - c_min) > 254)
break
;
de->code = i;
de->len = n;
if
(l == 18)
de->type = DECOMP_TYPE_B18;
else
de->type = DECOMP_TYPE_B1 + l - 1;
de->c_len = l;
de->c_min = c_min;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat))
break
;
n++;
}
}
if
(l <= 5 && is_short_tab(ci->decomp_data, l)) {
n = 1;
for
(;;) {
de->code = i;
de->len = n;
de->type = DECOMP_TYPE_S1 + l - 1;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == 0 ||
(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat &&
is_short_tab(ci1->decomp_data, l))))
break
;
n++;
}
}
if
(l <= 4) {
int
idx1, idx;
for
(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
n = 1;
for
(;;) {
de->code = i;
de->len = n;
de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat))
goto
next1;
for
(j = 0; j < l; j++) {
if
(j == idx) {
if
(ci1->decomp_data[j] != ci->decomp_data[j] + n)
goto
next1;
}
else
{
if
(ci1->decomp_data[j] != ci->decomp_data[j])
goto
next1;
}
}
n++;
}
next1: ;
}
}
if
(l == 3) {
n = 1;
for
(;;) {
de->code = i;
de->len = n;
de->type = DECOMP_TYPE_PAT3;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat &&
ci1->decomp_data[1] <= 0xffff &&
ci1->decomp_data[0] == ci->decomp_data[0] &&
ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
break
;
n++;
}
}
if
(l == 2 && is_short(ci->decomp_data[1])) {
n = 1;
for
(;;) {
de->code = i;
de->len = n;
de->type = DECOMP_TYPE_LS2;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
if
(!((i + n) <= CHARCODE_MAX && n < len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == 0 ||
(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat &&
ci1->decomp_data[0] <= 0xffff &&
is_short(ci1->decomp_data[1]))))
break
;
n++;
}
}
if
(l == 2) {
BOOL
is_16bit;
n = 0;
is_16bit = FALSE;
for
(;;) {
if
(!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
break
;
ci1 = &unicode_db[i + n];
if
(!(ci1->decomp_len == l &&
ci1->is_compat == ci->is_compat &&
is_short(ci1->decomp_data[1])))
break
;
if
(!is_16bit && !is_short(ci1->decomp_data[0]))
is_16bit = TRUE;
ci2 = &unicode_db[i + n + 1];
if
(!(ci2->decomp_len == l &&
ci2->is_compat == ci->is_compat &&
ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) &&
ci2->decomp_data[1] == ci1->decomp_data[1]))
break
;
n += 2;
de->code = i;
de->len = n;
de->type = DECOMP_TYPE_S2_UL + is_16bit;
de->c_len = l;
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
if
(de->cost < tab_de[i].cost) {
tab_de[i] = *de;
}
}
}
}
void
put16(uint8_t *data_buf,
int
*pidx, uint16_t c)
{
int
idx;
idx = *pidx;
data_buf[idx++] = c;
data_buf[idx++] = c >> 8;
*pidx = idx;
}
void
add_decomp_data(uint8_t *data_buf,
int
*pidx, DecompEntry *de)
{
int
i, j, idx, c;
CCInfo *ci;
idx = *pidx;
de->data_index = idx;
if
(de->type <= DECOMP_TYPE_C1) {
ci = &unicode_db[de->code];
assert
(ci->decomp_len == 1);
de->data_index = ci->decomp_data[0];
}
else
if
(de->type <= DECOMP_TYPE_L7) {
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
for
(j = 0; j < de->c_len; j++) {
if
(ci->decomp_len == 0)
c = 0;
else
c = ci->decomp_data[j];
put16(data_buf, &idx, c);
}
}
}
else
if
(de->type <= DECOMP_TYPE_LL2) {
int
n, p, k;
n = (de->len * de->c_len * 18 + 7) / 8;
p = de->len * de->c_len * 2;
memset
(data_buf + idx, 0, n);
k = 0;
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
for
(j = 0; j < de->c_len; j++) {
if
(ci->decomp_len == 0)
c = 0;
else
c = ci->decomp_data[j];
data_buf[idx + k * 2] = c;
data_buf[idx + k * 2 + 1] = c >> 8;
data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
k++;
}
}
idx += n;
}
else
if
(de->type <= DECOMP_TYPE_S5) {
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
for
(j = 0; j < de->c_len; j++) {
if
(ci->decomp_len == 0)
c = 0;
else
c = ci->decomp_data[j];
c = get_short_code(c);
assert
(c >= 0);
data_buf[idx++] = c;
}
}
}
else
if
(de->type <= DECOMP_TYPE_I4_2) {
ci = &unicode_db[de->code];
assert
(ci->decomp_len == de->c_len);
for
(j = 0; j < de->c_len; j++)
put16(data_buf, &idx, ci->decomp_data[j]);
}
else
if
(de->type <= DECOMP_TYPE_B18) {
c = de->c_min;
data_buf[idx++] = c;
data_buf[idx++] = c >> 8;
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
for
(j = 0; j < de->c_len; j++) {
assert
(ci->decomp_len == de->c_len);
c = ci->decomp_data[j];
if
(c == 0x20) {
c = 0xff;
}
else
{
c -= de->c_min;
assert
((uint32_t)c <= 254);
}
data_buf[idx++] = c;
}
}
}
else
if
(de->type <= DECOMP_TYPE_LS2) {
assert
(de->c_len == 2);
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
if
(ci->decomp_len == 0)
c = 0;
else
c = ci->decomp_data[0];
put16(data_buf, &idx, c);
if
(ci->decomp_len == 0)
c = 0;
else
c = ci->decomp_data[1];
c = get_short_code(c);
assert
(c >= 0);
data_buf[idx++] = c;
}
}
else
if
(de->type <= DECOMP_TYPE_PAT3) {
ci = &unicode_db[de->code];
assert
(ci->decomp_len == 3);
put16(data_buf, &idx, ci->decomp_data[0]);
put16(data_buf, &idx, ci->decomp_data[2]);
for
(i = 0; i < de->len; i++) {
ci = &unicode_db[de->code + i];
assert
(ci->decomp_len == 3);
put16(data_buf, &idx, ci->decomp_data[1]);
}
}
else
if
(de->type <= DECOMP_TYPE_S2_UL) {
for
(i = 0; i < de->len; i += 2) {
ci = &unicode_db[de->code + i];
c = ci->decomp_data[0];
c = get_short_code(c);
assert
(c >= 0);
data_buf[idx++] = c;
c = ci->decomp_data[1];
c = get_short_code(c);
assert
(c >= 0);
data_buf[idx++] = c;
}
}
else
if
(de->type <= DECOMP_TYPE_LS2_UL) {
for
(i = 0; i < de->len; i += 2) {
ci = &unicode_db[de->code + i];
c = ci->decomp_data[0];
put16(data_buf, &idx, c);
c = ci->decomp_data[1];
c = get_short_code(c);
assert
(c >= 0);
data_buf[idx++] = c;
}
}
else
{
abort
();
}
*pidx = idx;
}
#if 0
void
dump_large_char(
void
)
{
int
i, j;
for
(i = 0; i <= CHARCODE_MAX; i++) {
CCInfo *ci = &unicode_db[i];
for
(j = 0; j < ci->decomp_len; j++) {
if
(ci->decomp_data[j] > 0xffff)
printf
(
"%05x\n"
, ci->decomp_data[j]);
}
}
}
#endif
void
build_compose_table(
FILE
*f,
const
DecompEntry *tab_de);
void
build_decompose_table(
FILE
*f)
{
int
i, array_len, code_max, data_len, count;
DecompEntry *tab_de, de_s, *de = &de_s;
uint8_t *data_buf;
code_max = CHARCODE_MAX;
tab_de = mallocz((code_max + 2) *
sizeof
(*tab_de));
for
(i = code_max; i >= 0; i--) {
find_decomp_run(tab_de, i);
}
data_buf =
malloc
(100000);
data_len = 0;
array_len = 0;
for
(i = 0; i <= code_max; i++) {
de = &tab_de[i];
if
(de->len != 0) {
add_decomp_data(data_buf, &data_len, de);
i += de->len - 1;
array_len++;
}
}
#ifdef DUMP_DECOMP_TABLE
{
int
size, size1;
printf
(
"START LEN TYPE L C SIZE\n"
);
size = 0;
for
(i = 0; i <= code_max; i++) {
de = &tab_de[i];
if
(de->len != 0) {
size1 = get_decomp_run_size(de);
printf
(
"%05x %3d %6s %2d %1d %4d\n"
, i, de->len,
decomp_type_str[de->type], de->c_len,
unicode_db[i].is_compat, size1);
i += de->len - 1;
size += size1;
}
}
printf
(
"array_len=%d estimated size=%d bytes actual=%d bytes\n"
,
array_len, size, array_len * 6 + data_len);
}
#endif
fprintf
(f,
"static const uint32_t unicode_decomp_table1[%u] = {"
,
array_len);
count = 0;
for
(i = 0; i <= code_max; i++) {
de = &tab_de[i];
if
(de->len != 0) {
uint32_t v;
if
(count++ % 4 == 0)
fprintf
(f,
"\n "
);
v = (de->code << (32 - 18)) |
(de->len << (32 - 18 - 7)) |
(de->type << (32 - 18 - 7 - 6)) |
unicode_db[de->code].is_compat;
fprintf
(f,
" 0x%08x,"
, v);
i += de->len - 1;
}
}
fprintf
(f,
"\n};\n\n"
);
fprintf
(f,
"static const uint16_t unicode_decomp_table2[%u] = {"
,
array_len);
count = 0;
for
(i = 0; i <= code_max; i++) {
de = &tab_de[i];
if
(de->len != 0) {
if
(count++ % 8 == 0)
fprintf
(f,
"\n "
);
fprintf
(f,
" 0x%04x,"
, de->data_index);
i += de->len - 1;
}
}
fprintf
(f,
"\n};\n\n"
);
fprintf
(f,
"static const uint8_t unicode_decomp_data[%u] = {"
,
data_len);
for
(i = 0; i < data_len; i++) {
if
(i % 8 == 0)
fprintf
(f,
"\n "
);
fprintf
(f,
" 0x%02x,"
, data_buf[i]);
}
fprintf
(f,
"\n};\n\n"
);
build_compose_table(f, tab_de);
free
(data_buf);
free
(tab_de);
}
typedef
struct
{
uint32_t c[2];
uint32_t p;
} ComposeEntry;
#define COMPOSE_LEN_MAX 10000
static
int
ce_cmp(
const
void
*p1,
const
void
*p2)
{
const
ComposeEntry *ce1 = p1;
const
ComposeEntry *ce2 = p2;
int
i;
for
(i = 0; i < 2; i++) {
if
(ce1->c[i] < ce2->c[i])
return
-1;
else
if
(ce1->c[i] > ce2->c[i])
return
1;
}
return
0;
}
static
int
get_decomp_pos(
const
DecompEntry *tab_de,
int
c)
{
int
i, v, k;
const
DecompEntry *de;
k = 0;
for
(i = 0; i <= CHARCODE_MAX; i++) {
de = &tab_de[i];
if
(de->len != 0) {
if
(c >= de->code && c < de->code + de->len) {
v = c - de->code;
assert
(v < 64);
v |= k << 6;
assert
(v < 65536);
return
v;
}
i += de->len - 1;
k++;
}
}
return
-1;
}
void
build_compose_table(
FILE
*f,
const
DecompEntry *tab_de)
{
int
i, v, tab_ce_len;
ComposeEntry *ce, *tab_ce;
tab_ce =
malloc
(
sizeof
(*tab_ce) * COMPOSE_LEN_MAX);
tab_ce_len = 0;
for
(i = 0; i <= CHARCODE_MAX; i++) {
CCInfo *ci = &unicode_db[i];
if
(ci->decomp_len == 2 && !ci->is_compat &&
!ci->is_excluded) {
assert
(tab_ce_len < COMPOSE_LEN_MAX);
ce = &tab_ce[tab_ce_len++];
ce->c[0] = ci->decomp_data[0];
ce->c[1] = ci->decomp_data[1];
ce->p = i;
}
}
qsort
(tab_ce, tab_ce_len,
sizeof
(*tab_ce), ce_cmp);
#if 0
{
printf
(
"tab_ce_len=%d\n"
, tab_ce_len);
for
(i = 0; i < tab_ce_len; i++) {
ce = &tab_ce[i];
printf
(
"%05x %05x %05x\n"
, ce->c[0], ce->c[1], ce->p);
}
}
#endif
fprintf
(f,
"static const uint16_t unicode_comp_table[%u] = {"
,
tab_ce_len);
for
(i = 0; i < tab_ce_len; i++) {
if
(i % 8 == 0)
fprintf
(f,
"\n "
);
v = get_decomp_pos(tab_de, tab_ce[i].p);
if
(v < 0) {
printf
(
"ERROR: entry for c=%04x not found\n"
,
tab_ce[i].p);
exit
(1);
}
fprintf
(f,
" 0x%04x,"
, v);
}
fprintf
(f,
"\n};\n\n"
);
free
(tab_ce);
}
#ifdef USE_TEST
void
check_decompose_table(
void
)
{
int
c;
CCInfo *ci;
int
res[UNICODE_DECOMP_LEN_MAX], *ref;
int
len, ref_len, is_compat;
for
(is_compat = 0; is_compat <= 1; is_compat++) {
for
(c = 0; c < CHARCODE_MAX; c++) {
ci = &unicode_db[c];
ref_len = ci->decomp_len;
ref = ci->decomp_data;
if
(!is_compat && ci->is_compat) {
ref_len = 0;
}
len = unicode_decomp_char((uint32_t *)res, c, is_compat);
if
(len != ref_len ||
tabcmp(res, ref, ref_len) != 0) {
printf
(
"ERROR c=%05x compat=%d\n"
, c, is_compat);
dump_str(
"res"
, res, len);
dump_str(
"ref"
, ref, ref_len);
exit
(1);
}
}
}
}
void
check_compose_table(
void
)
{
int
i, p;
for
(i = 0; i <= CHARCODE_MAX; i++) {
CCInfo *ci = &unicode_db[i];
if
(ci->decomp_len == 2 && !ci->is_compat &&
!ci->is_excluded) {
p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
if
(p != i) {
printf
(
"ERROR compose: c=%05x %05x -> %05x ref=%05x\n"
,
ci->decomp_data[0], ci->decomp_data[1], p, i);
exit
(1);
}
}
}
}
#endif
#ifdef USE_TEST
void
check_str(
const
char
*msg,
int
num,
const
int
*in_buf,
int
in_len,
const
int
*buf1,
int
len1,
const
int
*buf2,
int
len2)
{
if
(len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
printf
(
"%d: ERROR %s:\n"
, num, msg);
dump_str(
" in"
, in_buf, in_len);
dump_str(
"res"
, buf1, len1);
dump_str(
"ref"
, buf2, len2);
exit
(1);
}
}
void
check_cc_table(
void
)
{
int
cc, cc_ref, c;
for
(c = 0; c <= CHARCODE_MAX; c++) {
cc_ref = unicode_db[c].combining_class;
cc = unicode_get_cc(c);
if
(cc != cc_ref) {
printf
(
"ERROR: c=%04x cc=%d cc_ref=%d\n"
,
c, cc, cc_ref);
exit
(1);
}
}
#ifdef PROFILE
{
int64_t ti, count;
ti = get_time_ns();
count = 0;
for
(c = 0x20; c <= 0xffff; c++) {
cc_ref = unicode_db[c].combining_class;
cc = unicode_get_cc(c);
count++;
}
ti = get_time_ns() - ti;
printf
(
"cc time=%0.1f ns/char\n"
,
(
double
)ti / count);
}
#endif
}
void
normalization_test(
const
char
*filename)
{
FILE
*f;
char
line[4096], *p;
int
*in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
int
in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
int
*buf, buf_len, pos;
f =
fopen
(filename,
"rb"
);
if
(!f) {
perror
(filename);
exit
(1);
}
pos = 0;
for
(;;) {
if
(!get_line(line,
sizeof
(line), f))
break
;
pos++;
p = line;
while
(
isspace
(*p))
p++;
if
(*p ==
'#'
|| *p ==
'@'
)
continue
;
in_str = get_field_str(&in_len, p, 0);
nfc_str = get_field_str(&nfc_len, p, 1);
nfd_str = get_field_str(&nfd_len, p, 2);
nfkc_str = get_field_str(&nfkc_len, p, 3);
nfkd_str = get_field_str(&nfkd_len, p, 4);
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
check_str(
"nfd"
, pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
free
(buf);
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
check_str(
"nfkd"
, pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
free
(buf);
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
check_str(
"nfc"
, pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
free
(buf);
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
check_str(
"nfkc"
, pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
free
(buf);
free
(in_str);
free
(nfc_str);
free
(nfd_str);
free
(nfkc_str);
free
(nfkd_str);
}
fclose
(f);
}
#endif
int
main(
int
argc,
char
**argv)
{
const
char
*unicode_db_path, *outfilename;
char
filename[1024];
if
(argc < 2) {
printf
(
"usage: %s unicode_db_path [output_file]\n"
"\n"
"If no output_file is given, a self test is done using the current unicode library\n"
,
argv[0]);
exit
(1);
}
unicode_db_path = argv[1];
outfilename = NULL;
if
(argc >= 3)
outfilename = argv[2];
unicode_db = mallocz(
sizeof
(unicode_db[0]) * (CHARCODE_MAX + 1));
snprintf(filename,
sizeof
(filename),
"%s/UnicodeData.txt"
, unicode_db_path);
parse_unicode_data(filename);
snprintf(filename,
sizeof
(filename),
"%s/SpecialCasing.txt"
, unicode_db_path);
parse_special_casing(unicode_db, filename);
snprintf(filename,
sizeof
(filename),
"%s/CaseFolding.txt"
, unicode_db_path);
parse_case_folding(unicode_db, filename);
snprintf(filename,
sizeof
(filename),
"%s/CompositionExclusions.txt"
, unicode_db_path);
parse_composition_exclusions(filename);
snprintf(filename,
sizeof
(filename),
"%s/DerivedCoreProperties.txt"
, unicode_db_path);
parse_derived_core_properties(filename);
snprintf(filename,
sizeof
(filename),
"%s/DerivedNormalizationProps.txt"
, unicode_db_path);
parse_derived_norm_properties(filename);
snprintf(filename,
sizeof
(filename),
"%s/PropList.txt"
, unicode_db_path);
parse_prop_list(filename);
snprintf(filename,
sizeof
(filename),
"%s/Scripts.txt"
, unicode_db_path);
parse_scripts(filename);
snprintf(filename,
sizeof
(filename),
"%s/ScriptExtensions.txt"
,
unicode_db_path);
parse_script_extensions(filename);
snprintf(filename,
sizeof
(filename),
"%s/emoji-data.txt"
,
unicode_db_path);
parse_prop_list(filename);
build_conv_table(unicode_db);
#ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
dump_case_folding_special_cases(unicode_db);
#endif
if
(!outfilename) {
#ifdef USE_TEST
check_case_conv();
check_flags();
check_decompose_table();
check_compose_table();
check_cc_table();
snprintf(filename,
sizeof
(filename),
"%s/NormalizationTest.txt"
, unicode_db_path);
normalization_test(filename);
#else
fprintf
(stderr,
"Tests are not compiled\n"
);
exit
(1);
#endif
}
else
{
FILE
*fo =
fopen
(outfilename,
"wb"
);
if
(!fo) {
perror
(outfilename);
exit
(1);
}
fprintf
(fo,
"/* Compressed unicode tables */\n"
"/* Automatically generated file - do not edit */\n"
"\n"
"#include <stdint.h>\n"
"\n"
);
dump_case_conv_table(fo);
compute_internal_props();
build_flags_tables(fo);
fprintf
(fo,
"#ifdef CONFIG_ALL_UNICODE\n\n"
);
build_cc_table(fo);
build_decompose_table(fo);
build_general_category_table(fo);
build_script_table(fo);
build_script_ext_table(fo);
build_prop_list_table(fo);
fprintf
(fo,
"#endif /* CONFIG_ALL_UNICODE */\n"
);
fclose
(fo);
}
return
0;
}