#ifdef PERL_EXT_RE_BUILD
#include "re_top.h"
#endif
#include "EXTERN.h"
#define PERL_IN_REGEX_ENGINE
#define PERL_IN_REGCOMP_ANY
#define PERL_IN_REGCOMP_TRIE_C
#include "perl.h"
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
#else
# include "regcomp.h"
#endif
#include "invlist_inline.h"
#include "unicode_constants.h"
#include "regcomp_internal.h"
#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
? (TRIE_LIST_CUR( idx ) - 1) \
: 0 )
#ifdef DEBUGGING
STATIC
void
S_dump_trie(pTHX_
const
struct
_reg_trie_data *trie, HV *widecharmap,
AV *revcharmap, U32 depth)
{
U32 state;
SV *sv=sv_newmortal();
int
colwidth= widecharmap ? 6 : 4;
U16 word;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE;
Perl_re_indentf( aTHX_
"Char : %-6s%-6s%-4s "
,
depth+1,
"Match"
,
"Base"
,
"Ofs"
);
for
( state = 0 ; state < trie->uniquecharcount ; state++ ) {
SV **
const
tmp = av_fetch_simple( revcharmap, state, 0);
if
( tmp ) {
Perl_re_printf( aTHX_
"%*s"
,
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
PERL_PV_ESCAPE_FIRSTCHAR
)
);
}
}
Perl_re_printf( aTHX_
"\n"
);
Perl_re_indentf( aTHX_
"State|-----------------------"
, depth+1);
for
( state = 0 ; state < trie->uniquecharcount ; state++ )
Perl_re_printf( aTHX_
"%.*s"
, colwidth,
"--------"
);
Perl_re_printf( aTHX_
"\n"
);
for
( state = 1 ; state < trie->statecount ; state++ ) {
const
U32 base = trie->states[ state ].trans.base;
Perl_re_indentf( aTHX_
"#%4"
UVXf
"|"
, depth+1, (UV)state);
if
( trie->states[ state ].wordnum ) {
Perl_re_printf( aTHX_
" W%4X"
, trie->states[ state ].wordnum );
}
else
{
Perl_re_printf( aTHX_
"%6s"
,
""
);
}
Perl_re_printf( aTHX_
" @%4"
UVXf
" "
, (UV)base );
if
( base ) {
U32 ofs = 0;
while
( ( base + ofs < trie->uniquecharcount ) ||
( base + ofs - trie->uniquecharcount < trie->lasttrans
&& trie->trans[ base + ofs - trie->uniquecharcount ].check
!= state))
ofs++;
Perl_re_printf( aTHX_
"+%2"
UVXf
"[ "
, (UV)ofs);
for
( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
if
( ( base + ofs >= trie->uniquecharcount )
&& ( base + ofs - trie->uniquecharcount
< trie->lasttrans )
&& trie->trans[ base + ofs
- trie->uniquecharcount ].check == state )
{
Perl_re_printf( aTHX_
"%*"
UVXf, colwidth,
(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
);
}
else
{
Perl_re_printf( aTHX_
"%*s"
, colwidth,
" ."
);
}
}
Perl_re_printf( aTHX_
"]"
);
}
Perl_re_printf( aTHX_
"\n"
);
}
Perl_re_indentf( aTHX_
"word_info N:(prev,len)="
,
depth);
for
(word=1; word <= trie->wordcount; word++) {
Perl_re_printf( aTHX_
" %d:(%d,%d)"
,
(
int
)word, (
int
)(trie->wordinfo[word].prev),
(
int
)(trie->wordinfo[word].len));
}
Perl_re_printf( aTHX_
"\n"
);
}
STATIC
void
S_dump_trie_interim_list(pTHX_
const
struct
_reg_trie_data *trie,
HV *widecharmap, AV *revcharmap, U32 next_alloc,
U32 depth)
{
U32 state;
SV *sv=sv_newmortal();
int
colwidth= widecharmap ? 6 : 4;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
Perl_re_indentf( aTHX_
"State :Word | Transition Data\n"
,
depth+1 );
Perl_re_indentf( aTHX_
"%s"
,
depth+1,
"------:-----+-----------------\n"
);
for
( state=1 ; state < next_alloc ; state ++ ) {
U16 charid;
Perl_re_indentf( aTHX_
" %4"
UVXf
" :"
,
depth+1, (UV)state );
if
( ! trie->states[ state ].wordnum ) {
Perl_re_printf( aTHX_
"%5s| "
,
""
);
}
else
{
Perl_re_printf( aTHX_
"W%4x| "
,
trie->states[ state ].wordnum
);
}
for
( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
SV **
const
tmp = av_fetch_simple( revcharmap,
TRIE_LIST_ITEM(state, charid).forid, 0);
if
( tmp ) {
Perl_re_printf( aTHX_
"%*s:%3X=%4"
UVXf
" | "
,
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
colwidth,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
| PERL_PV_ESCAPE_FIRSTCHAR
) ,
TRIE_LIST_ITEM(state, charid).forid,
(UV)TRIE_LIST_ITEM(state, charid).newstate
);
if
(!(charid % 10))
Perl_re_printf( aTHX_
"\n%*s| "
,
(
int
)((depth * 2) + 14),
""
);
}
}
Perl_re_printf( aTHX_
"\n"
);
}
}
STATIC
void
S_dump_trie_interim_table(pTHX_
const
struct
_reg_trie_data *trie,
HV *widecharmap, AV *revcharmap, U32 next_alloc,
U32 depth)
{
U32 state;
U16 charid;
SV *sv=sv_newmortal();
int
colwidth= widecharmap ? 6 : 4;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
Perl_re_indentf( aTHX_
"Char : "
, depth+1 );
for
( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
SV **
const
tmp = av_fetch_simple( revcharmap, charid, 0);
if
( tmp ) {
Perl_re_printf( aTHX_
"%*s"
,
colwidth,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
PERL_PV_ESCAPE_FIRSTCHAR
)
);
}
}
Perl_re_printf( aTHX_
"\n"
);
Perl_re_indentf( aTHX_
"State+-"
, depth+1 );
for
( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
Perl_re_printf( aTHX_
"%.*s"
, colwidth,
"--------"
);
}
Perl_re_printf( aTHX_
"\n"
);
for
( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
Perl_re_indentf( aTHX_
"%4"
UVXf
" : "
,
depth+1,
(UV)TRIE_NODENUM( state ) );
for
( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
if
(v)
Perl_re_printf( aTHX_
"%*"
UVXf, colwidth, v );
else
Perl_re_printf( aTHX_
"%*s"
, colwidth,
"."
);
}
if
( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
Perl_re_printf( aTHX_
" (%4"
UVXf
")\n"
,
(UV)trie->trans[ state ].check );
}
else
{
Perl_re_printf( aTHX_
" (%4"
UVXf
") W%4X\n"
,
(UV)trie->trans[ state ].check,
trie->states[ TRIE_NODENUM( state ) ].wordnum );
}
}
}
#endif
#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if
(UTF) { \
SV *zlopp = newSV(UTF8_MAXBYTES); \
unsigned
char
*flrbbbbb = (unsigned
char
*) SvPVX(zlopp); \
unsigned
char
*
const
kapow = uvchr_to_utf8(flrbbbbb, val); \
*kapow =
'\0'
; \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
av_push_simple(revcharmap, zlopp); \
}
else
{ \
char
ooooff = (
char
)val; \
av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \
} \
} STMT_END
#define TRIE_READ_CHAR STMT_START { \
wordlen++; \
if
( UTF ) { \
\
uvc = valid_utf8_to_uvchr( (
const
U8*) uc, &len); \
} \
else
if
(folder == PL_fold_latin1) { \
\
assert
(*uc != LATIN_SMALL_LETTER_SHARP_S); \
uvc = toLOWER_L1(*uc); \
if
(UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
len = 1; \
}
else
{ \
\
uvc = (U32)*uc; \
len = 1; \
} \
} STMT_END
#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
if
( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
U32 ging = TRIE_LIST_LEN( state ) * 2; \
Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
TRIE_LIST_LEN( state ) = ging; \
} \
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
TRIE_LIST_CUR( state )++; \
} STMT_END
#define TRIE_LIST_NEW(state) STMT_START { \
Newx( trie->states[ state ].trans.list, \
4, reg_trie_trans_le ); \
TRIE_LIST_CUR( state ) = 1; \
TRIE_LIST_LEN( state ) = 4; \
} STMT_END
#define TRIE_HANDLE_WORD(state) STMT_START { \
U16 dupe= trie->states[ state ].wordnum; \
regnode *
const
noper_next = regnext( noper ); \
\
DEBUG_r({ \
\
SV* tmp; \
if
(OP(noper) != NOTHING) \
tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
else
\
tmp = newSVpvn_utf8(
""
, 0, UTF ); \
av_push_simple( trie_words, tmp ); \
}); \
\
curword++; \
trie->wordinfo[curword].prev = 0; \
trie->wordinfo[curword].len = wordlen; \
trie->wordinfo[curword].accept = state; \
\
if
( noper_next < tail ) { \
if
(!trie->jump) { \
trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof
(U16) ); \
trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof
(U16) ); \
trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof
(U16) ); \
} \
trie->jump[curword] = (U16)(noper_next - convert); \
U16 set_before_paren; \
U16 set_after_paren; \
if
(OP(cur) == BRANCH) { \
set_before_paren = ARG1a(cur); \
set_after_paren = ARG1b(cur); \
}
else
{ \
set_before_paren = ARG2a(cur); \
set_after_paren = ARG2b(cur); \
} \
trie->j_before_paren[curword] = set_before_paren; \
trie->j_after_paren[curword] = set_after_paren; \
if
(!jumper) \
jumper = noper_next; \
if
(!nextbranch) \
nextbranch= regnext(cur); \
} \
\
if
( dupe ) { \
\
\
\
trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
trie->wordinfo[dupe].prev = curword; \
}
else
{ \
\
trie->states[ state ].wordnum = curword; \
} \
} STMT_END
#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
( ( base + charid >= ucharcount \
&& base + charid < ubound \
&& state == trie->trans[ base - ucharcount + charid ].check \
&& trie->trans[ base - ucharcount + charid ].next ) \
? trie->trans[ base - ucharcount + charid ].next \
: ( state==1 ? special : 0 ) \
)
STATIC
void
S_trie_bitmap_set_folded(pTHX_ RExC_state_t *pRExC_state,
reg_trie_data *trie, U8 ch,
const
U8 * folder)
{
TRIE_BITMAP_SET(trie, ch);
if
( folder )
TRIE_BITMAP_SET(trie, folder[ch]);
if
( !UTF ) {
if
(! UVCHR_IS_INVARIANT(ch)) {
U8 hi = UTF8_TWO_BYTE_HI(ch);
TRIE_BITMAP_SET(trie, hi);
}
}
}
I32
Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
regnode *first, regnode *last, regnode *tail,
U32 word_count, U32 flags, U32 depth)
{
reg_trie_data *trie;
HV *widecharmap = NULL;
AV *revcharmap = newAV();
regnode *cur;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
U32 next_alloc = 0;
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *lastbranch = NULL;
regnode *convert = NULL;
U32 *prev_states;
const
U8 * folder = NULL;
#ifdef DEBUGGING
const
U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN(
"tuaa"
));
AV *trie_words = NULL;
#else
const
U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN(
"tu"
));
STRLEN trie_charcount=0;
#endif
SV *re_trie_maxbuff;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_MAKE_TRIE;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
switch
(flags) {
case
EXACT:
case
EXACT_REQ8:
case
EXACTL:
break
;
case
EXACTFAA:
case
EXACTFUP:
case
EXACTFU:
case
EXACTFLU8: folder = PL_fold_latin1;
break
;
case
EXACTF: folder = PL_fold;
break
;
default
: Perl_croak( aTHX_
"panic! In trie construction, unknown node type %u %s"
, (unsigned) flags, REGNODE_NAME(flags) );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1,
sizeof
(reg_trie_data) );
trie->refcount = 1;
trie->startstate = 1;
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (
void
*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256,
sizeof
(U16) );
if
(flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
trie->bitmap = (
char
*) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1,
sizeof
(reg_trie_wordinfo));
DEBUG_r({
trie_words = newAV();
});
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
assert
(re_trie_maxbuff);
if
(!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
DEBUG_TRIE_COMPILE_r({
Perl_re_indentf( aTHX_
"make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n"
,
depth+1,
REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
REG_NODE_NUM(last), REG_NODE_NUM(tail), (
int
)depth);
});
if
( first == startbranch && OP( last ) != BRANCH ) {
convert = first;
}
else
{
convert = REGNODE_AFTER( first );
}
for
( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = REGNODE_AFTER( cur );
const
U8 *uc;
const
U8 *e;
int
foldlen = 0;
U32 wordlen = 0;
STRLEN minchars = 0;
STRLEN maxchars = 0;
bool
set_bit = trie->bitmap ? 1 : 0;
PERL_UNUSED_VAR(wordlen);
lastbranch = cur;
if
(OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
if
(noper_next < tail)
noper= noper_next;
}
if
( noper < tail
&& ( OP(noper) == flags
|| (flags == EXACT && OP(noper) == EXACT_REQ8)
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
}
else
{
trie->minlen= 0;
continue
;
}
if
( set_bit ) {
TRIE_BITMAP_SET(trie,*uc);
if
(OP( noper ) == EXACTFUP) {
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
for
( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
maxchars++;
if
(folder == NULL) {
minchars++;
}
else
if
(foldlen > 0) {
foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
}
else
{
minchars++;
if
(UTF) {
if
((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
foldlen -= UTF8SKIP(uc);
}
}
else
if
((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
foldlen--;
}
}
if
( uvc < 256 ) {
if
( folder ) {
U8 folded= folder[ (U8) uvc ];
if
( !trie->charmap[ folded ] ) {
trie->charmap[ folded ]=( ++trie->uniquecharcount );
TRIE_STORE_REVCHAR( folded );
}
}
if
( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
TRIE_STORE_REVCHAR( uvc );
}
if
( set_bit ) {
S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, uvc, folder);
set_bit = 0;
}
}
else
{
SV** svpp;
if
( !widecharmap )
widecharmap = newHV();
svpp = hv_fetch( widecharmap, (
char
*)&uvc,
sizeof
( UV ), 1 );
if
( !svpp )
Perl_croak( aTHX_
"error creating/fetching widecharmap entry for 0x%"
UVXf, uvc );
if
( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
TRIE_STORE_REVCHAR(uvc);
}
}
}
if
( cur == first ) {
trie->minlen = minchars;
trie->maxlen = maxchars;
}
else
if
(minchars < trie->minlen) {
trie->minlen = minchars;
}
else
if
(maxchars > trie->maxlen) {
trie->maxlen = maxchars;
}
}
trie->before_paren = OP(first) == BRANCH
? ARG1a(first)
: ARG2a(first);
trie->after_paren = OP(lastbranch) == BRANCH
? ARG1b(lastbranch)
: ARG2b(lastbranch);
DEBUG_TRIE_COMPILE_r(
Perl_re_indentf( aTHX_
"TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n"
,
depth+1,
( widecharmap ?
"UTF8"
:
"NATIVE"
), (
int
)word_count,
(
int
)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(
int
)trie->minlen, (
int
)trie->maxlen )
);
Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
prev_states[1] = 0;
if
( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
> SvIV(re_trie_maxbuff) )
{
STRLEN transcount = 1;
DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_
"Compiling trie using list compiler\n"
,
depth+1));
trie->states = (reg_trie_state *)
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
sizeof
(reg_trie_state) );
TRIE_LIST_NEW(1);
next_alloc = 2;
for
( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = REGNODE_AFTER( cur );
U32 state = 1;
U16 charid = 0;
U32 wordlen = 0;
if
(OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
if
(noper_next < tail)
noper= noper_next;
}
if
( noper < tail
&& ( OP(noper) == flags
|| (flags == EXACT && OP(noper) == EXACT_REQ8)
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const
U8 *uc= (U8*)STRING(noper);
const
U8 *e= uc + STR_LEN(noper);
for
( ; uc < e ; uc += len ) {
TRIE_READ_CHAR;
if
( uvc < 256 ) {
charid = trie->charmap[ uvc ];
}
else
{
SV**
const
svpp = hv_fetch( widecharmap,
(
char
*)&uvc,
sizeof
( UV ),
0);
if
( !svpp ) {
charid = 0;
}
else
{
charid=(U16)SvIV( *svpp );
}
}
if
( charid ) {
U16 check;
U32 newstate = 0;
charid--;
if
( !trie->states[ state ].trans.list ) {
TRIE_LIST_NEW( state );
}
for
( check = 1;
check <= TRIE_LIST_USED( state );
check++ )
{
if
( TRIE_LIST_ITEM( state, check ).forid
== charid )
{
newstate = TRIE_LIST_ITEM( state, check ).newstate;
break
;
}
}
if
( ! newstate ) {
newstate = next_alloc++;
prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
state = newstate;
}
else
{
Perl_croak( aTHX_
"panic! In trie construction, no char mapping for %"
IVdf, uvc );
}
}
}
else
{
noper= REGNODE_AFTER(cur);
}
TRIE_HANDLE_WORD(state);
}
trie->statecount = next_alloc;
trie->states = (reg_trie_state *)
PerlMemShared_realloc( trie->states,
next_alloc
*
sizeof
(reg_trie_state) );
DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
revcharmap, next_alloc,
depth+1)
);
trie->trans = (reg_trie_trans *)
PerlMemShared_calloc( transcount,
sizeof
(reg_trie_trans) );
{
U32 state;
U32 tp = 0;
U32 zp = 0;
for
( state=1 ; state < next_alloc ; state ++ ) {
U32 base=0;
if
(trie->states[state].trans.list) {
U16 minid=TRIE_LIST_ITEM( state, 1).forid;
U16 maxid=minid;
U16 idx;
for
( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
const
U16 forid = TRIE_LIST_ITEM( state, idx).forid;
if
( forid < minid ) {
minid=forid;
}
else
if
( forid > maxid ) {
maxid=forid;
}
}
if
( transcount < tp + maxid - minid + 1) {
transcount *= 2;
trie->trans = (reg_trie_trans *)
PerlMemShared_realloc( trie->trans,
transcount
*
sizeof
(reg_trie_trans) );
Zero( trie->trans + (transcount / 2),
transcount / 2,
reg_trie_trans );
}
base = trie->uniquecharcount + tp - minid;
if
( maxid == minid ) {
U32 set = 0;
for
( ; zp < tp ; zp++ ) {
if
( ! trie->trans[ zp ].next ) {
base = trie->uniquecharcount + zp - minid;
trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
1).newstate;
trie->trans[ zp ].check = state;
set = 1;
break
;
}
}
if
( !set ) {
trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
1).newstate;
trie->trans[ tp ].check = state;
tp++;
zp = tp;
}
}
else
{
for
( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
const
U32 tid = base
- trie->uniquecharcount
+ TRIE_LIST_ITEM( state, idx ).forid;
trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
idx ).newstate;
trie->trans[ tid ].check = state;
}
tp += ( maxid - minid + 1 );
}
Safefree(trie->states[ state ].trans.list);
}
trie->states[ state ].trans.base=base;
}
trie->lasttrans = tp + 1;
}
}
else
{
DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_
"Compiling trie using table compiler\n"
,
depth+1));
trie->trans = (reg_trie_trans *)
PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
* trie->uniquecharcount + 1,
sizeof
(reg_trie_trans) );
trie->states = (reg_trie_state *)
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
sizeof
(reg_trie_state) );
next_alloc = trie->uniquecharcount + 1;
for
( cur = first ; cur < last ; cur = regnext( cur ) ) {
regnode *noper = REGNODE_AFTER( cur );
U32 state = 1;
U16 charid = 0;
U32 accept_state = 0;
U32 wordlen = 0;
if
(OP(noper) == NOTHING) {
regnode *noper_next= regnext(noper);
if
(noper_next < tail)
noper= noper_next;
}
if
( noper < tail
&& ( OP(noper) == flags
|| (flags == EXACT && OP(noper) == EXACT_REQ8)
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
|| OP(noper) == EXACTFUP))))
{
const
U8 *uc= (U8*)STRING(noper);
const
U8 *e= uc + STR_LEN(noper);
for
( ; uc < e ; uc += len ) {
TRIE_READ_CHAR;
if
( uvc < 256 ) {
charid = trie->charmap[ uvc ];
}
else
{
SV*
const
*
const
svpp = hv_fetch( widecharmap,
(
char
*)&uvc,
sizeof
( UV ),
0);
charid = svpp ? (U16)SvIV(*svpp) : 0;
}
if
( charid ) {
charid--;
if
( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
prev_states[TRIE_NODENUM(next_alloc)]
= TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
}
else
{
Perl_croak( aTHX_
"panic! In trie construction, no char mapping for %"
IVdf, uvc );
}
}
}
else
{
noper= REGNODE_AFTER(cur);
}
accept_state = TRIE_NODENUM( state );
TRIE_HANDLE_WORD(accept_state);
}
DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
revcharmap,
next_alloc, depth+1));
{
const
U32 laststate = TRIE_NODENUM( next_alloc );
U32 state, charid;
U32 pos = 0, zp=0;
trie->statecount = laststate;
for
( state = 1 ; state < laststate ; state++ ) {
U8 flag = 0;
const
U32 stateidx = TRIE_NODEIDX( state );
const
U32 o_used = trie->trans[ stateidx ].check;
U32 used = trie->trans[ stateidx ].check;
trie->trans[ stateidx ].check = 0;
for
( charid = 0;
used && charid < trie->uniquecharcount;
charid++ )
{
if
( flag || trie->trans[ stateidx + charid ].next ) {
if
( trie->trans[ stateidx + charid ].next ) {
if
(o_used == 1) {
for
( ; zp < pos ; zp++ ) {
if
( ! trie->trans[ zp ].next ) {
break
;
}
}
trie->states[ state ].trans.base
= zp
+ trie->uniquecharcount
- charid ;
trie->trans[ zp ].next
= SAFE_TRIE_NODENUM( trie->trans[ stateidx
+ charid ].next );
trie->trans[ zp ].check = state;
if
( ++zp > pos ) pos = zp;
break
;
}
used--;
}
if
( !flag ) {
flag = 1;
trie->states[ state ].trans.base
= pos + trie->uniquecharcount - charid ;
}
trie->trans[ pos ].next
= SAFE_TRIE_NODENUM(
trie->trans[ stateidx + charid ].next );
trie->trans[ pos ].check = state;
pos++;
}
}
}
trie->lasttrans = pos + 1;
trie->states = (reg_trie_state *)
PerlMemShared_realloc( trie->states, laststate
*
sizeof
(reg_trie_state) );
DEBUG_TRIE_COMPILE_MORE_r(
Perl_re_indentf( aTHX_
"Alloc: %d Orig: %"
IVdf
" elements, Final:%"
IVdf
". Savings of %%%5.2f\n"
,
depth+1,
(
int
)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
+ 1 ),
(IV)next_alloc,
(IV)pos,
( ( next_alloc - pos ) * 100 ) / (
double
)next_alloc );
);
}
}
DEBUG_TRIE_COMPILE_MORE_r(
Perl_re_indentf( aTHX_
"Statecount:%"
UVxf
" Lasttrans:%"
UVxf
"\n"
,
depth+1,
(UV)trie->statecount,
(UV)trie->lasttrans)
);
trie->trans = (reg_trie_trans *)
PerlMemShared_realloc( trie->trans, trie->lasttrans
*
sizeof
(reg_trie_trans) );
{
U8 nodetype =(U8) flags;
char
*str=NULL;
#ifdef DEBUGGING
regnode *optimize = NULL;
#endif /* DEBUGGING */
assert
((!trie->jump) || !trie->jump[1] ||
(trie->jump[1] >= (
sizeof
(tregnode_TRIE)/
sizeof
(
struct
regnode))));
if
( first != startbranch || OP( last ) == BRANCH ) {
NEXT_OFF( first ) = (U16)(last - first);
}
trie->startstate= 1;
if
( trie->bitmap && !widecharmap && !trie->jump ) {
U32 state;
for
( state = 1 ; state < trie->statecount-1 ; state++ ) {
U32 ofs = 0;
I32 first_ofs = -1;
U32 count = 0;
const
U32 base = trie->states[ state ].trans.base;
if
( trie->states[state].wordnum )
count = 1;
for
( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
if
( ( base + ofs >= trie->uniquecharcount ) &&
( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
{
if
( ++count > 1 ) {
SV **tmp;
U8 *ch;
if
( state == 1 )
break
;
tmp = av_fetch_simple( revcharmap, ofs, 0);
ch = (U8*)SvPV_nolen_const( *tmp );
if
( count == 2 ) {
Zero(trie->bitmap, ANYOF_BITMAP_SIZE,
char
);
DEBUG_OPTIMISE_r(
Perl_re_indentf( aTHX_
"New Start State=%"
UVuf
" Class: ["
,
depth+1,
(UV)state));
if
(first_ofs >= 0) {
SV **
const
tmp = av_fetch_simple( revcharmap, first_ofs, 0);
const
U8 *
const
ch = (U8*)SvPV_nolen_const( *tmp );
S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_
"%s"
, (
char
*)ch)
);
}
}
S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_
"%s"
, ch));
}
first_ofs = ofs;
}
}
if
( count == 1 ) {
SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0);
STRLEN len;
char
*ch = SvPV( *tmp, len );
DEBUG_OPTIMISE_r({
SV *sv=sv_newmortal();
Perl_re_indentf( aTHX_
"Prefix State: %"
UVuf
" Ofs:%"
UVuf
" Char='%s'\n"
,
depth+1,
(UV)state, (UV)first_ofs,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
PERL_PV_ESCAPE_FIRSTCHAR
)
);
});
if
( state==1 ) {
OP( convert ) = nodetype;
str=STRING(convert);
setSTR_LEN(convert, 0);
}
assert
( ( STR_LEN(convert) + len ) < 256 );
setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
while
(len--)
*str++ = *ch++;
}
else
{
#ifdef DEBUGGING
if
(state>1)
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_
"]\n"
));
#endif
break
;
}
}
trie->prefixlen = (state-1);
if
(str) {
regnode *n = REGNODE_AFTER(convert);
assert
( n - convert <= U16_MAX );
NEXT_OFF(convert) = n - convert;
trie->startstate = state;
trie->minlen -= (state - 1);
trie->maxlen -= (state - 1);
#ifdef DEBUGGING
if
(
#ifdef PERL_EXT_RE_BUILD
1
#else
DEBUG_r_TEST
#endif
) {
U32 word = trie->wordcount;
while
(word--) {
SV **
const
tmp = av_fetch_simple( trie_words, word, 0 );
if
(tmp) {
if
( STR_LEN(convert) <= SvCUR(*tmp) )
sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
else
sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
}
}
}
#endif
if
(trie->maxlen) {
convert = n;
}
else
{
NEXT_OFF(convert) = (U16)(tail - convert);
DEBUG_r(optimize= n);
}
}
}
if
(!jumper)
jumper = last;
if
( trie->maxlen ) {
NEXT_OFF( convert ) = (U16)(tail - convert);
ARG1u_SET( convert, data_slot );
if
(trie->jump)
trie->jump[0] = (U16)(nextbranch - convert);
if
( !trie->states[trie->startstate].wordnum
&& trie->bitmap
&& ( (
char
*)jumper - (
char
*)convert) >= (
int
)
sizeof
(tregnode_TRIEC) )
{
OP( convert ) = TRIEC;
Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE,
char
);
PerlMemShared_free(trie->bitmap);
trie->bitmap= NULL;
}
else
OP( convert ) = TRIE;
FLAGS(convert) = nodetype;
DEBUG_r({
optimize = convert
+ NODE_STEP_REGNODE
+ REGNODE_ARG_LEN( OP( convert ) );
});
}
DEBUG_r(
if
(optimize) {
while
( optimize < jumper ) {
OP( optimize ) = OPTIMIZED;
optimize++;
}
});
}
{
U16 word;
U32 state;
U16 prev;
for
(word=1; word <= trie->wordcount; word++) {
prev = 0;
if
(trie->wordinfo[word].prev)
continue
;
state = trie->wordinfo[word].accept;
while
(state) {
state = prev_states[state];
if
(!state)
break
;
prev = trie->states[state].wordnum;
if
(prev)
break
;
}
trie->wordinfo[word].prev = prev;
}
Safefree(prev_states);
}
DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
RExC_rxi->data->data[ data_slot + 1 ] = (
void
*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (
void
*)trie_words;
RExC_rxi->data->data[ data_slot + 3 ] = (
void
*)revcharmap;
#else
SvREFCNT_dec_NN(revcharmap);
#endif
return
trie->jump
? MADE_JUMP_TRIE
: trie->startstate>1
? MADE_EXACT_TRIE
: MADE_TRIE;
}
regnode *
Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
{
const
U32 trie_offset = ARG1u(source);
reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
U32 *q;
const
U32 ucharcount = trie->uniquecharcount;
const
U32 numstates = trie->statecount;
const
U32 ubound = trie->lasttrans + ucharcount;
U32 q_read = 0;
U32 q_write = 0;
U32 charid;
U32 base = trie->states[ 1 ].trans.base;
U32 *fail;
reg_ac_data *aho;
const
U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN(
"T"
));
regnode *stclass;
DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
PERL_UNUSED_CONTEXT;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
if
( OP(source) == TRIE ) {
tregnode_TRIE *op = (tregnode_TRIE *)
PerlMemShared_calloc(1,
sizeof
(tregnode_TRIE));
StructCopy(source, op, tregnode_TRIE);
stclass = (regnode *)op;
}
else
{
tregnode_TRIEC *op = (tregnode_TRIEC *)
PerlMemShared_calloc(1,
sizeof
(tregnode_TRIEC));
StructCopy(source, op, tregnode_TRIEC);
stclass = (regnode *)op;
}
OP(stclass)+=2;
ARG1u_SET( stclass, data_slot );
aho = (reg_ac_data *) PerlMemShared_calloc( 1,
sizeof
(reg_ac_data) );
RExC_rxi->data->data[ data_slot ] = (
void
*)aho;
aho->trie=trie_offset;
aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates *
sizeof
(reg_trie_state) );
Copy( trie->states, aho->states, numstates, reg_trie_state );
Newx( q, numstates, U32);
aho->fail = (U32 *) PerlMemShared_calloc( numstates,
sizeof
(U32) );
aho->refcount = 1;
fail = aho->fail;
fail[ 0 ] = fail[ 1 ] = 1;
for
( charid = 0; charid < ucharcount ; charid++ ) {
const
U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
if
( newstate ) {
q[ q_write ] = newstate;
fail[ q[ q_write++ ] ]=1;
}
}
while
( q_read < q_write) {
const
U32 cur = q[ q_read++ % numstates ];
base = trie->states[ cur ].trans.base;
for
( charid = 0 ; charid < ucharcount ; charid++ ) {
const
U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
if
(ch_state) {
U32 fail_state = cur;
U32 fail_base;
do
{
fail_state = fail[ fail_state ];
fail_base = aho->states[ fail_state ].trans.base;
}
while
( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
fail[ ch_state ] = fail_state;
if
( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
{
aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
}
q[ q_write++ % numstates] = ch_state;
}
}
}
fail[ 0 ] = fail[ 1 ] = 0;
DEBUG_TRIE_COMPILE_r({
Perl_re_indentf( aTHX_
"Stclass Failtable (%"
UVuf
" states): 0"
,
depth, (UV)numstates
);
for
( q_read=1; q_read<numstates; q_read++ ) {
Perl_re_printf( aTHX_
", %"
UVuf, (UV)fail[q_read]);
}
Perl_re_printf( aTHX_
"\n"
);
});
Safefree(q);
return
stclass;
}