#include "xmlfast.h"
#define case_wsp \
case
0xa : context->line_number++; \
case
0x9 :\
case
0xd :\
case
0x20
#ifndef XML_DEBUG
#define XML_DEBUG 0
#endif
#if XML_DEBUG
#define WHERESTR " at %s line %d.\n"
#define WHEREARG __FILE__, __LINE__
#define debug(...) do{ fprintf(stderr, __VA_ARGS__); fprintf(stderr, WHERESTR, WHEREARG); } while(0)
#else
#define debug(...)
#endif
#define DOCUMENT_START 0
#define LT_OPEN 1
#define COMMENT_OPEN 2
#define CDATA_OPEN 3
#define PI 4
#define CONTENT_WAIT 5
#define TAG_OPEN 6
#define TAG_CLOSE 7
#define TEXT_READ 8
#define TEXT_DATA 9
#define TEXT_INITWSP 10
#define TEXT_WSP 11
#define DOCUMENT_ABORTED 12
static
char
*STATE[DOCUMENT_ABORTED+1] = {
"DOCUMENT_START"
,
"LT_OPEN"
,
"COMMENT_OPEN"
,
"CDATA_OPEN"
,
"PI"
,
"CONTENT_WAIT"
,
"TAG_OPEN"
,
"TAG_CLOSE"
,
"TEXT_READ"
,
"TEXT_DATA"
,
"TEXT_INITWSP"
,
"TEXT_WSP"
,
"DOCUMENT_ABORTED"
};
static
inline
char
* eat_wsp(parser_state * context,
char
*p) {
while
(1) {
switch
(*p) {
case
0:
return
p;
case_wsp :
break
;
default
:
return
p;
}
p++;
}
}
static
inline
char
* eatback_wsp(parser_state * context,
char
*p) {
while
(1) {
switch
(*p) {
case
0:
return
p;
case_wsp :
break
;
default
:
return
p;
}
p--;
}
}
static
inline
char
*parse_entity (parser_state * context,
char
*p) {
entityref_t *cur_ent;
char
*at;
at = p;
unsigned
int
i;
if
(*(p+1) ==
'#'
) {
p+=2;
wchar_t
chr = 0;
if
(*p ==
'x'
) {
p++;
while
(1) {
if
(*p >=
'0'
&& *p <=
'9'
) {
chr *= 16;
chr += (*p++ -
'0'
);
}
else
if
(*p >=
'a'
&& *p <=
'f'
) {
chr *= 16;
chr += (*p++ -
'a'
+ 10);
}
else
if
(*p >=
'A'
&& *p <=
'F'
) {
chr *= 16;
chr += (*p++ -
'A'
+ 10);
}
else
break
;
}
}
else
{
while
(*p >=
'0'
&& *p <=
'9'
) {
chr *= 10;
chr += (*p++ -
'0'
);
}
}
if
( *p ==
';'
) p++;
if
(chr > 0 && chr <= 0x2FFFF) {
if
(context->cb.uchar) context->cb.uchar(context->ctx, chr);
}
else
{
if
(context->cb.warn) {
char
back = *p;
*p = 0;
context->cb.warn(context->ctx,
"Bad entity value %s"
,at);
*p = back;
}
if
(context->cb.bytespart) context->cb.bytespart(context->ctx, at, p - at);
}
return
p;
}
cur_ent = entities;
next_ent:
if
(*p == 0)
return
0;
p++;
if
(*p ==
';'
) {
if
(cur_ent && cur_ent->entity) {
p++;
goto
ret;
}
else
{
goto
no_ent;
}
}
for
(i=0; i < cur_ent->children; i++) {
if
(cur_ent->more[i].c == *p) {
cur_ent = &cur_ent->more[i];
goto
next_ent;
}
}
if
(cur_ent && cur_ent->entity) {
goto
ret;
}
no_ent:
if
(p == at) p++;
if
(context->cb.bytespart) context->cb.bytespart(context->ctx, at, p - at);
return
p;
ret:
if
(context->cb.bytespart) context->cb.bytespart(context->ctx, cur_ent->entity, cur_ent->length);
return
p;
}
static
inline
char
*parse_attrs(
char
*p, parser_state * context) {
void
* ctx = context->ctx;
xml_callbacks * cb = &context->cb;
char
state = 0;
char
wait = 0;
char
loop = 1;
char
*at,*end;
p = eat_wsp(context, p);
while
(loop) {
switch
(state) {
case
0:
while
(state == 0) {
switch
(*p) {
case
0 :
if
(context->cb.die) context->cb.die(ctx,
"Document aborted"
);
return
0;
case_wsp : p = eat_wsp(context, p);
break
;
case
'>'
:
case
'?'
:
case
'/'
:
return
p;
default
: state = 1;
}
}
break
;
case
1:
at = p;
end = 0;
while
(state == 1) {
switch
(*p) {
case
0 :
if
(context->cb.die) context->cb.die(ctx,
"Document aborted"
);
return
0;
case_wsp :
end = p;
p = eat_wsp(context, p);
if
(*p !=
'='
) {
if
(context->cb.die) context->cb.die(ctx,
"No = after whitespace while reading attr name"
);
return
0;
}
case
'='
:
if
(!end) end = p;
if
(cb->attrname) cb->attrname( ctx, at, end - at );
p = eat_wsp(context, p + 1);
state = 2;
break
;
default
: p++;
}
}
break
;
case
2:
wait = 0;
while
(state == 2) {
switch
(*p) {
case
0 :
if
(context->cb.die) context->cb.die(ctx,
"Document aborted"
);
return
0;
case
'\''
:
case
'"'
:
if
(!wait) {
wait = *p;
p++;
at = p;
break
;
}
else
if
(*p == wait) {
state = 0;
if
(cb->bytes) cb->bytes( ctx, at, p - at );
p = eat_wsp(context, p+1);
break
;
}
case
'&'
:
if
(wait) {
if
(p > at && cb->bytespart) cb->bytespart( ctx, at, p - at );
if
( p = parse_entity(context, p) ) {
at = p;
break
;
}
}
else
{
if
(context->cb.die) context->cb.die(ctx,
"Not waiting for & in state 2"
);
return
0;
}
default
: p++;
}
}
break
;
default
:
if
(context->cb.warn) context->cb.warn(ctx,
"default, state=%d, char='%c'\n"
,state, *p);
return
0;
}
}
return
p;
}
void
parse (
char
* xml, parser_state * context) {
void
* ctx = context->ctx;
xml_callbacks * cb = &context->cb;
context->line_number = 1;
char
*p, *at, *start, *end, *search, buffer[BUFFER];
memset
(&buffer,0,BUFFER);
unsigned
int
state, len;
unsigned
char
textstate;
p = xml;
context->state = DOCUMENT_START;
next:
while
(1) {
switch
(*p) {
case
0:
goto
eod;
case
'<'
:
context->state = LT_OPEN;
p++;
switch
(*p) {
case
0:
goto
eod;
case
'!'
:
p++;
if
(*p == 0)
goto
eod;
if
(
strncmp
( p,
"--"
, 2 ) == 0 ) {
context->state = COMMENT_OPEN;
p+=2;
search =
strstr
(p,
"-->"
);
if
(search) {
if
(cb->comment) {
cb->comment( ctx, p, search - p );
}
p = search + 3;
}
else
xml_error(
"Comment node not terminated"
);
context->state = CONTENT_WAIT;
goto
next;
}
else
if
(
strncmp
( p,
"[CDATA["
, 7 ) == 0) {
context->state = CDATA_OPEN;
p+=7;
search =
strstr
(p,
"]]>"
);
if
(search) {
if
(cb->cdata) {
cb->cdata( ctx, p, search - p);
}
p = search + 3;
}
else
xml_error(
"Cdata node not terminated"
);
context->state = CONTENT_WAIT;
goto
next;
}
else
if
(
strncmp
(p,
"DOCTYPE"
, 7 ) == 0 ) {
p += 7;
state = 0;
while
(state == 0) {
switch
(*p) {
case
0 : xml_error(
"Doctype not properly terminated"
);
break
;
case
'['
: state = 1; p++;
break
;
case
'>'
: state = 2; p++;
break
;
default
: p++;
}
}
if
(state == 1) {
search =
strchr
(p,
']'
);
if
(search) {
p = eat_wsp(context,search+1);
if
(*p ==
'>'
) {
p++;
state = 2;
}
else
{
xml_error(
"Doctype not properly terminated"
);
}
}
else
{
xml_error(
"Doctype intSubset not terminated"
);
}
}
context->state = CONTENT_WAIT;
goto
next;
}
else
{
xml_error(
"Malformed document after <!"
);
goto
fault;
}
break
;
case
'?'
:
context->state = PI;
state = 0;
p++;
at = p;
while
(state == 0) {
switch
(*p) {
case
0 : xml_error(
"Processing instruction not terminated"
);
case_wsp :
if
(p > at) {
debug(
"PI: want attrs"
);
end = p;
state = 1;
break
;
}
else
xml_error(
"Bad processing instruction"
);
case
'?'
:
end = p;
p++;
if
(*p ==
'>'
) {
p++;
state = 3;
}
else
xml_error(
"Processing instruction not terminated"
);
break
;
default
: p++;
}
}
if
(cb->piopen) cb->piopen( context->ctx, at, end - at );
if
(state == 1) {
if
(!( at = parse_attrs(p,context) ))
xml_error(
"Error parsing PI attributes"
);
p = at;
state = 2;
}
debug(
"CB> Got pi name state=%d next='%c'\n"
,state,*p);
if
(state == 2) {
if
(*p ==
'?'
&& *(p+1) ==
'>'
) {
debug(
"PI correctly closed\n"
);
p+=2;
state = 3;
}
else
xml_error(
"Processing instruction not terminated"
);
}
if
(state != 3)
xml_error(
"Internal error: Bad state after processing instruction"
);
if
(cb->piclose) cb->piclose( context->ctx, at, end - at );
context->state = CONTENT_WAIT;
goto
next;
case
'/'
:
context->state = TAG_CLOSE;
p++;
at = p;
search =
strchr
(p,
'>'
);
if
(search) {
p = search + 1;
search = eatback_wsp(context, search-1)+1;
len = search - at;
if
(len == 0 ) xml_error(
"Empty close tag name"
);
if
(cb->tagclose) cb->tagclose(ctx, at, len);
context->state = CONTENT_WAIT;
goto
next;
}
else
xml_error(
"Close tag not terminated"
);
default
:
state = 0;
context->state = TAG_OPEN;
debug(
"open tag: %.10s..."
,p);
while
(state < 3) {
switch
(state) {
case
0:
at = p;
while
(state == 0) {
switch
(*p) {
case
0: xml_error(
"Unterminated node"
);
case_wsp :
if
(p > at) {
state = 1;
break
;
}
else
xml_error(
"Bad node open"
);
case
'/'
:
case
'>'
:
if
(p > at) {
state = 2;
break
;
}
else
xml_error(
"Bad node open"
);
default
: p++;
}
}
len = p - at;
debug(
"opened tag: <%.*s>"
, p - at, at);
if
(cb->tagopen) cb->tagopen( ctx, at, p - at );
break
;
case
1:
if
(search = parse_attrs(p,context)) {
p = search;
state = 2;
}
else
xml_error(
"Error parsing node attributes"
);
case
2:
while
(state == 2) {
switch
(*p) {
case
0 : xml_error(
"Unterminated node"
);
case_wsp : p = eat_wsp(context, p);
case
'/'
:
debug(
"close tag now: %s -> <%.*s>"
, at, len, at);
if
(cb->tagclose) cb->tagclose( ctx, at, len );
p = eat_wsp(context, p+1);
break
;
case
'>'
: state = 3; p++;
break
;
default
:
xml_error(
"Bad char at the end of tag"
);
}
}
context->state = CONTENT_WAIT;
goto
next;
}
}
}
break
;
default
:
context->state = TEXT_READ;
start = at = p;
char
*lastwsp = 0;
if
(!context->save_wsp) {
p = eat_wsp(context, p);
if
(p > at) start = at = p;
}
textstate = TEXT_DATA;
while
(1) {
switch
(*p) {
case
0 :
case
'<'
:
if
(!context->save_wsp && textstate == TEXT_WSP) {
}
else
{
lastwsp = p;
}
if
(cb->bytes) {
if
(lastwsp > at) {
cb->bytes(ctx, at, lastwsp - at );
}
else
{
if
(p > start) cb->bytes(ctx,
""
, 0 );
}
}
context->state = CONTENT_WAIT;
if
(*p == 0)
goto
eod;
goto
next;
case_wsp :
if
(textstate == TEXT_DATA) { lastwsp = p; }
textstate = TEXT_WSP;
p++;
break
;
default
:
textstate = TEXT_DATA;
if
(*p ==
'&'
) {
if
(p > at && cb->bytespart) cb->bytespart(ctx, at, p - at);
if
( p = parse_entity(context,p) ) {
at = p;
break
;
}
else
{
goto
fault;
}
}
p++;
}
}
textstate = TEXT_INITWSP;
break
;
}
}
return
;
eod:
switch
(context->state) {
case
DOCUMENT_START:
if
(context->cb.warn) context->cb.warn(ctx,
"Empty document"
);
return
;
case
LT_OPEN:
case
COMMENT_OPEN:
case
CDATA_OPEN:
case
PI:
case
TAG_OPEN:
case
TAG_CLOSE:
if
(context->cb.die)
context->cb.die(context->ctx,
"Bad document end, state = %s"
,STATE[context->state]);
break
;
case
TEXT_READ:
if
(context->cb.warn) context->cb.warn(ctx,
"Need to call text cb at the end of document"
);
break
;
case
CONTENT_WAIT:
break
;
default
:
if
(context->cb.warn) context->cb.warn(ctx,
"Bad context->state %d at the end of document\n"
,context->state);
}
fault:
return
;
}