The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

#include "xmlfast.h"
#define case_wsp \
case 0xa : context->line_number++; \
case 0x9 :\
case 0xd :\
case 0x20
#ifndef XML_DEBUG
#define XML_DEBUG 0
#endif
#if XML_DEBUG
#define WHERESTR " at %s line %d.\n"
#define WHEREARG __FILE__, __LINE__
#define debug(...) do{ fprintf(stderr, __VA_ARGS__); fprintf(stderr, WHERESTR, WHEREARG); } while(0)
#else
#define debug(...)
#endif
#define DOCUMENT_START 0
#define LT_OPEN 1
#define COMMENT_OPEN 2
#define CDATA_OPEN 3
#define PI 4
#define CONTENT_WAIT 5
#define TAG_OPEN 6
#define TAG_CLOSE 7
#define TEXT_READ 8
#define TEXT_DATA 9
#define TEXT_INITWSP 10
#define TEXT_WSP 11
#define DOCUMENT_ABORTED 12
static char *STATE[DOCUMENT_ABORTED+1] = {
"DOCUMENT_START"
,"LT_OPEN"
,"COMMENT_OPEN"
,"CDATA_OPEN"
,"PI"
,"CONTENT_WAIT"
,"TAG_OPEN"
,"TAG_CLOSE"
,"TEXT_READ"
,"TEXT_DATA"
,"TEXT_INITWSP"
,"TEXT_WSP"
,"DOCUMENT_ABORTED"
};
static inline char * eat_wsp(parser_state * context, char *p) {
while (1) {
switch (*p) {
case 0: return p;
case_wsp :
break;
default:
return p;
}
p++;
}
}
static inline char * eatback_wsp(parser_state * context, char *p) {
while (1) {
switch (*p) {
case 0: return p;
case_wsp :
break;
default:
return p;
}
p--;
}
}
static inline char *parse_entity (parser_state * context, char *p) {
//return p+1;
entityref_t *cur_ent;
char *at;
at = p;
unsigned int i;
if (*(p+1) == '#') {
p+=2;
wchar_t chr = 0;
if (*p == 'x') {
p++;
while(1) {
if (*p >= '0' && *p <= '9') {
chr *= 16;
chr += (*p++ - '0');
}
else
if (*p >= 'a' && *p <= 'f') {
chr *= 16;
chr += (*p++ - 'a' + 10);
}
else
if (*p >= 'A' && *p <= 'F') {
chr *= 16;
chr += (*p++ - 'A' + 10);
}
else
break;
}
}
else {
while(*p >= '0' && *p <= '9') {
chr *= 10;
chr += (*p++ - '0');
}
}
if ( *p == ';' ) p++;
if (chr > 0 && chr <= 0x2FFFF) {
if (context->cb.uchar) context->cb.uchar(context->ctx, chr);
} else {
if (context->cb.warn) {
char back = *p;
*p = 0;
context->cb.warn(context->ctx,"Bad entity value %s",at);
*p = back;
}
if (context->cb.bytespart) context->cb.bytespart(context->ctx, at, p - at);
}
return p;
}
cur_ent = entities;
next_ent:
if (*p == 0) return 0;
p++;
if (*p == ';') {
if (cur_ent && cur_ent->entity) {
p++;
goto ret;
} else {
goto no_ent;
}
}
for (i=0; i < cur_ent->children; i++) {
if (cur_ent->more[i].c == *p) {
cur_ent = &cur_ent->more[i];
goto next_ent;
}
}
if (cur_ent && cur_ent->entity) {
goto ret;
}
no_ent:
if (p == at) p++;
if (context->cb.bytespart) context->cb.bytespart(context->ctx, at, p - at);
return p;
ret:
if (context->cb.bytespart) context->cb.bytespart(context->ctx, cur_ent->entity, cur_ent->length);
return p;
}
/*
static void print_chain (xml_node *chain, int depth) {
int i;
xml_node * node;
printf(":>> ");
for (i=0; i < depth; i++) {
node = &chain[i];
printf("%s",node->name);
if (i < depth-1 )printf(" > ");
}
printf("\n");
}
*/
static inline char *parse_attrs(char *p, parser_state * context) {
void * ctx = context->ctx;
xml_callbacks * cb = &context->cb;
char state = 0;
/*
* state=0 - default, waiting for attr name or /?>
* state=1 - reading attr name
* state=2 - reading attr value
*/
char wait = 0;
char loop = 1;
char *at,*end;
p = eat_wsp(context, p);
while(loop) {
switch(state) {
case 0: // waiting for attr name
//printf("Want attr name, char='%c'\n",*p);
while(state == 0) {
switch(*p) {
case 0 : if (context->cb.die) context->cb.die(ctx,"Document aborted"); return 0;
case_wsp : p = eat_wsp(context, p); break;
case '>' :
case '?' :
case '/' : return p;
default : state = 1;
}
}
break;
case 1: //reading attr name
at = p;
end = 0;
//printf("Want = (%c)\n",*p);
while(state == 1) {
switch(*p) {
case 0 : if (context->cb.die) context->cb.die(ctx,"Document aborted");return 0;
case_wsp :
end = p;
p = eat_wsp(context, p);
if (*p != '=') {
if (context->cb.die) context->cb.die(ctx,"No = after whitespace while reading attr name");
return 0;
}
case '=':
if (!end) end = p;
if (cb->attrname) cb->attrname( ctx, at, end - at );
p = eat_wsp(context, p + 1);
state = 2;
break;
default: p++;
}
}
break;
case 2:
wait = 0;
//printf("Want quote (%c)\n",*p);
while(state == 2) {
switch(*p) {
case 0 : if (context->cb.die) context->cb.die(ctx,"Document aborted");return 0;
case '\'':
case '"':
if (!wait) { // got open quote
//printf("\tgot open quote <%c>\n",*p);
wait = *p;
p++;
at = p;
break;
} else
if (*p == wait) { // got close quote
//printf("\tgot close quote <%c>\n",*p);
state = 0;
if(cb->bytes) cb->bytes( ctx, at, p - at );
p = eat_wsp(context, p+1);
break;
}
case '&':
if (wait) {
//printf("Got entity begin (%s)\n",buffer);
if (p > at && cb->bytespart) cb->bytespart( ctx, at, p - at );
if( p = parse_entity(context, p) ) {
at = p;
break;
}
} else {
if (context->cb.die) context->cb.die(ctx,"Not waiting for & in state 2");
return 0;
}
default: p++;
}
}
break;
default:
if (context->cb.warn) context->cb.warn(ctx, "default, state=%d, char='%c'\n",state, *p);
return 0;
}
}
return p;
}
void parse (char * xml, parser_state * context) {
void * ctx = context->ctx;
xml_callbacks * cb = &context->cb;
context->line_number = 1;
char *p, *at, *start, *end, *search, buffer[BUFFER];
memset(&buffer,0,BUFFER);
unsigned int state, len;
unsigned char textstate;
p = xml;
/*
xml_node *seek, *reverse;
context->chain_size = 64;
Newx( context->chain, context->chain_size, xml_node);
context->root = context->chain;
*/
context->state = DOCUMENT_START;
next:
while (1) {
switch(*p) {
case 0: goto eod;
case '<':
context->state = LT_OPEN;
p++;
switch (*p) {
case 0: goto eod;
case '!':
p++;
if(*p == 0) goto eod;
if ( strncmp( p, "--", 2 ) == 0 ) {
context->state = COMMENT_OPEN;
p+=2;
search = strstr(p,"-->");
if (search) {
if (cb->comment) {
cb->comment( ctx, p, search - p );
}
p = search + 3;
} else xml_error("Comment node not terminated");
context->state = CONTENT_WAIT;
goto next;
} else
if ( strncmp( p, "[CDATA[", 7 ) == 0) {
context->state = CDATA_OPEN;
p+=7;
search = strstr(p,"]]>");
if (search) {
if (cb->cdata) {
cb->cdata( ctx, p, search - p);
}
p = search + 3;
} else xml_error("Cdata node not terminated");
context->state = CONTENT_WAIT;
goto next;
} else
if ( strncmp(p, "DOCTYPE", 7 ) == 0 ) {
p += 7;
//p = eat_wsp(p);
state = 0;
while(state == 0) {
switch(*p) {
case 0 : xml_error("Doctype not properly terminated"); break;
case '[': state = 1; p++; break;
case '>': state = 2; p++; break;
default : p++;
}
}
if (state == 1) {
search = strchr(p,']');
if (search) {
//printf("search = %s\n",search);
p = eat_wsp(context,search+1);
if (*p == '>') {
p++;
state = 2;
} else {
xml_error("Doctype not properly terminated");
}
} else {
xml_error("Doctype intSubset not terminated");
}
}
//fprintf(stderr,"after doctype: %s\n",p);
context->state = CONTENT_WAIT;
goto next;
} else
{
xml_error("Malformed document after <!");
goto fault;
}
break;
case '?':
context->state = PI;
state = 0;
p++;
at = p;
while(state == 0) {
switch(*p) {
case 0 : xml_error("Processing instruction not terminated");
case_wsp :
if (p > at) {
debug("PI: want attrs");
end = p;
state = 1;
break;
} else xml_error("Bad processing instruction");
case '?':
end = p;
p++;
if (*p == '>') {
p++;
state = 3;
} else xml_error("Processing instruction not terminated");
break;
default: p++;
}
}
if (cb->piopen) cb->piopen( context->ctx, at, end - at );
if (state == 1) {
if (!( at = parse_attrs(p,context) ))
xml_error("Error parsing PI attributes");
p = at;
state = 2;
}
debug("CB> Got pi name state=%d next='%c'\n",state,*p);
if (state == 2) {
if (*p == '?' && *(p+1) == '>') {
debug("PI correctly closed\n");
p+=2;
state = 3;
} else xml_error("Processing instruction not terminated");
}
if (state != 3)
xml_error("Internal error: Bad state after processing instruction");
if (cb->piclose) cb->piclose( context->ctx, at, end - at );
context->state = CONTENT_WAIT;
goto next;
case '/': // </node>
context->state = TAG_CLOSE;
p++;
at = p;
search = strchr(p,'>');
if (search) {
p = search + 1;
search = eatback_wsp(context, search-1)+1;
len = search - at;
// DISABLE BALANCING
if (len == 0 ) xml_error("Empty close tag name");
if(cb->tagclose) cb->tagclose(ctx, at, len);
context->state = CONTENT_WAIT;
goto next;
// DISABLE BALANCING
/*
if (context->depth == 0) {
if (context->cb.warn)
context->cb.warn(context->ctx,"Need to close tag upper than root. Ignored");
context->state = CONTENT_WAIT;
goto next;
}
if (strncmp(context->chain->name, at, len) == 0) {
if(cb->tagclose) cb->tagclose(ctx, at, len);
context->depth--;
context->chain--;
} else {
if(len+1 > BUFFER) {
snprintf(buffer,BUFFER,"%s",at);
} else {
snprintf(buffer,len+1,"%s",at);
}
//printf("NODE CLOSE '%s' (unbalanced)\n",buffer);
reverse = seek = context->chain;
while( seek > context->root ) {
seek--;
if (strncmp(seek->name, at, len) == 0) {
if (context->cb.warn)
context->cb.warn(context->ctx,"Found early opened node %s",seek->name);
while(context->chain >= seek) {
if(cb->tagclose) cb->tagclose(ctx, context->chain->name, context->chain->len);
Safefree(context->chain->name);
context->chain--;
context->depth--;
}
//optional feature: auto-opening tags
//for (seek = chain+2; seek <= reverse; seek++) {
// //printf("Auto open %s\n",seek->name);
// chain++;
// curr_depth++;
// *chain = *(chain+1);
// if(cb->tagopen) cb->tagopen(ctx, chain->name, chain->len);
// //print_chain(root, curr_depth);
//}
//optional feature: auto-opening tags
seek = 0;
break;
}
}
if (seek) {
if (cb->warn)
cb->warn(context->ctx,"Found no open node until root for '%s' at line %d, char %d. Ignored",buffer, context->line_number, p - xml);
} else {
// TODO ??
}
}
context->state = CONTENT_WAIT;
goto next;
*/
} else xml_error("Close tag not terminated");
default: //<node...>
state = 0;
context->state = TAG_OPEN;
debug("open tag: %.10s...",p);
while(state < 3) {
switch(state) {
case 0:
at = p;
while(state == 0) {
switch(*p) {
case 0: xml_error("Unterminated node");
case_wsp :
if (p > at) {
state = 1;
break;
} else xml_error("Bad node open");
case '/':
case '>':
if (p > at) {
state = 2;
break;
} else xml_error("Bad node open");
default: p++;
}
}
len = p - at;
/*
if (context->depth + 1 > context->chain_size) {
seek = context->root;
context->chain_size *= 2;
Renew( context->root, context->chain_size, xml_node);
context->chain = context->root + (context->chain - seek);
}
if (context->depth++ != 0) context->chain++;
context->chain->len = p - at;
context->chain->name = safemalloc( context->chain->len + 1 );
memcpy(context->chain->name, at, context->chain->len);
context->chain->name[context->chain->len] = '\0';
if (cb->tagopen) cb->tagopen( ctx, at, context->chain->len );
*/
debug("opened tag: <%.*s>", p - at, at);
if (cb->tagopen) cb->tagopen( ctx, at, p - at );
break;
case 1:
if (search = parse_attrs(p,context)) {
p = search;
state = 2;
} else xml_error("Error parsing node attributes");
case 2:
while(state == 2) {
switch(*p) {
case 0 : xml_error("Unterminated node");
case_wsp : p = eat_wsp(context, p);
case '/' :
debug("close tag now: %s -> <%.*s>", at, len, at);
if (cb->tagclose) cb->tagclose( ctx, at, len );
/*Safefree(context->chain->name);
context->chain--;
context->depth--;*/
p = eat_wsp(context, p+1);
break;
case '>' : state = 3; p++; break;
default :
xml_error("Bad char at the end of tag");
}
}
context->state = CONTENT_WAIT;
goto next;
}
}
}
break;
default:
context->state = TEXT_READ;
start = at = p;
char *lastwsp = 0;
if (!context->save_wsp) {
p = eat_wsp(context, p);
if (p > at) start = at = p;
}
textstate = TEXT_DATA;
while (1) {
switch(*p) {
case 0 :
case '<':
//if (p > at) {
if (!context->save_wsp && textstate == TEXT_WSP) {
//printf("Skip trailing whitespace chardata=%d wspdata=%d\n", lastwsp - at, p - lastwsp);
} else {
lastwsp = p;
}
if(cb->bytes) {
if (lastwsp > at) {
cb->bytes(ctx, at, lastwsp - at );
} else {
if (p > start) cb->bytes(ctx, "", 0 ); // explicitly terminate
}
}
//}
context->state = CONTENT_WAIT;
if (*p == 0) goto eod;
goto next;
case_wsp :
if (textstate == TEXT_DATA) { lastwsp = p; }
textstate = TEXT_WSP;
p++;
break;
default:
textstate = TEXT_DATA;
if (*p == '&') {
if (p > at && cb->bytespart) cb->bytespart(ctx, at, p - at);
if( p = parse_entity(context,p) ) {
at = p;
break;
} else {
goto fault;
}
}
p++;
}
}
textstate = TEXT_INITWSP;
break;
}
}
//printf("parse done\n");
//Safefree(context->root);
return;
eod:
//printf("End of document, context->state=%d\n",context->state);
switch(context->state) {
case DOCUMENT_START:
if (context->cb.warn) context->cb.warn(ctx,"Empty document");
return;
case LT_OPEN:
case COMMENT_OPEN:
case CDATA_OPEN:
case PI:
case TAG_OPEN:
case TAG_CLOSE:
if (context->cb.die)
context->cb.die(context->ctx,"Bad document end, state = %s",STATE[context->state]);
break;
case TEXT_READ:
if (context->cb.warn) context->cb.warn(ctx,"Need to call text cb at the end of document");
break;
case CONTENT_WAIT:
/*
if (context->depth == 0) {
//printf("END ok\n");
} else {
printf("Document aborted\n");
//print_chain(chain,curr_depth);
}
*/
break;
default:
if (context->cb.warn) context->cb.warn(ctx,"Bad context->state %d at the end of document\n",context->state);
}
fault:
//Safefree(context->root);
return;
}