#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "houdini.h"
/**
* & --> &
* < --> <
* > --> >
* " --> "
* ' --> '
*/
static const char *LOOKUP_CODES[] = {
"", /* reserved: use literal single character */
"", /* unused */
"", /* reserved: 2 character UTF-8 */
"", /* reserved: 3 character UTF-8 */
"", /* reserved: 4 character UTF-8 */
"?", /* invalid UTF-8 character */
""",
"&",
"'",
"<",
">"
};
static const char CODE_INVALID = 5;
static const char XML_LOOKUP_TABLE[] = {
/* ASCII: 0xxxxxxx */
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
0, 0, 6, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,10, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* Invalid UTF-8 char start: 10xxxxxx */
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
/* Multibyte UTF-8 */
/* 2 bytes: 110xxxxx */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
/* 3 bytes: 1110xxxx */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* 4 bytes: 11110xxx */
4, 4, 4, 4, 4, 4, 4, 4,
/* Invalid UTF-8: 11111xxx */
5, 5, 5, 5, 5, 5, 5, 5,
};
int
houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size)
{
size_t i = 0;
unsigned char code = 0;
gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
while (i < size) {
size_t start, end;
start = end = i;
while (i < size) {
unsigned int byte;
byte = src[i++];
code = XML_LOOKUP_TABLE[byte];
if (!code) {
/* single character used literally */
} else if (code >= CODE_INVALID) {
break; /* insert lookup code string */
} else if (code > size - end) {
code = CODE_INVALID; /* truncated UTF-8 character */
break;
} else {
unsigned int chr = byte & (0xff >> code);
while (--code) {
byte = src[i++];
if ((byte & 0xc0) != 0x80) {
code = CODE_INVALID;
break;
}
chr = (chr << 6) + (byte & 0x3f);
}
switch (i - end) {
case 2:
if (chr < 0x80)
code = CODE_INVALID;
break;
case 3:
if (chr < 0x800 ||
(chr > 0xd7ff && chr < 0xe000) ||
chr > 0xfffd)
code = CODE_INVALID;
break;
case 4:
if (chr < 0x10000 || chr > 0x10ffff)
code = CODE_INVALID;
break;
default:
break;
}
if (code == CODE_INVALID)
break;
}
end = i;
}
if (end > start)
gh_buf_put(ob, src + start, end - start);
/* escaping */
if (end >= size)
break;
gh_buf_puts(ob, LOOKUP_CODES[code]);
}
return 1;
}