#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "strip_html.h"
#ifdef _MSC_VER
#define strcasecmp(a,b) stricmp(a,b)
#endif
static
int
utf8_char_width(unsigned
char
* string);
void
_strip_html( Stripper * stripper,
char
* raw,
char
* output,
int
is_utf8_p ) {
char
* p_raw = raw;
char
* raw_end = raw +
strlen
(raw);
char
* p_output = output;
int
width;
if
( stripper->o_debug ) {
printf
(
"[DEBUG] input string: %s\n"
, p_raw );
}
while
( p_raw < raw_end ) {
width = is_utf8_p ? utf8_char_width(p_raw) : 1;
if
( stripper->o_debug ) {
printf
(
"[DEBUG] char:%C w%i state:%c%c%c tag:%5s last:%c%c%c%c in:%c%c%c quote:%c "
,
*p_raw,
width,
(stripper->f_closing ?
'C'
:
' '
),
(stripper->f_in_tag ?
'T'
:
' '
),
(stripper->f_full_tagname ?
'F'
:
' '
),
stripper->tagname,
(stripper->f_just_seen_tag ?
'T'
:
' '
),
(stripper->f_outputted_space ?
'S'
:
' '
),
(stripper->f_lastchar_slash ?
'/'
:
' '
),
(stripper->f_lastchar_minus ?
'-'
:
' '
),
(stripper->f_in_decl ?
'D'
:
' '
),
(stripper->f_in_comment ?
'C'
:
' '
),
(stripper->f_in_striptag ?
'X'
:
' '
),
(stripper->f_in_quote ? stripper->quote :
' '
)
);
}
if
( stripper->f_in_tag ) {
if
( !stripper->f_full_tagname && !stripper->f_in_decl ) {
if
( stripper->p_tagname == stripper->tagname && *p_raw ==
'!'
) {
stripper->f_in_decl = 1;
}
else
if
( stripper->p_tagname == stripper->tagname && *p_raw ==
'/'
) {
stripper->f_closing = 1;
}
else
if
( !stripper->f_closing && stripper->f_in_striptag && stripper->p_tagname == stripper->tagname && *p_raw !=
'/'
) {
stripper->f_in_tag = 0;
stripper->f_closing = 0;
}
else
if
( !stripper->f_in_striptag || stripper->f_closing ) {
if
( (!
isspace
( *p_raw ) && *p_raw !=
'/'
&& *p_raw !=
'>'
) &&
!( (stripper->p_tagname - stripper->tagname) == MAX_TAGNAMELENGTH ) ) {
*stripper->p_tagname++ = *p_raw;
}
else
{
*stripper->p_tagname = 0;
stripper->f_full_tagname = 1;
if
( stripper->f_in_striptag && stripper->f_closing ) {
if
( strcasecmp( stripper->tagname, stripper->striptag ) == 0 ) {
stripper->f_in_striptag = 0;
}
}
else
if
( !stripper->f_in_striptag && !stripper->f_closing ) {
if
( strcasecmp( stripper->tagname,
"p"
) ||
strcasecmp( stripper->tagname,
"br"
) ) {
if
( stripper->o_emit_newlines ) {
if
( stripper->o_debug ) {
printf
(
"NEWLINE "
);
}
*p_output++ =
'\n'
;
stripper->f_outputted_space = 1;
}
}
int
i;
for
( i = 0; i < stripper->numstriptags; i++ ) {
if
( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) {
stripper->f_in_striptag = 1;
strcpy
( stripper->striptag, stripper->tagname );
break
;
}
}
}
check_end( stripper, *p_raw );
}
}
}
else
{
if
( stripper->f_in_quote ) {
if
( *p_raw == stripper->quote ) {
stripper->quote = 0;
stripper->f_in_quote = 0;
}
}
else
{
if
( !stripper->f_in_comment &&
( *p_raw ==
'\''
|| *p_raw ==
'\"'
) ) {
stripper->f_in_quote = 1;
stripper->quote = *p_raw;
stripper->f_lastchar_minus = 0;
stripper->f_lastchar_slash = 0;
}
else
{
if
( stripper->f_in_decl ) {
if
( stripper->f_lastchar_minus ) {
if
( *p_raw ==
'-'
) {
stripper->f_in_comment = !stripper->f_in_comment;
}
stripper->f_lastchar_minus = 0;
}
else
{
if
( *p_raw ==
'-'
) {
stripper->f_lastchar_minus = 1;
}
}
if
( !stripper->f_in_comment ) {
check_end( stripper, *p_raw );
}
}
else
{
check_end( stripper, *p_raw );
}
}
}
}
}
else
{
if
( *p_raw ==
'<'
) {
stripper->f_in_tag = 1;
stripper->tagname[0] = 0;
stripper->p_tagname = stripper->tagname;
stripper->f_full_tagname = 0;
stripper->f_closing = 0;
stripper->f_just_seen_tag = 1;
}
else
{
if
( !stripper->f_in_striptag ) {
if
( stripper->o_emit_spaces ){
if
( !
isspace
(*p_raw) &&
!stripper->f_outputted_space &&
stripper->f_just_seen_tag ) {
if
( stripper->o_debug ) {
printf
(
"SPACE "
);
}
*p_output++ =
' '
;
stripper->f_outputted_space = 1;
}
}
strncpy
(p_output, p_raw, width);
if
( stripper->o_debug ) {
printf
(
"CHAR %c"
, *p_raw);
}
p_output += width;
stripper->f_just_seen_tag = 0;
if
(!
isspace
(*p_raw)) {
stripper->f_outputted_space = 0;
}
else
{
stripper->f_outputted_space = 1;
}
}
}
}
p_raw += width;
if
( stripper->o_debug ) {
printf
(
"\n"
);
}
}
*p_output = 0;
if
(stripper->o_auto_reset) {
_reset( stripper );
}
}
static
int
utf8_char_width(unsigned
char
* string) {
if
(~*string & 128) {
return
1;
}
else
if
((*string & 192) == 128) {
int
width = 1;
char
* p = string;
while
((*p++ & 192) == 128) {
width++;
}
return
width;
}
else
if
((*string & 224) == 192) {
return
2;
}
else
if
((*string & 240) == 224) {
return
3;
}
else
if
((*string & 248) == 240) {
return
4;
}
else
{
printf
(
"[WARN] invalid utf8 char ord=%i\n"
, *string );
return
1;
}
}
void
_reset( Stripper * stripper ) {
stripper->f_in_tag = 0;
stripper->f_closing = 0;
stripper->f_lastchar_slash = 0;
stripper->f_full_tagname = 0;
stripper->f_outputted_space = 1;
stripper->f_just_seen_tag = 0;
stripper->f_in_quote = 0;
stripper->f_in_decl = 0;
stripper->f_in_comment = 0;
stripper->f_lastchar_minus = 0;
stripper->f_in_striptag = 0;
memset
(stripper->tagname, 0,
sizeof
(stripper->tagname));
}
void
clear_striptags( Stripper * stripper ) {
strcpy
(stripper->o_striptags[0],
""
);
stripper->numstriptags = 0;
}
void
add_striptag( Stripper * stripper,
char
* striptag ) {
if
( stripper->numstriptags < MAX_STRIPTAGS-1 ) {
strcpy
(stripper->o_striptags[stripper->numstriptags++], striptag);
}
else
{
fprintf
( stderr,
"Cannot have more than %i strip tags"
, MAX_STRIPTAGS );
}
}
void
check_end( Stripper * stripper,
char
end ) {
if
( end ==
'/'
) {
stripper->f_lastchar_slash = 1;
}
else
{
if
( (end ==
'>'
) ||
(stripper->f_in_striptag && stripper->f_closing &&
isspace
(end)) ) {
stripper->f_in_quote = 0;
stripper->f_in_comment = 0;
stripper->f_in_decl = 0;
stripper->f_in_tag = 0;
stripper->f_closing = 0;
if
( stripper->f_lastchar_slash &&
(strcasecmp( stripper->striptag, stripper->tagname ) == 0) ) {
stripper->f_in_striptag = 0;
}
}
stripper->f_lastchar_slash = 0;
}
}