@ISA
=
qw(Exporter)
;
@EXPORT
=
qw(parse_html parse_htmlfile expand_entities)
;
$VERSION
=
sprintf
(
"%d.%02d"
,
q$Revision: 1.11 $
=~ /(\d+)\.(\d+)/);
sub
Version {
$VERSION
; }
$IMPLICIT_TAGS
= 1;
$IGNORE_UNKNOWN
= 1;
$IGNORE_TEXT
= 0;
for
(
qw(title base link meta isindex nextid)
) {
$isHeadElement
{
$_
} = 1;
}
for
(
qw(h1 h2 h3 h4 h5 h6
p pre address blockquote
xmp listing
a img br hr
ol ul dir menu li
dl dt dd
cite code em kbd samp strong var dfn strike
b i u tt
table tr td th caption
form input select option textarea
)
) {
$isBodyElement
{
$_
} = 1;
}
for
(
qw(wbr nobr center blink font basefont)
) {
$isBodyElement
{
$_
} = 1;
}
for
(
qw(cite code em kbd samp strong var b i u tt
a img br hr
wbr nobr center blink font basefont
table
)
) {
$isPhraseMarkup
{
$_
} = 1;
}
for
(
qw(ul ol dir menu)
) {
$isList
{
$_
} = 1;
}
for
(
qw(tr td th caption)
) {
$isTableElement
{
$_
} = 1;
}
for
(
qw(input select option textarea)
) {
$isFormElement
{
$_
} = 1;
}
sub
parse_html
{
my
$html
=
$_
[1];
$html
= new HTML::Element
'html'
unless
defined
$html
;
my
$buf
= \
$html
->{
'_buf'
};
$$buf
.=
$_
[0];
if
(
$html
->{_comment}) {
if
(
$$buf
=~ s/^.*?-->//s) {
delete
$html
->{_comment};
}
else
{
$$buf
=
''
;
}
}
$$buf
=~ s/<!--.*?-->//sg;
if
(
$$buf
=~ s/<!--.*//s) {
$html
->{_comment} = 1;
}
return
$html
unless
length
$$buf
;
my
@x
=
split
(/(<[^>]+>)/,
$$buf
);
if
(
$x
[-1] =~ m/>/) {
$$buf
=
''
;
}
elsif
(
$x
[-1] =~ s/(<.*)//s) {
$$buf
= $1;
pop
(
@x
)
unless
length
$x
[-1];
}
else
{
$$buf
=
''
;
}
for
(
@x
) {
if
(m:^</\s*(\w+)\s*>$:) {
endtag(
$html
,
lc
$1);
}
elsif
(m/^<\s*\w+/) {
starttag(
$html
,
$_
);
}
elsif
(m/^<!\s
*DOCTYPE
\b/) {
}
else
{
text(
$html
,
$_
);
}
}
$html
;
}
sub
parse_htmlfile
{
my
$file
=
shift
;
open
(F,
$file
) or
return
new HTML::Element
'html'
,
'comment'
=> $!;
my
$html
=
undef
;
my
$chunk
=
''
;
while
(
read
(F,
$chunk
, 1024)) {
$html
= parse_html(
$chunk
,
$html
);
}
close
(F);
$html
;
}
sub
starttag
{
my
$html
=
shift
;
my
$elem
=
shift
;
$elem
=~ s/^<\s*(\w+)\s*//;
my
$tag
= $1;
$elem
=~ s/>$//;
unless
(
defined
$tag
) {
warn
"Illegal start tag $_[0]"
;
}
else
{
$tag
=
lc
$tag
;
my
%attr
;
while
(
$elem
=~ s/^([^\s=]+)\s*(=\s*)?//) {
$key
= $1;
if
(
defined
$2) {
if
(
$elem
=~ s/^
"([^\"]+)"
?\s*//) {
$val
= $1;
}
elsif
(
$elem
=~ s/^
'([^\']+)'
?\s*//) {
$val
= $1;
}
elsif
(
$elem
=~ s/^(\S*)\s*//) {
$val
= $1;
}
else
{
die
"This should not happen"
;
}
HTML::Entities::decode(
$val
);
}
else
{
$val
=
$key
;
}
$attr
{
$key
} =
$val
;
}
my
$pos
=
$html
->{_pos};
$pos
=
$html
unless
defined
$pos
;
my
$ptag
=
$pos
->{_tag};
my
$e
= new HTML::Element
$tag
,
%attr
;
if
(!
$IMPLICIT_TAGS
) {
}
elsif
(
$isBodyElement
{
$tag
}) {
if
(
$pos
->isInside(
'head'
)) {
endtag(
$html
,
'head'
);
$pos
=
$html
->insertElement(
'body'
, 1);
$ptag
=
$pos
->tag;
}
elsif
(!
$pos
->isInside(
'body'
)) {
$pos
=
$html
->insertElement(
'body'
, 1);
$ptag
=
$pos
->tag;
}
if
(
$tag
eq
'p'
||
$tag
=~ /^h[1-6]/) {
endtag(
$html
, [
qw(p h1 h2 h3 h4 h5 h6 pre textarea)
],
'li'
);
}
elsif
(
$tag
=~ /^[oud]l$/) {
if
(
$ptag
=~ /^h[1-6]/) {
endtag(
$html
,
$ptag
);
$pos
=
$html
->insertElement(
'p'
, 1);
$ptag
=
'p'
;
}
}
elsif
(
$tag
eq
'li'
) {
endtag(
$html
,
'li'
,
keys
%isList
);
$ptag
=
$html
->
pos
->tag;
$pos
=
$html
->insertElement(
'ul'
, 1)
unless
$isList
{
$ptag
};
}
elsif
(
$tag
eq
'dt'
||
$tag
eq
'dd'
) {
endtag(
$html
, [
'dt'
,
'dd'
],
'dl'
);
$ptag
=
$html
->
pos
->tag;
$pos
=
$html
->insertElement(
'dl'
, 1)
unless
$ptag
eq
'dl'
;
}
elsif
(
$isFormElement
{
$tag
}) {
return
unless
$pos
->isInside(
'form'
);
if
(
$tag
eq
'option'
) {
endtag(
$html
,
'option'
);
$ptag
=
$html
->
pos
->tag;
$pos
=
$html
->insertElement(
'select'
, 1)
unless
$ptag
eq
'select'
;
}
}
elsif
(
$isTableElement
{
$tag
}) {
endtag(
$html
,
$tag
,
'table'
);
$pos
=
$html
->insertElement(
'table'
, 1)
if
!
$pos
->isInside(
'table'
);
}
elsif
(
$isPhraseMarkup
{
$tag
}) {
if
(
$ptag
eq
'body'
) {
$pos
=
$html
->insertElement(
'p'
, 1);
}
}
}
elsif
(
$isHeadElement
{
$tag
}) {
if
(
$pos
->isInside(
'body'
)) {
warn
"Header element <$tag> in body\n"
;
}
elsif
(!
$pos
->isInside(
'head'
)) {
$pos
=
$html
->insertElement(
'head'
, 1);
}
}
elsif
(
$tag
eq
'html'
) {
if
(
$ptag
eq
'html'
&&
$pos
->isEmpty()) {
for
(
keys
%attr
) {
$html
->attr(
$_
,
$attr
{
$_
});
}
return
;
}
else
{
warn
"Skipping nested html element\n"
;
return
;
}
}
elsif
(
$tag
eq
'head'
) {
if
(
$ptag
ne
'html'
&&
$pos
->isEmpty()) {
warn
"Skipping nested <head> element\n"
;
return
;
}
}
elsif
(
$tag
eq
'body'
) {
if
(
$pos
->isInside(
'head'
)) {
endtag(
$html
,
'head'
);
}
elsif
(
$ptag
ne
'html'
) {
warn
"Skipping nested <body> element\n"
;
return
;
}
}
else
{
if
(
$IGNORE_UNKNOWN
) {
warn
"Skipping $tag\n"
;
return
;
}
}
$html
->insertElement(
$e
);
}
}
sub
endtag
{
my
(
$html
,
$tag
,
@stop
) =
@_
;
my
$p
=
$html
->{_pos};
$p
=
$html
unless
defined
(
$p
);
if
(
ref
$tag
) {
PARENT:
while
(
defined
$p
) {
my
$ptag
=
$p
->{_tag};
for
(
@$tag
) {
last
PARENT
if
$ptag
eq
$_
;
}
for
(
@stop
) {
return
if
$ptag
eq
$_
;
}
$p
=
$p
->{_parent};
}
}
else
{
while
(
defined
$p
) {
my
$ptag
=
$p
->{_tag};
last
if
$ptag
eq
$tag
;
for
(
@stop
) {
return
if
$ptag
eq
$_
;
}
$p
=
$p
->{_parent};
}
}
$html
->{_pos} =
$p
->{_parent}
if
defined
$p
;
}
sub
text
{
my
$html
=
shift
;
my
$pos
=
$html
->{_pos};
$pos
=
$html
unless
defined
(
$pos
);
my
@text
=
@_
;
HTML::Entities::decode(
@text
)
unless
$IGNORE_TEXT
;
if
(
$pos
->isInside(
qw(pre xmp listing)
)) {
return
if
$IGNORE_TEXT
;
$pos
->pushContent(
@text
);
}
else
{
my
$empty
= 1;
for
(
@text
) {
$empty
= 0
if
/\S/;
}
return
if
$empty
;
my
$ptag
=
$pos
->{_tag};
if
(!
$IMPLICIT_TAGS
) {
}
elsif
(
$ptag
eq
'head'
) {
endtag(
$html
,
'head'
);
$html
->insertElement(
'body'
, 1);
$pos
=
$html
->insertElement(
$html
,
'p'
, 1);
}
elsif
(
$ptag
eq
'html'
) {
$html
->insertElement(
$html
,
'body'
, 1);
$pos
=
$html
->insertElement(
'p'
, 1);
}
elsif
(
$ptag
eq
'body'
||
$ptag
eq
'form'
) {
$pos
=
$html
->insertElement(
'p'
, 1);
}
return
if
$IGNORE_TEXT
;
for
(
@text
) {
next
if
/^\s*$/;
s/\s+/ /g;
$pos
->pushContent(
$_
);
}
}
}
1;