#!/usr/bin/perl
my
$p
= HTML::Parser->new(
api_version
=> 3,
start_h
=> [\
&a_start_handler
,
"self,tagname,attr"
],
report_tags
=> [
qw(a img)
],
);
$p
->parse_file(
shift
||
die
) ||
die
$!;
sub
a_start_handler {
my
(
$self
,
$tag
,
$attr
) =
@_
;
return
unless
$tag
eq
"a"
;
return
unless
exists
$attr
->{href};
print
"A $attr->{href}\n"
;
$self
->handler(
text
=> [],
'@{dtext}'
);
$self
->handler(
start
=> \
&img_handler
);
$self
->handler(
end
=> \
&a_end_handler
,
"self,tagname"
);
}
sub
img_handler {
my
(
$self
,
$tag
,
$attr
) =
@_
;
return
unless
$tag
eq
"img"
;
push
(@{
$self
->handler(
"text"
)},
$attr
->{alt} ||
"[IMG]"
);
}
sub
a_end_handler {
my
(
$self
,
$tag
) =
@_
;
my
$text
= encode(
'utf8'
,
join
(
""
, @{
$self
->handler(
"text"
)}));
$text
=~ s/^\s+//;
$text
=~ s/\s+$//;
$text
=~ s/\s+/ /g;
print
"T $text\n"
;
$self
->handler(
"text"
,
undef
);
$self
->handler(
"start"
, \
&a_start_handler
);
$self
->handler(
"end"
,
undef
);
}