#!/usr/bin/perl
# Extract all plain text from an HTML file
use
strict;
use
warnings;
use
Encode ();
use
HTML::Parser ();
my
%inside
;
sub
tag {
my
(
$tag
,
$num
) =
@_
;
$inside
{
$tag
} +=
$num
;
" "
;
# not for all tags
}
sub
text {
return
if
$inside
{script} ||
$inside
{style};
encode(
'utf8'
,
$_
[0]);
}
HTML::Parser->new(
api_version
=> 3,
handlers
=> [
start
=> [\
&tag
,
"tagname, '+1'"
],
end
=> [\
&tag
,
"tagname, '-1'"
],
text
=> [\
&text
,
"dtext"
],
],
marked_sections
=> 1,
)->parse_file(
shift
)
||
die
"Can't open file: $!\n"
;