1. more document is needed.
2. some interface may be changed in future.
----------------------- cut of sources
$self->{Url_Last} = $url;
$self->{Url_Base} = $res->base->as_string;
$self->{Url_Code} = $res->code;
$self->{Url_Title} = $res->headers->title;
$self->{Url_Content_Type} = $res->headers->content_type;
$self->{Url_Last_Modified} = $res->headers->last_modified;
$self->{Url_Last_Modified_Iso} = HTTP::Date::time2iso($res->headers->last_modified);
$str="<Test>\nThis is a Test\r\n;<!-- Abc\r \n-->";
is($bot->get_url_textok($str), "<ln> <Test><ln>This is a Test<ln>;<!-- Abc--><ln>", "function get_url_text_ok");
Before <!---
Test of mark <font>FONTS</font>
---> Inner
<script src="Test">display();</script>
Another Space
is($bot->parse_empty($str), "Before Inner<ln><ln>Another Space<ln>", "function parse_empty");
$str="<ln> a good<ln> dog<ln> must<ln> be<ln> good.<ln>";
is($bot->parse_leadingspace($str), "<ln> a good<ln>dog<ln>must<ln>be<ln>good.<ln>", "function parse_leadingspace");
use WWW::BookBot::Chinese;
$bot=new WWW::BookBot::Chinese;
$str="A b o u t a ÖÐgreat ¹ú ÊÇ Ò» ¸ö and w o n d e r f u l i d e a.<ln>";
is($bot->parse_innerspace($str), "About a great and wonderful idea.<ln>", "function parse_innerspace");
#print Dumper($bot);
# get_url_ok: prepare received contents
# $str contents
#---RETURN contents
sub get_url_ok {
my $self = shift;
return ($self->{Url_Content_Type} eq 'text/html'
or $self->{Url_Content_Type} eq 'text/plain') ?
$self->get_url_textok(@_) : $self->get_url_binok(@_);
# get_url_textok: prepare received text contents by replace \r\n as <ln>
# $str contents
#---RETURN contents
sub get_url_textok {
my $self = shift;
my $str=$self->de_code($_[0]);
$str=~s/(<[^<>]*)(?: *<ln>)+([^<>]*>)/$1$2/g; #prepare <> in different lines
return "<ln>$str<ln>";
# parse_img: parser images
# $base base url
# $contents html contents
sub parse_img {
my $self = shift;
my $base = shift;
my ($html, $src);
while($_[0]=~/<img([^<>]*>)/ig) {
if( $html=~/src *= *\"([^\">]*)\"/ ) {
} elsif( $html=~/src *= *\'([^\'>]*)\'/ ) {
} elsif( $html=~/src *= *([^ >]*)( |>)/ ) {
$self->parse_img_ok($self->url_rel2abs($src, $base));
# parse_img_ok: found good image url
# $url image url
sub parse_img_ok {
my $self = shift;
my ($url) = @_;
# not finished
$str=$found if eval($p1);
if( $str=~/\n +[^ \n][^\n]*\n[^ \n][^\n]*\n[^ \n][^\n]*\n/sg ) {
# "\n " -> <br>, "\n" -> no use
$str=~s/\n +(?=[^ \n])/<br> /sg;
# parse_innerspace: remove inner space
# $content contents
#---RETURN contents
sub parse_innerspace {
my $self = shift;
return $_[0] if not $self->{TEXTREMOVEINNERSPACE};
my $pattern_sub="[^ <>]($self->{Pattern_Mark})* ";
my $pattern="";
for(my $i=0; $i<7; $i++) {
return $_[0] if not $_[0]=~/$pattern/;
my $str1=$_[0];
$str1=~s/([0-9a-zA-Z]($self->{Pattern_Mark})) [0-9a-zA-Z][0-9a-zA-Z]//g;
my $str="";
foreach (split /(<ln>| {2,})/, $_[0]) {
print "($_) ";
if(/^ /) {
$str.=" ";
$_=~s/ //g if $_=~/^[^ ]($self->{Pattern_Mark})*( [^ ]($self->{Pattern_Mark})*)+$/;
return $str;
WWW::BookBot is a bot to fetch web e-texts with catalog, books and chapters. It can fetch and reformat multiple html pages into a single text file, which can be readed in .