sub
extract_nav_link{
my
(
$class
,
$seed
,
$is_utf
) =
@_
;
my
$content
= Fetcher->fetch(
$seed
) ;
if
(
$is_utf
){
$content
=MyUtils->u8_to_gbk(
$content
);
}
my
$tree
= HTML::TreeBuilder->new_from_content(
$content
);
my
$addr
=
q/@0.0.3/
;
my
$str
=
$tree
->address(
$addr
)->attr(
'content'
);
$tree
->
delete
;
$str
=~s/0;\sURL=//;
my
$seed_uri
= URI->new(
$seed
);
my
$index_uri
= URI->new_abs(
$str
,
$seed_uri
);
return
$index_uri
;
}
sub
extract_ak_lks{
my
$class
=
shift
;
my
$index_uri
=
shift
|| Carp->croak(
"ÐèÒªindex_uri¡£¡£¡£¡£¡£"
);
my
$cp_ak_addr
=
shift
|| Carp->croak(
"ÐèÒªµØÖ·¡£¡£¡£¡£¡£"
);
my
$is_utf
=
shift
;
my
$nav_page_content
=Fetcher->fetch(
$index_uri
) ;
if
(
$is_utf
){
$nav_page_content
=MyUtils->u8_to_gbk(
$nav_page_content
);
}
my
$tree
= HTML::TreeBuilder->new_from_content(
$nav_page_content
);
my
$cp_aks_tree
=
$tree
->address(
$cp_ak_addr
);
my
@cp_aks
;
for
(@{
$cp_aks_tree
->extract_links(
"a"
) }) {
my
(
$link
,
$element
,
$attr
,
$tag
) =
@$_
;
my
$abs_link
= URI->new_abs(
$link
,
$index_uri
);
push
@cp_aks
,
$abs_link
;
}
$tree
->
delete
;
return
@cp_aks
;
}
sub
extract_ak{
my
$class
=
shift
;
my
$ak_uri
=
shift
|| Carp->croak(
"ÐèÒªak_uri¡£¡£¡£¡£¡£"
);
my
$is_utf
=
shift
;
my
$content
=Fetcher-> fetch(
$ak_uri
);
if
(
$is_utf
){
$content
=MyUtils->u8_to_gbk(
$content
);
}
my
$tree
= HTML::TreeBuilder->new_from_content(
$content
);
my
$addr_title
=
q/@0.0.1/
;
my
$addr_content
=
q/@0.1.4.0.0.4.1.0.0/
;
my
$title
=
$tree
->address(
$addr_title
)->as_text();
my
$news
=
$tree
->address(
$addr_content
)->as_text();
$tree
->
delete
;
my
$time
=
localtime
(
time
);
my
%ak
= (
"uri"
,
$ak_uri
,
"title"
,
$title
,
"content"
,
$news
,
"ftime"
,
$time
);
return
%ak
;
}
sub
extract_links{
my
$class
=
shift
;
my
$index_uri
=
shift
|| Carp->croak(
"ÐèÒªpageÒ³Á´½Ó¡£¡£¡£¡£¡£"
);
my
$lk_addr
=
shift
;
print
"\nץȡ---------"
.
$index_uri
;
my
$c
=Fetcher->fetch(
$index_uri
);
my
$nav_page_content
=MyUtils->u8_to_gbk(
$c
)
if
$c
;
my
$tree
= HTML::TreeBuilder->new_from_content(
$nav_page_content
);
my
$cp_lks_tree
=
$tree
->address(
$lk_addr
);
my
@cp_lks
;
for
(@{
$cp_lks_tree
->extract_links(
"a"
) }) {
my
(
$link
,
$element
,
$attr
,
$tag
) =
@$_
;
my
$abs_link
= URI->new_abs(
$link
,
$index_uri
);
push
@cp_lks
,
$abs_link
;
}
$tree
->
delete
;
return
@cp_lks
;
}
sub
extract_text{
my
$class
=
shift
;
my
$ak_uri
=
shift
|| Carp->croak(
"ÐèÒªÐÂÎÅÁ´½Ó"
);
my
$content_addr
=
shift
|| Carp->croak(
"ÐèÒªÐÂÎÅÄÚÈÝλÖÃÐÅÏ¢"
);
my
$is_utf
=
shift
;
my
$content
=Fetcher-> fetch(
$ak_uri
);
if
(
$is_utf
) {
$content
=MyUtils->u8_to_gbk(
$content
)
if
$content
;
}
$content
=~s/&.
*quo
;//g;
my
$tree
= HTML::TreeBuilder->new_from_content(
$content
);
my
$stree
=
$tree
->address(
$content_addr
);
if
(
$stree
==
undef
) {
return
''
;
}
my
$news
=
$stree
->as_text() ;
$tree
->
delete
;
return
$news
||
''
;
}
sub
extract_cont_addr{
my
$class
=
shift
;
my
$tmpl_file
=
shift
;
my
@crit
=
@_
;
@crit
= (
"id"
,
"my_content"
)
unless
@_
;
my
$tree
=HTML::TreeBuilder->new_from_file(
$tmpl_file
);
if
(
wantarray
){
my
@strees
=
$tree
->look_down(
"id"
,
"my_content"
);
croak
qq"
Error:No desired content marked in the template\t$tmpl_file!
ÇëÏȱê¼ÇÄ£°åÎļþ¡£
\n
"
unless
@strees
;
my
@cont_addrs
=();
for
my
$stree
(
@strees
){
push
@cont_addrs
,
$stree
->address();
}
$tree
->
delete
;
return
@cont_addrs
;
}
else
{
my
$stree
=
$tree
->look_down(
"id"
,
"my_content"
);
croak
qq"
Error:No desired content marked in the template\t$tmpl_file!!
ÇëÏȱê¼ÇÄ£°åÎļþ¡£
\n
"
unless
$stree
;
my
$addr
=
$stree
->address();
$tree
->
delete
;
return
$addr
;
}
}
sub
extract_addrs{
my
$class
=
shift
;
my
$tmpl_file
=
shift
;
my
@crit
=
@_
;
@crit
= (
"id"
,
"my_content"
)
unless
@_
;
my
$tree
=HTML::TreeBuilder->new_from_file(
$tmpl_file
);
my
@strees
=
$tree
->look_down(
@crit
);
my
@cont_addrs
;
for
my
$stree
(
@strees
){
my
$addr
=
$stree
->address();
push
@cont_addrs
,
$addr
;
print
$stree
->address(),
"\n"
;
}
$tree
->
delete
;
if
(
wantarray
){
return
@cont_addrs
;
}
else
{
return
pop
@cont_addrs
;
}
}
1;