—————————————# $Id: Yahoo.pm,v 2.380 2009/05/02 13:28:41 Martin Exp $
=head1 NAME
WWW::Search::Yahoo - backend for searching www.yahoo.com
=head1 SYNOPSIS
use WWW::Search;
my $oSearch = new WWW::Search('Yahoo');
my $sQuery = WWW::Search::escape_query("sushi restaurant Columbus Ohio");
$oSearch->native_query($sQuery);
while (my $oResult = $oSearch->next_result())
print $oResult->url, "\n";
=head1 DESCRIPTION
This class is a Yahoo specialization of L<WWW::Search>. It handles
making and interpreting Yahoo searches F<http://www.yahoo.com>.
This class exports no public interface; all interaction should
be done through L<WWW::Search> objects.
=head1 NOTES
The default search is: Yahoo's web-based index (not Directory).
=head1 PRIVATE METHODS
If you just want to write Perl code to search Yahoo,
you do NOT need to read any further here.
Instead, just read the L<WWW::Search> documentation.
If you want to write a subclass of this module
(e.g. create a backend for another branch of Yahoo)
then please read about the private methods here:
=cut
package
WWW::Search::Yahoo;
use
strict;
use
warnings;
use
Carp ();
use
HTML::TreeBuilder;
use
WWW::Search;
use
WWW::SearchResult;
use
URI;
use
URI::Escape;
our
$VERSION
=
do
{
my
@r
= (
q$Revision: 2.380 $
=~ /\d+/g);
sprintf
"%d."
.
"%03d"
x
$#r
,
@r
};
our
$MAINTAINER
=
'Martin Thurn <mthurn@cpan.org>'
;
# Thanks to the hard work of Gil Vidals and his team at
# positionresearch.com, we know the following: In early 2004,
# yahoo.com implemented new robot-blocking tactics that look for
# frequent requests from the same client IP. One way around these
# blocks is to slow down and randomize the timing of our requests. We
# therefore insert a random sleep before every request except the
# first one. This variable is equivalent to a "first-time" flag for
# this purpose:
$iMustPause
= 0;
=head2 gui_query
Yes, Virginia, we do try to emulate stupid-human queries.
=cut
sub
gui_query
{
my
(
$self
,
$sQuery
,
$rh
) =
@_
;
$self
->{
'_options'
} = {
'p'
=>
$sQuery
,
# 'hc' => 0,
# 'hs' => 0,
'ei'
=>
'UTF-8'
,
};
# print STDERR " + Yahoo::gui_query() is calling native_query()...\n";
$rh
->{
'search_base_path'
} =
'/bin/query'
;
return
$self
->native_query(
$sQuery
,
$rh
);
}
# gui_query
sub
_native_setup_search
{
my
(
$self
,
$native_query
,
$rhOptsArg
) =
@_
;
# print STDERR " + This is Yahoo::native_setup_search()...\n";
# print STDERR " + _options is ", $self->{'_options'}, "\n";
$self
->{
'_hits_per_page'
} = 100;
# $self->{'_hits_per_page'} = 10; # for debugging
# www.yahoo.com refuses robots.
$self
->user_agent(
'non-robot'
);
# www.yahoo.com completely changes the HTML output depending on the
# browser!
# $self->{'agent_name'} = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)';
# $self->{agent_e_mail} = 'mthurn@cpan.org';
$self
->{_next_to_retrieve} = 1;
$self
->{
'search_base_path'
} ||=
'/search'
;
if
(!
defined
(
$self
->{
'_options'
}))
{
# We do not clobber the existing _options hash, if there is one;
# e.g. if gui_search() was already called on this object
$self
->{
'_options'
} = {
'vo'
=>
$native_query
,
'h'
=>
'w'
,
# web sites
'n'
=>
$self
->{_hits_per_page},
# 'b' => $self->{_next_to_retrieve}-1,
};
}
# if
my
$rhOptions
=
$self
->{
'_options'
};
if
(
defined
(
$rhOptsArg
))
{
# Copy in new options, promoting special ones:
foreach
my
$key
(
keys
%$rhOptsArg
)
{
# print STDERR " + inspecting option $key...";
if
(WWW::Search::generic_option(
$key
))
{
# print STDERR "promote & delete\n";
$self
->{
$key
} =
$rhOptsArg
->{
$key
}
if
defined
(
$rhOptsArg
->{
$key
});
delete
$rhOptsArg
->{
$key
};
}
else
{
# print STDERR "copy\n";
$rhOptions
->{
$key
} =
$rhOptsArg
->{
$key
}
if
defined
(
$rhOptsArg
->{
$key
});
}
}
# foreach
# print STDERR " + resulting rhOptions is ", Dumper($rhOptions);
# print STDERR " + resulting rhOptsArg is ", Dumper($rhOptsArg);
}
# if
# Finally, figure out the url.
$self
->{
'_next_url'
} =
$self
->{
'search_base_url'
} .
$self
->{
'search_base_path'
} .
'?'
.
$self
->hash_to_cgi_string(
$rhOptions
);
$self
->{_debug} =
$self
->{
'search_debug'
} || 0;
$self
->{_debug} = 2
if
(
$self
->{
'search_parse_debug'
});
}
# _native_setup_search
=head2 need_to_delay
This method tells the L<WWW::Search> controller code whether we need to
pause and give the yahoo.com servers a breather.
=cut
sub
need_to_delay
{
# print STDERR " + this is Yahoo::need_to_delay()\n";
return
$iMustPause
;
}
# need_to_delay
=head2 user_agent_delay
This method tells the L<WWW::Search> controller code how many seconds we should pause.
=cut
sub
user_agent_delay
{
my
$self
=
shift
;
my
$iSecs
=
int
(30 +
rand
(30));
STDERR
" + sleeping $iSecs seconds, to make yahoo.com think we're NOT a robot...\n"
if
(0 <
$self
->{_debug});
sleep
(
$iSecs
);
}
# user_agent_delay
=head2 preprocess_results_page
Clean up the Yahoo HTML before we attempt to parse it.
=cut
sub
preprocess_results_page
{
my
$self
=
shift
;
my
$sPage
=
shift
;
if
(
$self
->{_debug} == 77)
{
# For debugging only. Print the page contents and abort.
STDERR
$sPage
;
exit
88;
}
# if
# Delete the <BASE> tag that appears BEFORE the <html> tag (because
# it causes HTML::TreeBuilder to NOT be able to parse it!)
$sPage
=~ s!<BASE\s[^>]+>!!;
return
$sPage
;
}
# preprocess_results_page
=head2 _result_list_tags
Returns a list,
which will be passed as arguments to HTML::Element::look_down()
in order to return a list of HTML::Element which contain the query results.
=cut
sub
_result_list_tags
{
return
(
_tag
=>
'div'
,
class
=>
'res'
,
);
}
# _result_list_tags
=head2 _result_list_items
Given an HTML::TreeBuilder tree,
returns a list of HTML::Element,
which contain the query results.
=cut
sub
_result_list_items
{
my
$self
=
shift
;
my
$oTree
=
shift
||
die
;
my
@ao
=
$oTree
->look_down(
$self
->_result_list_tags);
return
@ao
;
}
# _result_list_items
my
$WS
=
q{[\t\r\n\240\ ]}
;
sub
_parse_tree
{
my
$self
=
shift
;
my
$oTree
=
shift
;
STDERR
" + ::Yahoo got a tree $oTree\n"
if
(2 <=
$self
->{_debug});
# Every time we get a page from yahoo.com, we have to pause before
# fetching another.
$iMustPause
++;
my
$hits_found
= 0;
# Only try to parse the hit count if we haven't done so already:
STDERR
" + start, approx_h_c is =="
,
$self
->approximate_hit_count(),
"==\n"
if
(2 <=
$self
->{_debug});
if
(
$self
->approximate_hit_count() < 1)
{
my
$rh
=
$self
->_where_to_find_count;
my
@aoDIV
=
$oTree
->look_down(
%$rh
);
DIV_TAG:
foreach
my
$oDIV
(
@aoDIV
)
{
next
unless
ref
$oDIV
;
STDERR
" + try DIV =="
,
$oDIV
->as_HTML
if
(2 <=
$self
->{_debug});
my
$s
=
$oDIV
->as_text;
STDERR
" + TEXT ==$s==\n"
if
(2 <=
$self
->{_debug});
my
$iCount
=
$self
->_string_has_count(
$s
);
$iCount
=~
tr
!,\.!!d;
if
(0 <=
$iCount
)
{
$self
->approximate_result_count(
$iCount
);
last
DIV_TAG;
}
# if
}
# foreach DIV_TAG
}
# if
STDERR
" + found approx_h_c is =="
,
$self
->approximate_hit_count(),
"==\n"
if
(2 <=
$self
->{_debug});
my
@aoLI
=
$self
->_result_list_items(
$oTree
);
STDERR
" DDD aoLI has "
,
scalar
(
@aoLI
),
" items...\n"
if
(2 <=
$self
->{_debug});
LI_TAG:
foreach
my
$oLI
(
@aoLI
)
{
# Sanity check:
next
LI_TAG
unless
ref
(
$oLI
);
STDERR
" DDD found oLI is =="
,
$oLI
->as_HTML,
"==\n"
if
(2 <=
$self
->{_debug});
my
$oA
=
$oLI
->look_down(
_tag
=>
'a'
);
next
LI_TAG
unless
ref
(
$oA
);
STDERR
" DDD found oA is =="
,
$oA
->as_HTML,
"==\n"
if
(2 <=
$self
->{_debug});
my
$sTitle
=
$oA
->as_text ||
''
;
my
$sURL
=
$oA
->attr(
'href'
) ||
''
;
next
LI_TAG
if
(
$sURL
eq
''
);
STDERR
" + raw URL is ==$sURL==\n"
if
(2 <=
$self
->{_debug});
# Throw out various unwanted Yahoo links:
next
LI_TAG
if
(
$sURL
=~ m!\.yahoo\.com/(about|jobseeker|preferences|search)/!);
next
LI_TAG
if
(
$sURL
=~ m!//((answers|cgi|cn|de|docs|europe|help|
local
|myweb\d?|search|searchmarketing|video)\.)+yahoo\.com!);
# Strip off the yahoo.com redirect part of the URL:
$sURL
=~ s!\A.*?\*-!!;
$sURL
=~ s!\Ahttp%3A!http:!i;
STDERR
" + cooked URL is ==$sURL==\n"
if
(2 <=
$self
->{_debug});
my
$hit
= new WWW::SearchResult;
$hit
->description(
q{}
);
$self
->parse_details(
$oLI
,
$hit
);
$hit
->add_url(
$sURL
);
$sTitle
=
$self
->strip(
$sTitle
);
$hit
->title(
$sTitle
);
push
(@{
$self
->{cache}},
$hit
);
$hits_found
++;
}
# foreach LI_TAG
# Now try to find the "next page" link:
my
@aoA
=
$oTree
->look_down(
'_tag'
=>
'a'
);
NEXT_A:
foreach
my
$oA
(
reverse
@aoA
)
{
next
NEXT_A
unless
ref
(
$oA
);
my
$sAhtml
=
$oA
->as_HTML;
printf
STDERR (
" + next A ==%s==\n"
,
$sAhtml
)
if
(2 <=
$self
->{_debug});
if
(
$self
->_a_is_next_link(
$oA
))
{
# Here is an example of a raw next URL:
# http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101
# http://rds.yahoo.com/;_ylt=AutpqXFv9tv2eTXen2Mw_c1XNyoA;_ylu=X3oDMTExN2UzODg3BGNvbG8DdwRzZWMDcGFnaW5hdGlvbgR2dGlkA0RGWDJfOQ--/SIG=19e131ad9/EXP=1130207429/**http%3A%2F%2Fsearch.yahoo.com%2Fsearch%3Fn%3D100%26vo%3Dpokemon%26ei%3DUTF-8%26xargs%3D12KPjg1hVSt4GmmvmnCOObHb%255F%252Dvj0Zlpi3g5UzTYR6a9RL8nQJDqADN%255F2aP%255FdLHL9y7XrQ0JOkvqV2HOs3qODiIxkSdWH8UbKsmJS5%255FIp9DLfdaXlzsbIu0%252Djv3NcQZy8nLl2qbeONz73ZI6L5Hk57%26pstart%3D6%26b%3D101
my
$sURL
=
$oA
->attr(
'href'
);
STDERR
" + raw next URL ==$sURL==\n"
if
(2 <=
$self
->{_debug});
# Delete Yahoo-redirect portion of URL:
$sURL
=~ s!\A.+?[-*]+(?=http)!!;
STDERR
" + poached next URL ==$sURL==\n"
if
(2 <=
$self
->{_debug});
$sURL
= WWW::Search::unescape_query(
$sURL
);
$self
->{_next_url} =
$self
->absurl(
$self
->{
'_prev_url'
},
$sURL
);
STDERR
" + cooked next URL ==$self->{_next_url}==\n"
if
(2 <=
$self
->{_debug});
last
NEXT_A;
}
# if
}
# foreach NEXT_A
return
$hits_found
;
}
# _parse_tree
=head2 parse_details
Given a (portion of an) HTML::TreeBuilder tree
and a L<WWW::SearchResult> object,
parses one result out of the tree and populates the SearchResult.
=cut
sub
parse_details
{
my
$self
=
shift
;
# Required arg1 = (part of) an HTML parse tree:
my
$oLI
=
shift
;
# Required arg2 = a WWW::SearchResult object to fill in:
my
$hit
=
shift
;
my
$oDIV
=
$oLI
->look_down(
_tag
=>
'div'
,
class
=>
'abstr'
,
);
if
(
ref
(
$oDIV
))
{
my
$sDesc
=
$oDIV
->as_text;
$hit
->description(
$self
->strip(
$sDesc
));
}
# if
# Delete the useless human-readable restatement of the URL (first
# <EM> tag we come across):
my
$oEM
=
$oLI
->look_down(
_tag
=>
'em'
);
if
(
ref
(
$oEM
))
{
my
$sSize
=
''
;
$sSize
= $1
if
(
$oLI
->as_text =~ m!(\d+[kb]?)!gx);
$hit
->size(
$sSize
);
}
# if
return
;
# Delete any remaining <A> tags:
my
@aoA
=
$oLI
->look_down(
_tag
=>
'a'
);
A_TAG:
foreach
my
$oA
(
@aoA
)
{
$oA
->detach;
$oA
->
delete
;
}
# foreach A_TAG
$oDIV
=
$oLI
->look_down(
_tag
=>
'div'
);
if
(
ref
$oDIV
)
{
$oDIV
->detach;
$oDIV
->
delete
;
}
# if
my
$sDesc
=
$oLI
->as_text;
STDERR
" + raw sDesc is ==$sDesc==\n"
if
(2 <=
$self
->{_debug});
# Grab stuff off the end of the description:
STDERR
" + cooked sDesc is ==$sDesc==\n"
if
(2 <=
$self
->{_debug});
$hit
->description(
$self
->strip(
$sDesc
));
}
# parse_details
=head2 _where_to_find_count
Returns a list,
which will be passed as arguments to HTML::Element::look_down()
in order to return an HTML::Element
which contains the approximate result count.
=cut
sub
_where_to_find_count
{
my
%hash
= (
_tag
=>
'div'
,
# 'class' => 'ygbody',
id
=>
'info'
,
);
return
\
%hash
;
}
# _where_to_find_count
=head2 _string_has_count
Given a string,
returns the approximate result count
if that string contains the approximate result count.
=cut
sub
_string_has_count
{
my
$self
=
shift
;
my
$s
=
shift
;
# print STDERR " DDD Yahoo::string_has_count($s)?\n";
return
$1
if
(
$s
=~ m!\bof\s+(?:about\s+)?([,0-9]+)!i);
return
-1;
}
# _string_has_count
=head2 _a_is_next_link
Given an HTML::Element,
returns true if it seems to contain the clickable "next page" widget.
=cut
sub
_a_is_next_link
{
my
$self
=
shift
;
my
$oA
=
shift
;
return
0
if
(!
ref
$oA
);
my
$sID
=
$oA
->attr(
'id'
) ||
''
;
return
1
if
(
$sID
eq
'pg-next'
);
my
$s
=
$oA
->as_text;
STDERR
" + next A as_text ==$s==\n"
if
(2 <=
$self
->{_debug});
return
(
$s
=~ m!\A
$WS
*Next
$WS
+
>
;
$WS
*\z!i);
}
# _a_is_next_link
=head2 strip
Given a string,
strips leading and trailing whitespace off of it.
=cut
sub
strip
{
my
$self
=
shift
;
my
$s
=
&WWW::Search::strip_tags
(
shift
);
$s
=~ s!\A
$WS
+ !!x;
$s
=~ s!
$WS
+\Z!!x;
return
$s
;
}
# strip
1;
__END__
GUI search:
http://ink.yahoo.com/bin/query?p=sushi+restaurant+Columbus+Ohio&hc=0&hs=0
Advanced search:
http://search.yahoo.com/search?h=w&fr=op&va=&vp=&vo=Martin+Thurn&ve=&bbase=Search&vl=&vc=&vd=all&vt=any&vss=i&vs=&vr=&vk=
actual next link from page:
http://google.yahoo.com/bin/query?p=%22Shelagh+Fraser%22&b=21&hc=0&hs=0&xargs=
_next_url :
http://google.yahoo.com/bin/query?%0Ap=%22Shelagh+Fraser%22&b=21&hc=0&hs=0&xargs=
http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101
=head1 SEE ALSO
To make new back-ends, see L<WWW::Search>.
=head1 BUGS
Please tell the maintainer if you find any!
=head1 AUTHOR
As of 1998-02-02, C<WWW::Search::Yahoo> is maintained by Martin Thurn
(mthurn@cpan.org).
C<WWW::Search::Yahoo> was originally written by Wm. L. Scheding,
based on C<WWW::Search::AltaVista>.
=head1 LEGALESE
THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
=head1 LICENSE
Copyright (C) 1998-2009 Martin 'Kingpin' Thurn
This software is released under the same license as Perl itself.
=cut