Sponsoring The Perl Toolchain Summit 2025: Help make this important event another success Learn more

# $Id: Yahoo.pm,v 2.380 2009/05/02 13:28:41 Martin Exp $
=head1 NAME
WWW::Search::Yahoo - backend for searching www.yahoo.com
=head1 SYNOPSIS
use WWW::Search;
my $oSearch = new WWW::Search('Yahoo');
my $sQuery = WWW::Search::escape_query("sushi restaurant Columbus Ohio");
$oSearch->native_query($sQuery);
while (my $oResult = $oSearch->next_result())
print $oResult->url, "\n";
=head1 DESCRIPTION
This class is a Yahoo specialization of L<WWW::Search>. It handles
making and interpreting Yahoo searches F<http://www.yahoo.com>.
This class exports no public interface; all interaction should
be done through L<WWW::Search> objects.
=head1 NOTES
The default search is: Yahoo's web-based index (not Directory).
=head1 PRIVATE METHODS
If you just want to write Perl code to search Yahoo,
you do NOT need to read any further here.
Instead, just read the L<WWW::Search> documentation.
If you want to write a subclass of this module
(e.g. create a backend for another branch of Yahoo)
then please read about the private methods here:
=cut
use strict;
use Carp ();
use Data::Dumper; # for debugging only
use URI;
use vars qw( $iMustPause );
our
$VERSION = do { my @r = (q$Revision: 2.380 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r };
our $MAINTAINER = 'Martin Thurn <mthurn@cpan.org>';
# Thanks to the hard work of Gil Vidals and his team at
# positionresearch.com, we know the following: In early 2004,
# yahoo.com implemented new robot-blocking tactics that look for
# frequent requests from the same client IP. One way around these
# blocks is to slow down and randomize the timing of our requests. We
# therefore insert a random sleep before every request except the
# first one. This variable is equivalent to a "first-time" flag for
# this purpose:
$iMustPause = 0;
=head2 gui_query
Yes, Virginia, we do try to emulate stupid-human queries.
=cut
sub gui_query
{
my ($self, $sQuery, $rh) = @_;
$self->{'_options'} = {
'p' => $sQuery,
# 'hc' => 0,
# 'hs' => 0,
'ei' => 'UTF-8',
};
# print STDERR " + Yahoo::gui_query() is calling native_query()...\n";
$rh->{'search_base_url'} = 'http://search.yahoo.com';
$rh->{'search_base_path'} = '/bin/query';
return $self->native_query($sQuery, $rh);
} # gui_query
sub _native_setup_search
{
my ($self, $native_query, $rhOptsArg) = @_;
# print STDERR " + This is Yahoo::native_setup_search()...\n";
# print STDERR " + _options is ", $self->{'_options'}, "\n";
$self->{'_hits_per_page'} = 100;
# $self->{'_hits_per_page'} = 10; # for debugging
# www.yahoo.com refuses robots.
$self->user_agent('non-robot');
# www.yahoo.com completely changes the HTML output depending on the
# browser!
# $self->{'agent_name'} = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)';
# $self->{agent_e_mail} = 'mthurn@cpan.org';
$self->{_next_to_retrieve} = 1;
$self->{'search_base_url'} ||= 'http://search.yahoo.com';
$self->{'search_base_path'} ||= '/search';
if (! defined($self->{'_options'}))
{
# We do not clobber the existing _options hash, if there is one;
# e.g. if gui_search() was already called on this object
$self->{'_options'} = {
'vo' => $native_query,
'h' => 'w', # web sites
'n' => $self->{_hits_per_page},
# 'b' => $self->{_next_to_retrieve}-1,
};
} # if
my $rhOptions = $self->{'_options'};
if (defined($rhOptsArg))
{
# Copy in new options, promoting special ones:
foreach my $key (keys %$rhOptsArg)
{
# print STDERR " + inspecting option $key...";
if (WWW::Search::generic_option($key))
{
# print STDERR "promote & delete\n";
$self->{$key} = $rhOptsArg->{$key} if defined($rhOptsArg->{$key});
delete $rhOptsArg->{$key};
}
else
{
# print STDERR "copy\n";
$rhOptions->{$key} = $rhOptsArg->{$key} if defined($rhOptsArg->{$key});
}
} # foreach
# print STDERR " + resulting rhOptions is ", Dumper($rhOptions);
# print STDERR " + resulting rhOptsArg is ", Dumper($rhOptsArg);
} # if
# Finally, figure out the url.
$self->{'_next_url'} = $self->{'search_base_url'} . $self->{'search_base_path'} .'?'. $self->hash_to_cgi_string($rhOptions);
$self->{_debug} = $self->{'search_debug'} || 0;
$self->{_debug} = 2 if ($self->{'search_parse_debug'});
} # _native_setup_search
=head2 need_to_delay
This method tells the L<WWW::Search> controller code whether we need to
pause and give the yahoo.com servers a breather.
=cut
sub need_to_delay
{
# print STDERR " + this is Yahoo::need_to_delay()\n";
return $iMustPause;
} # need_to_delay
=head2 user_agent_delay
This method tells the L<WWW::Search> controller code how many seconds we should pause.
=cut
sub user_agent_delay
{
my $self = shift;
my $iSecs = int(30 + rand(30));
print STDERR " + sleeping $iSecs seconds, to make yahoo.com think we're NOT a robot...\n" if (0 < $self->{_debug});
sleep($iSecs);
} # user_agent_delay
=head2 preprocess_results_page
Clean up the Yahoo HTML before we attempt to parse it.
=cut
sub preprocess_results_page
{
my $self = shift;
my $sPage = shift;
if ($self->{_debug} == 77)
{
# For debugging only. Print the page contents and abort.
print STDERR $sPage;
exit 88;
} # if
# Delete the <BASE> tag that appears BEFORE the <html> tag (because
# it causes HTML::TreeBuilder to NOT be able to parse it!)
$sPage =~ s!<BASE\s[^>]+>!!;
return $sPage;
} # preprocess_results_page
=head2 _result_list_tags
Returns a list,
which will be passed as arguments to HTML::Element::look_down()
in order to return a list of HTML::Element which contain the query results.
=cut
sub _result_list_tags
{
return (_tag => 'div',
class => 'res',
);
} # _result_list_tags
=head2 _result_list_items
Given an HTML::TreeBuilder tree,
returns a list of HTML::Element,
which contain the query results.
=cut
sub _result_list_items
{
my $self = shift;
my $oTree = shift || die;
my @ao = $oTree->look_down($self->_result_list_tags);
return @ao;
} # _result_list_items
my $WS = q{[\t\r\n\240\ ]};
sub _parse_tree
{
my $self = shift;
my $oTree = shift;
print STDERR " + ::Yahoo got a tree $oTree\n" if (2 <= $self->{_debug});
# Every time we get a page from yahoo.com, we have to pause before
# fetching another.
$iMustPause++;
my $hits_found = 0;
# Only try to parse the hit count if we haven't done so already:
print STDERR " + start, approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug});
if ($self->approximate_hit_count() < 1)
{
my $rh = $self->_where_to_find_count;
my @aoDIV = $oTree->look_down(%$rh);
DIV_TAG:
foreach my $oDIV (@aoDIV)
{
next unless ref $oDIV;
print STDERR " + try DIV ==", $oDIV->as_HTML if (2 <= $self->{_debug});
my $s = $oDIV->as_text;
print STDERR " + TEXT ==$s==\n" if (2 <= $self->{_debug});
my $iCount = $self->_string_has_count($s);
$iCount =~ tr!,\.!!d;
if (0 <= $iCount)
{
$self->approximate_result_count($iCount);
last DIV_TAG;
} # if
} # foreach DIV_TAG
} # if
print STDERR " + found approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug});
my @aoLI = $self->_result_list_items($oTree);
print STDERR " DDD aoLI has ", scalar(@aoLI), " items...\n" if (2 <= $self->{_debug});
LI_TAG:
foreach my $oLI (@aoLI)
{
# Sanity check:
next LI_TAG unless ref($oLI);
print STDERR " DDD found oLI is ==", $oLI->as_HTML, "==\n" if (2 <= $self->{_debug});
my $oA = $oLI->look_down(_tag => 'a');
next LI_TAG unless ref($oA);
print STDERR " DDD found oA is ==", $oA->as_HTML, "==\n" if (2 <= $self->{_debug});
my $sTitle = $oA->as_text || '';
my $sURL = $oA->attr('href') || '';
next LI_TAG if ($sURL eq '');
print STDERR " + raw URL is ==$sURL==\n" if (2 <= $self->{_debug});
# Throw out various unwanted Yahoo links:
next LI_TAG if ($sURL =~ m!\.yahoo\.com/(about|jobseeker|preferences|search)/!);
next LI_TAG if ($sURL =~ m!//((answers|cgi|cn|de|docs|europe|help|local|myweb\d?|search|searchmarketing|video)\.)+yahoo\.com!);
# Strip off the yahoo.com redirect part of the URL:
$sURL =~ s!\A.*?\*-!!;
$sURL =~ s!\Ahttp%3A!http:!i;
print STDERR " + cooked URL is ==$sURL==\n" if (2 <= $self->{_debug});
my $hit = new WWW::SearchResult;
$hit->description(q{});
$self->parse_details($oLI, $hit);
$hit->add_url($sURL);
$sTitle = $self->strip($sTitle);
$hit->title($sTitle);
push(@{$self->{cache}}, $hit);
$hits_found++;
} # foreach LI_TAG
# Now try to find the "next page" link:
my @aoA = $oTree->look_down('_tag' => 'a');
NEXT_A:
foreach my $oA (reverse @aoA)
{
next NEXT_A unless ref($oA);
my $sAhtml = $oA->as_HTML;
printf STDERR (" + next A ==%s==\n", $sAhtml) if (2 <= $self->{_debug});
if ($self->_a_is_next_link($oA))
{
# Here is an example of a raw next URL:
# http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101
# http://rds.yahoo.com/;_ylt=AutpqXFv9tv2eTXen2Mw_c1XNyoA;_ylu=X3oDMTExN2UzODg3BGNvbG8DdwRzZWMDcGFnaW5hdGlvbgR2dGlkA0RGWDJfOQ--/SIG=19e131ad9/EXP=1130207429/**http%3A%2F%2Fsearch.yahoo.com%2Fsearch%3Fn%3D100%26vo%3Dpokemon%26ei%3DUTF-8%26xargs%3D12KPjg1hVSt4GmmvmnCOObHb%255F%252Dvj0Zlpi3g5UzTYR6a9RL8nQJDqADN%255F2aP%255FdLHL9y7XrQ0JOkvqV2HOs3qODiIxkSdWH8UbKsmJS5%255FIp9DLfdaXlzsbIu0%252Djv3NcQZy8nLl2qbeONz73ZI6L5Hk57%26pstart%3D6%26b%3D101
my $sURL = $oA->attr('href');
print STDERR " + raw next URL ==$sURL==\n" if (2 <= $self->{_debug});
# Delete Yahoo-redirect portion of URL:
$sURL =~ s!\A.+?[-*]+(?=http)!!;
print STDERR " + poached next URL ==$sURL==\n" if (2 <= $self->{_debug});
$sURL = WWW::Search::unescape_query($sURL);
$self->{_next_url} = $self->absurl($self->{'_prev_url'}, $sURL);
print STDERR " + cooked next URL ==$self->{_next_url}==\n" if (2 <= $self->{_debug});
last NEXT_A;
} # if
} # foreach NEXT_A
return $hits_found;
} # _parse_tree
=head2 parse_details
Given a (portion of an) HTML::TreeBuilder tree
and a L<WWW::SearchResult> object,
parses one result out of the tree and populates the SearchResult.
=cut
sub parse_details
{
my $self = shift;
# Required arg1 = (part of) an HTML parse tree:
my $oLI = shift;
# Required arg2 = a WWW::SearchResult object to fill in:
my $hit = shift;
my $oDIV = $oLI->look_down(_tag => 'div',
class => 'abstr',
);
if (ref($oDIV))
{
my $sDesc = $oDIV->as_text;
$hit->description($self->strip($sDesc));
} # if
# Delete the useless human-readable restatement of the URL (first
# <EM> tag we come across):
my $oEM = $oLI->look_down(_tag => 'em');
if (ref($oEM))
{
my $sSize = '';
$sSize = $1 if ($oLI->as_text =~ m!(\d+[kb]?)!gx);
$hit->size($sSize);
} # if
return;
# Delete any remaining <A> tags:
my @aoA = $oLI->look_down(_tag => 'a');
A_TAG:
foreach my $oA (@aoA)
{
$oA->detach;
$oA->delete;
} # foreach A_TAG
$oDIV = $oLI->look_down(_tag => 'div');
if (ref $oDIV)
{
$oDIV->detach;
$oDIV->delete;
} # if
my $sDesc = $oLI->as_text;
print STDERR " + raw sDesc is ==$sDesc==\n" if (2 <= $self->{_debug});
# Grab stuff off the end of the description:
print STDERR " + cooked sDesc is ==$sDesc==\n" if (2 <= $self->{_debug});
$hit->description($self->strip($sDesc));
} # parse_details
=head2 _where_to_find_count
Returns a list,
which will be passed as arguments to HTML::Element::look_down()
in order to return an HTML::Element
which contains the approximate result count.
=cut
sub _where_to_find_count
{
my %hash = (
_tag => 'div',
# 'class' => 'ygbody',
id => 'info',
);
return \%hash;
} # _where_to_find_count
=head2 _string_has_count
Given a string,
returns the approximate result count
if that string contains the approximate result count.
=cut
sub _string_has_count
{
my $self = shift;
my $s = shift;
# print STDERR " DDD Yahoo::string_has_count($s)?\n";
return $1 if ($s =~ m!\bof\s+(?:about\s+)?([,0-9]+)!i);
return -1;
} # _string_has_count
=head2 _a_is_next_link
Given an HTML::Element,
returns true if it seems to contain the clickable "next page" widget.
=cut
sub _a_is_next_link
{
my $self = shift;
my $oA = shift;
return 0 if (! ref $oA);
my $sID = $oA->attr('id') || '';
return 1 if ($sID eq 'pg-next');
my $s = $oA->as_text;
print STDERR " + next A as_text ==$s==\n" if (2 <= $self->{_debug});
return ($s =~ m!\A$WS*Next$WS+&gt;$WS*\z!i);
} # _a_is_next_link
=head2 strip
Given a string,
strips leading and trailing whitespace off of it.
=cut
sub strip
{
my $self = shift;
my $s = &WWW::Search::strip_tags(shift);
$s =~ s!\A$WS+ !!x;
$s =~ s! $WS+\Z!!x;
return $s;
} # strip
1;
__END__
GUI search:
http://ink.yahoo.com/bin/query?p=sushi+restaurant+Columbus+Ohio&hc=0&hs=0
Advanced search:
http://search.yahoo.com/search?h=w&fr=op&va=&vp=&vo=Martin+Thurn&ve=&bbase=Search&vl=&vc=&vd=all&vt=any&vss=i&vs=&vr=&vk=
actual next link from page:
http://google.yahoo.com/bin/query?p=%22Shelagh+Fraser%22&b=21&hc=0&hs=0&xargs=
_next_url :
http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101
=head1 SEE ALSO
To make new back-ends, see L<WWW::Search>.
=head1 BUGS
Please tell the maintainer if you find any!
=head1 AUTHOR
As of 1998-02-02, C<WWW::Search::Yahoo> is maintained by Martin Thurn
(mthurn@cpan.org).
C<WWW::Search::Yahoo> was originally written by Wm. L. Scheding,
based on C<WWW::Search::AltaVista>.
=head1 LEGALESE
THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
=head1 LICENSE
Copyright (C) 1998-2009 Martin 'Kingpin' Thurn
This software is released under the same license as Perl itself.
=cut