package HTML::ListScraper::Book; use warnings; use strict; use Class::Generate qw(class); class 'HTML::ListScraper::Tag' => { name => { type => '$', required => 1, readonly => 1 }, index => { type => '$', required => 1, readonly => 1 }, link => { type => '$', readonly => 1 }, text => '$', '&append_text' => q{ $text .= $_[0]; } }; sub new { my $class = shift; my $self = { shapeless => 0, index => 0, dseq => [ ], next => 0, tseq => [ ], p2t => { } }; # the list is from HTML 4.01 Transitional DTD; head and body is # included not because we seriously expect them to be unpaired, # but just to simplify documentation - they aren't going to get # into repeated sequences anyway... foreach (qw(area base basefont body br col colgroup dd dt frame head hr img input isindex li link meta option p param tbody td tfoot th thead tr)) { $self->{unclosed_tags}->{$_} = 1; } bless $self, $class; return $self; } sub shapeless { my $self = shift; if (@_) { $self->{shapeless} = !!$_[0]; } return $self->{shapeless}; } sub is_unclosed_tag { my ($self, $name) = @_; return exists($self->{unclosed_tags}->{$name}); } sub push_item { my ($self, $name) = @_; my $index = ($self->{index})++; $self->_push(HTML::ListScraper::Tag->new(name => $name, index => $index)); } sub push_link { my ($self, $name, $link) = @_; my $index = ($self->{index})++; $self->_push(HTML::ListScraper::Tag->new( name => $name, index => $index, link => $link)); } sub get_internal_name { my ($self, $name) = @_; return exists($self->{p2t}->{$name}) ? $self->{p2t}->{$name} : undef; } sub intern_name { my ($self, $name) = @_; if (!exists($self->{p2t}->{$name})) { use bytes; my $c = ($self->{next})++; if ($self->{next} > 255) { # 18Apr2007: HTML::ListScraper::get_known_sequence # depends on 1-byte internal names die "can't handle so many tags"; # could probably switch to 2-byte numbers, but is that # useful? } $self->{p2t}->{$name} = bytes::chr($c); } return $self->{p2t}->{$name}; } sub _push { my ($self, $td) = @_; my $name = $td->name; my $iname = $self->intern_name($name); push @{$self->{dseq}}, $td; push @{$self->{tseq}}, $iname; } sub append_text { my ($self, $text) = @_; my $count = scalar(@{$self->{dseq}}); # ignore text before the first tag if (!$count) { return; # if we had a verbose mode, we would warn here } my $td = $self->{dseq}->[$count - 1]; $td->append_text($text); } sub get_internal_sequence { my $self = shift; return wantarray ? @{$self->{tseq}} : $self->{tseq}; } sub is_presentable { my ($self, $start, $len) = @_; if ($self->{shapeless}) { return 1; } my $i = 0; my @stack; while ($i < $len) { my $name = $self->{dseq}->[$start + $i]->name; my $tag = $name; $tag =~ s~^\/~~; if ($name eq $tag) { push @stack, $tag; } else { while (scalar(@stack) && ($stack[scalar(@stack) - 1] ne $tag)) { if ($self->is_unclosed_tag($stack[scalar(@stack) - 1])) { pop @stack; } else { return 0; } } if (!scalar(@stack)) { return 0; } pop @stack; } ++$i; } while (scalar(@stack)) { my $top = pop @stack; if (!$self->is_unclosed_tag($top)) { return 0; } } return 1; } sub get_all_tags { my $self = shift; return wantarray ? @{$self->{dseq}} : $self->{dseq}; } sub get_tags { my ($self, $start, $len) = @_; my $last = $start + $len - 1; return @{$self->{dseq}}[$start .. $last]; } sub get_tag { my ($self, $pos) = @_; return $self->{dseq}->[$pos]; } 1;