use strict;
$Alvis::Document::Type::VERSION = '0.1';
#############################################################################
#
# Tries to predict the type of a document. Currently pretty crude.
#
# Kimmo Valtonen
#
#############################################################################
#############################################################################
#
# Error message stuff
#
#############################################################################
my $ErrStr;
my ($ERR_OK,
$ERR_DOC,
$ERR_FILE_TYPE
)=(0..2);
my %ErrMsgs=($ERR_OK=>"",
$ERR_DOC=>"No document.",
$ERR_FILE_TYPE=>"Unable to instantiate File::Type"
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
if (!defined($errcode))
{
confess("set_err_state() called with an undefined argument.");
}
if (exists($ErrMsgs{$errcode}))
{
if ($errcode==$ERR_OK)
{
$self->{errstr}="";
}
else
{
$self->{errstr}.=" " . $ErrMsgs{$errcode};
if (defined($errmsg))
{
$self->{errstr}.=" " . $errmsg;
}
}
}
else
{
confess("Internal error: set_err_state() called with an " .
"unrecognized argument ($errcode).")
}
}
sub errmsg
{
my $self=shift;
return $self->{errstr};
}
#############################################################################
#
# Methods
#
##############################################################################
sub new
{
my $proto=shift;
my $class=ref($proto)||$proto;
my $parent=ref($proto)&&$proto;
my $self={};
bless($self,$class);
$self->_init(@_);
$self->_set_err_state($ERR_OK);
$self->{fileType}=File::Type->new();
if (!defined($self->{fileType}))
{
$self->_set_err_state($ERR_FILE_TYPE);
return undef;
}
return $self;
}
sub _init
{
my $self=shift;
$self->{defaultType}='text';
$self->{defaultSubType}='plain';
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# Returns similarly to MIME ($type,$sub_type), but is broader
#
sub guess
{
my $self=shift;
my $text=shift;
$self->_set_err_state($ERR_OK); # clean the slate
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
my ($type,$sub_type);
# Try File::Type first
my $mime_type=$self->{fileType}->mime_type($text);
if (!defined($mime_type))
{
$type=$self->{defaultType};
$sub_type=$self->{defaultSubType};
}
else
{
($type,$sub_type)=split(/\//,$mime_type,-1);
}
# If the result is a generic one, check for our types of interest
# by other means
# BTW, File::Type should make it clear and checkable what its
# "I dunno" reply is
if ($type eq 'application' && $sub_type eq 'octet-stream')
{
if ($self->_looks_like_HTML($text))
{
($type,$sub_type)=('text','html');
}
elsif ($self->_looks_like_RSS($text))
{
# not a MIME type
($type,$sub_type)=('text','rss')
}
}
return ($type,$sub_type);
}
sub _looks_like_HTML
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type=~/(?:html|wml)/igo)
{
return 1;
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($text=~/<(?:(?i)html|body)\W/sgo)
{
return 1;
}
return 0;
}
sub _looks_like_RSS
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type=~/(?:rss)/igo)
{
return 1;
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($text=~/<(?:(?i)rss|channel)\W/sgo)
{
return 1;
}
return 0;
}
1;
1;
__END__
=head1 NAME
Alvis::Document::Type - Perl extension for guessing and checking the type
of a document (an extension of MIME types).
=head1 SYNOPSIS
use Alvis::Document::Type;
# Create a new instance
my $t=Alvis::Document::Type->new(defaultType=>'text',
defaultSubType=>'html');
if (!defined($t))
{
die('Ugh!');
}
my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
if (!(defined($doc_type) && defined($doc_sub_type)))
{
die("Guess what? " . $t->errmsg());
}
=head1 DESCRIPTION
Tries to guess the type of a document similarly to MIME types
(type and a subtype).
Adds subtypes 'rss' and 'html' to MIME type 'text'.
=head1 METHODS
=head2 new()
Options:
defaultType The default type (text).
defaultSubType The default subtype (plain).
=head2 guess($text)
Tries to guess the type of $text.
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
=head1 AUTHOR
Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut