#!/usr/bin/env perl

=head1 NAME

OWL::Simple::Parser

=head1 SYNOPSIS

	use OWL::Simple::Parser;
	
	# load Experimental Factor Ontology
	my $parser = OWL::Simple::Parser->new(  owlfile => 'efo.owl',
			synonym_tag => 'alternative_term' );
	
	# parse file
	$parser->parse();
	
	# iterate through all the classes
	for my $id (keys %{ $parser->class }){
		my $OWLClass = $parser->class->{$id};
		print $id . ' ' . $OWLClass->label . "\n";
		
		# list synonyms
		for my $syn (@{ $OWLClass->synonyms }){
			print "\tsynonym - $syn\n";
		}
		
		# list definitions
		for my $def (@{ $OWLClass->definitions }){
			print "\tdef - $def\n";
		}
		
		# list parents
		for my $parent (@{ $OWLClass->subClassOf }){
			print "\tsubClassOf - $parent\n";
		}
	}

=head1 DESCRIPTION

A simple OWL parser loading accessions, labels and synonyms and exposes them
as a collection of OWL::Simple::Class objects.

In the constructor specify the owlfile to be loaded and an optional synonym_tag
used in the ontology (example FULL_SYN for NCI Thesaurus).

=head2 METHODS

=over

=item class_count()

Number of classes loaded by the parser.

=item synonyms_count()

Number of synonyms loaded by the parser.

=item class

Hash collection of all the OWL::Simple::Class objects

=back

=head1 AUTHOR

Tomasz Adamusiak <tomasz@cpan.org>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2010 European Bioinformatics Institute. All Rights Reserved.

This module is free software; you can redistribute it and/or modify it 
under GPLv3.

This software is provided "as is" without warranty of any kind.

=cut

package OWL::Simple::Parser;

use Moose 0.89;
use OWL::Simple::Class;
use XML::Parser 2.34;
use Data::Dumper;
use Log::Log4perl qw(:easy);
Log::Log4perl->easy_init( { level => $INFO, layout => '%-5p - %m%n' } );

our $VERSION = 0.07;

has 'owlfile'     => ( is => 'rw', isa => 'Str',     required => 1 );
has 'class'       => ( is => 'ro', isa => 'HashRef', default  => sub { {} } );
has 'class_count' => ( is => 'rw', isa => 'Int',     default  => 0 );
has 'synonyms_count' => ( is => 'rw', isa => 'Int', default => 0 );
has 'synonym_tag' =>
  ( is => 'rw', isa => 'Str', default => 'efo:alternative_term' );

my $parser;
my $path = '';
my $class = OWL::Simple::Class->new();
my %restriction;

# Default constructor. Initializes the XML::Parser and sets appropriate handlers.

sub BUILD() {
	my $self = shift;
	$parser = new XML::Parser;
	$parser->setHandlers(
		Start => sub { $self->startElement(@_) },
		End   => sub { $self->endElement(@_) },
		Char  => sub { $self->characterData(@_) },
	);
}

# Increments internal counter of classes and synonyms parser respectively.

sub incr_classes() {
	my $self = shift;
	$self->class_count( $self->class_count + 1 );
}

sub incr_synonyms() {
	my $self = shift;
	$self->synonyms_count( $self->synonyms_count + 1 );
}

# Main function. Parser the owlfile using XML::Parser

sub parse() {
	my $self = shift;
	$parser->parsefile( $self->owlfile );
	INFO "LOADED "
	  . $self->class_count
	  . ' CLASSES AND '
	  . $self->synonyms_count
	  . ' SYNONYMS from '
	  . $self->owlfile;

	1;
}

# Handler executed by XML::Parser. Adds current element to $path.
# $path is used characterData() to determine whtether node text should be
# added to class.
#
# Initializes a new OWLClass object and stores it in $class. This is later
# populated by other handlers.

sub startElement() {
	my ( $self, $parseinst, $element, %attr ) = @_;
	DEBUG "->startElement  $self, $parseinst, $element";
	$path = $path . '/' . $element;    # add element to path
	if ( $path eq '/rdf:RDF/owl:Class' ) {
		$self->incr_classes();
		INFO(
			"Loaded " . $self->class_count . " classes from " . $self->owlfile )
		  if $self->class_count % 1000 == 0;
		$class = OWL::Simple::Class->new();
		$class->id( $attr{'rdf:about'} ) if defined $attr{'rdf:about'};
		$class->id( $attr{'rdf:ID'} )    if defined $attr{'rdf:ID'};
		WARN 'DUPLICATE RDF:ID & RDF:ABOUT IN ' . $attr{'rdf:about'}
		  if ( defined $attr{'rdf:id'} && defined $attr{'rdf:about'} );
	}

	# Two ways to match parents, either as rdf:resource attribute
	# on rdfs:subClassOf or rdf:about on nested rdfs:subClassOf/owl:Class
	elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf' ) {
		push @{ $class->subClassOf }, $attr{'rdf:resource'}
		  if defined $attr{'rdf:resource'};
	}
	elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf/owl:Class' ) {
		push @{ $class->subClassOf }, $attr{'rdf:about'}
		  if defined $attr{'rdf:about'};
	}

	# Here we try to match relations, e.g. part_of, derives_from, etc.
	elsif ( $element eq 'owl:Restriction' ) {
		$restriction{type}  = undef;
		$restriction{class} = [];
	}
	elsif ( $element eq 'owl:someValuesFrom' ) {
		push @{ $restriction{class} }, $attr{'rdf:resource'}
		  if defined $attr{'rdf:resource'};
		push @{ $restriction{class} }, $attr{'rdf:about'}
		  if defined $attr{'rdf:about'};
	}

	# Regex as properties can be transitive, etc.
	elsif ( $element =~ /owl:\w+Property$/ ) {
		$restriction{type} = $attr{'rdf:about'} if defined $attr{'rdf:about'};
		$restriction{type} = $attr{'rdf:resource'}
		  if defined $attr{'rdf:resource'};
	}
}

# Handler executed by XML::Parser when node text is processed.
#
# For rdfs:label stores the value into $class->label.
#
# For synonym_tag (defaulting to efo:alternative_term), runs some post
# processing removing CDATA statements form NCIt synonyms and [accessedResource],
# [accessDate] tags from EFO synonyms. The synonyms are stored in $class->synonyms.

sub characterData {
	my ( $self, $parseinst, $data ) = @_;
	DEBUG "->characterData  $self, $parseinst, $data";

	# Get rdfs:label
	if ( $path eq '/rdf:RDF/owl:Class/rdfs:label' ) {
		$class->label(
			( defined $class->label() ? $class->label() : '' ) . $data );
	}

	# Get definition_citation or defintion
	elsif ($path eq '/rdf:RDF/owl:Class/efo:definition_citation'
		|| $path eq '/rdf:RDF/owl:Class/efo:definition' )
	{
		$class->annotation(
			( defined $class->annotation() ? $class->annotation() : '' )
			. $data );
	}

	# FIXME obsolete code
	# Remove source tags from efo classes
	# Special case for the OBO converter ONLY
	# FIXME consider a more general approach
	# FIXME provenance information is now stored in a separate annotation
	#if ( $data =~ m/^(.*)\[accessedResource: (.*)\]\[.*\]$/ ) {
	#	$data = '"' . $1 . '"' . " [$2]";
	#}
	#else {
	#	$data = '"' . $data . '" []';
	#}

	# Get synonyms, either matching to anything with synonym or
	# alternative_term inside or custom tag from parameters
	elsif (
		   $path =~ m{^/rdf:RDF/owl:Class/\w*:?\w*(synonym|alternative_term)\w*}
		|| $path eq '/rdf:RDF/owl:Class/' . $self->synonym_tag )
	{
		$self->incr_synonyms();

		# detecting closing tag inside, NCIt fix
		if ( $data =~ m!</! ) {
			($data) = $data =~ m!>(.*?)</!;    # match to first entry
		}

		# remove source tags from efo classes
		if ( $data =~ m/^(.*)\[accessedResource.*\]$/ ) {
			$data = $1;
		}
		WARN( "Unparsable synonym detected for " . $class->id )
		  unless defined $data;
		push @{ $class->synonyms }, $data if defined $data;
	}
}

# Handler executed by XML::Parser when the closing tag
# is encountered. For owl:Class it pushes it into the class hash as it was
# processed by characterData() already and the parser is ready to
# process a new owl:Class.
#
# Also strips the closing tag from $path.

sub endElement() {
	my ( $self, $parseinst, $element ) = @_;
	DEBUG "->endElement  $self, $parseinst, $element";

	# Reached end of class, add the class to hash
	if (   $path eq '/rdf:RDF/owl:Class'
		&& $class->id ne "http://www.w3.org/2002/07/owl#Thing" )
	{
		WARN 'Class ' . $class->id . ' possibly duplicated'
		  if defined $self->class->{ $class->id };
		my $classhash = $self->class;
		$classhash->{ $class->id } = $class;
	}

	# Reached end of the relationship tag, add to appropriate array
	# Currently supports only part_of, and even that poorly.
	# FIXME circular references
	elsif ( $element eq 'owl:Restriction' ) {
		WARN "UNDEFINED RESTRICTION " . $class->id
		  if not defined $restriction{type};
		if ( $restriction{type} =~ m!/part_of$! ) {
			for my $cls ( @{ $restriction{class} } ) {
				push @{ $class->part_of }, $cls;
			}
		}
	}

	# character data can be called multiple times
	# for a single element, so it's concatanated there
	# and saved here
	elsif ( $element eq 'efo:definition_citation' ) {
		push @{ $class->xrefs }, $class->annotation if $class->annotation() ne '';
	}
	elsif ( $element eq 'efo:definition' ) {
		push @{ $class->definitions }, $class->annotation if $class->annotation() ne '';
	}
	print Dumper($class) unless defined $class->annotation;
	# clear temp annotation
	$class->annotation('');

	#remove end element from path
	$path =~ s!/$element$!!;
}

1;