#!/usr/bin/perl =head1 NAME MeSH::Parser::ASCII - parser for the MeSH ASCII format =head1 SYNOPSIS use MeSH::Parser::ASCII; # instantiate the parser my $parser = MeSH::Parser::ASCII->new( meshfile => 'd2010.bin' ); # parse the file $parser->parse(); # loop through all the headings while (my ($id, $heading) = each %{$parser->heading} ){ print $id . ' - ' . $heading->{label} . "\n"; for my $synonym (@{$heading->{synonyms}}){ print "\t$synonym\n"; } } =head1 DESCRIPTION Parser for the MeSH ASCII format. =over =item meshfile MeSH file in ASCII format =back =head2 METHODS =over =item parse() Parses the MeSH file and loads it into a hash ref. =item heading Returns a hash ref collection of all the parsed headings. Each consists of a label, and id and synonyms if any were available. Label is extracted from I<Mesh Heading> field in Descriptor Data Elements, or I<Name of substance> in Supplementary Concept Records, or I<Subheading> in Qualifier Data Elements. Synonyms are only parsed for Descriptor Data Elements (I<PRINT ENTRY> and I<ENTRY> entries) =back =head1 AUTHOR Tomasz Adamusiak <tomasz@cpan.org> =head1 COPYRIGHT AND LICENSE Copyright (c) 2010 European Bioinformatics Institute. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under GPLv3. This software is provided "as is" without warranty of any kind. =cut package MeSH::Parser::ASCII; use Moose 0.89; use Log::Log4perl qw(:easy); Log::Log4perl->easy_init( { level => $INFO, layout => '%-5p - %m%n' } ); our $VERSION = 0.01; has 'meshfile' => ( is => 'rw', isa => 'Str', required => 1 ); has 'heading' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); sub parse() { my $self = shift; INFO 'Parsing file ' . $self->meshfile . ' ...'; # open file open my $fh, '<', $self->meshfile; my ( $label, $id, $synonyms, $count ); $count->{syns} = 0; while (<$fh>) { # multiplatform chomp s/\015?\012?$//; # initialise if (/^\*NEWRECORD/) { $synonyms = undef; $label = undef; $id = undef; } DEBUG '<' . $_ . '>'; # save on new line if (/^$/) { LOGDIE 'Could not parse heading\'s label.' unless defined $label; $count->{headings}++; WARN "Duplicate heading found for $id" if defined $self->heading->{$id}; $self->heading->{$id}->{label} = $label; $self->heading->{$id}->{synonyms} = $synonyms if defined $synonyms; DEBUG $label . ' ' . $id . "\n"; for my $syn (@$synonyms) { DEBUG "\t" . $syn; $count->{syns}++; } } # Mesh Heading in Descriptor Data Elements $label = ( split(/ = /) )[1] if /^MH = /; # Name of substance in Supplementary Concept Records $label = ( split(/ = /) )[1] if /^NM = /; # Subheading in Qualifier Data Elements $label = ( split(/ = /) )[1] if /^SH = /; $id = ( split(/ = /) )[1] if /^UI = /; # PRINT ENTRY and ENTRY are synonyms in Descriptor Data Elements # splits on ENTRY = , and then disregards anything after pipe | push @$synonyms, ( split( /\|/, ( split(/ = /) )[1] ) )[0] if /^ENTRY = /; push @$synonyms, ( split( /\|/, ( split(/ = /) )[1] ) )[0] if /^PRINT ENTRY = /; } close $fh; LOGDIE 'Could not parse any headings.' unless defined $count->{headings}; INFO "Loaded " . $count->{headings} . " headings and " . $count->{syns} . " synonyms"; 1; } 1;