#!/usr/bin/perl -w =head1 NAME mm-xml2aw-xml.pl - This program converts MetaMap xml (mm-xml) formatted text into the all words xml (aw-xml) format. =head1 SYNOPSIS This program converts MetaMap xml (mm-xml) formatted text into the all words xml (aw-xml) format. =head1 USAGE perl mm-xml2aw-xml.pl SOURCE DESTINATION =head2 SOURCE =head2 DESTINATION =head2 Optional Arguments: =head3 --log DIRECTORY Directory to contain temporary and log files. DEFAULT: log =head3 --help Displays the quick summary of program options. =head3 --version Displays the version information. =head1 OUTPUT All words xml format similar to the SemEval all words disambiguation task. In this format, each term assigned one or more concepts in the metamap xml file are outputed as follows: <?xml version="1.0"?> <!DOCTYPE corpus SYSTEM "all-words.dtd"> <corpus lang="en"> <text id="001"> <head id="d001.s001.t001" candidates="C1280500,C2348382">effect</head> of the <head id="d001.s001.t004" candidates="C0449238">duration</head> </text> </corpus> There exists an addition to the regular SemEval format. The candidate tags contain each possible sense of the term assigned by metamap. These will be used as the possible senses in the umls-allwords-senserelate.pl program when using the --candidate option. Otherwise, the senses come from doing a dictionary lookup in the MRCONSO table of the UMLS. =head1 PROGRAM REQUIREMENTS =over =item * Perl (version 5.8.5 or better) - http://www.perl.org =back =head1 AUTHOR Bridget T. McInnes, University of Minnesota, Twin Cities =head1 COPYRIGHT Copyright (c) 2007-2008, Bridget T. McInnes, University of Minnesota, Twin Cities bthomson at cs.umn.edu Ted Pedersen, University of Minnesota Duluth tpederse at d.umn.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ use Getopt::Long; use XML::Twig; use File::Spec; eval(GetOptions( "version", "help" , "log=s"))or die ("Please check the above mentioned option(s).\n"); # if help is defined, print out help if( defined $opt_help ) { $opt_help = 1; &showHelp(); exit; } # if version is requested, show version if( defined $opt_version ) { $opt_version = 1; &showVersion(); exit; } my $default = ""; my $set = ""; # set the time stamp my $timestamp = &time_stamp(); # set the log file my $log = "log.$timestamp"; if(defined $opt_log) { $log = $opt_log; $set .= " --log $log\n"; } else { $default .= " --log $log\n"; } if($set ne "") { print STDERR "User Options: \n"; print STDERR "$set\n"; } if($default ne "") { print STDERR "Default Options:\n"; print STDERR "$default\n"; } # At least 2 terms should be given on the command line. if(scalar(@ARGV) < 2) { print STDERR "The input and output files must be given on the command line.\n"; &minimalUsageNotes(); exit; } my $outfile = shift; my $infile = shift; # check that output file has been supplied if( !($outfile) ) { print STDERR "No output file (DESTINATION) was supplied.\n"; &askHelp(); exit; } # check if the output file already exists if( -e $outfile ) { print "DESTINATION ($outfile) already exists! Overwrite (Y/N)?"; my $reply = <STDIN>; chomp $reply; $reply = uc($reply); exit 0 if ($reply ne "Y"); } # open the input and output files open(INFILE, $infile) || die "Could not open $infile\n"; open(OUTFILE, ">$outfile") || die "Could not open $outfile\n"; my @abstracts = (); my $abstract = ""; while(<INFILE>) { if($_=~/\<?xml version/) { if($abstract ne "") { push @abstracts, $abstract; } $abstract = ""; } $abstract .= $_; } # print header information print OUTFILE "<?xml version=\"1.0\"?>\n"; print OUTFILE "<!DOCTYPE corpus SYSTEM \"all-words.dtd\">\n"; print OUTFILE "<corpus lang=\"en\">\n"; my $abstractid = 0; foreach my $abstract (@abstracts) { if($abstract=~/^\s*$/) { next; } # increment id $abstractid++; # print document id my $aid = sprintf("%03d", $abstractid); print OUTFILE "<text id=\"$aid\">\n"; # set the xml file for this abstract if(-e "$infile.processing") { system "rm $infile.processing"; } open(FILE, ">$infile.processing") || die "Could not open $infile.processing\n"; print FILE "$abstract"; close FILE; # load the metamap xml output my $t= XML::Twig->new(); $t->parsefile("$infile.processing"); my $root = $t->root; # initialize variables my @cuis = (); my @matches = (); my @tokens = (); # loop through tokens my $method= $root; my $sentenceid = 0; my $tokenid = 0; while( $method=$method->next_elt( $root )) { if($method->local_name eq "UttText") { $sentenceid++; $tokenid = 0; } if($method->local_name eq "InputMatch") { my $token = $method->text; push @tokens, $token; } # check if in mapping if($method->local_name eq "Mapping") { $flag = 1; } # if in mapping, get the cui if( ($method->local_name eq "CandidateCUI") && ($flag == 1) ) { my $cui = $method->text; push @cuis, $cui; } # if in mapping, get the cui if( ($method->local_name eq "CandidateMatched") && ($flag == 1) ) { my $match = $method->text; $match=~s/[\*\?\+\(\)\[\]\/ ]//g; push @matches, lc($match); } if($method->local_name eq "Phrase") { my %mappings = (); foreach my $i (0..$#matches) { my $term = $matches[$i]; my $mflag = 0; while($mflag == 0) { foreach my $token (@tokens) { $token=lc($token); if($token=~/$term/) { $mappings{$token}{"$cuis[$i]/$matches[$i]"}++; $mflag = 1; } } chop $term; if($term=~/^\s*$/) { $mflag = 1; } } } foreach my $token (@tokens) { my $tok = lc($token); $tokenid++; my $a = sprintf("%03d", $abstractid); my $s = sprintf("%03d", $sentenceid); my $t = sprintf("%03d", $tokenid); my $id = "d$a.s$s.t$t"; my $senses = ""; foreach my $m (sort keys %{$mappings{$tok}}) { $m=~/(C[0-9]+)\//; $senses .= "$1,"; } chop $senses; if($senses=~/^\s*$/) { print OUTFILE "$token\n"; } else { print OUTFILE "<head id=\"$id\" candidates=\"$senses\">$token<\/head>\n"; } } @tokens = ();; @cuis = (); @matches = (); } } print OUTFILE "<\/text>\n"; # remove the processing file system "rm $infile.processing"; } print OUTFILE "<\/corpus>\n"; ############################################################################## # SUB FUNCTIONS ############################################################################## # function to create a timestamp sub time_stamp { my ($stamp); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); $year += 1900; $mon++; $d = sprintf("%4d%2.2d%2.2d",$year,$mon,$mday); $t = sprintf("%2.2d%2.2d%2.2d",$hour,$min,$sec); $stamp = $d . $t; return($stamp); } # function to output minimal usage notes sub minimalUsageNotes { print STDERR "Usage: mm-xml2aw-xml.pl [OPTIONS] DESTINATION SOURCE\n"; askHelp(); } # function to output help messages for this program sub showHelp() { print "Usage: mm-xml2aw-xml.pl DESTINATION SOURCE\n\n"; print "Takes as input a machine code MetaMap file and converts it\n"; print "to all-words xml format for umls-allwords-senserelate.pl.\n\n"; print "OPTIONS:\n\n"; print "--log Directory to contain temporary and log\n"; print " files. DEFAULT: log.<timestamp>\n\n"; print "--version Prints the version number\n\n"; print "--help Prints this help message.\n\n"; } # function to output the version number sub showVersion { print '$Id: mm-xml2aw-xml.pl,v 1.7 2011/05/16 14:12:26 btmcinnes Exp $'; print "\nCopyright (c) 2007, Ted Pedersen & Bridget McInnes\n"; } # function to output "ask for help" message when user's goofed sub askHelp { print STDERR "Type mm-xml2aw-xml.pl --help for help.\n"; }