#!/usr/bin/perl =head1 NAME umls-similarity.pl - this program returns a semantic similarity score between two concepts =head1 SYNOPSIS This is a utility that takes as input either two terms (DEFAULT) or two CUIs and returns the similarity between the two. =head1 USAGE Usage: umls-similarity.pl [OPTIONS] [CUI1|TERM1] [CUI2|TERM2] =head1 INPUT =head3 [CUI1|TERM1] [CUI2|TERM2] The input are two terms or two CUIs associated to concepts in the UMLS. =head2 Optional Arguments: =head3 --inputfile FILE A file containing pairs of concepts or terms in the following format: term1<>term2 or cui1<>cui2 or cui1<>term2 or term1<>cui2 =head3 --username STRING Username is required to access the umls database on MySql =head3 --password STRING Password is required to access the umls database on MySql =head3 --hostname STRING Hostname where mysql is located. DEFAULT: localhost =head3 --database STRING Database contain UMLS DEFAULT: umls =head3 --measure MEASURE Use the MEASURE module to calculate the semantic similarity. The available measure are: 1. Leacock and Chodorow (1998) refered to as lch 2. Wu and Palmer (1994) refered to as wup 3. The basic path measure refered to as path =head3 --precision N Displays values upto N places of decimal. =head4 --help Displays the quick summary of program options. =head4 --version Displays the version information. =head1 OUTPUT disambiguate.pl creates two directories. One containing the arff files and the other containing the weka files. In the weka directory, the overall averages are stored in the OverallAverage file. =head1 SYSTEM REQUIREMENTS =over =item * Perl (version 5.8.5 or better) - http://www.perl.org =item * UMLS::Interface - http://search.cpan.org/dist/UMLS-Interface =item * UMLS::Similarity - http://search.cpan.org/dist/UMLS-Similarity =back =head1 CONTACT US If you have any trouble installing and using UMLS-Similarity, please contact us via the users mailing list : umls-similarity@yahoogroups.com You can join this group by going to: http://tech.groups.yahoo.com/group/umls-similarity/ You may also contact us directly if you prefer : Bridget T. McInnes: bthomson at cs.umn.edu Ted Pedersen : tpederse at d.umn.edu =head1 AUTHOR Bridget T. McInnes, University of Minnesota =head1 COPYRIGHT Copyright (c) 2007-2009, Bridget T. McInnes, University of Minnesota bthomson at cs.umn.edu Ted Pedersen, University of Minnesota Duluth tpederse at d.umn.edu Siddharth Patwardhan, University of Utah, Salt Lake City sidd@cs.utah.edu Serguei Pakhomov, University of Minnesota Twin Cities pakh0002@umn.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to: The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ use lib "/export/scratch/programs/lib/site_perl/5.8.7/"; use UMLS::Interface; use UMLS::Similarity::lch; use UMLS::Similarity::path; use UMLS::Similarity::wup; use Getopt::Long; GetOptions( "version", "help", "username=s", "password=s", "hostname=s", "database=s", "socket=s", "measure=s", "config=s", "infile=s", "precision=s"); my $debug = 0; # if help is defined, print out help if( defined $opt_help ) { $opt_help = 1; &showHelp(); exit; } # if version is requested, show version if( defined $opt_version ) { $opt_version = 1; &showVersion(); exit; } # At least 2 terms should be given on the command line. if( !(defined $opt_infile) and (scalar(@ARGV) < 2) ) { print STDERR "At least 2 terms or CUIs should be given on the \n"; print STDERR "command line or use the --infile option\n"; &minimalUsageNotes(); exit; } # initialize variables my $precision = ""; my $floatformat = ""; my $database = ""; my $hostname = ""; my $socket = ""; my $measure = ""; my $umls = ""; my $lch = ""; my $path = ""; my $wup = ""; my $noscore = ""; my $infile = ""; my %input_hash = (); &setOptions (); &loadUMLS (); &loadMeasures (); &loadInput (); &calculateSimilarity(); sub calculateSimilarity { if($debug) { print STDERR "In calculateSimilarity\n"; } foreach my $input1 (sort keys %input_hash) { foreach my $input2 (sort keys %{$input_hash{$input1}}) { if($debug) { print STDERR "INPUT=> $input1 : $input2\n"; } my @c1 = (); my @c2 = (); my $cui_flag1 = 0; my $cui_flag2 = 0; # check if input contains cuis if($input1=~/C[0-9]+/) { push @c1, $input1; $cui_flag1 = 1; } else { @c1 = $umls->getConceptList($input1); &errorCheck($umls); } if($input2=~/C[0-9]+/) { push @c2, $input2; $cui_flag2 = 1; } else { @c2 = $umls->getConceptList($input2); &errorCheck($umls); } if($debug) { print STDERR "$input1 (@c1)\n"; print STDERR "$input2 (@c2)\n"; } # get the similarity between the concepts foreach $cc1 (@c1) { foreach $cc2 (@c2) { my $t1 = $input1; my $t2 = $input2; if($cui_flag1) { my @ts1 = $umls->getTermList($cc1); &errorCheck($umls); ($t1) = @ts1; } if($cui_flag2) { my @ts2 = $umls->getTermList($cc2); &errorCheck($umls); ($t2) = @ts2; } if(! ($umls->checkConceptExists($cc1)) ) { if($cui_flag) { print "$noscore<>$t1<>$t2\n"; } else { print "$noscore<>$input1<>$input2\n"; } $printFlag = 1; next; } if(! ($umls->checkConceptExists($cc2)) ) { if($cui_flag) { print "$noscore<>$t1<>$t2\n"; } else { print "$noscore<>$input1<>$input2\n"; } $printFlag = 1; next; } if($debug) { print STDERR "Obtaining similarity for $cc1 and $cc2\n"; } my $score = ""; if($measure eq "lch") { $value = $lch->getRelatedness($cc1, $cc2); &errorCheck($lch); $score = sprintf $floatformat, $value; } elsif($measure eq "wup") { $value = $wup->getRelatedness($cc1, $cc2); &errorCheck($wup); $score = sprintf $floatformat, $value; } else { $value = $path->getRelatedness($cc1, $cc2); &errorCheck($path); $score = sprintf $floatformat, $value; } if($cui_flag) { print "$score<>$t1($cc1)<>$t2($cc2)\n"; } else { print "$score<>$input1($cc1)<>$input2($cc2)\n"; } $printFlag = 1; } } if(! ($printFlag)) { print "$noscore<>$input1<>$input2\n"; } $printFlag = 0; } } } sub loadInput { if($debug) { print STDERR "In loadInput\n"; } # if file is defined get the terms or cuis from the input file if(defined $opt_infile) { if($debug) { print STDERR "FILE ($opt_infile) DEFINED\n"; } open(FILE, $infile) || die "Could not open file: $infile\n"; my $linecounter = 1; while(<FILE>) { chomp; if($_=~/^\s*$/) { next; } if($_=~/\<\>/) { my ($i1, $i2) = split/<>/; $input_hash{$i1}{$i2}++; } else { print STDERR "There is an error in the input file ($infile)\n"; print STDERR "one line $linecounter. The input is not in the\n"; print STDERR "correct format. Here is the input line:\n"; print STDERR "$_\n\n"; exit; } } } # otherwise get them from the command line else { if($debug) { print STDERR "Command Line terms/cuis defined\n"; } my $i1 = shift @ARGV; my $i2 = shift @ARGV; if($debug) { print STDERR "INPUT: $i1 $i2\n"; } $input_hash{$i1}{$i2}++; } } # load the appropriate measure sub loadMeasures { # load the module implementing the Leacock and # Chodorow (1998) measure if($measure eq "lch") { $lch = UMLS::Similarity::lch->new($umls); die "Unable to create measure object.\n" if(!$lch); ($errCode, $errString) = $lch->getError(); die "$errString\n" if($errCode); $lch->{'trace'} = 1; } # loading the module implementing the Wu and # Palmer (1994) measure if($measure eq "wup") { $wup = UMLS::Similarity::wup->new($umls); die "Unable to create measure object.\n" if(!$wup); ($errCode, $errString) = $wup->getError(); die "$errString\n" if($errCode); $wup->{'trace'} = 1; } # loading the module implementing the simple edge counting # measure of semantic relatedness. if($measure eq "path") { $path = UMLS::Similarity::path->new($umls); die "Unable to create measure object.\n" if(!$path); ($errCode, $errString) = $path->getError(); die "$errString\n" if($errCode); $path->{'trace'} = 1; } } # load the UMLS sub loadUMLS { if(defined $opt_username and defined $opt_config) { $umls = UMLS::Interface->new({"driver" => "mysql", "database" => "$database", "username" => "$opt_username", "password" => "$opt_password", "hostname" => "$hostname", "socket" => "$socket", "config" => "$opt_config"}); die "Unable to create UMLS::Interface object.\n" if(!$umls); ($errCode, $errString) = $umls->getError(); die "$errString\n" if($errCode); } elsif(defined $opt_username) { $umls = UMLS::Interface->new({"driver" => "mysql", "database" => "$database", "username" => "$opt_username", "password" => "$opt_password", "hostname" => "$hostname", "socket" => "$socket"}); die "Unable to create UMLS::Interface object.\n" if(!$umls); ($errCode, $errString) = $umls->getError(); die "$errString\n" if($errCode); } elsif(defined $opt_config) { $umls = UMLS::Interface->new({"config" => "$opt_config"}); die "Unable to create UMLS::Interface object.\n" if(!$umls); ($errCode, $errString) = $umls->getError(); die "$errString\n" if($errCode); } else { $umls = UMLS::Interface->new(); die "Unable to create UMLS::Interface object.\n" if(!$umls); ($errCode, $errString) = $umls->getError(); die "$errString\n" if($errCode); } &errorCheck($umls); } # set user input and default options sub setOptions { if($debug) { print STDERR "In setOptions\n"; } my $default = ""; my $set = ""; # set file if(defined $opt_infile) { $infile = $opt_infile; $set .= " --infile $opt_infile\n"; } if(defined $opt_config) { $config = $opt_config; $set .= " --config $config\n"; } # set precision $precision = 4; if(defined $opt_precision) { $precision = $opt_precision; $set .= " --precision $precision\n"; } else { $precision = 4; $default .= " --precision $precision\n"; } if ($precision !~ /^\d+$/) { print STDERR "Value for switch --precision should be integer >= 0. Using 4.\n"; $precision = 4; $default .= " --precision $precision\n"; } # create the floating point conversion format as required by sprintf! $floatformat = join '', '%', '.', $precision, 'f'; # set the zero score with appropriate precision $noscore = sprintf $floatformat, -1; # set databasee options if(defined $opt_username) { if(defined $opt_username) { $set .= " --username $opt_username\n"; } if(defined $opt_password) { $set .= " --password XXXXXXX\n"; } if(defined $opt_database) { $database = $opt_database; $set .= " --database $database\n"; } else { $database = "umls"; $default .= " --database $database\n"; } if(defined $opt_hostname) { $hostname = $opt_hostname; $set .= " --hostname $hostname\n"; } else { $hostname = "localhost"; $default .= " --hostname $hostname\n"; } if(defined $opt_socket) { $socket = $opt_socket; $set .= " --socket $socket\n"; } else { $socket = "/tmp/mysql.sock\n"; $default .= " --socket $socket\n"; } } # set the semantic similarity measure to be used if(defined $opt_measure) { $measure = $opt_measure; $set .= " --measure $measure\n"; } else { $measure = "path"; $default .= " --measure $measure\n"; } if($measure=~/(path|wup|lch)/) { # good to go } else { print STDERR "The measure ($opt_measure) is not defined for\n"; print STDERR "the UMLS-Similarity package at this time.\n\n"; &minimalUsageNotes(); exit; } # check settings if($default eq "") { $default = " No default settings\n"; } if($set eq "") { $set = " No user defined settings\n"; } # print options print STDERR "Default Settings:\n"; print STDERR "$default\n"; print STDERR "User Settings:\n"; print STDERR "$set\n"; } sub errorCheck { my $obj = shift; ($errCode, $errString) = $obj->getError(); print STDERR "$errString\n" if($errCode); exit if($errCode > 1); } ############################################################################## # function to output minimal usage notes ############################################################################## sub minimalUsageNotes { print "Usage: umls-similarity.pl [OPTIONS] [TERM1 TERM2] [CUI1 CUI2]\n"; &askHelp(); exit; } ############################################################################## # function to output help messages for this program ############################################################################## sub showHelp() { print "This is a utility that takes as input either two terms \n"; print "or two CUIs from the command line or a file and returns \n"; print "the similarity between the two using either Leacock and \n"; print "Chodorow, 1998 (lch), Wu and Palmer, 1994 (wup) or the \n"; print "basic path measure (path)\n\n"; print "Usage: umls-similarity.pl [OPTIONS] TERM1 TERM2\n\n"; print "Options:\n\n"; print "--username STRING Username required to access mysql\n\n"; print "--password STRING Password required to access mysql\n\n"; print "--hostname STRING Hostname for mysql (DEFAULT: localhost)\n\n"; print "--database STRING Database contain UMLS (DEFAULT: umls)\n\n"; print "--infile FILE File containing TERM or CUI pairs\n\n"; print "--measure MEASURE The measure to use to calculate the\n"; print " semantic similarity. (DEFAULT: path)\n\n"; print "--precision N Displays values upto N places of decimal.\n\n"; print "--version Prints the version number\n\n"; print "--help Prints this help message.\n\n"; } ############################################################################## # function to output the version number ############################################################################## sub showVersion { print '$Id: umls-similarity.pl,v 1.20 2009/02/09 18:36:54 btmcinnes Exp $'; print "\nCopyright (c) 2008, Ted Pedersen & Bridget McInnes\n"; } ############################################################################## # function to output "ask for help" message when user's goofed ############################################################################## sub askHelp { print STDERR "Type umls-similarity.pl --help for help.\n"; }