# UMLS::Similarity::vector.pm # # Module implementing the vector semantic relatedness measure # based on the measure proposed by Patwardhan (2003) # # Copyright (c) 2009-2010, # # Bridget T McInnes, University of Minnesota, Twin Cities # bthomson at umn.edu # # Siddharth Patwardhan, University of Utah, Salt Lake City # sidd at cs.utah.edu # # Serguei Pakhomov, University of Minnesota, Twin Cities # pakh002 at umn.edu # # Ted Pedersen, University of Minnesota, Duluth # tpederse at d.umn.edu # # Ying Liu, University of Minnesota # liux0935 at umn.edu # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to # # The Free Software Foundation, Inc., # 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. package UMLS::Similarity::vector; use warnings; use UMLS::Similarity; use vars qw($VERSION); $VERSION = '0.01'; my $debug = 0; my $defraw_option= 0; my $vectormatrix = ""; my $vectorindex = ""; my $debugfile = ""; my $dictfile = ""; my %index = (); my %reverse_index = (); my %position = (); my %length = (); my %dictionary = (); local(*DEBUG); sub new { my $className = shift; return undef if(ref $className); if($debug) { print STDERR "In UMLS::Similarity::vector->new()\n"; } my $interface = shift; my $params = shift; $params = {} if(!defined $params); $vectorindex = $params->{'vectorindex'}; $vectormatrix = $params->{'vectormatrix'}; $config = $params->{'config'}; $dictfile = $params->{'dictfile'}; $debugfile = $params->{'debugfile'}; my $defraw = $params->{'defraw'}; if(defined $defraw) { $defraw_option = 1; } if (defined $dictfile) { open(DICT, "<$dictfile") or die("Error: cannot open file '$dictfile' for output index.\n"); while(<DICT>) { chomp; if($_=~/^\s*$/) { next; } my @defs = split/ /; my $concept = shift @defs; my $definition = join (" ", @defs); $dictionary{$concept} = $definition; } close DICT; } open(INDX, "<$vectorindex") or die("Error: cannot open file '$vectorindex' for output index.\n"); while (my $line = <INDX>) { chomp($line); my @terms = split(' ', $line); $index{$terms[0]} = $terms[1]; $reverse_index{$terms[1]} = $terms[0]; $position{$terms[1]} = $terms[2]; $length{$terms[1]} = $terms[3]; } close INDX; if(defined $debugfile) { if(-e $debugfile) { print "Debug file $debugfile already exists! Overwrite (Y/N)? "; my $reply = <STDIN>; chomp $reply; $reply = uc $reply; exit 0 if ($reply ne "Y"); } open(DEBUG, ">$debugfile") || die "Could not open debug file: $debugfile\n"; } my $self = {}; # Initialize the error string and the error level. $self->{'errorString'} = ""; $self->{'error'} = 0; # Bless the object. bless($self, $className); # The backend interface object. $self->{'interface'} = $interface; if(!$interface) { $self->{'errorString'} .= "\nError (UMLS::Similarity::vector->new()) - "; $self->{'errorString'} .= "An interface object is required."; $self->{'error'} = 2; } # The backend interface object. $self->{'interface'} = $interface; return $self; } sub getRelatedness { my $self = shift; return undef if(!defined $self || !ref $self); my $concept1 = shift; my $concept2 = shift; my $interface = $self->{'interface'}; my $d1 = ""; my $d2 = ""; if (defined $dictfile) { $d1 = $dictionary{$concept1}; $d2 = $dictionary{$concept2}; if(defined $debugfile) { print DEBUG "DEFINITIONS FOR CUI 1: \n"; print DEBUG "1. $d1\n"; print DEBUG "DEFINITIONS FOR CUI 2: \n"; print DEBUG "1. $d2\n"; } } else { my $defs1 = $interface->getExtendedDefinition($concept1); my $defs2 = $interface->getExtendedDefinition($concept2); $d1 = ""; if(defined $debugfile) { print DEBUG "DEFINITIONS FOR CUI 1: \n"; } my $i = 1; foreach my $def (@{$defs1}) { if(defined $debugfile) { print DEBUG "$i. $def\n"; $i++; } $def=~/(C[0-9]+) ([A-Za-z]+) (C[0-9]+) ([A-Z]+) \s*\:\s*(.*?)$/; $d1 .= $5 . " "; } $d2 = ""; if(defined $debugfile) { print DEBUG "DEFINITIONS FOR CUI 2: \n"; } my $j = 1; foreach my $def (@{$defs2}) { if(defined $debugfile) { print DEBUG "$j. $def\n"; $j++; } $def=~/(C[0-9]+) ([A-Za-z]+) (C[0-9]+) ([A-Z]+) \s*\:\s*(.*?)$/; $d2 .= $5 . " "; } } # if the --defraw option is not set clean up the defintions if($defraw_option == 0) { $d1 = lc($d1); $d2 = lc($d2); $d1=~s/[\.\,\?\/\'\"\;\:\[\]\{\}\!\@\#\$\%\^\&\*\(\)\-\_\+\-\=]//g; $d2=~s/[\.\,\?\/\'\"\;\:\[\]\{\}\!\@\#\$\%\^\&\*\(\)\-\_\+\-\=]//g; } open(MATX, "<$vectormatrix") or die("Error: cannot open file '$vectormatrix' for output index.\n"); my %vector1 = (); my %vector2 = (); my @defs1 = split(" ", $d1); my @defs2 = split(" ", $d2); my $def1_length = 0 ; foreach my $def_term1 (@defs1) { if (defined $index{$def_term1}) { my $index_term = $index{$def_term1}; my $p = $position{$index_term}; my $l = $length{$index_term}; if (($p==0) and (!defined $l)) { next; } else { $def1_length++; my ($data, $n); seek MATX, $p, 0; if (($n = read MATX, $data, $l) != 0) { if (defined $debugfile) { print DEBUG "$def_term1: "; } chomp($data); my @word_vector = split (' ', $data); my $index = shift @word_vector; $index =~ m/^(\d+)\:$/; if ($index_term == $1) { for (my $z=0; $z<@word_vector; ) { $vector1{$word_vector[$z]} += $word_vector[$z+1]; $z += 2; if (defined $debugfile) { if(defined $word_vector[$z]) { print DEBUG "$reverse_index{$word_vector[$z]} "; } } } if (defined $debugfile) { print DEBUG "\n"; } } else { print STDERR "$def_term1 is not a correct word!\n"; exit; } } } } } if (defined $debugfile) { print DEBUG "def1 length: $def1_length\n"; } if (defined $debugfile) { print DEBUG "def2: $d2\n"; } my $def2_length = 0 ; foreach my $def_term2 (@defs2) { if (defined $index{$def_term2}) { my $index_term = $index{$def_term2}; my $p = $position{$index_term}; my $l = $length{$index_term}; if (($p==0) and (!defined $l)) { next; } else { $def2_length++; my ($data, $n); seek MATX, $p, 0; if (($n = read MATX, $data, $l) != 0) { if (defined $debugfile) { print DEBUG "$def_term2: "; } chomp($data); my @word_vector = split (' ', $data); my $index = shift @word_vector; $index =~ m/^(\d+)\:$/; if ($index_term == $1) { for (my $z=0; $z<@word_vector; ) { $vector2{$word_vector[$z]} += $word_vector[$z+1]; $z += 2; if (defined $debugfile) { if(defined $wordvector[$z]) { print DEBUG "$reverse_index{$word_vector[$z]} "; } } } if (defined $debugfile) { print DEBUG "\n"; } } else { print STDERR "$def_term2 is not a correct word!\n"; exit; } } } } } if (defined $debugfile) { print DEBUG "def2_length: $def2_length\n"; } # normalize my $vec1 = &norm(\%vector1); my $vec2 = &norm(\%vector2); # cosine my $score = &_inner($vec1, $vec2); return $score; } # Subroutine to normalize a vector. sub norm { my $vec = shift; my $out = {}; my $lent = 0; my $ind = 0; return {} if(!defined $vec); foreach $ind (keys %{$vec}) { $lent += (($vec->{$ind}) * ($vec->{$ind})); } $lent = sqrt($lent); if($lent) { foreach $ind (keys %{$vec}) { $out->{$ind} = $vec->{$ind}/$lent; } } return $out; } # Subroutine to find the dot-product of two vectors. sub _inner { my $a = shift; my $b = shift; my $ind; my $dotProduct = 0; return 0 if(!defined $a || !defined $b); foreach $ind (keys %{$a}) { $dotProduct += $a->{$ind} * $b->{$ind} if(defined $a->{$ind} && defined $b->{$ind}); } return $dotProduct; } # Method to return recent error/warning condition sub getError { my $self = shift; return (2, "") if(!defined $self || !ref $self); if($debug) { print STDERR "In UMLS::Similarity::vector->getError()\n"; } my $dontClear = shift; my $error = $self->{'error'}; my $errorString = $self->{'errorString'}; if(!(defined $dontClear && $dontClear)) { $self->{'error'} = 0; $self->{'errorString'} = ""; } $errorString =~ s/^\n//; return ($error, $errorString); } 1; __END__ =head1 NAME UMLS::Similarity::vector - Perl module for computing semantic relatedness of concepts in the Unified Medical Language System (UMLS) using the method described by Resnik 1995. =head1 SYNOPSIS use UMLS::Interface; use UMLS::Similarity::vector; my $vectormatrix = "samples/vectormatrix"; my $vectorindex = "samples/vectorindex"; my $umls = UMLS::Interface->new(); die "Unable to create UMLS::Interface object.\n" if(!$umls); ($errCode, $errString) = $umls->getError(); die "$errString\n" if($errCode); $vectoroptions{"vectormatrix"} = $vectormatrix; $vectoroptions{"vectorindex"} = $vectorindex; my $vector = UMLS::Similarity::vector->new($umls, \%vectoroptions); die "Unable to create measure object.\n" if(!$vector); my $cui1 = "C0005767"; my $cui2 = "C0007634"; @ts1 = $umls->getTermList($cui1); my $term1 = pop @ts1; @ts2 = $umls->getTermList($cui2); my $term2 = pop @ts2; my $value = $vector->getRelatedness($cui1, $cui2); print "The similarity between $cui1 ($term1) and $cui2 ($term2) is $value\n"; =head1 DESCRIPTION This module computes the semantic relatedness of two concepts in the UMLS according to a method described by Resnik (1995). The relatedness measure proposed by Resnik is the information content of the least common subsumer of the two concepts. =head1 USAGE The semantic relatedness modules in this distribution are built as classes that expose the following methods: new() getRelatedness() getError() =head1 TYPICAL USAGE EXAMPLES To create an object of the vector measure, we would have the following lines of code in the perl program. use UMLS::Similarity::vector; $measure = UMLS::Similarity::vector->new($interface); The reference of the initialized object is stored in the scalar variable '$measure'. '$interface' contains an interface object that should have been created earlier in the program (UMLS-Interface). If the 'new' method is unable to create the object, '$measure' would be undefined. This, as well as any other error/warning may be tested. die "Unable to create object.\n" if(!defined $measure); ($err, $errString) = $measure->getError(); die $errString."\n" if($err); To find the semantic relatedness of the concept 'blood' (C0005767) and the concept 'cell' (C0007634) using the measure, we would write the following piece of code: $relatedness = $measure->getRelatedness('C0005767', 'C0007634'); =head1 SEE ALSO perl(1), UMLS::Interface perl(1), UMLS::Similarity(3) =head1 CONTACT US If you have any trouble installing and using UMLS-Similarity, please contact us via the users mailing list : umls-similarity@yahoogroups.com You can join this group by going to: http://tech.groups.yahoo.com/group/umls-similarity/ You may also contact us directly if you prefer : Bridget T. McInnes: bthomson at cs.umn.edu Ted Pedersen : tpederse at d.umn.edu =head1 AUTHORS Bridget T McInnes <bthomson at cs.umn.edu> Siddharth Patwardhan <sidd at cs.utah.edu> Serguei Pakhomov <pakh0002 at umn.edu> Ted Pedersen <tpederse at d.umn.edu> Ying Liu <liux0935 at umn.edu> =head1 COPYRIGHT AND LICENSE Copyright 2004-2010 by Bridget T McInnes, Siddharth Patwardhan, Serguei Pakhomov, Ying Liu and Ted Pedersen This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut