#!/usr/bin/perl

=head1 NAME

umls-association-runDataSet.pl This program calculates the assocation a 
dataset of term pairs

=head1 SYNOPSIS

This utility takes a file of line seperated term pairs as input. The file is 
of the form: "cui1<>cui2\n" with each line containing a new cui pair. It
outputs a line seperated list of association score and term pair of the 
form: "score<>cui1<>cui2". Each line contains a different cui pair and their 
score

=head1 USAGE

Usage: umls-assocation-runDataSet.pl [OPTIONS] CUI_LIST_FILE OUTPUT_FILE --measure Assoc_Measure --matrix Matrix_FileName

=head1 INPUT

=head2 CUI_LIST_FILE

the input file containing line seperated cui pairs of the form: "cui1<>cui2"

=head2 OUTPUT_FILE

the output file, where each score and cui pair are output of the form: 
score<>cui1<>cui2

[Matrix_File]                                                                                                       

File name containing co-occurrence data in sparse matrix format
 
[Assoc_Measure]

A string specifying the association measure to use
The measure used to calculate the assocation. Recommended = x2

The package uses the Text::NSP package to do the calculation.
The measure included within this package are: 
    1.  Dice Coefficient 
    2.  Fishers exact test - left sided
    3.  Fishers exact test - right sided
    4.  Fishers twotailed test - right sided
    5.  Jaccard Coefficient
    6.  Log-likelihood ratio
    7.  Mutual Information
    8.  Odds Ratio
    9.  Pointwise Mutual Information
    10. Phi Coefficient
    11. Pearson's Chi Squared Test
    12. Poisson Stirling Measure
    13. T-score  

=head1 OPTIONS

Optional command line arguements. These options are identical to 
umls-association.pl. Please see umls-associaton.pl for descriptions.
 
=head1 OUTPUT

The association between the each concept pair of the input file written to 
a new line of the output file.

=head1 SYSTEM REQUIREMENTS

=over

=item * Perl (version 5.8.5 or better) - http://www.perl.org

=item * Text::NSP - http://search.cpan.org/dist/Text-NSP

=back

=head1 CONTACT US
   
  If you have any trouble installing and using UMLS-Assocation, 
  please contact us via the users mailing list :
    
      umls-association@yahoogroups.com
     
  You can join this group by going to:
    
      http://tech.groups.yahoo.com/group/umls-assocation/
     
  You may also contact us directly if you prefer :
    
      Sam Henry: henryst at vcu.edu 

=head1 AUTHOR

 Sam Henry, Virginia Commonwealth University
 Bridget T. McInnes, Virginia Commonwealth University 
 Alexander D. McQuilkin, Virginia Commonwealth University

=head1 COPYRIGHT

Copyright (c) 2015

 Bridget T. McInnes, Virginia Commonwealth University 
 btmcinnes at vcu.edu

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to:

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.

=cut

use UMLS::Association;
use Getopt::Long;

my $DEFAULT_MEASURE = "tscore";

#############################################
#  Get Options and params
#############################################
eval(GetOptions( "version", "help", "measure=s", "noorder", "lta", "mwa", "sbc", "lsa", "wsa", "matrix=s","precision=s","nonorm")) or die ("Please check the above mentioned option(s).\n");

#get required input
my $cuisFileName = shift;
my $outputFileName = shift;

#############################################
#  Check help, version, minimal usage notes
#############################################
#  if help is defined, print out help
if( defined $opt_help ) {
    $opt_help = 1;
    &showHelp();
    exit;
}

#  if version is requested, show version
if( defined $opt_version ) {
    $opt_version = 1;
    &showVersion();
    exit;
}

# a single input file and output file must be passed in 
if(!(defined $cuisFileName)) {
    print STDERR "No CUI Pair Input File provided\n";
    &minimalUsageNotes();
    exit;
}
if(!(defined $outputFileName)) {
    print STDERR "No Output File provided\n";
    &minimalUsageNotes();
    exit;
}

#############################################
#  Set Up UMLS::Association
#############################################
#  set UMLS-Association option hash
my %assoc_option_hash = ();

if(defined $opt_measure) {
    $assoc_option_hash{"measure"} = $opt_measure;
} 
if(defined $opt_debug) {
    $assoc_option_hash{"debug"} = $opt_debug;
}
if(defined $opt_verbose) {
    $assoc_option_hash{"verbose"} = $opt_verbose;
}
if(defined $opt_precision){
    $assoc_option_hash{"precision"} = $opt_precision;
}
if(defined $opt_lta){
    $assoc_option_hash{"lta"} = $opt_lta;
}
if(defined $opt_mwa){
    $assoc_option_hash{"mwa"} = $opt_mwa;
}
if(defined $opt_sbc){
    $assoc_option_hash{"sbc"} = $opt_sbc;
}
if(defined $opt_lsa){
    $assoc_option_hash{"lsa"} = $opt_lsa;
}
if(defined $opt_wsa){
    $assoc_option_hash{"wsa"} = $opt_wsa;
}
if(defined $opt_noorder){
    $assoc_option_hash{"noorder"} = $opt_noorder;
}
if(defined $opt_matrix){
    $assoc_option_hash{"matrix"} = $opt_matrix;
}
if(defined $opt_nonorm){
    $assoc_option_hash{"nonorm"} = $opt_nonorm;
}


#  instantiate instance of UMLS-Assocation
my $association = UMLS::Association->new(\%assoc_option_hash); 
die "Unable to create UMLS::Association object.\n" if(!$association);

#############################################
#  Calculate Association
#############################################

#read in all the first and second cui sets
# two comma seperated sets seperated by <> (E.G. c1,c2<>c3,c4,c5)
open IN, $cuisFileName 
    or die ("Error: unable to open cui list file: $cuisFileName");
my @sets1 = ();
my @sets2 = ();
foreach my $line (<IN>) {
    #read the cui sets from the line
    chomp $line;
    (my $cuiSet1String, my $cuiSet2String) = split('<>',$line);
    my @cuiSet1 = split(/,/,$cuiSet1String);
    my @cuiSet2 = split(/,/,$cuiSet2String);

    #add to the cui sets
    push @sets1, \@cuiSet1;
    push @sets2, \@cuiSet2;
}
close IN;

#calculate association scores for each term pair
my $scoresRef = $association->calculateAssociation_setPairList(\@sets1, \@sets2, $assoc_option_hash{"measure"});

#output the results
open OUT, ">$outputFileName" 
    or die ("Error: Unable to open output file: $outputFileName");
for (my $i = 0; $i < scalar @{$scoresRef}; $i++) {
    print OUT "${$scoresRef}[$i]<>".(join(',',@{$sets1[$i]}))."<>".(join(',',@{$sets2[$i]}))."\n";
} 
close OUT;



###########################
# Help Functions
###########################
#shows the minimal usage notes
sub minimalUsageNotes {
    print "Usage: umls-association-runDataSet.pl [OPTIONS] CUI_LIST_FILE OUTPUT_FILE\n";
    print "Type umls-association-runDataSet.pl --help for help.\n";
    exit;
}

#shows help to the user
sub showHelp {
    print "This utility takes a file of line seperated term pairs as input.\n";
    print " The file is of the form: \"cui1<>cui2\n\" with each line containing\n";
    print " a new cui pair. It outputs a line seperated list of association\n";
    print "score and term pair of the form: \"score<>cui1<>cui2\". Each line \n";
    print "contains a different cui pair and their score\n";
    print "\n";
    print "Usage: umls-association-runDataSet.pl [OPTIONS] CUI_LIST_FILE OUTPUT_FILE\n";
    print "  --measure Assoc_Measure --matrix Matrix_File\n";
    print "\n";
    print "Please note, the optional parameters are identical to umls-association.pl.\n";
    print "to avoid inconsitencies when adding new features or updating, please see:\n";
    print "umls-association --help\n";
    print "for a complete list of optional arguments\n\n";
}

#shows the current version
sub showVersion {
    print "current version is ".(Association->version())."\n";
    exit;
}