#!/bin/csh

# This script sets up data in the "lexical sample" format that is
# commonly used in Senseval exercises. That means that there is a 
# directory created (called LexSample) that contains subdirectories,
# one for each word we wish to sense discriminate. 

# Originally written by Amruta Purandare, 2002-2004 
# Modified by Ted Pedersen, July 2006

if(-e LexSample) then
	echo "LexSample already exists."
	echo "Remove directory LexSample before running makedata.sh"
	exit 1
endif

# The input data must be in Senseval2 format, and if there are multiple
# lexelts in the input file, then those will be split up so that there
# is one directory per lexelt in LexSample

# training and test file must be in /Data directory

set DATADIR  = Data
set TRAINING = eng-lex-sample.training.xml
set TEST     = eng-lex-sample.evaluation.xml
set KEY      = $DATADIR/eng-lex-sample.key

set REGEXDIR = Regexs
set TOKEN    = $REGEXDIR/token.regex
set NONTOKEN = $REGEXDIR/nontoken.regex

# we move the train/test data from its home in DATADIR up one level  
# because of setup.pl requirements and structure

cd $DATADIR
gunzip *.gz

cp $TRAINING ..
cp $TEST ..

cd ..

# preprocessing - this will split the training and test data up into 
# separate files based on the lexelt, and put them in their own directory

setup.pl --verbose --showargs --training $TRAINING --key $KEY --token $TOKEN --nontoken $NONTOKEN $TEST

	rm -fr $TRAINING $TEST

	cd LexSample

	rm -fr token.regex nontoken.regex

	# we run demo only on selected words that we have 
	# experimeted with for Amruta's Thesis experiments
	# removing other words ...

	rm -fr bum.n call.v carry.v chair.n colourless.a detention.n develop.v draw.v dress.v drift.v drive.v dyke.n face.v faithful.a fatigue.n feeling.n ferret.v find.v fit.a graceful.a green.a hearth.n holiday.n keep.v lady.n leave.v local.a match.v nation.n nature.n oblique.a play.v pull.v replace.v restraint.n see.v sense.n serve.v solemn.a spade.n stress.n strike.v treat.v turn.v use.v vital.a wander.v wash.v work.v yew.n

	set lexelts = `ls`
	
	foreach lexelt ($lexelts)
	
		cd $lexelt
	
			# we only need test.xml and training.count
			rm -fr $lexelt-training.xml $lexelt-test.count

			# Senseval-2 data has a large number of senses,
                        # and some of the words have instances with 
			# multiple correct answers. SenseClusters assumes
	 	        # that each instance only has one correct answer,
	                # so we must filter the data to make that true.

		        # create a table showing the number of instances
                        # that occur with each sense. This is needed by
	                # filter.pl

			frequency.pl $lexelt-test.xml > frequency

	                # remove all but the most frequent sense for those
	                # instances that have multiple answers (--nomulti)
	                # and remove all instances associated with senses
                        # that occur 5 percent of the time or less (--percent)

	                # it is also possible with filter to ask for the 
	                # top N senses for each word (--rank N) but we do 
			# not do that here

			filter.pl --nomulti --percent 5 $lexelt-test.xml frequency > $lexelt-test.xml.fil

			mv $lexelt-test.xml.fil $lexelt-test.xml

			rm -fr frequency
		cd ..
	end
cd ..

gzip Data/eng*