#!/usr/bin/perl -w

use strict;
use POSIX;
use HTML::Entities;
use Alvis::URLs;
use Getopt::Long;
use Pod::Usage;


# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

#  ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;

my $MINCOUNT = 1;
my $linktext = 0;
my $titletext = 0;
my $stopfile = "";
my $fixdocs = 0;    # set this to fix everything but .docs 
my %stops = ();


#  check options

GetOptions(
     'man'       => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
      'stopfile=s' => \$stopfile,
      'mincount=i' => \$MINCOUNT,
      'docs' => \$fixdocs,
      'linktext' => \$linktext,
      'titletext' => \$titletext,
      'noclean' => \$Alvis::URLs::noclean,
      'nocase' => \$Alvis::URLs::nocase,
      'h|help'       => sub {pod2usage(1)}
);

pod2usage(-message => "ERROR: need input file and stem")
      if ( $#ARGV != 1 );

my $file = shift();
my $stem = shift();

my $doccount = 0;
my $featcount = 0;
#  maps a cleaned URL's hash to a docID 
my %docmap = ();
#  maps a docID to a sequence number
my %docid = ();
#  token value plus count
my %token = ();
my %tokencnt = ();

if ( $stopfile ) {
  open(S,"<$stopfile");
  while ( ($_=<S>) ) {
    chomp();
    $stops{lc($_)} = 1;
  }
  close(S);
}

sub tabletext() {
  my $tw = $_[0];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/ /,$tw) ) {
    #  lower case by default
    $k = lc($k);
    if ( ! defined($stops{$k}) ) {
      &table("text",$k);
    }
  }
}

#  ensure to make "link" entries dominate, they should never be
#  dropped in favor of non-link entries
sub table() {
  my $tp = $_[0];
  my $text = $_[1];
  my $code = "$tp $text";
  # print STDERR "Table $code\n";
  my $h = &Alvis::URLs::easyhash64char($code);
  if ( defined($token{$h}) ) {
    if ( $token{$h} ne $code ) {
      if ( defined($docmap{$h}) ) {
	#  documents always override
	if ( $tp eq "link" ) {
	  print STDERR "Dropping token '$token{$h}' with hash $h due to clash\n";
	  $token{$h} = $code;
	} else {
	  print STDERR "Dropping token '$code' with hash $h due to clash\n";
	}
      } else {
	print STDERR "Dropping token '$code' with hash $h due to clash\n";
      }
    } else {
      $tokencnt{$h}++;
    }
  } else {
    if ( $tp eq "link" || ! defined($docmap{$h}) ) {
      $token{$h} = $code;
      $tokencnt{$h}++;
    }
  }
}

if ( $fixdocs ) {
  #  we just update the docs file
  #
  my $line = 0;
  if ( -f "$stem.docs" ) {
    #  read last line to get last document number
    open(ND,"tail -1 $stem.docs |") or die "Cannot read $stem.docs: $!\n";
    $_ = <ND>;
    close(ND);
    if ( /^([0-9]+) / ) {
      $line = int($1) + 1;
    } else {
      print STDERR "Cannot read document index from $stem.docs\n";
      exit(1);
    }
    #  now start from here, update .docs
    open(DOCS,">>$stem.docs");
    open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
    while (($_=<I>) ) {
      chomp();
      if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
	my $inu = &Alvis::URLs::StandardURL($1);
	my $id = uc($2);
	my $titles = $3;
	my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
	print DOCS "$line $inu $id $hash $titles\n";
	$line ++;
	for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
	      $_=<I>,chomp() ) {
	  #  skip to end of record
	}
      }
    }
    close(I);
    close(DOCS);
    #  update .srcpar
    open(SRCPAR,"<$stem.srcpar");
    my $sp = "";
    while ( ($_=<SRCPAR>) ) {
      $sp .= $_;
    }
    close(SRCPAR);
    $sp =~ s/\nmaxdoc=.*/\nmaxdoc=$line/;
    open(SRCPAR,">$stem.srcpar");
    print SRCPAR $sp;
    close(SRCPAR);
    exit(0);
  } else {
    print STDERR "Cannot open $stem.docs\n";
    exit(1);
  }
}

#  one pass fills tables
open(DOCS,">$stem.docs");
open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
my $line = 0;
while (($_=<I>) ) {
  chomp();
  if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
    my $inu = &Alvis::URLs::StandardURL($1);
    my $id = uc($2);
    my $titles = $3;
    my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
    # print STDERR "DOCS > $line $hash $inu $id $titles\n";
    print DOCS "$line $inu $id $hash $titles\n";
    #   notice we overwrite any previous docID
    $docid{$id} = $line;
    if ( defined($docmap{$hash}) ) {
      $docmap{$hash} .= " $id";
    } else {
      $docmap{$hash} = $id;
    }
    $line ++;	  
    if ( $titletext ) {
      &tabletext($titles);
    }
    #   now process links
    for ( $_=<I>,chomp(); $_ && $_ ne "EOD" && $_ ne "EOL";
	  $_=<I>,chomp() ) {
      my $link = $_;
      $link =~ s/ .*//;
      # print STDERR "LINK: $link $_\n";
      $link = &Alvis::URLs::StandardURL($link);
      &table("link",$link);
      if ( $linktext && /^([^ ]+) (.*)$/ ) {
	&tabletext($2);
      }
    }
    if ( $_ eq "EOL" ) {
      #   now process tokens
      for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
	    $_=<I>,chomp() ) {
	if ( /^([^ ]+) (.*)$/ ) {
	  if ( $1 eq "text" ) {
	    &tabletext($2);
	  } else {
	    &table($1,$2);
	  }
	}
      }
    }
  } elsif ( /^D / ) {
    print STDERR "Unmatched document entry: (($_))\n";
  }
}
close(I);
close(DOCS);
print STDERR "Processed $line documents\n";
$doccount = $line;

# we have insured that any hash that belongs to a document
# is reserved for links
open(TMP,">$stem.tokens.tmp");
foreach my $t ( keys(%token) ) {
  if ( $docmap{$t} ) {
    $token{$t} =~ /^([^ ]+) (.*)/;
    if ( $1 ne "link" ) {
      print STDERR "Dropped token $t '$token{$t}', should be 'link'\n";
    } else {
      print TMP "doc $t $tokencnt{$t} $2\n";
    }
  } elsif ( $tokencnt{$t}>= $MINCOUNT ) {
    $token{$t} =~ /^([^ ]+) (.*)/;
    print TMP "$1 $t $tokencnt{$t} $2\n";
  }
}
close(TMP);

#  discard unused tables
%token = ();
%tokencnt = ();

# now sort by type, docs first, and add line number
# also print doc mappings, i.e., feature to original document

my @typecnt = ();
my @typename = ();
my $types = 0;

open(TMP,"$SORTCODE ( grep '^doc ' $stem.tokens.tmp | sort ) |");
open(TOKENS,">$stem.tokens");
open(TOKENMAP,">$stem.words");
open(DOCMAP,">$stem.docfeats");
$line = 0;
while ( ($_=<TMP>) ) {
  chomp();
  my $tok = $_;
  $tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
  print TOKENS "$tok\n";
  print TOKENMAP "$line $_\n";
  $_ =~ /^doc ([^ ]+) /;
  my $h = $1;
  foreach my $id ( split(/ /,$docmap{$h}) ) {
    if ( !defined($docid{$id}) ) {
      print STDERR "Lost doc sequence number for docID $id\n";
    }
    print DOCMAP "$line $docid{$id}\n";
  }
  $line++;
}
if ( $line>0 ) {
  #  keep track of type details
  $typename[0] = "doc";
  $typecnt[0] = $line;
  $types++;
}
close(TMP);
open(TMP,"$SORTCODE ( grep -v '^doc ' $stem.tokens.tmp | sort ) |");
my $type = "";
my $type_start = $line;
while ( ($_=<TMP>) ) {
  chomp();
  my $tok = $_;
  $tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
  print TOKENS "$tok\n";
  print TOKENMAP "$line $_\n";
  my $ntype = $_;
  $ntype =~ s/ .*//;
  if ( $ntype ne $type ) {
    if ( $type ) {
      $typename[$types] = $type;
      $typecnt[$types] = $line - $type_start;
      $type_start = $line;
      $types ++;
    }
    $type = $ntype;
  }
  $line++;
}
$typename[$types] = $type;
$typecnt[$types] = $line - $type_start;
$type_start = $line;
$types ++;
close(TOKENS);
close(TOKENMAP);
close(DOCMAP);
unlink("$stem.tokens.tmp");
$featcount = $line;

#  now create some dimensions in .srcpar
open(SRCPAR,">$stem.srcpar");
print SRCPAR "datastem=\"$stem\"\n";
print SRCPAR "linkstem=\"$stem\"\n";
print SRCPAR "maxdoc=$doccount\n";
print SRCPAR "maxfeat=$featcount\n";
print SRCPAR "maxcomp=1\n";
print SRCPAR "dims.n_dims=$types\n";
print SRCPAR "dims.tot=$featcount\n";
print SRCPAR "dims.names=" . join(",",@typename) . "\n";
print SRCPAR "dims.dims=" . join(",",@typecnt) . "\n";
close(SRCPAR);

exit 0;

__END__

=head1 NAME
    
linkTables - input file of links and tokens for document set, 
and generated token and document tables.

=head1 SYNOPSIS
    
linkTables [--docs|--linktext|--nocase|--noclean|--titletext] 
           [--mincount N] [--stopfile FILE] LINK-FILE STEM

Options:

    LINK-FILE           Filename for input link file usually created by XSL
    STEM                Stem for output file, several extensions read and made
    --docs              only update the .docs file, all else remains fixed
    --linktext          add link text, delimit by spaces, to text type
    --mincount M        only add tokens with this many
    --nocase            ignore case of URLs
    --noclean           don't use built-in URL cleaning
    --stopfile F        do not enter these words in text tables
    --titletext         add title text, delimit by spaces, to text type
    -h, --help          display help message and exit.
     --man              print man page and exit.

=head1 DESCRIPTION

Input file of links, link text and redirects in the data format described
next.  Use file name '-' to input stdin.  Builds the 
tables used in bag processing: 
   STEM.tokens        N-th line is the token for items with index (N-1). 
   STEM.words         A map for the token file includes token, its type and the hash code.      
   STEM.docs          N-th line is the details for the N-th document
   STEM.docfeats      mapping of token index to document index

The token to document index in .docfeats is implied after standardising OUTGOING-URLs 
for a document and the document URls themselves

=head1 DATA FORMAT

Input lines can have the R form for redirects:
     R <URL> <URL-REDIRECTED-TO>

These entries are ignored by this script, and should be first 
eliminated with 
I<linkRedir>(1).  The main input is the
D form for documents and their links and link text
        D <URL> <HASHID> <TITLE>
        <OUTGOING-URL> <LINK-TEXT>
        ...
        EOL
        <TYPE> <TOKEN>
        ...
        EOD

The text "EOD" acts as a
document terminator and can be missing if no tokens exist.
The text "EOL" is a link terminator.  The <URL>s and <HASHID>s must not have
spaces or the processing will get confused since R and D records are
split on spaces.   Note text at the end of the line is an exception.
<HASHID> is any externally defined record identifier.  ALVIS default
is a 32 character hexadecimal from an MD5 hash of the text.

<TYPE> is intended to be a short bit of alphabetic text describing the
type such as 'person', 'company', etc.
Reserved <TYPE>s are 'doc', link to a document in the collection,
'link' which is a link out of the collection, and 'text' which
is any text.

=head1 SEE ALSO

I<Alvis::URLs>(3), 
I<linkBags>(1), 
I<linkMpca>(1), 
I<linkRedir>(1), 
I<mpdata>(1).

MPCA website is 
F<http://www.componentanalysis.org>

=head1 AUTHOR

Wray Buntine

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005-2006 Wray Buntine

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

=cut