# /=====================================================================\ #
# |  LaTeXML::Util::Pathname                                            | #
# | Pathname Utilities for LaTeXML                                      | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #
#======================================================================
# Sanely combine features of File::Spec and File::Basename
# Somehow, both modules tend to bite me at random times.
# eg. sometimes Basename's fileparse doesn't extract extension.
# sometimes File::Spec seems to do too many filesystem checks (gets slow!)
# File::Spec->splitpath "may or may not ... trailing '/'" ... Huh?
#======================================================================
# My first instinct is that this should bless the pathnames,
# but strings as pathnames come so naturally in perl;
# But I may still do it...
#======================================================================
# Some portability changes for Windows, thanks to Ioan Sucan.
#======================================================================
# Packages in the LaTeXML::Util package set have no dependence on LaTeXML
# objects or context.
#======================================================================
package LaTeXML::Util::Pathname;
use strict;
use warnings;
use File::Spec;
use File::Copy;
use File::Which;
use Cwd;
use base qw(Exporter);
our @EXPORT = qw( &pathname_find &pathname_findall &pathname_kpsewhich
  &pathname_make &pathname_canonical
  &pathname_split &pathname_directory &pathname_name &pathname_type
  &pathname_timestamp
  &pathname_concat
  &pathname_relative &pathname_absolute
  &pathname_is_absolute &pathname_is_contained
  &pathname_is_url &pathname_is_literaldata
  &pathname_protocol
  &pathname_cwd &pathname_chdir &pathname_mkdir &pathname_copy
  &pathname_installation);

# NOTE: For absolute pathnames, the directory component starts with
# whatever File::Spec considers to be the volume, or "/".
#======================================================================
# Ioan Sucan suggests switching this to '\\' for windows, but notes
# that it works as it is, so we'll leave it (for now).
### my $SEP         = '/';                          # [CONSTANT]
# Some indicators that this is not sufficient? (calls to libraries/externals???)
# PRELIMINARY test, probably need to be even more careful
my $ISWINDOWS   = $^O =~ /^(MSWin|NetWare|cygwin)/i;
my $SEP         = ($ISWINDOWS ? '\\' : '/');           # [CONSTANT]
my $KPATHSEP    = ($ISWINDOWS ? ';' : ':');            # [CONSTANT]
my $LITERAL_RE  = '(?:literal)(?=:)';                  # [CONSTANT]
my $PROTOCOL_RE = '(?:https|http|ftp)(?=:)';           # [CONSTANT]

#======================================================================
# pathname_make(dir=>dir, name=>name, type=>type);
# Returns a pathname.  This will be an absolute path if
# dir (or the first, if dir is an array), is absolute.
sub pathname_make {
  my (%pieces) = @_;
  my $pathname = '';
  if (my $dir = $pieces{dir}) {
    my @dirs = (ref $dir eq 'ARRAY' ? @$dir : ($dir));
    $pathname = shift(@dirs);
    foreach my $d (@dirs) {
      $pathname =~ s|\Q$SEP\E$||; $dir =~ s|^\Q$SEP\E||;
      $pathname .= $SEP . $dir; } }
  $pathname .= $SEP if $pathname && $pieces{name} && $pathname !~ m|\Q$SEP\E$|;
  $pathname .= $pieces{name} if $pieces{name};
  $pathname .= '.' . $pieces{type} if $pieces{type};
  return pathname_canonical($pathname); }

# Split the pathname into components (dir,name,type).
# If pathname is absolute, dir starts with volume or '/'
sub pathname_split {
  my ($pathname) = @_;
  $pathname = pathname_canonical($pathname);
  my ($vol, $dir, $name) = File::Spec->splitpath($pathname);
  # Hmm, for /, we get $dir = / but we want $vol='/'  ?????
  if ($vol) { $dir = $vol . $dir; }
  elsif (File::Spec->file_name_is_absolute($pathname) && !File::Spec->file_name_is_absolute($dir)) { $dir = $SEP . $dir; }
  # $dir shouldn't end with separator, unless it is root.
  $dir =~ s/\Q$SEP\E$// unless $dir eq $SEP;
  my $type = '';
  if ($name =~ s/\.([^\.]+)$//) { $type = $1; }
  return ($dir, $name, $type); }

use Carp;

# This likely needs portability work!!! (particularly regarding urls, separators, ...)
# AND, care about symbolic links and collapsing ../ !!!
sub pathname_canonical {
  my ($pathname) = @_;
  if ($pathname =~ /^($LITERAL_RE)/) {
    return $pathname; }
  # Don't call pathname_is_absolute, etc, here, cause THEY call US!
  confess "Undefined pathname!" unless defined $pathname;
  #  File::Spec->canonpath($pathname); }
  $pathname =~ s|^~|$ENV{HOME}|;
  # We CAN canonicalize urls, but we need to be careful about the // before host!
  # OHHH, but we DON'T want \ for separator!
  my $urlprefix = undef;
  if ($pathname =~ s|^($PROTOCOL_RE//[^/]*)/|/|) {
    $urlprefix = $1; }

  if ($pathname =~ m|//+/|) {
    Carp::cluck "Recursive pathname? : $pathname\n"; }
##  $pathname =~ s|//+|/|g;
  $pathname =~ s|/\./|/|g;
  # Collapse any foo/.. patterns, but not ../..
  while ($pathname =~ s|/(?!\.\./)[^/]+/\.\.(/\|$)|$1|) { }
  $pathname =~ s|^\./||;
  return (defined $urlprefix ? $urlprefix . $pathname : $pathname); }

# Convenient extractors;
sub pathname_directory {
  my ($pathname) = @_;
  my ($dir, $name, $type) = pathname_split($pathname);
  return $dir; }

sub pathname_name {
  my ($pathname) = @_;
  my ($dir, $name, $type) = pathname_split($pathname);
  return $name; }

sub pathname_type {
  my ($pathname) = @_;
  my ($dir, $name, $type) = pathname_split($pathname);
  return $type; }

# Note that this returns ONLY recognized protocols!
sub pathname_protocol {
  my ($pathname) = @_;
  return ($pathname =~ /^($PROTOCOL_RE|$LITERAL_RE)/ ? $1 : 'file'); }

#======================================================================
sub pathname_concat {
  my ($dir, $file) = @_;
  return $file unless $dir;
  return $dir if !defined $file || ($file eq '.');
  return pathname_canonical(File::Spec->catpath('', $dir || '', $file)); }

#======================================================================
# Is $pathname an absolute pathname ?
# pathname_is_absolute($pathname) => (0|1)
sub pathname_is_absolute {
  my ($pathname) = @_;
  return $pathname && File::Spec->file_name_is_absolute(pathname_canonical($pathname)); }

sub pathname_is_url {
  my ($pathname) = @_;
  return $pathname && $pathname =~ /^($PROTOCOL_RE)/ && $1; }    # Other protocols?

sub pathname_is_literaldata {
  my ($pathname) = @_;
  if ($pathname =~ /^($LITERAL_RE)/) { return $1; } else { return; } }

# Check whether $pathname is contained in (ie. underneath) $base
# Returns the relative pathname if it is underneath; undef otherwise.
sub pathname_is_contained {
  my ($pathname, $base) = @_;
  # after assuring that both paths are absolute,
  # get $pathname relative to $base
  my $rel = pathname_canonical(pathname_relative(pathname_absolute($pathname),
      pathname_absolute($base)));
  # If the relative pathname starts with "../" that it apparently is NOT underneath base!
  return ($rel =~ m|^\.\.(?:/\|\Q$SEP\E)| ? undef : $rel); }

# pathname_relative($pathname,$base) => $relativepathname
# If $pathname is an absolute, non-URL pathname,
# return the pathname relative to $base,
# else just return its canonical form.
# Actually, if it's a url and $base is also url, to SAME host! & protocol...
# we _could_ make relative...
sub pathname_relative {
  my ($pathname, $base) = @_;
  $pathname = pathname_canonical($pathname);
  return ($base && pathname_is_absolute($pathname) && !pathname_is_url($pathname)
    ? File::Spec->abs2rel($pathname, pathname_canonical($base))
    : $pathname); }

sub pathname_absolute {
  my ($pathname, $base) = @_;
  $pathname = pathname_canonical($pathname);
  return (!pathname_is_absolute($pathname) && !pathname_is_url($pathname)
    ? File::Spec->rel2abs($pathname, ($base ? pathname_canonical($base) : pathname_cwd()))
    : $pathname); }

#======================================================================
# Actual file system operations.
sub pathname_timestamp {
  my ($pathname) = @_;
  return -f $pathname ? (stat($pathname))[9] : 0; }

our $CWD = undef;
# DO NOT use pathname_cwd, unless you also use pathname_chdir to change dirs!!!
sub pathname_cwd {
  if (!defined $CWD) {
    if (my $cwd = cwd()) {
      $CWD = pathname_canonical($cwd); }
    else {
      # Fatal not imported
      die "INTERNAL: Could not determine current working directory (cwd)"
        . "Perhaps a problem with Perl's locale settings?"; } }
  return $CWD; }

sub pathname_chdir {
  my ($directory) = @_;
  chdir($directory);
  pathname_cwd();    # RE-cache $CWD!
  return; }

sub pathname_mkdir {
  my ($directory) = @_;
  return unless $directory;
  $directory = pathname_canonical($directory);
  my ($volume, $dirs, $last) = File::Spec->splitpath($directory);
  my (@dirs) = (File::Spec->splitdir($dirs), $last);
  for (my $i = 0 ; $i <= $#dirs ; $i++) {
    my $dir = File::Spec->catpath($volume, File::Spec->catdir(@dirs[0 .. $i]), '');
    if (!-d $dir) {
      mkdir($dir) or return; } }
  return $directory; }

# copy a file, preserving attributes, if possible.
# Why doesn't File::Copy preserve attributes on Unix !?!?!?
sub pathname_copy {
  my ($source, $destination) = @_;
  # If it _needs_ to be copied:
  $source      = pathname_canonical($source);
  $destination = pathname_canonical($destination);
  if ((!-f $destination) || (pathname_timestamp($source) > pathname_timestamp($destination))) {
    if (my $destdir = pathname_directory($destination)) {
      pathname_mkdir($destdir) or return; }
###    if($^O =~ /^(MSWin32|NetWare)$/){ # Windows
###      # According to Ioan, this should work:
###      system("xcopy /P $source $destination")==0 or return; }
###    else {               # Unix
###      system("cp --preserve=timestamps $source $destination")==0 or return; }
    # Hopefully this portably copies, preserving timestamp.
    copy($source, $destination) or return;
    my ($atime, $mtime) = (stat($source))[8, 9];
    utime $atime, $mtime, $destination;    # And set the modification time
  }
  return $destination; }

#======================================================================
# pathname_find($pathname, paths=>[...], types=>[...])  => $absolute_pathname;
# Find a file corresponding to $pathname returning the absolute,
# completed pathname if found, else undef
#  * If $pathname is a not an absolute pathname
#    (although it may still have directory components)
#    then if search $paths are given, search for it relative to
#    each of the directories in $paths,
#    else search for it relative to the current working directory.
#  * If types is given, then search (in each searched directory)
#    for the first file with the given extension.
#    The extension "" (empty string) means to search for the exact name.
#  * If types is not given, search for the exact named file
#    without additional extension.
#  * If installation_subdir is given, look in that subdirectory of where LaTeXML
#    was installed, by appending it to the paths.

# This is presumably daemon safe...
my @INSTALLDIRS = grep { (-f "$_.pm") && (-d $_) }
  map { pathname_canonical($_ . $SEP . 'LaTeXML') } @INC;    # [CONSTANT]

sub pathname_installation {
  return $INSTALLDIRS[0]; }

sub pathname_find {
  my ($pathname, %options) = @_;
  return unless $pathname;
  my @paths = candidate_pathnames($pathname, %options);
  foreach my $path (@paths) {
    return $path if -f $path; }
  return; }

sub pathname_findall {
  my ($pathname, %options) = @_;
  return unless $pathname;
  my @paths = candidate_pathnames($pathname, %options);
  return grep { -f $_ } @paths; }

# It's presumably cheep to concatinate all the pathnames,
# relative to the cost of testing for files,
# and this simplifies overall.
sub candidate_pathnames {
  my ($pathname, %options) = @_;
  my @dirs = ();
  $pathname = pathname_canonical($pathname) unless $pathname eq '*';
  my ($pathdir, $name, $type) = ($pathname eq '*' ? (undef, '*', undef) : pathname_split($pathname));
  $name .= '.' . $type if (defined $type) && ($type ne '');
  # generate the set of search paths we'll use.
  if (pathname_is_absolute($pathname)) {
    push(@dirs, $pathdir); }
  else {
    my $cwd = pathname_cwd();
    if ($options{paths}) {
      foreach my $p (@{ $options{paths} }) {
        # Complete the search paths by prepending current dir to relative paths,
        my $pp = pathname_concat((pathname_is_absolute($p) ? pathname_canonical($p) : pathname_concat($cwd, $p)),
          $pathdir);
        push(@dirs, $pp) unless grep { $pp eq $_ } @dirs; } }    # but only include each dir ONCE
    push(@dirs, pathname_concat($cwd, $pathdir)) unless @dirs;    # At least have the current directory!
           # And, if installation dir specified, append it.
    if (my $subdir = $options{installation_subdir}) {
      push(@dirs, map { pathname_concat($_, $subdir) } @INSTALLDIRS); } }

  # extract the desired extensions.
  my @exts = ();
  if ($options{type}) {
    push(@exts, '.' . $options{type}); }
  if ($options{types}) {
    foreach my $ext (@{ $options{types} }) {
      if ($ext eq '') { push(@exts, ''); }
      elsif ($ext eq '*') {
        push(@exts, '.*', ''); }
      elsif ($pathname =~ /\.\Q$ext\E$/i) {
        push(@exts, ''); }
      else {
        push(@exts, '.' . $ext); } } }
  push(@exts, '') unless @exts;

  my @paths = ();
  # Now, combine; precedence to leading directories.
  foreach my $dir (@dirs) {
    foreach my $ext (@exts) {
      if ($name eq '*') {    # Unfortunately, we've got to test the file system NOW...
        if ($ext eq '.*') {    # everything
          opendir(DIR, $dir) or next;
          push(@paths, map { pathname_concat($dir, $_) } grep { !/^\./ } readdir(DIR));
          closedir(DIR); }
        else {
          opendir(DIR, $dir) or next;    # ???
          push(@paths, map { pathname_concat($dir, $_) } grep { /\Q$ext\E$/ } readdir(DIR));
          closedir(DIR); } }
      elsif ($ext eq '.*') {             # Unfortunately, we've got to test the file system NOW...
        opendir(DIR, $dir) or next;      # ???
        push(@paths, map { pathname_concat($dir, $_) } grep { /^\Q$name\E\.\w+$/ } readdir(DIR));
        closedir(DIR); }
      else {
        push(@paths, pathname_concat($dir, $name . $ext)); } } }
  return @paths; }

#======================================================================
our $kpsewhich = which($ENV{LATEXML_KPSEWHICH} || 'kpsewhich');
our $kpse_cache = undef;

sub pathname_kpsewhich {
  my (@candidates) = @_;
  return unless $kpsewhich;
  build_kpse_cache() unless $kpse_cache;
  foreach my $file (@candidates) {
    if (my $result = $$kpse_cache{$file}) {
      return $result; } }
  # If we've failed to read the cache, try directly calling kpsewhich
  # For multiple calls, this is slower in general. But MiKTeX, eg., doesn't use texmf ls-R files!
  my $files = join(' ', @candidates);
  if ($kpsewhich && (my $result = `"$kpsewhich" $files`)) {
    if ($result =~ /^\s*(.+?)\s*\n/s) {
      return $1; } }
  return; }

sub build_kpse_cache {
  $kpse_cache = {};    # At least we've tried.
  return unless $kpsewhich;
       # This finds ALL the directories looked for for any purposes, including docs, fonts, etc
  my $texmf = `"$kpsewhich" --expand-var \'\\\$TEXMF\'`; chomp($texmf);
  # These are directories which contain the tex related files we're interested in.
  # (but they're typically below where the ls-R indexes are!)
  my $texpaths = `"$kpsewhich" --show-path tex`; chomp($texpaths);
  my @filters = ();
  foreach my $path (split(/$KPATHSEP/, $texpaths)) {
    $path =~ s/^!!//; $path =~ s|//+$|/|;
    push(@filters, $path) if -d $path; }
  $texmf =~ s/^["']//; $texmf =~ s/["']$//;
  $texmf =~ s/^\s*\\\{(.+?)}\s*$/$1/s;
  $texmf =~ s/\{\}//g;
  my @dirs = split(/,/, $texmf);
  foreach my $dir (@dirs) {
    $dir =~ s/^!!//;
    # Presumably if no ls-R, we can ignore the directory?
    if (-f "$dir/ls-R") {
      my $LSR;
      my $subdir;
      my $skip = 0;    # whether to skip entries in the current subdirectory.
      open($LSR, '<', "$dir/ls-R") or die "Cannot read $dir/ls-R: $!";
      while (<$LSR>) {
        chop;
        next unless $_;
        if (/^%/) { }
        elsif (/^(.*?):$/) {    # Move to a new subdirectory
          $subdir = $1;
          $subdir =~ s|^\./||;    # remove prefix
          my $d = $dir . '/' . $subdir;    # Hopefully OS safe, for comparison?
          $skip = !grep { $d =~ /^\Q$_\E/ } @filters; }    # check if one of the TeX paths
        elsif (!$skip) {
          # Is it safe to use '/' here?
          my $sep = '/';
          $$kpse_cache{$_} = join($sep, $dir, $subdir, $_); } }
      close($LSR); } }
  return; }

#======================================================================
1;

__END__

=pod 

=head1 NAME

C<LaTeXML::Util::Pathname>  - portable pathname and file-system utilities

=head1 DESCRIPTION

This module combines the functionality L<File::Spec> and L<File::Basename> to
give a consistent set of filename utilities for LaTeXML.
A pathname is represented by a simple string.

=head2 Pathname Manipulations

=over 4

=item C<< $path = pathname_make(%peices); >>

Constructs a pathname from the keywords in pieces
  dir   : directory
  name  : the filename (possibly with extension)
  type  : the filename extension

=item C<< ($dir,$name,$type) = pathname_split($path); >>

Splits the pathname C<$path> into the components: directory, name and type.

=item C<< $path = pathname_canonical($path); >>

Canonicallizes the pathname C<$path> by simplifying repeated slashes,
dots representing the current or parent directory, etc.

=item C<< $dir = pathname_directory($path); >>

Returns the directory component of the pathname C<$path>.

=item C<< $name = pathname_name($path); >>

Returns the name component of the pathname C<$path>.

=item C<< $type = pathname_type($path); >>

Returns the type component of the pathname C<$path>.

=item C<< $path = pathname_concat($dir,$file); >>

Returns the pathname resulting from concatenating
the directory C<$dir> and filename C<$file>.

=item C<< $boole = pathname_is_absolute($path); >>

Returns whether the pathname C<$path> appears to be an absolute pathname.

=item C<< $boole = pathname_is_url($path); >>

Returns whether the pathname C<$path> appears to be a url, rather than local file.

=item C<< $rel = pathname_is_contained($path,$base); >>

Checks whether C<$path> is underneath the directory C<$base>; if so
it returns the pathname C<$path> relative to C<$base>; otherwise returns undef.

=item C<< $path = pathname_relative($path,$base); >>

If C<$path> is an absolute, non-URL pathname,
returns the pathname relative to the directory C<$base>,
otherwise simply returns the canonical form of C<$path>.

=item C<< $path = pathname_absolute($path,$base); >>

Returns the absolute pathname resulting from interpretting
C<$path> relative to the directory C<$base>.  If C<$path>
is already absolute, it is returned unchanged.

=back

=head2 File System Operations

=over 4

=item C<< $modtime = pathname_timestamp($path); >>

Returns the modification time of the file named by C<$path>,
or undef if the file does not exist.

=item C<< $path = pathname_cwd(); >>

Returns the current working directory.

=item C<< $dir = pathname_mkdir($dir); >>

Creates the directory C<$dir> and all missing ancestors.
It returns C<$dir> if successful, else undef.

=item C<< $dest = pathname_copy($source,$dest); >>

Copies the file C<$source> to C<$dest> if needed;
ie. if C<$dest> is missing or older than C<$source>.
It preserves the timestamp of C<$source>.

=item C<< $path = pathname_find($name,%options); >>

Finds the first file named C<$name> that exists 
and that matches the specification
in the keywords C<%options>.  
An absolute pathname is returned.

If C<$name> is not already an absolute pathname, then
the option C<paths> determines directories to recursively search.
It should be a list of pathnames, any relative paths
are interpreted relative to the current directory.
If C<paths> is omitted, then the current directory is searched.

If the option C<installation_subdir> is given, it
indicates, in addition to the above, a directory relative
to the LaTeXML installation directory to search.
This allows files included with the distribution to be found.

The C<types> option specifies a list of filetypes to search for.
If not supplied, then the filename must match exactly.
The type C<*> matches any extension.

=item C<< @paths = pathname_findall($name,%options); >>

Like C<pathname_find>,
but returns I<all> matching (absolute) paths that exist.

=back

=head1 AUTHOR

Bruce Miller <bruce.miller@nist.gov>

=head1 COPYRIGHT

Public domain software, produced as part of work done by the
United States Government & not subject to copyright in the US.

=cut