##-*- Mode: CPerl -*-
##======================================================================
## Header Administrivia
##======================================================================
our $VERSION = '0.10';
pp_setversion($VERSION);
##------------------------------------------------------
## pm additions
pp_addpm({At=>'Top'},<<'EOPM');
use strict;
use version;
## $PDL_ATLEAST_2_014 : avoid in-place reshape() in _edit_pdl() for PDL >= 2.014
## + prior to PDL-2.014, PDL::reshape() returned a new PDL, but modifies
## the calling object in-place for v2.014
our $PDL_ATLEAST_2_014 = version->parse($PDL::VERSION) >= version->parse("2.014");
=pod
=head1 NAME
PDL::EditDistance - Wagner-Fischer edit distance and alignment for PDLs.
=head1 SYNOPSIS
use PDL;
use PDL::EditDistance;
##-- input PDLs
$a = pdl([map { ord($_) } qw(G U M B O)]);
$b = pdl([map { ord($_) } qw(G A M B O L)]);
$a1 = pdl([0, map { ord($_) } qw(G U M B O)]);
$b1 = pdl([0, map { ord($_) } qw(G A M B O L)]);
##-------------------------------------------------------------
## Levenshtein distance
$dist = edit_distance_static($a,$b, 0,1,1,1);
($dist,$align) = edit_align_static($a,$b, 0,1,1,1);
##-------------------------------------------------------------
## Wagner-Fischer distance
@costs = ($costMatch=0,$costInsert=1,$costDelete=1,$costSubstitute=2);
$dist = edit_distance_static($a,$b, @costs);
($dist,$align) = edit_align_static($a,$b, @costs);
##-------------------------------------------------------------
## General edit distance
$costsMatch = random($a->nelem+1, $b->nelem+1);
$costsIns = random($a->nelem+1, $b->nelem+1);
$costsDel = random($a->nelem+1, $b->nelem+1);
$costsSubst = random($a->nelem+1, $b->nelem+1);
@costs = ($costsMatch,$costsIns,$costDel,$costsSubst);
$dist = edit_distance_full($a,$b,@costs);
($dist,$align) = edit_align_full($a,$b,@costs);
##-------------------------------------------------------------
## Alignment
$op_match = align_op_match(); ##-- constant
$op_del = align_op_insert1(); ##-- constant
$op_ins = align_op_insert2(); ##-- constant
$op_subst = align_op_substitute(); ##-- constant
($apath,$bpath,$pathlen) = edit_bestpath($align);
($ai,$bi,$ops,$pathlen) = edit_pathtrace($align);
##-------------------------------------------------------------
## Longest Common Subsequence
$lcs = edit_lcs($a,$b);
($ai,$bi,$lcslen) = lcs_backtrace($a,$b,$lcs);
=cut
EOPM
## /pm additions
##------------------------------------------------------
##------------------------------------------------------
## Exports: None
pp_export_nothing();
##------------------------------------------------------
## Includes / defines
#pp_addhdr(<<'EOH');
#EOH
##======================================================================
## C Utilities
##======================================================================
pp_addhdr(<<'EOH');
#define ALIGN_OP_MATCH 0
#define ALIGN_OP_INSERT1 1
#define ALIGN_OP_INSERT2 2
#define ALIGN_OP_SUBSTITUTE 3
EOH
##======================================================================
## PDL::PP Wrappers
##======================================================================
##======================================================================
## Basic Utilities
#pp_addpm(<<'EOPM');
#=pod
#
#=head1 Basic Utilities
#
#=cut
#EOPM
##======================================================================
## Convenience Methods
##------------------------------------------------------
## _edit_pdl($a()) : ensures pdl-ness
#pp_add_exported('','_edit_pdl');
pp_addpm(<<'EOPM');
=pod
=head2 _edit_pdl
=for sig
Signature: (a(N); [o]apdl(N+1))
Convenience method.
Returns a pdl $apdl() suitable for representing $a(),
which can be specified as a UTF-8 or byte-string, as an arrays of numbers, or as a PDL.
$apdl(0) is always set to zero.
=cut
sub _edit_pdl {
if (UNIVERSAL::isa($_[0],'PDL')) {
return ($PDL_ATLEAST_2_014 ? $_[0]->pdl : $_[0])->flat->reshape($_[0]->nelem+1)->rotate(1);
}
#return pdl(byte,[0, map { ord($_) } split(//,$_[0])]) if (!ref($_[0]) && !utf8::is_utf8($_[0])); ##-- byte-string (old)
elsif (!ref($_[0])) {
return pdl(long,[0, unpack('C0C*',$_[0])]) if (utf8::is_utf8($_[0])); ##-- utf8-string
return pdl(byte,[0, unpack('U0C*',$_[0])]); ##-- byte-string
}
return pdl([0,@{$_[0]}]);
}
EOPM
##------------------------------------------------------
## edit_cost_matrices() : generate cost-matrices (convenience)
pp_add_exported('','edit_costs');
pp_addpm(<<'EOPM');
=pod
=head2 edit_costs
=for sig
Signature: (PDL::Type type; int N; int M;
[o]costsMatch(N+1,M+1); [o]costsIns(N+1,M+1); [o]costsDel(N+1,M+1); [o]costsSubst(N+1,M+1))
Convenience method.
Ensures existence and proper dimensionality of cost matrices for inputs
of length N and M.
=cut
sub edit_costs {
return _edit_costs($_[0],$_[1]+1,$_[2]+1,@_[3..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_costs() : generate cost-matrices (low-level, convenience)
pp_add_exported('','_edit_costs');
pp_addpm(<<'EOPM');
=pod
=head2 _edit_costs
=for sig
Signature: (PDL::Type type; int N1; int M1;
[o]costsMatch(N1,M1); [o]costsIns(N1,M1); [o]costsDel(N1,M1); [o]costsSubst(N1,M1))
Low-level method.
Ensures existence and proper dimensionality of cost matrices for inputs
of length N1-1 and M1-1.
=cut
sub _edit_costs {
#my ($type,$n1,$m1,$costsMatch,$costsIns,$costsDel,$costsSubst) = @_;
return (_edit_matrix(@_[0..2],$_[3]),
_edit_matrix(@_[0..2],$_[4]),
_edit_matrix(@_[0..2],$_[5]),
_edit_matrix(@_[0..2],$_[6]));
}
##-- $matrix = _edit_matrix($type,$dim0,$dim1,$mat)
sub _edit_matrix {
return zeroes(@_[0..2]) if (!defined($_[3]));
$_[3]->reshape(@_[1,2]) if ($_[3]->ndims != 2 || $_[3]->dim(0) != $_[1] || $_[3]->dim(1) != $_[2]);
return $_[3]->type == $_[0] ? $_[3] : $_[3]->convert($_[0]);
}
EOPM
##------------------------------------------------------
## edit_cost_static() : generate static cost-matrices (convenience)
pp_add_exported('','edit_costs_static');
pp_addpm(<<'EOPM');
=pod
=head2 edit_costs_static
=for sig
Signature: (PDL::Type type; int N; int M;
staticCostMatch(); staticCostIns(); staticCostSubst();
[o]costsMatch(N+1,M+1); [o]costsIns(N+1,M+1); [o]costsDel(N+1,M+1); [o]costsSubst(N+1,M+1))
Convenience method.
=cut
sub edit_costs_static {
#my ($type,$n,$m, $cMatch,$cIns,$cDel,$cSubst, $costsMatch,$costsIns,$costsDel,$costsSubst) = @_;
my @costs = edit_costs(@_[0..2],@_[7..$#_]);
$costs[$_] .= $_[$_+3] foreach (0..3);
return @costs;
}
EOPM
##======================================================================
## Distance: Full
##------------------------------------------------------
## edit_distance_matrix_full() : full distance matrix
pp_add_exported('','edit_distance_full');
pp_addpm(<<'EOPM');
=pod
=head2 edit_distance_full
=for sig
Signature: (a(N); b(M);
costsMatch(N+1,M+1); costsIns(N+1,M+1); costsDel(N+1,M+1); costsSubst(N+1,M+1);
[o]dist(N+1,M+1); [o]align(N+1,M+1))
Convenience method.
Compute the edit distance matrix for inputs $a() and $b(), and
cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst().
$a() and $b() may be specified as PDLs, arrays of numbers, or as strings.
=cut
sub edit_distance_full {
return _edit_distance_full(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_distance_full : get distance matrix from full cost matrices
pp_add_exported('','_edit_distance_full');
pp_def('_edit_distance_full',
Pars => ('a1(N1); b1(M1);'
.' costsMatch(N1,M1); costsIns(N1,M1); costsDel(N1,M1); costsSubst(N1,M1);'
.' [o]dist(N1,M1);'),
Code =>
('
int i,j;
//
//-- initialize distance matrix: insertion costs
$dist (N1=>0,M1=>0) = $costsMatch(N1=>0,M1=>0); //-- BOS always matches
for (i=1; i < $SIZE(N1); i++) {
$dist(N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costsDel(N1=>i,M1=>0); //-- delete (insert1)
}
for (j=1; j < $SIZE(M1); j++) {
$dist(N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costsIns(N1=>0,M1=>j); //-- insert (insert2)
}
//
//-- compute distance
for (i=1; i < $SIZE(N1); i++) {
for (j=1; j < $SIZE(M1); j++) {
$GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costsDel(N1=>i,M1=>j);
$GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costsIns(N1=>i,M1=>j);
$GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1);
if ($a1(N1=>i)==$b1(M1=>j)) {
cost_subst += $costsMatch(N1=>i,M1=>j);
} else {
cost_subst += $costsSubst(N1=>i,M1=>j);
}
//
if (cost_insert_1 < cost_insert_2) {
if (cost_insert_1 < cost_subst) {
$dist( N1=>i,M1=>j) = cost_insert_1;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
}
} else if (cost_insert_2 < cost_subst) {
$dist( N1=>i,M1=>j) = cost_insert_2;
} else {
$dist( N1=>i,M1=>j) = cost_subst;
}
}
}
'),
Doc =>
q(
Low-level method.
Compute the edit distance matrix for input PDLs $a1() and $b1() and
cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst().
The first elements of $a1() and $b1() are ignored.
),
);
##======================================================================
## Distance + Alignment: Full
##------------------------------------------------------
## edit_align_full() : full distance & alignment matrix
pp_add_exported('','edit_align_full');
pp_addpm(<<'EOPM');
=pod
=head2 edit_align_full
=for sig
Signature: (a(N); b(M);
costsMatch(N+1,M+1); costsIns(N+1,M+1); costsDel(N+1,N+1); costsSubst(N+1,M+1);
[o]dist(N+1,M+1); [o]align(N+1,M+1))
Convenience method.
Compute the edit distance and alignment matrices for inputs $a() and $b(), and
cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst().
$a() and $b() may be specified as PDLs, arrays of numbers, or as strings.
=cut
sub edit_align_full {
return _edit_align_full(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_align_full : get distance & alignment matrices given full cost matrices
pp_add_exported('','_edit_align_full');
pp_def('_edit_align_full',
Pars => ('a1(N1); b1(M1);'
.' costsMatch(N1,M1); costsIns(N1,M1); costsDel(N1,M1); costsSubst(N1,M1);'
.' [o]dist(N1,M1); byte [o]align(N1,M1);'),
Code =>
('
int i,j;
//
//-- initialize distance matrix: insertion costs
$dist (N1=>0,M1=>0) = $costsMatch(N1=>0,M1=>0); //-- BOS always matches
$align(N1=>0,M1=>0) = ALIGN_OP_MATCH; //-- ... marked as "substitute"
for (i=1; i < $SIZE(N1); i++) {
$dist (N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costsDel(N1=>i,M1=>0);
$align(N1=>i,M1=>0) = ALIGN_OP_INSERT1;
}
for (j=1; j < $SIZE(M1); j++) {
$dist (N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costsIns(N1=>0,M1=>j);
$align(N1=>0,M1=>j) = ALIGN_OP_INSERT2;
}
//
//-- compute distance
for (i=1; i < $SIZE(N1); i++) {
for (j=1; j < $SIZE(M1); j++) {
$GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costsDel(N1=>i,M1=>j);
$GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costsIns(N1=>i,M1=>j);
$GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1);
char subst_op;
//
if ($a1(N1=>i)==$b1(M1=>j)) {
cost_subst += $costsMatch(N1=>i,M1=>j);
subst_op = ALIGN_OP_MATCH;
} else {
cost_subst += $costsSubst(N1=>i,M1=>j);
subst_op = ALIGN_OP_SUBSTITUTE;
}
//
if (cost_insert_1 < cost_insert_2) {
if (cost_insert_1 < cost_subst) {
$dist( N1=>i,M1=>j) = cost_insert_1;
$align(N1=>i,M1=>j) = ALIGN_OP_INSERT1;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
$align(N1=>i,M1=>j) = subst_op;
}
} else if (cost_insert_2 < cost_subst) {
$dist( N1=>i,M1=>j) = cost_insert_2;
$align(N1=>i,M1=>j) = ALIGN_OP_INSERT2;
} else {
$dist( N1=>i,M1=>j) = cost_subst;
$align(N1=>i,M1=>j) = subst_op;
}
}
}
'),
Doc =>
q(
Low-level method.
Compute the edit distance and alignment matrix for input PDLs $a1() and $b1() and
cost matrices $costsMatch(), $costsIns(), $costsDel(), and $costsSubst().
The first elements of $a1() and $b1() are ignored.
),
);
##======================================================================
## Distance: Static
##------------------------------------------------------
## edit_distance_static() : distance matrix using static cost schema
pp_add_exported('','edit_distance_static');
pp_addpm(<<'EOPM');
=pod
=head2 edit_distance_static
=for sig
Signature: (a(N); b(M);
staticCostMatch(); staticCostIns(); staticCostDel(); staticCostSubst();
[o]dist(N+1,M+1))
Convenience method.
Compute the edit distance matrix for inputs $a() and $b() given
a static cost schema @costs = ($staticCostMatch(), $staticCostIns(), $staticCostDel(), and $staticCostSubst()).
$a() and $b() may be specified as PDLs, arrays of numbers, or as strings.
Functionally equivalent to edit_distance_full($matches,@costs,$dist),
but slightly faster.
=cut
sub edit_distance_static {
return _edit_distance_static(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_distance_static : get distance matrix from static cost schema
pp_add_exported('','_edit_distance_static');
pp_def('_edit_distance_static',
Pars => ('a1(N1); b1(M1); costMatch(); costIns(); costDel(); costSubst();'
.' [o]dist(N1,M1);'),
Code =>
('
int i,j;
//
//-- initialize distance matrix: insertion costs
$dist(N1=>0,M1=>0) = $costMatch(); //-- BOS always matches
for (i=1; i < $SIZE(N1); i++) {
$dist(N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costDel(); //-- delete (insert1)
}
for (j=1; j < $SIZE(M1); j++) {
$dist(N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costIns(); //-- insert (insert2)
}
//
//-- compute distance
for (i=1; i < $SIZE(N1); i++) {
for (j=1; j < $SIZE(M1); j++) {
$GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costDel();
$GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costIns();
$GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1) + ($a1(N1=>i)==$b1(M1=>j)
? $costMatch()
: $costSubst());
if (cost_insert_1 < cost_insert_2) {
if (cost_insert_1 < cost_subst) {
$dist(N1=>i,M1=>j) = cost_insert_1;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
}
} else if (cost_insert_2 < cost_subst) {
$dist(N1=>i,M1=>j) = cost_insert_2;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
}
}
}
'),
Doc =>
q(
Low-level method.
Compute the edit distance matrix for input PDLs $a1() and $b1() given a
static cost schema @costs = ($costMatch(), $costIns(), $costDel(), $costSubst()).
Functionally identitical to _edit_distance_matrix_full($matches,@costs,$dist),
but slightly faster.
The first elements of $a1() and $b1() are ignored.
),
);
##======================================================================
## Distance + Alignment: Static
##------------------------------------------------------
## edit_align_static() : distance + alignment matrices using static cost schema
pp_add_exported('','edit_align_static');
pp_addpm(<<'EOPM');
=pod
=head2 edit_align_static
=for sig
Signature: (a(N); b(M);
staticCostMatch(); staticCostIns(); staticCostDel(); staticCostSubst();
[o]dist(N+1,M+1); [o]align(N+1,M+1))
Convenience method.
Compute the edit distance and alignment matrices for inputs $a() and $b() given
a static cost schema @costs = ($staticCostMatch(), $staticCostIns(), $staticCostDel(), and $staticCostSubst()).
$a() and $b() may be specified as PDLs, arrays of numbers, or as strings.
Functionally equivalent to edit_align_full($matches,@costs,$dist),
but slightly faster.
=cut
sub edit_align_static {
return _edit_align_static(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_align_static : get distance & alignment matrices from static cost schema
pp_add_exported('','_edit_align_static');
pp_def('_edit_align_static',
Pars => ('a1(N1); b1(M1); costMatch(); costIns(); costDel(); costSubst();'
.' [o]dist(N1,M1); byte [o]align(N1,M1)'),
Code =>
('
int i,j;
//
//-- initialize distance matrix: insertion costs
$dist( N1=>0,M1=>0) = $costMatch(); //-- BOS always matches
$align(N1=>0,M1=>0) = ALIGN_OP_MATCH; //-- ... and is marked as "match"
for (i=1; i < $SIZE(N1); i++) {
$dist (N1=>i,M1=>0) = $dist(N1=>i-1,M1=>0) + $costDel(); //-- delete (insert1)
$align(N1=>i,M1=>0) = ALIGN_OP_INSERT1;
}
for (j=1; j < $SIZE(M1); j++) {
$dist (N1=>0,M1=>j) = $dist(N1=>0,M1=>j-1) + $costIns(); //-- insert (insert2)
$align(N1=>0,M1=>j) = ALIGN_OP_INSERT2;
}
//
//-- compute distance
for (i=1; i < $SIZE(N1); i++) {
for (j=1; j < $SIZE(M1); j++) {
$GENERIC(dist) cost_insert_1 = $dist(N1=>i-1,M1=>j ) + $costDel();
$GENERIC(dist) cost_insert_2 = $dist(N1=>i, M1=>j-1) + $costIns();
$GENERIC(dist) cost_subst = $dist(N1=>i-1,M1=>j-1);
char subst_op;
//
if ($a1(N1=>i)==$b1(M1=>j)) {
cost_subst += $costMatch();
subst_op = ALIGN_OP_MATCH;
} else {
cost_subst += $costSubst();
subst_op = ALIGN_OP_SUBSTITUTE;
}
//
if (cost_insert_1 < cost_insert_2) {
if (cost_insert_1 < cost_subst) {
$dist(N1=>i,M1=>j) = cost_insert_1;
$align(N1=>i,M1=>j) = ALIGN_OP_INSERT1;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
$align(N1=>i,M1=>j) = subst_op;
}
} else if (cost_insert_2 < cost_subst) {
$dist(N1=>i,M1=>j) = cost_insert_2;
$align(N1=>i,M1=>j) = ALIGN_OP_INSERT2;
} else {
$dist(N1=>i,M1=>j) = cost_subst;
$align(N1=>i,M1=>j) = subst_op;
}
}
}
'),
Doc =>
q(
Low-level method.
Compute the edit distance and alignment matrices for input PDLs $a1() and $b1() given a
static cost schema @costs = ($costMatch(), $costIns(), $costDel(), $costSubst()).
Functionally identitical to _edit_distance_matrix_full($matches,@costs,$dist),
but slightly faster.
The first elements of $a1() and $b1() are ignored.
),
);
##==============================================================
## Alignment
##------------------------------------------------------
## Alignment: Constants
pp_add_exported('','align_op_insert1');
pp_def('align_op_insert1',
Pars=>'[o]a()',
Code=>'$a() = ALIGN_OP_INSERT1;',
Doc => 'Alignment matrix value constant for insertion operations on $a() string.',
);
pp_add_exported('','align_op_insert2');
pp_def('align_op_insert2',
Pars=>'[o]a()',
Code=>'$a() = ALIGN_OP_INSERT2;',
Doc => 'Alignment matrix value constant for insertion operations on $a() string.',
);
pp_add_exported('','align_op_match');
pp_def('align_op_match',
Pars=>'[o]a()',
Code=>'$a() = ALIGN_OP_MATCH;',
Doc => 'Alignment matrix value constant for matches.',
);
pp_add_exported('','align_op_substitute');
pp_def('align_op_substitute',
Pars=>'[o]a()',
Code=>'$a() = ALIGN_OP_SUBSTITUTE;',
Doc => 'Alignment matrix value constant for substitution operations.',
);
pp_add_exported('','align_op_insert');
pp_add_exported('','align_op_delete');
pp_addpm(<<'EOPM');
=pod
=head2 align_op_delete
Alias for align_op_insert1()
=head2 align_op_insert
Alias for align_op_insert2()
=cut
*align_op_delete = \&align_op_insert1;
*align_op_insert = \&align_op_insert2;
EOPM
pp_add_exported('','align_ops');
pp_addpm(<<'EOPM');
=pod
=head2 align_ops
=for sig
Signature: ([o]ops(4))
Alignment matrix value constants 4-element pdl (match,insert1,insert2,substitute).a
=cut
sub align_ops { return PDL->sequence(PDL::byte(),4); }
EOPM
##------------------------------------------------------
## Alignment: best path
##------------------------------------------------------
## edit_bestpath() : distance + alignment matrices using static cost schema
pp_add_exported('','edit_bestpath');
pp_addpm(<<'EOPM');
=pod
=head2 edit_bestpath
=for sig
Signature: (align(N+1,M+1); [o]apath(N+M+2); [o]bpath(N+M+2); [o]pathlen())
Convenience method.
Compute best path through alignment matrix $align().
Stores paths for original input strings $a() and $b() in $apath() and $bpath()
respectively.
Negative values in $apath() and $bpath() indicate insertion/deletion operations.
On completion, $pathlen() holds the actual length of the paths.
=cut
sub edit_bestpath {
my ($align,$apath,$bpath,$len) = @_;
$len=pdl(long,$align->dim(0)+$align->dim(1)) if (!defined($len));
if (!defined($apath)) { $apath=zeroes(long,$len); }
else { $apath->reshape($len) if ($apath->nelem < $len); }
if (!defined($bpath)) { $bpath = zeroes(long,$len); }
else { $bpath->reshape($len) if ($bpath->nelem < $len); }
_edit_bestpath($align, $apath, $bpath, $len, $align->dim(0)-1, $align->dim(1)-1);
return ($apath,$bpath,$len);
}
EOPM
##------------------------------------------------------
## _edit_bestpath : get best path
pp_add_exported('','_edit_bestpath');
pp_def('_edit_bestpath',
Pars => ('align(N1,M1); int [o]apath(L); int [o]bpath(L); int [o]len();'),
OtherPars => ('int ifinal; int jfinal'),
Code =>
('
int i,j,p,endp,tmp;
char op;
//-- get reversed path & real path length
$len()=0;
for (p=0,i=$COMP(ifinal),j=$COMP(jfinal); p < $SIZE(L) && (i>0 || j>0); p++) {
$apath(L=>p) = i-1;
$bpath(L=>p) = j-1;
//
op = $align(N1=>i,M1=>j);
if (op==ALIGN_OP_INSERT1) {
--i;
} else if (op==ALIGN_OP_INSERT2) {
--j;
} else { /* if (op==ALIGN_OP_MATCH || op==ALIGN_OP_SUBSTITUTE) */
--i;
--j;
}
}
$len() = p;
//-- now reverse the paths
for (p=0; p < ($len()+1)/2; p++) {
endp = $len()-p-1;
//
tmp = $apath(L=>p);
$apath(L=>p) = $apath(L=>endp);
$apath(L=>endp) = tmp;
//
tmp = $bpath(L=>p);
$bpath(L=>p) = $bpath(L=>endp);
$bpath(L=>endp) = tmp;
}
//-- now, sanitize the paths
for (p=$len(); p > 0; p--) {
if ($apath(L=>p) == $apath(L=>p-1)) $apath(L=>p) = -1;
if ($bpath(L=>p) == $bpath(L=>p-1)) $bpath(L=>p) = -1;
}
'),
Doc =>
q(
Low-level method.
Compute best path through alignment matrix $align() from final index ($ifinal,$jfinal).
Stores paths for (original) input strings $a() and $b() in $apath() and $bpath()
respectively.
Negative values in $apath() and $bpath() indicate insertion/deletion operations.
On completion, $pathlen() holds the actual length of the paths.
),
);
##------------------------------------------------------
## Alignment: operation backtrace
##------------------------------------------------------
## edit_backtrace() : alignment operation backtrace
pp_add_exported('','edit_pathtrace');
pp_addpm(<<'EOPM');
=pod
=head2 edit_pathtrace
=for sig
Signature: ( align(N+1,M+1); [o]ai(L); [o]bi(L); [o]ops(L); [o]$pathlen() )
Convenience method.
Compute alignment path backtrace through alignment matrix $align() from final index ($ifinal,$jfinal).
Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi()
respectively.
Unlike edit_bestpath(), null-moves for $ai() and $bi() are not stored here as negative values.
Returned pdls ($ai,$bi,$ops) are trimmed to the appropriate path length.
=cut
sub edit_pathtrace {
my ($align,$ai,$bi,$ops,$len) = @_;
$len=pdl(long,$align->dim(0)+$align->dim(1)) if (!defined($len));
if (!defined($ai)) { $ai=zeroes(long,$len); }
else { $ai->reshape($len) if ($ai->nelem < $len); }
if (!defined($bi)) { $bi = zeroes(long,$len); }
else { $bi->reshape($len) if ($bi->nelem < $len); }
if (!defined($ops)) { $ops = zeroes(long,$len); }
else { $ops->reshape($len) if ($ops->nelem < $len); }
_edit_pathtrace($align, $ai,$bi,$ops,$len, $align->dim(0)-1,$align->dim(1)-1);
my $lens = ($len->sclr-1);
return ((map { $_->slice("0:$lens") } ($ai,$bi,$ops)), $len);
}
EOPM
##------------------------------------------------------
## _edit_pathlog : trace path
pp_add_exported('','_edit_pathtrace');
pp_def('_edit_pathtrace',
Pars => ('align(N1,M1); int [o]ai(L); int [o]bi(L); int [o]ops(L); int [o]len();'),
OtherPars => ('int ifinal; int jfinal'),
Code =>
('
int i,j,p, tmp;
//-- get reversed path & real path length
$len() = 0;
for (p=0, i=$COMP(ifinal),j=$COMP(jfinal); p < $SIZE(L) && (i>0 || j>0); p++) {
int op = $align(N1=>i,M1=>j);
$ai(L=>p) = i;
$bi(L=>p) = j;
$ops(L=>p) = op;
switch (op) {
case ALIGN_OP_INSERT1: --i; break;
case ALIGN_OP_INSERT2: --j; break;
case ALIGN_OP_MATCH:
case ALIGN_OP_SUBSTITUTE:
default: --i; --j; break;
}
}
$len() = p;
//-- now reverse the paths
for (p=0; p < ($len()+1)/2; p++) {
int endp = $len()-p-1;
//
tmp = $ai(L=>p);
$ai(L=>p) = $ai(L=>endp);
$ai(L=>endp) = tmp;
//
tmp = $bi(L=>p);
$bi(L=>p) = $bi(L=>endp);
$bi(L=>endp) = tmp;
//
tmp = $ops(L=>p);
$ops(L=>p) = $ops(L=>endp);
$ops(L=>endp) = tmp;
}
'),
Doc =>
q(
Low-level method.
Compute alignment path backtrace through alignment matrix $align() from final index ($ifinal,$jfinal).
Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi()
respectively.
Unlike edit_bestpath(), null-moves for $ai() and $bi() are not stored here as negative values.
Returned pdls ($ai,$bi,$ops) are trimmed to the appropriate path length.
),
);
##======================================================================
## Longest Common Subsequence (LCS)
##------------------------------------------------------
## edit_lcs() : lcs matrix (convenience)
pp_add_exported('','edit_lcs');
pp_addpm(<<'EOPM');
=pod
=head2 edit_lcs
=for sig
Signature: (a(N); b(M); int [o]lcs(N+1,M+1);)
Convenience method.
Compute the longest common subsequence (LCS) matrix for input PDLs $a1() and $b1().
The output matrix $lcs() contains at cell ($i+1,$j+1) the length of the LCS
between $a1(0..$i) and $b1(0..$j); thus $lcs($N,$M) contains the
length of the LCS between $a() and $b().
=cut
sub edit_lcs {
return _edit_lcs(_edit_pdl($_[0]), _edit_pdl($_[1]), @_[2..$#_]);
}
EOPM
##------------------------------------------------------
## _edit_lcs : get longest common subsequence matrix
pp_add_exported('','_edit_lcs');
pp_def('_edit_lcs',
Pars => ('a1(N1); b1(M1); int [o]lcs(N1,M1);'),
Code =>
('
int i,j, iprev,jprev;
//
//-- initialize lcs matrix: lcs=0 at borders
for (i=0; i < $SIZE(N1); i++) { $lcs(N1=>i,M1=>0) = 0; }
for (j=1; j < $SIZE(M1); j++) { $lcs(N1=>0,M1=>j) = 0; }
//
//-- compute lcs
for (i=1,iprev=0; i < $SIZE(N1); i++,iprev++) {
for (j=1,jprev=0; j < $SIZE(M1); j++,jprev++) {
//
if ($a1(N1=>i)==$b1(M1=>j)) {
$lcs(N1=>i,M1=>j) = $lcs(N1=>iprev,M1=>jprev)+1;
} else {
$GENERIC(lcs) lcs_i_jp = $lcs(N1=>i,M1=>jprev);
$GENERIC(lcs) lcs_ip_j = $lcs(N1=>iprev,M1=>j);
$lcs(N1=>i,M1=>j) = lcs_i_jp > lcs_ip_j ? lcs_i_jp : lcs_ip_j;
}
}
}
'),
Doc =>
q(
Low-level method.
Compute the longest common subsequence (LCS) matrix for input PDLs $a1() and $b1().
The initial (zeroth) elements of $a1() and $b1() are ignored.
The output matrix $lcs() contains at cell ($i,$j) the length of the LCS
between $a1(1..$i) and $b1(1..$j); thus $lcs($N1-1,$M1-1) contains the
length of the LCS between $a1() and $b1().
),
);
##------------------------------------------------------
## lcs_backtrace : get LCS backtrace (convenience)
pp_add_exported('','lcs_backtrace');
pp_addpm(<<'EOPM');
=pod
=head2 lcs_backtrace
=for sig
Signature: (a(N); b(M); int lcs(N+1,M+1); int ifinal(); int jfinal(); int [o]ai(L); int [o]bi(L); int [o]len())
Convenience method.
Compute longest-common-subsequence backtrace through LCS matrix $lcs()
for original input strings ($a(),$b()) from final index ($ifinal,$jfinal).
Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi()
respectively.
=cut
sub lcs_backtrace {
my ($a,$b,$lcs,$ifinal,$jfinal,$ai,$bi,$len) = @_;
$len=pdl(long, pdl(long,$lcs->dims)->min) if (!defined($len));
if (!defined($ai)) { $ai=zeroes(long,$len); }
else { $ai->reshape($len) if ($ai->nelem < $len); }
if (!defined($bi)) { $bi = zeroes(long,$len); }
else { $bi->reshape($len) if ($bi->nelem < $len); }
if (!defined($ifinal)) { $ifinal = $lcs->dim(0)-1; }
if (!defined($jfinal)) { $jfinal = $lcs->dim(1)-1; }
_lcs_backtrace(_edit_pdl($a),_edit_pdl($b), $lcs,$ifinal,$jfinal, $ai,$bi,$len);
my $lens = ($len->sclr-1);
return ($ai->slice("0:$lens"),$bi->slice("0:$lens"), $len);
}
EOPM
##------------------------------------------------------
## _lcs_backtrace : get LCS backtrace
pp_add_exported('','_lcs_backtrace');
pp_def('_lcs_backtrace',
Pars => ('a1(N1); b1(M1); int lcs(N1,M1); int ifinal(); int jfinal(); [o]ai(L); [o]bi(L); int [o]len()'),
Code =>
('
int i,j,p, iprev,jprev, tmp;
//-- get reversed path & real path length
$len() = 0;
for (p=0, i=$ifinal(), j=$jfinal(); p < $SIZE(L) && i>0 && j>0; ) {
$GENERIC(a1) a1i = $a1(N1=>i);
$GENERIC(b1) b1j = $b1(M1=>j);
iprev=i-1;
jprev=j-1;
if (a1i==b1j) {
$ai(L=>p) = iprev;
$bi(L=>p) = jprev;
--i;
--j;
++p;
} else if ($lcs(N1=>i,M1=>jprev) > $lcs(N1=>iprev,M1=>j)) {
--j;
} else {
--i;
}
}
$len() = p;
//-- now reverse the paths
for (p=0; p < ($len()+1)/2; p++) {
int endp = $len()-p-1;
//
tmp = $ai(L=>p);
$ai(L=>p) = $ai(L=>endp);
$ai(L=>endp) = tmp;
//
tmp = $bi(L=>p);
$bi(L=>p) = $bi(L=>endp);
$bi(L=>endp) = tmp;
}
'),
Doc =>
q(
Low-level method.
Compute longest-common-subsequence backtrace through LCS matrix $lcs()
for initial-padded strings ($a1(),$b1()) from final index ($ifinal,$jfinal).
Stores raw paths for (original) input strings $a() and $b() in $ai() and $bi()
respectively.
),
);
##======================================================================
## Footer Administrivia
##======================================================================
##------------------------------------------------------
## pm additions
pp_addpm(<<'EOPM');
##---------------------------------------------------------------------
=pod
=head1 ACKNOWLEDGEMENTS
Perl by Larry Wall.
PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others.
=cut
##----------------------------------------------------------------------
=pod
=head1 KNOWN BUGS
Probably many.
=cut
##---------------------------------------------------------------------
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head2 Copyright Policy
Copyright (C) 2006-2015, Bryan Jurish. All rights reserved.
This package is free software, and entirely without warranty.
You may redistribute it and/or modify it under the same terms
as Perl itself, either Perl 5.20.2, or at your option any later
version of Perl 5.
=head1 SEE ALSO
perl(1), PDL(3perl).
=cut
EOPM
# Always make sure that you finish your PP declarations with
# pp_done
pp_done();
##----------------------------------------------------------------------