our
$VERSION
=
'0.15'
;
my
$count
= 0;
sub
new {
my
(
$class
,
%params
) =
@_
;
*add_document
= \
&add
;
*add_documents
= \
&bulk_add
;
*optimize
= \
&reweight_graph
;
*is_indexed
= \
&has_doc
;
my
@allowed
=
qw/debug auto_reweight use_global_weights max_depth START_ENERGY ACTIVATE_THRESHOLD COLLECT_THRESHOLD use_file xs/
;
my
%check
;
$check
{
$_
}++
foreach
@allowed
;
my
@forbidden
;
foreach
my
$k
(
keys
%params
) {
push
@forbidden
,
$k
unless
exists
$check
{
$k
};
}
if
(
@forbidden
) {
croak
"The following unrecognized parameters were detected: "
,
join
", "
,
@forbidden
;
}
my
$obj
=
bless
{
debug
=> 0,
auto_reweight
=> 1,
use_global_weights
=> 1,
max_depth
=> 100000000,
START_ENERGY
=> 100,
ACTIVATE_THRESHOLD
=> 1,
COLLECT_THRESHOLD
=> .2,
%params
,
depth
=> 0,
neighbors
=> {},
},
$class
;
if
(
$obj
->{use_file} ) {
my
%neighbors
;
use
MLDBM
qw/DB_File Storable/
;
warn
"Using MLDBM: $obj->{use_file}"
;
$obj
->{neighbors} =
tie
%neighbors
,
'MLDBM'
,
$obj
->{use_file} or
die
$!;
}
return
$obj
;
}
{
my
$parse_sub
;
sub
load_from_dir {
my
(
$class
,
$dir
,
$code
) =
@_
;
croak
"$dir is not a directory"
unless
-d
$dir
;
unless
(
defined
$code
and
ref
$code
and
ref
$code
eq
'CODE'
) {
$code
=
sub
{
my
$text
=
shift
;
$text
=~ s/[^\w]/ /gs;
my
@toks
=
split
/\s+/m,
$text
;
return
grep
{
length
(
$_
) > 1 }
@toks
;
};
}
$parse_sub
=
$code
;
my
%docs
;
my
$reader
=
sub
{
my
(
$parse
) =
@_
;
return
if
/^\./;
return
unless
-f
$_
;
open
my
$fh
,
$_
or
croak
"Could not open file $File::Find::name: $!"
;
local
$/;
my
$contents
= <
$fh
>;
close
$fh
or croak
"failed to close filehandle"
;
my
@words
=
$parse_sub
->(
$contents
);
$docs
{
$File::Find::name
} = \
@words
;
};
find(
$reader
,
$dir
);
my
$self
= __PACKAGE__->new();
$self
->bulk_add(
%docs
);
return
$self
;
}
}
sub
load_from_tdm {
my
(
$self
,
$file
) =
@_
;
croak
"TDM file $file does not exist"
unless
-f
$file
;
return
if
$self
->{
'loaded'
};
$self
->_read_tdm(
$file
);
$self
->{
'loaded'
} = 1;
$self
->reweight_graph();
}
sub
rename
{
my
(
$self
,
$old
,
$new
) =
@_
;
croak
"rename method needs two arguments"
unless
defined
$old
and
defined
$new
;
croak
"document $old not found"
unless
exists
$self
->{neighbors}{ _nodeify(
'D'
,
$old
) };
my
$bad
= _nodeify(
'D'
,
$old
);
my
$good
= _nodeify(
'D'
,
$new
);
return
if
exists
$self
->{neighbors}{
$good
};
my
$s
=
$self
->{neighbors};
foreach
my
$n
(
keys
%{
$s
->{
$bad
} } ) {
$s
->{
$good
}{
$n
} =
$s
->{
$n
}{
$good
} =
$s
->{
$bad
}{
$n
};
delete
$s
->{
$bad
}{
$n
};
delete
$s
->{
$n
}{
$bad
};
}
delete
$s
->{
$bad
};
return
1;
}
sub
retrieve {
my
(
$self
,
$file
) =
@_
;
croak
"Must provide a filename to retrieve graph"
unless
$file
;
croak
"'$file' is not a file"
unless
-f
$file
;
Storable::retrieve(
$file
);
}
sub
get_activate_threshold {
$_
[0]->{
'ACTIVATE_THRESHOLD'
} }
sub
set_activate_threshold {
my
(
$self
,
$threshold
) =
@_
;
croak
"Can't set activate threshold to zero"
unless
$threshold
;
croak
"Can't set activate threshold to negative value"
unless
$threshold
> 0;
$self
->{
'ACTIVATE_THRESHOLD'
} =
$_
[1];
}
sub
get_auto_reweight{
$_
[0]->{auto_reweight} }
sub
set_auto_reweight{
$_
[0]->{auto_reweight} =
$_
[0]->[1]; }
sub
get_collect_threshold {
return
(
$_
[0]->{
'xs'
} ?
$_
[0]->{Graph}->collectionThreshold :
$_
[0]->{
'COLLECT_THRESHOLD'
})
}
sub
set_collect_threshold {
my
(
$self
,
$newval
) =
@_
;
$newval
||=0;
$self
->{Graph}->collectionThreshold(
$newval
)
if
$self
->{
'xs'
};
$self
->{
'COLLECT_THRESHOLD'
} =
$newval
|| 0;
return
1;
}
sub
get_debug_mode {
$_
[0]->{debug} }
sub
set_debug_mode {
my
(
$self
,
$mode
) =
@_
;
$self
->{
'debug'
} =
$mode
;
}
sub
get_initial_energy {
$_
[0]->{
'START_ENERGY'
} }
sub
set_initial_energy {
my
(
$self
,
$start_energy
) =
@_
;
croak
"Can't set initial energy to zero"
unless
$start_energy
;
croak
"Can't set initial energy to negative value"
unless
$start_energy
> 0;
$self
->{
'START_ENERGY'
} =
$start_energy
;
}
sub
get_max_depth {
$_
[0]->{max_depth} }
sub
set_max_depth { croak
"Tried to set maximum depth to an undefined value"
unless
defined
$_
[1];
$_
[0]->{max_depth} =
$_
[1] || 100000000
}
sub
add {
my
(
$self
,
$title
,
$words
) =
@_
;
croak
"Please provide a word list"
unless
defined
$words
;
croak
"Word list is not a reference to an array or hash"
unless
ref
$words
and
ref
$words
eq
"HASH"
or
ref
$words
eq
"ARRAY"
;
croak
"Please provide a document identifier"
unless
defined
$title
;
my
$dnode
= _nodeify(
'D'
,
$title
);
croak
"Tried to add document with duplicate identifier: '$title'\n"
if
exists
$self
->{neighbors}{
$dnode
};
my
@list
;
if
(
ref
$words
eq
'ARRAY'
) {
@list
= @{
$words
};
}
else
{
@list
=
keys
%{
$words
};
}
croak
"Tried to add a document with no content"
unless
scalar
@list
;
my
@edges
;
foreach
my
$term
(
@list
) {
my
$tnode
= _nodeify(
'T'
,
lc
(
$term
) );
my
$lcount
= (
ref
$words
eq
'HASH'
?
$words
->{
$term
} : 1 );
my
$gcount
= ++
$self
->{term_count}{
lc
(
$term
)};
my
$final_weight
= 1;
push
@edges
, [
$dnode
,
$tnode
,
$final_weight
,
$lcount
];
}
$self
->{reweight_flag} = 1;
__normalize( \
@edges
);
foreach
my
$e
(
@edges
) {
$self
->{neighbors}{
$e
->[0]}{
$e
->[1]} =
join
','
,
$e
->[2],
$e
->[3];
$self
->{neighbors}{
$e
->[1]}{
$e
->[0]} =
join
','
,
$e
->[2],
$e
->[3];
}
$self
->reweight_graph()
if
$self
->{auto_reweight};
return
1;
}
sub
add_file {
my
(
$self
,
$path
,
%params
) =
@_
;
croak
"Invalid file '$path' provided to add_file method."
unless
defined
$path
and -f
$path
;
my
$title
= (
exists
$params
{name} ?
$params
{name} :
$path
);
local
$/;
open
my
$fh
,
$path
or croak
"Unable to open $path: $!"
;
my
$content
= <
$fh
>;
my
$ref
;
if
(
exists
$params
{parse} ) {
croak
"code provided is not a reference"
unless
ref
$params
{parse};
croak
"code provided is not a subroutine"
unless
ref
$params
{parse} eq
'CODE'
;
$ref
=
$params
{parse}->(
$content
);
croak
"did not get an appropriate reference back after parsing"
unless
ref
$ref
and
ref
$ref
=~ /(HASH|ARRAY)/;
}
else
{
my
$code
=
sub
{
my
$txt
=
shift
;
$txt
=~ s/\W/ /g;
my
@toks
=
split
m/\s+/,
$txt
;
\
@toks
;
};
$ref
=
$code
->(
$content
);
}
return
unless
$ref
;
$self
->add(
$title
,
$ref
);
}
sub
bulk_add {
my
(
$self
,
%incoming_docs
) =
@_
;
{
local
$self
->{auto_reweight} = 0;
foreach
my
$doc
(
keys
%incoming_docs
) {
$self
->add(
$doc
,
$incoming_docs
{
$doc
});
}
}
$self
->reweight_graph()
if
$self
->{auto_reweight};
}
sub
degree {
scalar
keys
%{
$_
[0]->{neighbors}{
$_
[1]}} }
sub
delete
{
my
(
$self
,
$type
,
$name
) =
@_
;
croak
"Must provide a node type to delete() method"
unless
defined
$type
;
croak
"Invalid type $type passed to delete method. Must be one of [TD]"
unless
$type
=~ /^[TD]$/io;
croak
"Please provide a node name"
unless
defined
$name
;
return
unless
defined
$name
;
my
$node
= _nodeify(
$type
,
$name
);
my
$n
=
$self
->{neighbors};
croak
"Found a neighborless node $node"
unless
exists
$n
->{
$node
};
my
@terms
=
keys
%{
$n
->{
$node
} };
warn
"found "
,
scalar
@terms
,
" neighbors attached to $node\n"
if
$self
->{debug};
foreach
my
$t
(
@terms
) {
delete
$n
->{
$node
}{
$t
};
delete
$n
->{
$t
}{
$node
};
if
(
scalar
keys
%{
$n
->{
$t
} } == 0 ) {
warn
"\tdeleting orphaned node $t"
if
$self
->{debug};
my
(
$subtype
,
$name
) =
$t
=~ /^(.):(.*)$/;
delete
$n
->{
$t
};
}
}
delete
$n
->{
$node
};
$self
->check_consistency();
$self
->{reweight_flag} = 1;
$self
->reweight_graph
if
$self
->{auto_reweight};
1;
}
sub
has_doc {
my
(
$self
,
$doc
) =
@_
;
carp
"Received undefined value for has_doc"
unless
defined
$doc
;
my
$node
= _nodeify(
'D'
,
$doc
);
return
exists
$self
->{neighbors}{
$node
} ||
undef
;
}
sub
has_term {
my
(
$self
,
$term
) =
@_
;
carp
"Received undefined value for has_term"
unless
defined
$term
;
my
$node
= _nodeify(
'T'
,
$term
);
return
exists
$self
->{neighbors}{
$node
} ||
undef
;
}
sub
distance {
my
(
$self
,
$n1
,
$n2
,
$type
) =
@_
;
croak
unless
$type
;
$type
=
lc
(
$type
);
croak
unless
$type
=~ /^[dt]$/;
my
$key
= (
$type
eq
't'
?
'terms'
:
'documents'
);
my
@shared
=
$self
->intersection(
$key
=> [
$n1
,
$n2
] );
return
0
unless
@shared
;
my
$node1
= _nodeify(
$type
,
$n1
);
my
$node2
= _nodeify(
$type
,
$n2
);
my
$sum1
= 0;
my
$sum2
= 0;
foreach
my
$next
(
@shared
) {
my
(
undef
,
$lcount1
) =
split
m/,/,
$self
->{neighbors}{
$node1
}{
$next
};
my
(
undef
,
$lcount2
) =
split
m/,/,
$self
->{neighbors}{
$node2
}{
$next
};
my
$degree
=
$self
->degree(
$next
);
my
$elem1
=
$lcount1
/
$degree
;
$sum1
+=
$elem1
;
my
$elem2
=
$lcount2
/
$degree
;
$sum2
+=
$elem2
;
}
my
$final
= (
$sum1
/
$self
->degree(
$node1
)) + (
$sum2
/
$self
->degree(
$node2
));
return
$final
;
}
sub
distance_matrix {
my
(
$self
,
$type
,
$limit
) =
@_
;
croak
"Must provide type argument to distance_matrix()"
unless
defined
$type
;
croak
"must provide limit"
unless
$limit
;
my
@nodes
;
if
(
lc
(
$type
) eq
'd'
) {
@nodes
=
$self
->doc_list();
}
elsif
(
lc
(
$type
) eq
't'
) {
@nodes
=
$self
->term_list();
}
else
{
croak
"Unsupported type $type"
;
}
my
@ret
;
my
$count
= 0;
foreach
my
$from
(
@nodes
) {
warn
$from
,
" - $count\n"
;
$count
++;
my
$index
= -1;
my
@found
;
foreach
my
$to
(
@nodes
) {
$index
++;
next
if
$from
eq
$to
;
my
$dist
=
$self
->distance(
$from
,
$to
,
$type
);
push
@found
, [
$index
,
$dist
]
if
$dist
;
}
my
@sorted
=
sort
{
$b
->[1] <=>
$a
->[1] }
@found
;
my
@final
=
splice
(
@sorted
, 0,
$limit
);
push
@ret
,
join
" "
, (
map
{
join
' '
,
$_
->[0],
substr
(
$_
->[1], 0, 7) }
sort
{
$a
->[0] <=>
$b
->[0] }
@final
),
"\n"
;
}
return
join
"\n"
,
@ret
;
}
sub
intersection {
my
(
$self
,
%nodes
) =
@_
;
my
@nodes
;
if
(
exists
$nodes
{documents} ) {
push
@nodes
,
map
{ _nodeify(
'D'
,
$_
) } @{
$nodes
{documents}};
}
if
(
exists
$nodes
{terms} ) {
push
@nodes
,
map
{ _nodeify(
'T'
,
$_
) } @{
$nodes
{terms}};
}
my
%seen
;
foreach
my
$n
(
@nodes
) {
my
@neighbors
=
$self
->_neighbors(
$n
);
$seen
{
$_
}++
foreach
@neighbors
;
}
return
map
{ s/^[DT]://;
$_
}
grep
{
$seen
{
$_
} ==
scalar
@nodes
}
keys
%seen
;
}
sub
raw_search {
my
(
$self
,
@query
) =
@_
;
$self
->_clear();
foreach
(
@query
) {
$self
->_energize(
$_
,
$self
->{
'START_ENERGY'
});
}
my
$results_ref
=
$self
->_collect();
return
$results_ref
;
}
sub
reweight_graph {
my
(
$self
) =
@_
;
my
$n
=
$self
->{neighbors};
my
$doc_count
=
$self
->doc_count();
foreach
my
$node
(
keys
%{
$n
} ) {
next
unless
$node
=~ /^D:/o;
warn
"reweighting at node $node\n"
if
$self
->{debug} > 1;
my
@terms
=
keys
%{
$n
->{
$node
} };
my
@edges
;
foreach
my
$t
(
@terms
) {
my
$pair
=
$n
->{
$node
}{
$t
};
my
(
undef
,
$lcount
) =
split
/,/,
$pair
;
(
my
$term
=
$t
) =~ s/^T://;
croak
"did not receive a local count"
unless
$lcount
;
my
$weight
;
if
(
$self
->{use_global_weights} ) {
my
$gweight
=
log
(
$doc_count
/
$self
->doc_count(
$term
) ) + 1;
my
$lweight
=
log
(
$lcount
) + 1;
$weight
= (
$gweight
*
$lweight
);
}
else
{
$weight
=
log
(
$lcount
) + 1;
}
push
@edges
, [
$node
,
$t
,
$weight
,
$lcount
];
}
__normalize( \
@edges
);
foreach
my
$e
(
@edges
) {
my
$pair
=
join
','
,
$e
->[2],
$e
->[3];
$n
->{
$node
}{
$e
->[1]} =
$n
->{
$e
->[1]}{
$node
} =
$pair
;
}
}
$self
->{reweight_flag} = 0;
return
1;
}
sub
update {
my
(
$self
,
$id
,
$words
) =
@_
;
croak
"update not implemented in XS"
if
$self
->{xs};
croak
"Must provide a document identifier to update_document"
unless
defined
$id
;
my
$dnode
= _nodeify(
'D'
,
$id
);
return
unless
exists
$self
->{neighbors}{
$dnode
};
croak
"must provide a word list "
unless
defined
$words
and
ref
$words
and
(
ref
$words
eq
'HASH'
or
ref
$words
eq
'ARRAY'
);
my
$n
=
$self
->{neighbors}{
$dnode
};
my
@terms
=
keys
%{
$n
};
if
(
ref
$words
eq
'ARRAY'
) {
my
%words
;
$words
{
$_
}++
foreach
@$words
;
$words
= \
%words
;
}
local
$self
->{auto_reweight} = 0;
my
$must_reweight
= 0;
my
%seen
;
foreach
my
$term
(
keys
%{
$words
} ) {
my
$t
= _nodeify(
'T'
,
$term
);
if
(
exists
$n
->{
$t
} ){
my
$curr_val
=
$n
->{
$t
};
my
(
undef
,
$loc
) =
split
m/,/,
$curr_val
;
unless
(
$loc
==
$words
->{
$term
} ) {
$n
->{
$t
} =
join
','
, 1,
$words
->{
$term
};
$must_reweight
++;
}
}
else
{
$n
->{
$t
} =
$self
->{neighbors}{
$t
}{
$dnode
} =
join
','
, 1,
$words
->{
$term
};
$must_reweight
++;
}
$seen
{
$t
}++;
}
foreach
my
$t
(
@terms
) {
$must_reweight
++
unless
exists
$seen
{
$t
};
}
$self
->reweight_graph()
if
$must_reweight
;
return
$must_reweight
;
}
sub
doc_count {
my
(
$self
,
$term
) =
@_
;
if
(
defined
$term
) {
$term
= _nodeify(
'T'
,
$term
)
unless
$term
=~ /^T:/;
my
$node
=
$self
->{neighbors}{
$term
};
return
0
unless
defined
$node
;
return
scalar
keys
%{
$node
};
}
else
{
return
scalar
grep
/D:/,
keys
%{
$self
->{
'neighbors'
} };
}
}
sub
doc_list {
my
(
$self
,
$term
) =
@_
;
my
$t
;
if
(
defined
$term
and
$term
!~ /T:/) {
$t
= _nodeify(
'T'
,
$term
);
}
my
$hash
= (
defined
$term
?
$self
->{neighbors}{
$t
} :
$self
->{neighbors} );
grep
/^D:/,
keys
%{
$hash
};
}
sub
dump
{
my
(
$self
) =
@_
;
my
@docs
=
$self
->doc_list();
foreach
my
$d
(
@docs
) {
print
$self
->dump_node(
$d
);
}
}
sub
dump_node {
my
(
$self
,
$node
) =
@_
;
my
@lines
;
push
@lines
,
join
"\t"
,
"COUNT"
,
"WEIGHT"
,
"NEIGHBOR"
;
foreach
my
$n
(
keys
%{
$self
->{neighbors}{
$node
} } ) {
my
$v
=
$self
->{neighbors}{
$node
}{
$n
};
my
(
$weight
,
$count
) =
split
/,/,
$v
;
push
@lines
,
join
"\t"
,
$count
,
substr
(
$weight
, 0, 8 ),
$n
;
}
return
@lines
;
}
sub
dump_tdm {
my
(
$self
,
$file
) =
@_
;
my
$counter
= 0;
my
%lookup
;
$lookup
{
$_
} =
$counter
++
foreach
$self
->term_list;
my
@docs
=
$self
->doc_list;
my
$fh
;
if
(
defined
$file
) {
open
$fh
,
"> $file"
or croak
"Could not open TDM output file: $!"
;
}
else
{
*fh
=
*STDOUT
;
}
foreach
my
$doc
(
@docs
) {
my
$n
=
$self
->{neighbors}{
$doc
};
my
$row_count
=
scalar
keys
%{
$n
};
print
$fh
$row_count
;
foreach
my
$t
(
sort
keys
%{
$doc
} ) {
my
$index
=
$lookup
{
$t
};
my
(
$weight
,
undef
) =
split
m/,/,
$n
->{
$t
};
print
$fh
' '
,
$index
,
' '
,
$weight
;
}
print
$fh
"\n"
;
}
}
sub
near_neighbors {
my
(
$self
,
$name
,
$type
) =
@_
;
my
$node
= _nodeify(
$type
,
$name
);
my
$n
=
$self
->{neighbors}{
$node
};
my
%found
;
foreach
my
$next
(
keys
%{
$n
} ) {
foreach
my
$mynext
(
keys
%{
$self
->{neighbors}{
$next
} }){
$found
{
$mynext
}++;
}
}
delete
$found
{
$node
};
return
keys
%found
;
}
sub
term_count {
my
(
$self
,
$doc
) =
@_
;
if
(
defined
$doc
) {
my
$node
=
$self
->{neighbors}{ _nodeify(
'D'
,
$doc
) };
return
0
unless
defined
$node
;
return
scalar
keys
%{
$node
};
}
else
{
return
scalar
grep
/T:/,
keys
%{
$self
->{neighbors} };
}
}
sub
term_list {
my
(
$self
,
$doc
) =
@_
;
my
$node
= (
defined
$doc
?
$self
->{neighbors}{ _nodeify(
'D'
,
$doc
) } :
$self
->{neighbors}
);
grep
/^T:/,
keys
%{
$node
};
}
sub
word_count {
my
(
$self
,
$term
) =
@_
;
my
$n
=
$self
->{neighbors};
my
$count
= 0;
my
@terms
;
if
(
defined
$term
) {
push
@terms
,
$term
;
}
else
{
@terms
=
$self
->term_list();
}
foreach
my
$term
(
@terms
) {
$term
= _nodeify(
'T'
,
$term
)
unless
$term
=~/^T:/o;
foreach
my
$doc
(
keys
%{
$n
->{
$term
} } ) {
(
undef
,
my
$lcount
) =
split
/,/,
$n
->{
$term
}{
$doc
};
$count
+=
$lcount
;
}
}
return
$count
;
}
sub
search {
my
(
$self
,
@query
) =
@_
;
my
@nodes
= _nodeify(
'T'
,
@query
);
my
$results
=
$self
->raw_search(
@nodes
);
my
(
$docs
,
$words
) = _partition(
$results
);
return
(
$docs
,
$words
);
}
sub
simple_search {
my
(
$self
,
$query
) =
@_
;
my
@words
=
map
{ s/\W+//g;
lc
(
$_
) }
split
m/\s+/,
$query
;
my
@nodes
= _nodeify(
'T'
,
@words
);
my
$results
=
$self
->raw_search(
@nodes
);
my
(
$docs
,
$words
) = _partition(
$results
);
my
@sorted_docs
=
sort
{
$docs
->{
$b
} <=>
$docs
->{
$a
} }
keys
%{
$docs
};
return
@sorted_docs
;
}
sub
find_by_title {
my
(
$self
,
@titles
) =
@_
;
my
@found
;
my
@docs
=
$self
->doc_list();
my
$pattern
=
join
'|'
,
@titles
;
my
$match_me
=
qr/$pattern/
i;
foreach
my
$d
(
@docs
) {
push
@found
,
$d
if
$d
=~
$match_me
;
}
return
@found
;
}
sub
find_similar {
my
(
$self
,
@docs
) =
@_
;
my
@nodes
= _nodeify(
'D'
,
@docs
);
my
$results
=
$self
->raw_search(
@nodes
);
my
(
$docs
,
$words
) = _partition(
$results
);
return
(
$docs
,
$words
);
}
sub
merge {
my
(
$self
,
$type
,
$good
,
@bad
) =
@_
;
croak
"must provide a type argument to merge"
unless
defined
$type
;
croak
"Invalid type argument $type to merge [must be one of (D,T)]"
unless
$type
=~ /^[DT]/io;
my
$target
= _nodeify(
$type
,
$good
);
my
@sources
= _nodeify(
$type
,
@bad
);
my
$tnode
=
$self
->{neighbors}{
$target
};
foreach
my
$bad_node
(
@sources
) {
next
if
$bad_node
eq
$target
;
my
%neighbors
= %{
$self
->{neighbors}{
$bad_node
}};
foreach
my
$n
(
keys
%neighbors
) {
if
(
exists
$self
->{neighbors}{
$target
}{
$n
} ) {
my
$curr_val
=
$tnode
->{
$n
};
my
$aug_val
=
$self
->{neighbors}{
$bad_node
}{
$n
};
my
(
$w1
,
$c1
) =
split
m/,/,
$curr_val
;
my
(
$w2
,
$c2
) =
split
m/,/,
$aug_val
;
my
$new_count
=
$c1
+
$c2
;
$curr_val
=~ s/,\d+$/,
$new_count
/;
$tnode
->{
$n
} =
$curr_val
;
}
else
{
die
"sanity check failed for existence test"
if
exists
$self
->{neighbors}{
$target
}{
$n
};
my
$val
=
$self
->{neighbors}{
$bad_node
}{
$n
};
$self
->{neighbors}{
$n
}{
$target
} =
$val
;
$self
->{neighbors}{
$target
}{
$n
} =
$val
;
}
delete
$self
->{neighbors}{
$bad_node
}{
$n
};
delete
$self
->{neighbors}{
$n
}{
$bad_node
};
}
delete
$self
->{neighbors}{
$bad_node
};
}
}
sub
mixed_search {
my
(
$self
,
$incoming
) =
@_
;
croak
"must provide hash ref to mixed_search method"
unless
defined
$incoming
&&
ref
(
$incoming
) &&
ref
(
$incoming
) eq
'HASH'
;
my
$tref
=
$incoming
->{
'terms'
} || [];
my
$dref
=
$incoming
->{
'docs'
} || [];
my
@dnodes
= _nodeify(
'D'
, @{
$dref
} );
my
@tnodes
= _nodeify(
'T'
, @{
$tref
} );
my
$results
=
$self
->raw_search(
@dnodes
,
@tnodes
);
my
(
$docs
,
$words
) = _partition(
$results
);
return
(
$docs
,
$words
);
}
sub
store {
my
(
$self
,
@args
) =
@_
;
if
(
$self
->{
'xs'
} ) {
croak
"Cannot store object when running in XS mode."
;
}
else
{
$self
->SUPER::nstore(
@args
);
}
}
sub
_partition {
my
(
$e
) =
@_
;
my
(
$docs
,
$words
);
foreach
my
$k
(
sort
{
$e
->{
$b
} <=>
$e
->{
$a
} }
keys
%{
$e
} ) {
(
my
$name
=
$k
) =~ s/^[DT]://o;
$k
=~ /^D:/ ?
$docs
->{
$name
} =
$e
->{
$k
} :
$words
->{
$name
} =
$e
->{
$k
} ;
}
return
(
$docs
,
$words
);
}
sub
_neighbors {
my
(
$self
,
$node
) =
@_
;
return
unless
exists
$self
->{neighbors}{
$node
};
return
keys
%{
$self
->{neighbors}{
$node
} };
}
sub
_nodeify {
my
(
$prefix
,
@list
) =
@_
;
my
@nodes
;
foreach
my
$item
(
@list
) {
push
@nodes
,
uc
(
$prefix
).
':'
.
$item
;
}
(
wantarray
?
@nodes
:
$nodes
[0] );
}
sub
_read_tdm {
my
(
$self
,
$file
) =
@_
;
print
"Loading TDM...\n"
if
$self
->{
'debug'
} > 1;
croak
"File does not exist"
unless
-f
$file
;
open
my
$fh
,
$file
or croak
"Could not open $file: $!"
;
for
( 1..4 ){
my
$skip
= <
$fh
>;
}
my
%neighbors
;
my
$doc
= 0;
if
(
$self
->{
'xs'
} ) {
my
$map
=
$self
->{
'node_map'
};
while
(<
$fh
>) {
chomp
;
my
$dindex
=
$self
->_add_node(
"D:$doc"
, 2 );
my
(
$count
,
%vals
) =
split
;
while
(
my
(
$term
,
$edge
) =
each
%vals
) {
$self
->{
'term_count'
}{
$term
}++;
my
$tnode
=
"T:$term"
;
my
$tindex
= (
defined
$map
->{
$tnode
} ?
$map
->{
$tnode
} :
$self
->_add_node(
$tnode
, 1 )
);
$self
->{Graph}->set_edge(
$dindex
,
$tindex
,
$edge
);
}
$doc
++;
}
}
else
{
while
(<
$fh
>) {
chomp
;
my
$dnode
=
"D:$doc"
;
my
(
$count
,
%vals
) =
split
;
while
(
my
(
$term
,
$edge
) =
each
%vals
) {
$self
->{
'term_count'
}{
$term
}++;
my
$tnode
=
"T:$term"
;
$neighbors
{
$dnode
}{
$tnode
} =
$edge
.
',1'
;
$neighbors
{
$tnode
}{
$dnode
} =
$edge
.
',1'
;
}
$doc
++;
}
$self
->{
'neighbors'
} = \
%neighbors
;
}
print
"Loaded.\n"
if
$self
->{
'debug'
} > 1;
$self
->{
'from_TDM'
} = 1;
$self
->{
'doc_count'
} =
$doc
;
}
sub
_add_node {
my
(
$self
,
$node_name
,
$type
) =
@_
;
croak
"Must provide a type"
unless
$type
;
croak
"Must provide a node name"
unless
$node_name
;
croak
"This node already exists"
if
$self
->{
'node_map'
}{
$node_name
};
my
$new_id
=
$self
->{
'next_free_id'
}++;
$self
->{
'node_map'
}{
$node_name
} =
$new_id
;
$self
->{
'id_map'
}[
$new_id
] =
$node_name
;
$self
->{
'Graph'
}->add_node(
$new_id
,
$type
);
return
$new_id
;
}
sub
check_consistency {
my
(
$self
) =
@_
;
my
%inbound
;
my
%outbound
;
foreach
my
$node
(
keys
%{
$self
->{neighbors}} ) {
next
unless
$node
=~ /^[DT]:/;
$outbound
{
$node
} =
scalar
keys
%{
$self
->{neighbors}{
$node
}};
foreach
my
$neighbor
(
keys
%{
$self
->{neighbors}{
$node
} } ) {
$inbound
{
$neighbor
}++;
}
}
my
$in
=
scalar
keys
%inbound
;
my
$out
=
scalar
keys
%outbound
;
carp
"number of nodes with inbound links ($in) does not match number of nodes with outbound links ( $out )"
unless
scalar
keys
%inbound
==
scalar
keys
%outbound
;
foreach
my
$node
(
keys
%inbound
) {
$outbound
{
$node
} ||= 0;
carp
"$node has $inbound{$node} inbound links, $outbound{$node} outbound links\n"
unless
$inbound
{
$node
} ==
$outbound
{
$node
};
}
}
sub
have_edge {
my
(
$self
,
$node1
,
$node2
) =
@_
;
return
exists
$self
->{neighbors}{
$node1
}{
$node2
};
}
{
my
%visited
;
my
%component
;
my
$depth
;
sub
connected_components {
my
(
$self
) =
@_
;
%visited
= ();
%component
= ();
my
$n
=
$self
->{neighbors};
my
@node_list
=
keys
%{
$n
};
my
@components
;
while
(
@node_list
) {
my
$start
=
shift
@node_list
;
next
if
exists
$visited
{
$start
};
last
unless
$start
;
warn
"Visiting neighbors for $start\n"
;
visit_neighbors(
$n
,
$start
);
push
@components
, [
keys
%component
];
%component
= ();
}
warn
"Found "
,
scalar
@components
,
" connected components\n"
;
return
@components
;
}
sub
visit_neighbors {
my
(
$g
,
$l
) =
@_
;
return
if
$visited
{
$l
};
$depth
++;
$visited
{
$l
}++;
$component
{
$l
}++;
warn
$depth
,
" $l\n"
;
my
@neigh
=
keys
%{
$g
->{
$l
} };
foreach
my
$n
(
@neigh
) {
visit_neighbors(
$g
,
$n
);
}
$depth
--;
}
}
sub
_clear {
my
(
$self
) =
@_
;
$self
->{
'energy'
} =
undef
;
}
sub
_collect {
my
(
$self
) =
@_
;
my
$e
=
$self
->{
'energy'
};
my
$result
= {};
foreach
my
$k
(
keys
%{
$self
->{
'energy'
}} ) {
next
unless
$e
->{
$k
} >
$self
->{
'COLLECT_THRESHOLD'
};
$result
->{
$k
} =
$e
->{
$k
};
}
return
$result
;
}
sub
_energize {
my
(
$self
,
$node
,
$energy
) =
@_
;
return
unless
defined
$self
->{neighbors}{
$node
};
my
$orig
=
$self
->{energy}{
$node
} || 0;
$self
->{energy}->{
$node
} +=
$energy
;
return
if
(
$self
->{depth} ==
$self
->{max_depth} );
$self
->{depth}++;
if
(
$self
->{
'debug'
} > 1 ) {
print
' '
x
$self
->{
'depth'
};
print
"$node: energizing $orig + $energy\n"
;
}
my
$n
=
$self
->{neighbors};
my
$degree
=
scalar
keys
%{
$n
->{
$node
} };
if
(
$degree
== 0 ) {
carp
"WARNING: reached a node without neighbors: $node at search depth $self->{depth}\n"
;
$self
->{depth}--;
return
;
}
my
$subenergy
=
$energy
/ (
log
(
$degree
)+1);
if
(
$degree
== 1 and
$energy
<
$self
->{
'START_ENERGY'
} ) {
}
elsif
(
$subenergy
>
$self
->{ACTIVATE_THRESHOLD} ) {
print
' '
x
$self
->{
'depth'
},
"$node: propagating subenergy $subenergy to $degree neighbors\n"
if
$self
->{
'debug'
} > 1;
foreach
my
$neighbor
(
keys
%{
$n
->{
$node
} } ) {
my
$pair
=
$n
->{
$node
}{
$neighbor
};
my
(
$edge
,
undef
) =
split
/,/,
$pair
;
my
$weighted_energy
=
$subenergy
*
$edge
;
print
' '
x
$self
->{
'depth'
},
" edge $edge ($node, $neighbor)\n"
if
$self
->{
'debug'
} > 1;
$self
->_energize(
$neighbor
,
$weighted_energy
);
}
}
$self
->{
'depth'
}--;
return
1;
}
sub
__normalize {
my
(
$arr
) =
@_
;
croak
"Must provide array ref to __normalize"
unless
defined
$arr
and
ref
$arr
and
ref
$arr
eq
'ARRAY'
;
my
$sum
;
$sum
+=
$_
->[2]
foreach
@{
$arr
};
$_
->[2]/=
$sum
foreach
@{
$arr
};
return
1;
}
sub
DESTROY {
undef
$_
[0]->{Graph}
}
1;