#!/usr/bin/perl -w
use CGI qw(:standard :cgi-lib);
use lib qw(.);
use GermaNet::Flat;
use GraphViz;
use File::Basename qw(basename dirname);
use Encode qw(encode decode encode_utf8 decode_utf8);
use HTTP::Status;
use File::Temp;
use JSON;
use utf8;
use strict;
use open qw(:std :utf8);
##==============================================================================
## constants
our $prog = basename($0);
our $label = "GermaNet"; ##-- top-level label
our $charset = 'utf-8'; ##-- this is all we support for now
our $max_depth = 2; ##-- maximum 'depth' parameter (0:none)
our $vars = {};
our %defaults =
(
'q'=>'GNROOT',
'f'=>'html',
'case' => 1,
'db' => 'gn',
'depth' => 1, ##-- TODO
);
##-- local overrides
if (-r "$0.rc") {
do "$0.rc";
die("$0: error reading rc file $0.rc: $@") if ($@);
}
##==============================================================================
## utils
BEGIN {
*htmlesc = \&escapeHTML;
}
my ($gn);
sub syn_id {
return ref($_[0]) ? $_[0]{synset} : $_[0];
}
sub syn_label {
my $syn = shift;
return join("\\n", @{ref($syn) ? $syn->{orth} : $gn->lex2orth($gn->syn2lex($syn))});
}
my (%nodes,%edges,$gv);
sub ensure_node {
my ($syn,%opts) = @_;
my $synid = syn_id($syn);
$gv->add_node(($nodes{$synid}=$synid),
label=>syn_label($syn),
URL=>"?s=$synid",
%opts,
) if (!exists $nodes{$synid});
}
sub ensure_edge {
my ($from,$to,%opts) = @_;
my $fromid = syn_id($from);
my $toid = syn_id($to);
if (exists $edges{"$fromid $toid"}) {
#print STDERR "edge exists: $fromid $toid\n";
return;
}
$edges{"$fromid $toid"} = "$fromid $toid";
$gv->add_edge($fromid,$toid,%opts);
return;
}
sub ensure_tree {
my ($syn,$subdepth,$supdepth, $opts,$subopts,$supopts) = @_;
ensure_node($syn, %{$opts//{}});
if (($subdepth//0) > 0) {
foreach my $sub (@{$syn->{hyponyms}//[]}) {
ensure_tree($sub, $subdepth-1,0, $subopts,$subopts,undef);
ensure_edge($syn, $sub);
}
}
if (($supdepth//0) > 0) {
foreach my $sup (@{$syn->{hyperonyms}//[]}) {
ensure_tree($sup, 0,$supdepth-1, $supopts,undef,$supopts);
ensure_edge($sup, $syn);
}
}
}
## \%info = synset_info($synsetId, $subdepth=0, $supdepth=0)
## + returned hash is of the form {synset=>$synsetId, orth=>\@orths, ...}
## + if $subdepth is greater than zero, hash also has hyponyms=>\@subs
## + if $supdepth is greater than zero, hash also has hyperonyms=>\@supers
sub synset_info {
my ($syn,$subdepth,$supdepth) = @_;
my $info = {synset=>$syn, orth=>[map {s/_/ /g; $_} @{$gn->lex2orth($gn->syn2lex($syn))}]};
if (($subdepth//0) > 0) {
$info->{hyponyms}=[];
foreach my $sub (@{$gn->hyponyms($syn)}) {
push(@{$info->{hyponyms}}, synset_info($sub,$subdepth-1,0));
}
}
if (($supdepth//0) > 0) {
$info->{hyperonyms}=[];
foreach my $sup (@{$gn->hyperonyms($syn)}) {
push(@{$info->{hyperonyms}}, synset_info($sup,0,$supdepth-1));
}
}
return $info;
}
## $tmpdata = gvdump($gv,$fmt)
## + workaround for broken UTF-8 support in GraphViz::as_* methods
sub gvdump {
my ($gv,$fmt) = @_;
my ($fh,$filename) = File::Temp::tempfile('gnvXXXXX',DIR=>'/tmp',SUFFIX=>".$fmt",UNLINK=>1);
$fh->close();
my $dot = $gv->as_debug;
open(DOT,'|-','dot',"-T$fmt","-o$filename")
or die("$prog: could not open pipe to dot: $!");
binmode(DOT,':utf8');
print DOT $dot
or die("$prog: failed to write to DOT pipe: $!");
close DOT
or die("$prog: failed to close DOT pipe: $!");
local $/=undef;
open(BUF,"<:raw", $filename)
or die("$prog: open failed for temp file '$filename': $!");
my $buf = <BUF>;
close BUF;
return $buf;
}
## $bool = is_robot()
## + check for common robots via user agent
## + found in logs:
## "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
## "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
sub is_robot {
my $ua = user_agent() // '';
return $ua =~ /Googlebot|YandexBot/ ? 1 : 0;
}
##======================================================================
## cgi parameters
##-- DEBUG
sub showq {
return;
my ($lab,$q) = @_;
$q //= '';
printf STDERR
("$0: $lab: q=$q \[utf8:%d,valid:%d,check:%d]\n",
(utf8::is_utf8($q) ? 1 : 0),
(utf8::valid($q) ? 1 : 0),
(Encode::is_utf8($q,1) ? 1 : 0),
);
}
##-- get params
if (param()) {
$vars = { Vars() }; ##-- copy tied Vars()-hash, otherwise utf8 flag gets handled wrong!
}
##-- rename vars
$vars->{q} //= (grep {$_} @$vars{qw(lemma l term t word w)})[0];
$vars->{s} //= (grep {$_} @$vars{qw(synset syn s)})[0];
$vars->{f} //= (grep {$_} @$vars{qw(format fmt f mode m)})[0];
$vars->{db} //= (grep {$_} @$vars{qw(database base db)})[0];
$vars->{case} //= (grep {$_} @$vars{qw(case_sensitive sensitive sens case cs)})[0];
$vars->{depth} //= (grep {$_} @$vars{qw(depth d)})[0];
showq('init', $vars->{q}//'');
charset($charset); ##-- initialize charset AFTER calling Vars(), otherwise fallback utf8::upgrade() won't work
##-- instantiate defaults
#use Data::Dumper; print STDERR Data::Dumper->Dump([\%defaults,$vars],['defaults','vars']);
$vars->{$_} = $defaults{$_} foreach (grep {!defined($vars->{$_})} keys %defaults);
$vars->{depth} = $max_depth if (($max_depth//0) > 0 && ($vars->{depth}//0) > $max_depth);
showq('default', $vars->{q});
##-- sanitize vars
foreach (keys %$vars) {
next if (!defined($vars->{$_}));
my $tmp = $vars->{$_};
$tmp =~ s/\x{0}//g;
eval {
##-- try to decode utf8 params e.g. "%C3%B6de" for "öde"
$tmp = decode_utf8($tmp,1) if (!utf8::is_utf8($tmp) && utf8::valid($tmp));
};
if ($@) {
##-- decoding failed; treat as bytes (e.g. "%F6de" for "öde")
utf8::upgrade($tmp);
undef $@;
}
$vars->{$_} = $tmp;
}
showq('sanitized', $vars->{q});
our $depth = $vars->{depth};
##==============================================================================
## MAIN
my %fmtxlate = ('text'=>'dot',
'jpg'=>'jpeg',
);
my %fmt2type = ('png'=>'image/png',
'gif'=>'image/gif',
'jpeg'=>'image/jpeg',
'dot'=>'text/plain',
'canon'=>'text/plain',
'debug'=>'text/plain',
'cmapx'=>'text/plain',
'imap'=>'text/html',
'svg'=>'image/svg+xml',
'eps'=>'application/postscript',
'ps'=>'application/postscript',
'json'=>'application/json',
);
eval {
die "$prog: you must specify either a query term (q=TERM) or a synset (s=SYNSET)!"
if (!$vars->{q} && !$vars->{s});
my $dir0 = dirname($0);
my $infile = (grep {-r $_} map {($_,"$_.db","$dir0/$_","$dir0/$_.db")} map {($_,"${label}/$_")} ($vars->{db}))[0];
die("$0: couldn't find input file for db=$vars->{db}") if (!$infile);
#print STDERR "$0: using database '$infile'\n";
$gn = GermaNet::Flat->load($infile)
or die("$prog: failed to load '$infile': $!");
##-- output format
my $fmt = $vars->{f};
$fmt = $fmtxlate{$fmt} if (exists($fmtxlate{$fmt}));
##-- basic properties
my ($syns,$qtitle);
if ($vars->{s}) {
##-- basic properties: synset query
$syns = [grep {exists($gn->{rel}{"syn2lex:$_"})} split(' ',$vars->{s})];
$qtitle = '{'.join(', ', @{$gn->auniq($gn->synset_terms($syns))}).'}';
} else {
##-- basic properties: lemma or synset query
my @terms = split(' ',$vars->{q});
@terms = $gn->luniq(map {($_,lc($_),ucfirst(lc($_)))} @terms) if (!$vars->{case});
$syns = $gn->get_synsets(\@terms) // [];
push(@$syns, grep {exists($gn->{rel}{"syn2lex:$_"})} @terms); ##-- allow synset names as 'lemma' queries
$qtitle = $vars->{q};
}
#print STDERR "syns = {", join(' ',@{$syns||[]}), "}\n";
#die("$prog: no synset(s) found for query \`$qtitle'") if (!$syns || !@$syns);
$syns //= [];
##-- header keys
my %versionHeader = ("-X-germanet-version"=>($gn->dbversion()||'unknown'));
my $info = [map {synset_info($_,$depth,$depth)} @$syns];
if ($fmt eq 'json') {
##-- json format: just dump info
binmode *STDOUT, ':raw';
print
(header(-type=>$fmt2type{json},%versionHeader),
to_json($info, {utf8=>1, pretty=>1, canonical=>1}),
);
exit 0;
}
##-- graphviz object
$gv = GraphViz->new(
directed=>1,
rankdir=>'LR',
#concentrate=>1,
name=>'gn',
node=>{shape=>'rectangle',fontname=>'arial',fontsize=>12,style=>'filled',fillcolor=>'white'},
edge=>{dir=>'back'},
);
foreach my $syn (@$info) {
ensure_node($syn, fillcolor=>'yellow',fontname=>'arial bold',shape=>'circle');
}
foreach my $syn (@$info) {
ensure_tree($syn,$depth,$depth, {},{fillcolor=>'cyan'},{fillcolor=>'magenta'});
}
##-- dump
#print $gv->as_debug; exit 0;
#print $gv->as_canon; exit 0;
##-- get content
my ($fmtsub);
if ($fmt eq 'html') {
##-- content: html
my ($imgfmt);
#$imgfmt = 'svg';
$imgfmt = 'png';
my $cmapx = gvdump($gv,'cmapx');
my $deptharg = ($depth > 1) ? "&d=$depth" : '';
if (1) {
##-- trim/rename titles
$cmapx =~ s/\s(?:title|alt)=\"[^\"]*\"//sg;
$cmapx =~ s/href=\"\?s=(\w+)\"/href="?s=$1$deptharg" title="$1"/g;
}
print
(header(-type=>'text/html',-charset=>$charset,%versionHeader),
start_html(
-title=>"$label Graph: $qtitle",
-meta=>{ROBOTS=>'NOINDEX, NOFOLLOW'},
),
h1("$label Graph: $qtitle"),
($syns && @$syns
? ("<img src=\"${prog}?fmt=${imgfmt}&s=".join('+',@{$syns||[]})."$deptharg\" usemap=\"#gn\" />\n",
$cmapx,
)
: ("no synset(s) found!")
),
##-- ugly hack
q{<hr/>
<span style="display:block; text-align:center; color:#666666;">
<a style="color:#666666;" href="/dstar/imprint">Imprint</a>
·
<a style="color:#666666;" href="/dstar/privacy">Privacy</a>
</span>
},
end_html,
);
}
elsif ($fmt eq 'debug') {
print header(-type=>$fmt2type{$fmt},-charset=>'utf-8'), eval "\$gv->as_${fmt}()";
}
elsif (exists($fmt2type{$fmt})) {
binmode *STDOUT, ':raw';
print
(header(-type=>($fmt2type{$fmt}//"application/octet-stream")),
gvdump($gv,$fmt),
);
}
else {
die "$prog: unknown format '$fmt'";
}
exit 0;
};
##----------------------------------------------------------------------
## catch errors
if ($@) {
print
(header(-status=>RC_INTERNAL_SERVER_ERROR),
start_html('Error'),
h1('Error'),
pre(escapeHTML($@)),
end_html);
exit 1;
}