NAME
Ufal::MorphoDiTa - bindings to Morphodita library http://ufal.mff.cuni.cz/morphodita.
DESCRIPTION
Ufal::MorphoDiTa
is a Perl binding to Morphodita library http://ufal.mff.cuni.cz/morphodita.
All classes can be imported into the current namespace using the all
export tag.
The bindings is a straightforward conversion of the C++
bindings API. Vectors do not have native Perl interface, see Ufal::MorphoDiTa::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.
Wrapped C++ API
The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/morphodita/api-reference\>.
Helper Structures
-----------------
typedef vector<string> Forms;
struct TaggedForm {
string form;
string tag;
};
typedef vector<TaggedForm> TaggedForms;
struct TaggedLemma {
string lemma;
string tag;
};
typedef vector<TaggedLemma> TaggedLemmas;
struct TaggedLemmaForms {
string lemma;
TaggedForms forms;
};
typedef vector<TaggedLemmaForms> TaggedLemmasForms;
struct TokenRange {
size_t start;
size_t length;
};
typedef vector<TokenRange> TokenRanges;
Main Classes
------------
class Version {
public:
unsigned major;
unsigned minor;
unsigned patch;
static Version current();
};
class Tokenizer {
public:
virtual void setText(const char* text);
virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
static Tokenizer* newVerticalTokenizer();
static Tokenizer* newCzechTokenizer();
static Tokenizer* newEnglishTokenizer();
static Tokenizer* newGenericTokenizer();
};
class Morpho {
public:
static Morpho* load(const char* fname);
enum { NO_GUESSER = 0, GUESSER = 1 };
virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
virtual string rawLemma(const char* lemma) const;
virtual string lemmaId(const char* lemma) const;
virtual Tokenizer* newTokenizer() const;
};
class Tagger {
public:
static Tagger* load(const char* fname);
virtual const Morpho* getMorpho() const;
virtual void tag(Forms& forms, TaggedLemmas& tags) const;
Tokenizer* newTokenizer() const;
};
class TagsetConverter {
public:
static TagsetConverter* newIdentityConverter();
static TagsetConverter* newPdtToConll2009Converter();
virtual void convert(TaggedLemma& lemma) const;
virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
virtual void convertGenerated(TaggedLemmasForms& forms) const;
};
Examples
run_morpho_cli
Simple example performing morphological analysis and generation.
use strict;
use open qw(:std :utf8);
use Ufal::MorphoDiTa qw(:all);
@ARGV >= 1 or die "Usage: $0 dict_file\n";
print STDERR "Loading dictionary: ";
my $morpho = Morpho::load($ARGV[0]);
$morpho or die "Cannot load dictionary from file '$ARGV[0]'\n";
print STDERR "done\n";
shift @ARGV;
my $lemmas = TaggedLemmas->new();
my $lemmas_forms = TaggedLemmasForms->new();
while (<>) {
chomp;
my @tokens = split /\s+/, $_, -1;
if (@tokens == 1) { #Analyze
my $result = $morpho->analyze($tokens[0], $Morpho::GUESSER, $lemmas);
my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";
for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
my $lemma = $lemmas->get($i);
printf "%sLemma: %s %s\n", $guesser, $lemma->{lemma}, $lemma->{tag};
}
} elsif (@tokens == 2) { #Generate
my $result = $morpho->generate($tokens[0], $tokens[1], $Morpho::GUESSER, $lemmas_forms);
my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";
for (my $i = 0; $i < $lemmas_forms->size(); $i++) {
my $lemma_forms = $lemmas_forms->get($i);
printf "%sLemma: %s\n", $guesser, $lemma_forms->{lemma};
for (my $i = 0; $i < $lemma_forms->{forms}->size(); $i++) {
my $form = $lemma_forms->{forms}->get($i);
printf " %s %s\n", $form->{form}, $form->{tag};
}
}
}
}
run_tagger
Simple example performing tokenization and PoS tagging.
use strict;
use open qw(:std :utf8);
use Ufal::MorphoDiTa qw(:all);
sub encode_entities($) {
my ($text) = @_;
$text =~ s/[&<>"]/$& eq "&" ? "&" : $& eq "<" ? "<" : $& eq ">" ? ">" : """/ge;
return $text;
}
@ARGV >= 1 or die "Usage: $0 tagger_file\n";
print STDERR "Loading tagger: ";
my $tagger = Tagger::load($ARGV[0]);
$tagger or die "Cannot load tagger from file '$ARGV[0]'\n";
print STDERR "done\n";
shift @ARGV;
my $forms = Forms->new();
my $lemmas = TaggedLemmas->new();
my $tokens = TokenRanges->new();
my $tokenizer = $tagger->newTokenizer();
$tokenizer or die "No tokenizer is defined for the supplied model!";
for (my $not_eof = 1; $not_eof; ) {
my $text = '';
# Read block
while (1) {
my $line = <>;
last unless ($not_eof = defined $line);
$text .= $line;
chomp($line);
last unless length $line;
}
# Tag
$tokenizer->setText($text);
my $t = 0;
while ($tokenizer->nextSentence($forms, $tokens)) {
$tagger->tag($forms, $lemmas);
for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
my $lemma = $lemmas->get($i);
my $token = $tokens->get($i);
my ($token_start, $token_length) = ($token->{start}, $token->{length});
printf "%s%s<token lemma=\"%s\" tag=\"%s\">%s</token>%s",
encode_entities(substr $text, $t, $token_start - $t),
$i == 0 ? "<sentence>" : "",
encode_entities($lemma->{lemma}),
encode_entities($lemma->{tag}),
encode_entities(substr $text, $token_start, $token_length),
$i + 1 == $size ? "</sentence>" : "";
$t = $token_start + $token_length;
}
}
print encode_entities(substr $text, $t);
}
AUTHORS
Milan Straka <straka@ufal.mff.cuni.cz>
Jana Straková <strakova@ufal.mff.cuni.cz>
COPYRIGHT AND LICENCE
Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.
MorphoDiTa is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
MorphoDiTa is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License along with MorphoDiTa. If not, see <http://www.gnu.org/licenses/>.