NAME

Ufal::MorphoDiTa - bindings to Morphodita library http://ufal.mff.cuni.cz/morphodita.

DESCRIPTION

Ufal::MorphoDiTa is a Perl binding to Morphodita library http://ufal.mff.cuni.cz/morphodita.

All classes can be imported into the current namespace using the all export tag.

The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::MorphoDiTa::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/morphodita/api-reference\>.

Helper Structures
-----------------

  typedef vector<string> Forms;
  
  struct TaggedForm {
    string form;
    string tag;
  };
  typedef vector<TaggedForm> TaggedForms;
  
  struct TaggedLemma {
    string lemma;
    string tag;
  };
  typedef vector<TaggedLemma> TaggedLemmas;
  
  struct TaggedLemmaForms {
    string lemma;
    TaggedForms forms;
  };
  typedef vector<TaggedLemmaForms> TaggedLemmasForms;
  
  struct TokenRange {
    size_t start;
    size_t length;
  };
  typedef vector<TokenRange> TokenRanges;


Main Classes
------------

  class Version {
   public:
    unsigned major;
    unsigned minor;
    unsigned patch;
  
    static Version current();
  };
  
  class Tokenizer {
   public:
    virtual void setText(const char* text);
    virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
  
    static Tokenizer* newVerticalTokenizer();
    static Tokenizer* newCzechTokenizer();
    static Tokenizer* newEnglishTokenizer();
    static Tokenizer* newGenericTokenizer();
  };
  
  class Morpho {
   public:
    static Morpho* load(const char* fname);
  
    enum { NO_GUESSER = 0, GUESSER = 1 };
  
    virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
    virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
    virtual string rawLemma(const char* lemma) const;
    virtual string lemmaId(const char* lemma) const;
  
    virtual Tokenizer* newTokenizer() const;
  };
  
  class Tagger {
   public:
    static Tagger* load(const char* fname);
  
    virtual const Morpho* getMorpho() const;
  
    virtual void tag(Forms& forms, TaggedLemmas& tags) const;
  
    Tokenizer* newTokenizer() const;
  };
  
  class TagsetConverter {
   public:
    static TagsetConverter* newIdentityConverter();
    static TagsetConverter* newPdtToConll2009Converter();
  
    virtual void convert(TaggedLemma& lemma) const;
    virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
    virtual void convertGenerated(TaggedLemmasForms& forms) const;
  };

Examples

run_morpho_cli

Simple example performing morphological analysis and generation.

use strict;
use open qw(:std :utf8);

use Ufal::MorphoDiTa qw(:all);

@ARGV >= 1 or die "Usage: $0 dict_file\n";

print STDERR "Loading dictionary: ";
my $morpho = Morpho::load($ARGV[0]);
$morpho or die "Cannot load dictionary from file '$ARGV[0]'\n";
print STDERR "done\n";
shift @ARGV;

my $lemmas = TaggedLemmas->new();
my $lemmas_forms = TaggedLemmasForms->new();
while (<>) {
  chomp;
  my @tokens = split /\s+/, $_, -1;
  if (@tokens == 1) { #Analyze
    my $result = $morpho->analyze($tokens[0], $Morpho::GUESSER, $lemmas);
    my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";

    for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
      my $lemma = $lemmas->get($i);
      printf "%sLemma: %s %s\n", $guesser, $lemma->{lemma}, $lemma->{tag};
    }
  } elsif (@tokens == 2) { #Generate
    my $result = $morpho->generate($tokens[0], $tokens[1], $Morpho::GUESSER, $lemmas_forms);
    my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";

    for (my $i = 0; $i < $lemmas_forms->size(); $i++) {
      my $lemma_forms = $lemmas_forms->get($i);
      printf "%sLemma: %s\n", $guesser, $lemma_forms->{lemma};
      for (my $i = 0; $i < $lemma_forms->{forms}->size(); $i++) {
        my $form = $lemma_forms->{forms}->get($i);
        printf "  %s %s\n", $form->{form}, $form->{tag};
      }
    }
  }
}

run_tagger

Simple example performing tokenization and PoS tagging.

use strict;
use open qw(:std :utf8);

use Ufal::MorphoDiTa qw(:all);

sub encode_entities($) {
  my ($text) = @_;
  $text =~ s/[&<>"]/$& eq "&" ? "&amp;" : $& eq "<" ? "&lt;" : $& eq ">" ? "&gt;" : "&quot;"/ge;
  return $text;
}

@ARGV >= 1 or die "Usage: $0 tagger_file\n";

print STDERR "Loading tagger: ";
my $tagger = Tagger::load($ARGV[0]);
$tagger or die "Cannot load tagger from file '$ARGV[0]'\n";
print STDERR "done\n";
shift @ARGV;

my $forms = Forms->new();
my $lemmas = TaggedLemmas->new();
my $tokens = TokenRanges->new();
my $tokenizer = $tagger->newTokenizer();
$tokenizer or die "No tokenizer is defined for the supplied model!";

for (my $not_eof = 1; $not_eof; ) {
  my $text = '';

  # Read block
  while (1) {
    my $line = <>;
    last unless ($not_eof = defined $line);
    $text .= $line;
    chomp($line);
    last unless length $line;
  }

  # Tag
  $tokenizer->setText($text);
  my $t = 0;
  while ($tokenizer->nextSentence($forms, $tokens)) {
    $tagger->tag($forms, $lemmas);

    for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
      my $lemma = $lemmas->get($i);
      my $token = $tokens->get($i);
      my ($token_start, $token_length) = ($token->{start}, $token->{length});

      printf "%s%s<token lemma=\"%s\" tag=\"%s\">%s</token>%s",
        encode_entities(substr $text, $t, $token_start - $t),
        $i == 0 ? "<sentence>" : "",
        encode_entities($lemma->{lemma}),
        encode_entities($lemma->{tag}),
        encode_entities(substr $text, $token_start, $token_length),
        $i + 1 == $size ? "</sentence>" : "";
      $t = $token_start + $token_length;
    }
  }
  print encode_entities(substr $text, $t);
}

AUTHORS

Milan Straka <straka@ufal.mff.cuni.cz>

Jana Straková <strakova@ufal.mff.cuni.cz>

COPYRIGHT AND LICENCE

Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

MorphoDiTa is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

MorphoDiTa is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with MorphoDiTa. If not, see <http://www.gnu.org/licenses/>.