The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.

NAME

Ufal::UDPipe - bindings to UDPipe library http://ufal.mff.cuni.cz/udpipe.

SYNOPSIS

  use Ufal::UDPipe;

  my $model_file = '...';
  my $model = Ufal::UDPipe::Model::load($model_file) or die "Cannot load model from file '$model_file'\n";

  my $tokenizer = $model->newTokenizer($Ufal::UDPipe::Model::DEFAULT);
  my $conllu_output = Ufal::UDPipe::OutputFormat::newOutputFormat("conllu");
  my $sentence = Ufal::UDPipe::Tree->new();

  $tokenizer->setText(join('', <>));
  while ($tokenizer->nextSentence($sentence)) {
    $model->tag($sentence, $Ufal::UDPipe::Model::DEFAULT);
    $model->parse($sentence, $Ufal::UDPipe::Model::DEFAULT);

    my $output = $conllu_output->writeSentence($sentence);
    print $output;
  }

REQUIREMENTS

To compile the module, C++11 compiler is needed, either g++ 4.7 or newer, alternatively clang 3.2 or newer or Visual C++ 2015.

DESCRIPTION

Ufal::UDPipe is a Perl binding to UDPipe library http://ufal.mff.cuni.cz/udpipe.

The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::UDPipe::Words source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/udpipe/api-reference\>.

  Helper Structures
  -----------------
  
    typedef vector<int> Children;
  
    typedef vector<string> Comments;
  
    class ProcessingError {
    public:
      bool occurred();
      string message;
    };
  
    class Word {
     public:
      int id;         // 0 is root, >0 is sentence word, <0 is undefined
      string form;    // form
      string lemma;   // lemma
      string upostag; // universal part-of-speech tag
      string xpostag; // language-specific part-of-speech tag
      string feats;   // list of morphological features
      int head;       // head, 0 is root, <0 is undefined
      string deprel;  // dependency relation to the head
      string deps;    // secondary dependencies
      string misc;    // miscellaneous information
  
      Children children;
  
      Word(int id = -1, const string& form = string());
    };
    typedef vector<Word> Words;
  
    class MultiwordToken {
     public:
      int idFirst, idLast;
      string form;
      string misc;
  
      MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string());
    };
    typedef vector<MultiwordToken> MultiwordTokens;
  
    class Sentence {
     public:
      Sentence();
  
      Words words;
      MultiwordTokens multiwordTokens;
      Comments comments
      static const string rootForm;
  
      bool empty();
      void clear();
      virtual Word& addWord(const char* form);
      void setHead(int id, int head, const string& deprel);
      void unlinkAllWords();
    };
    typedef vector<Sentence> Sentences;
  
  
  Main Classes
  ------------
  
    class InputFormat {
     public:
      virtual void setText(const char* text);
      virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr);
  
      static InputFormat* newInputFormat(const string& name);
      static InputFormat* newConlluInputFormat();
      static InputFormat* newHorizontalInputFormat();
      static InputFormat* newVerticalInputFormat();
    };
  
    class OutputFormat {
     public:
      virtual string writeSentence(const Sentence& s) const;
  
      static OutputFormat* newOutputFormat(const string& name);
      static OutputFormat* newConlluOutputFormat();
      static OutputFormat* newHorizontalOutputFormat();
      static OutputFormat* newVerticalOutputFormat();
    };
  
    class Model {
     public:
      static Model* load(const char* fname);
  
      virtual InputFormat* newTokenizer(const string& options) const;
      virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const;
      virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const;
  
      static const string DEFAULT;
    };
  
    class Pipeline {
     public:
      Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output);
  
      void setModel(const Model* m);
      void setInput(const string& input);
      void setTagger(const string& tagger);
      void setParser(const string& parser);
      void setOutput(const string& output);
  
      string process(const string& data, ProcessingError* error = nullptr) const;
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Trainer {
     public:
  
      static string train(const string& method, const Sentences& train, const Sentences& heldout,
                          const string& tokenizer, const string& tagger, const string& parser,
                          ProcessingError* error = nullptr);
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Evaluator {
     public:
      Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser);
  
      void setModel(const Model* m);
      void setTokenizer(const string& tokenizer);
      void setTagger(const string& tagger);
      void setParser(const string& parser);
  
      string evaluate(const string& data, ProcessingError* error = nullptr) const;
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Version {
     public:
      unsigned major;
      unsigned minor;
      unsigned patch;
      string prerelease;
  
      // Returns current version.
      static version current();
    };

Examples

run_udpipe

Simple pipeline loading data (tokenizing on request), tagging, parsing and writing to specified output format.

  use strict;
  use open qw(:std :utf8);
  
  use Ufal::UDPipe;
  
  @ARGV >= 3 or die "Usage: $0 input_format output_format model_file\n";
  my $input = shift @ARGV;
  my $output = shift @ARGV;
  my $model_file = shift @ARGV;
  
  print STDERR "Loading model: ";
  my $model = Ufal::UDPipe::Model::load($model_file);
  $model or die "Cannot load model from file '$model_file'\n";
  print STDERR "done\n";
  
  my $pipeline = Ufal::UDPipe::Pipeline->new($model, $input, $Ufal::UDPipe::Pipeline::DEFAULT, $Ufal::UDPipe::Pipeline::DEFAULT, $output);
  my $error = Ufal::UDPipe::ProcessingError->new();
  
  for (my $not_eof = 1; $not_eof; ) {
    my $text = '';
  
    # Read block
    while (1) {
      my $line = <>;
      last unless ($not_eof = defined $line);
      $text .= $line;
      chomp($line);
      last unless length $line;
    }
  
    # Process data
    my $processed = $pipeline->process($text, $error);
    $error->occurred() and die "An error occurred in run_udpipe: " . $error->{message};
    print $processed;
  }

AUTHORS

Milan Straka <straka@ufal.mff.cuni.cz>

COPYRIGHT AND LICENCE

Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.