lib/Text/KyTea.pm - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
—
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
              package Text::KyTea;
use 5.008_001;
use strict;
use warnings;
use Carp;
use Data::Recursive::Encode;
use Lingua::JA::Regular::Unicode qw/alnum_h2z space_h2z katakana_h2z/;
our $VERSION = '0.30';
require XSLoader;
XSLoader::load(__PACKAGE__, $VERSION);
sub _options
{
    return {
        # analysis options
        model   => '/usr/local/share/kytea/model.bin',
        h2z     => 1,
        nows    => 0,
        notags  => 0,
        notag   => [],
        nounk   => 0,
        unkbeam => 50,
        # I/O options
        tagmax  => 3,
        deftag  => 'UNK',
        unktag  => '',
        # advanced I/O options
        wordbound => ' ',
        tagbound  => '/',
        elembound => '&',
        unkbound  => ' ',
        skipbound => '?',
        nobound   => '-',
        hasbound  => '|',
    };
}
sub new
{
    my $class = shift;
    my %args  = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);
    my $options = $class->_options;
    for my $key (keys %args)
    {
        if (!exists $options->{$key}) { croak "Unknown option '$key'";  }
        else                          { $options->{$key} = $args{$key}; }
    }
    croak 'model file is not found' if ! -e $options->{model};
    return _init_text_kytea($class, $options);
}
sub _h2z { katakana_h2z( space_h2z( alnum_h2z($_[0]) ) ); }
sub parse
{
    my ($self, $text) = @_;
    my $is_h2z_enable = $self->_is_h2z_enable;
    if ($is_h2z_enable)
    {
        my @original_chars = split(//, $text);
        my $text = _h2z($text);
        my $results = Data::Recursive::Encode->decode_utf8( $self->_parse($text) );
        my $i = 0;
        # changed char -> original char
        for my $result (@{$results})
        {
            $result->{surface} = join( '', @original_chars[$i .. $i + (length $result->{surface}) - 1] );
            $i += length $result->{surface};
        }
        return $results;
    }
    return Data::Recursive::Encode->decode_utf8( $self->_parse($text) );
}
1;
__END__
=encoding utf8
=head1 NAME
Text::KyTea - Perl wrapper for KyTea
=for test_synopsis
my ($text, %config, $path);
=head1 SYNOPSIS
  use Text::KyTea;
  use utf8;
  my $kytea   = Text::KyTea->new(%config);
  my $results = $kytea->parse($text);
  for my $result (@{$results})
  {
      print $result->{surface};
      for my $tags (@{$result->{tags}})
      {
          print "\t";
          for my $tag (@{$tags})
          {
              print " ", $tag->{feature}, "/", $tag->{score};
          }
      }
      print "\n";
  }
=head1 DESCRIPTION
KyTea is a general toolkit developed for analyzing text,
with a focus on Japanese, Chinese and other languages
requiring word or morpheme segmentation.
This module works under KyTea Ver.0.3.2 and later.
Under old versions of KyTea, this might not work.
If you've changed default install directory of KyTea,
please install Text::KyTea with interactive mode
(e.g., cpanm --interactive or cpanm -v).
For more information about KyTea, please see the "SEE ALSO" section.
=head1 METHODS
=over 4
=item new(%config)
Creates a new Text::KyTea instance.
  my $kytea = Text::KyTea->new(
      model   => 'model.bin', # default is '/usr/local/share/kytea/model.bin'
      h2z     => 0,           # default is 1 (enable)
      notag   => [1,2],       # default is []
      nounk   => 0,           # default is 0 (estimates the pronunciation of unkown words)
      unkbeam => 50,          # default is 50
      tagmax  => 3,           # default is 3
      deftag  => 'UNK',       # default is 'UNK'
      unktag  => '',          # default is ''
  );
=item new(h2z => 1)
Converts $text from hankaku to zenkaku before parsing $text.
This option improves the parsing accuracy in most of model files.
=item read_model($path)
Reads the given model file.
The model file should be read by new(model => $path) method.
Model files are available at http://www.phontron.com/kytea/model.html
=item parse($text)
Parses the given text via KyTea, and returns results of analysis.
The results are returned as an array reference.
=back
=head1 AUTHOR
pawa E<lt>pawapawa@cpan.orgE<gt>
=head1 SEE ALSO
http://www.phontron.com/kytea/
=head1 LICENSE
Copyright (C) 2012 pawa All rights reserved.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)