lib/Datahub/Factory/Command/transport.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
—
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
              package Datahub::Factory::Command::transport;
use Datahub::Factory::Sane;
use parent 'Datahub::Factory::Cmd';
use Module::Load;
use Catmandu;
use Catmandu::Util qw(data_at);
use Datahub::Factory;
use namespace::clean;
use Datahub::Factory::PipelineConfig;
use Datahub::Factory::Fixer::Condition;
use Data::Dumper qw(Dumper);
sub abstract { "Transport data from a data source to a datahub instance" }
sub description { "Long description on blortex algorithm" }
sub opt_spec {
        return (
                [ "pipeline|p=s", "Location of the pipeline configuration file"]
        );
}
sub validate_args {
        my ($self, $opt, $args) = @_;
    if (! $opt->{'pipeline'}) {
        $self->usage_error('The --pipeline flag is required.');
    }
        my $pcfg = Datahub::Factory->pipeline($opt);
    try {
        $pcfg->check_object();
    } catch {
        $self->usage_error($_);
    }
        # no args allowed but options!
        $self->usage_error("No args allowed") if @$args;
}
sub execute {
  my ($self, $arguments, $args) = @_;
  my $logger = Datahub::Factory->log;
  my ($pcfg, $opt);
  try {
      $pcfg = Datahub::Factory->pipeline($arguments);
      $opt = $pcfg->opt;
  } catch {
      $logger->fatal($_);
      exit 1;
  };
  # Load modules
  my ($import_module, $fix_module, $export_module);
  try {
      $import_module = Datahub::Factory->importer($opt->{importer})->new($opt->{oimport});
  } catch {
      $logger->fatal(sprintf('%s at [plugin_importer_%s]', $_, $opt->{'importer'}));
      exit 1;
  };
  try {
      $export_module = Datahub::Factory->exporter($opt->{exporter})->new($opt->{oexport});
  } catch {
      $logger->fatal(sprintf('%s at [plugin_exporter_%s]', $_, $opt->{'exporter'}));
      exit 1;
  };
  # Perform import/fix/export
  # Catmandu::Fix treats all warnings as fatal errors (this is good)
  # so we can catch them with try-catch
  # Not that errors here are _not_ fatal => continue running
  # till all records have been processed
  my $counter = 0;
  $import_module->importer->each(sub {
      my $item = shift;
      $counter++;
      my $f = try {
          try {
                my $cond = Datahub::Factory::Fixer::Condition->new(
                    'options' => $opt,
                    'item'    => $item
                );
                # Load the correct fixer here, we have the data here
                $fix_module = $cond->fix_module;
          } catch {
                $logger->fatal(sprintf('%s at [plugin_fixer_%s]', $_, $opt->{'fixer'}));
                exit 1;
          };
          # Execute the fix
          $fix_module->fixer->fix($item);
      } catch {
          my $error_msg;
          if ($_->can('message')) {
              $error_msg = sprintf('Item %d (counted): could not execute fix: %s', $counter, $_->message);
          } else {
              $error_msg = sprintf('Item %d (counted): could not execute fix: %s', $counter, $_);
          }
          $logger->error($error_msg);
          return 1;
      };
      if (defined($f) && $f == 1) {
          # End the processing of this record, go to the next one.
          return;
      }
      my $item_id = data_at($opt->{'id_path'}, $item);
      my $e = try {
          $export_module->add($item);
      } catch {
          my $error_msg;
          # $item_id can be undefined if it isn't set in the source, but this
          # is only discovered when exporting (and not during fixing)
          my $id_type = 'id';
          if (!defined($item_id)) {
              $item_id = $counter;
              $id_type = 'counted';
          }
          if ($_->can('message')) {
              $error_msg = sprintf('Item %s (%s): could not export item: %s', $item_id, $id_type, $_->message);
          } else {
              $error_msg = sprintf('Item %s (%s): could not export item: %s', $item_id, $id_type, $_);
          }
          $logger->error($error_msg);
          return 1;
      };
      if (defined($e) && $e == 1) {
          # End the processing of this record, go to the next one.
          return;
      }
      $logger->info(sprintf('Item %s (id): exported.', $item_id));
  });
}
1;
__END__
=head1 NAME
Datahub::Factory::Command::transport - Implements the 'transport' command.
=head1 DESCRIPTION
This command allows datamanagers to (a) fetch data from a (local) source (b)
transform the data to LIDO using a fix (c) upload the LIDO transformed data to
a Datahub instance.
=head1 COMMAND LINE INTERFACE
=over
=item C<--pipeline>
Location of the pipeline configuration file.
=back
=head2 Pipeline configuration file
The I<pipeline configuration file> is in the L<INI format|http://search.cpan.org/~sherzodr/Config-Simple-4.59/Simple.pm#INI-FILE> and its location is
provided to the application using the C<--pipeline> switch.
The file is broadly divided in two parts: the first (shortest) part configures
the pipeline itself and sets the plugins to use for the I<import>, I<fix> and
I<export> actions. The second part sets options specific for the used plugins.
=head4 Pipeline configuration
This part has three sections: C<[Importer]>, C<[Fixer]> and C<[Exporter]>.
Every section has just one option: C<plugin>. Set this to the plugin you
want to use for every action.
All current supported plugins are in the C<Importer> and C<Exporter> folders.
For the C<[Fixer]>, only the I<Fix> plugin is supported.
Supported I<Importer> plugins:
=over
=item L<TMS|Datahub::Factory::Importer::TMS>
=item L<Adlib|Datahub::Factory::Importer::Adlib>
=item L<OAI|Datahub::Factory::Importer::OAI>
=back
Supported I<Exporter> plugins:
=over
=item L<Datahub|Datahub::Factory::Exporter::Datahub>
=item L<LIDO|Datahub::Factory::Exporter::LIDO>
=item L<YAML|Datahub::Factory::Exporter::YAML>
=back
=head3 Plugin configuration
    [Importer]
    plugin = OAI
    [plugin_importer_OAI]
    endpoint = https://oai.my.museum/oai
    [Fixer]
    plugin = Fix
    [plugin_fixer_Fix]
    file_name = '/home/datahub/my.fix'
    id_path = 'lidoRecID.0._'
    [Exporter]
    plugin = LIDO
    [plugin_exporter_LIDO]
All plugins have their own configuration options in sections called
C<[plugin_type_name]> where C<type> can be I<importer>, I<exporter>
or I<fixer> and C<name> is the name of the plugin.
All plugins define their own options as parameters to the respective
plugin. All possible parameters are valid items in the configuration
section.
If a plugin requires no options, you still need to create the (empty)
configuration section (e.g. C<[plugin_exporter_LIDO]> in the above
example).
=head4 Fixer plugin
    [plugin_fixer_Fix]
    condition = record.institution_name
    fixers = MSK, GRO
    id_path = record.id
    [plugin_fixer_Fix]
    file_name = /home/datahub/my.fix
    id_path = record.id
The C<[plugin_fixer_Fix]> can directly load a fix file (via the option
C<file_name>) or can be configured to conditionally load a different
fix file to support multiple fix files for the same data stream (e.g.
when two institutions with different data models use the same API
endpoint). This is done by setting the C<condition> and C<fixers>
options.
The C<id_path> option contains the path (in Fix syntax) of the identifier of
each record in your data after the fix has been applied, but before it is
submitted to the I<Exporter>. It is used for reporting and logging.
=head4 Conditional fixers
    [plugin_fixer_Fix]
    condition = record.institution_name
    fixers = MSK, GRO
    id_path = 'lidoRecID.0._'
    [plugin_fixer_GRO]
    condition = 'Groeningemuseum'
    file_name = '/home/datahub/gro.fix'
    id_path = 'lidoRecID.0._'
    [plugin_fixer_MSK]
    condition = 'Museum voor Schone Kunsten Gent'
    file_name = '/home/datahub/msk.fix'
    id_path = 'lidoRecID.0._'
If you want to separate the data stream into multiple (smaller) streams with
a different fix file for each stream, you can do this by setting the appropriate
options in the C<[plugin_fixer_Fix]> block. Note that C<id_path> is still mandatory.
Set C<condition> to the Fix-compatible path in the original stream that holds
the condition you want to use to split the stream.
Provide a comma-separated list of fixer plugins in C<fixers>.
For every fixer plugin in C<fixers>, create a configuration block called
C<[plugin_fixer_name]> and provide the following options:
=over
=item C<condition>
The value that the C<condition> from C<[plugin_fixer_Fix]> must have for
the record to belong to this block.
=item C<file_name>
The location of the fix file that must be executed for every record in this
block.
=item C<id_path>
=back
=head4 Example configuration file
  [Importer]
  plugin = Adlib
  [Fixer]
  plugin = Fix
  [Exporter]
  plugin = Datahub
  [plugin_importer_Adlib]
  file_name = '/tmp/adlib.xml'
  data_path = 'recordList.record.*'
  [plugin_fixer_Fix]
  file_name = '/tmp/msk.fix'
  id_path = 'record.id'
  [plugin_exporter_Datahub]
  datahub_url = https://my.thedatahub.io
  datahub_format = LIDO
  oauth_client_id = datahub
  oauth_client_secret = datahub
  oauth_username = datahub
  oauth_password = datahub
=head1 AUTHORS
Pieter De Praetere <pieter@packed.be>
Matthias Vandermaesen <matthias.vandermaesen@vlaamsekunstcollectie.be>
=head1 COPYRIGHT
Copyright 2016 - PACKED vzw, Vlaamse Kunstcollectie vzw
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the terms of the GPLv3.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)