{
$PICA::PlainParser::VERSION
=
'0.585'
;
}
sub
new {
my
(
$class
,
%params
) =
@_
;
$class
=
ref
$class
||
$class
;
my
$self
=
bless
{
field_handler
=>
defined
$params
{Field} ?
$params
{Field} :
undef
,
record_handler
=>
defined
$params
{Record} ?
$params
{Record} :
undef
,
broken_field_handler
=>
defined
$params
{FieldError} ?
$params
{FieldError} :
undef
,
broken_record_handler
=>
defined
$params
{RecordError} ?
$params
{RecordError} :
undef
,
proceed
=>
$params
{Proceed} ?
$params
{Proceed} : 0,
limit
=> (
$params
{Limit} || 0) * 1,
offset
=> (
$params
{Offset} || 0) * 1,
record
=>
undef
,
broken
=>
undef
,
read_records
=> [],
'strict'
=>
$params
{strict} || 0,
filename
=>
""
,
fields
=> [],
read_counter
=> 0,
active
=> 0,
},
$class
;
return
$self
;
}
sub
parsefile {
my
(
$self
,
$file
) =
@_
;
if
(
ref
(
$file
) eq
'GLOB'
) {
$self
->{filehandle} =
$file
;
$self
->{filename} =
""
;
}
elsif
( UNIVERSAL::isa(
$file
,
'IO::Handle'
) ) {
$self
->{filehandle} =
$file
;
$self
->{filename} =
""
;
}
else
{
$self
->{filename} =
$file
;
my
$fh
=
$file
;
$fh
=
"zcat $fh |"
if
$fh
=~ /\.gz$/;
$fh
=
"unzip -p $fh |"
if
$fh
=~ /\.zip$/;
$self
->{filehandle} = IO::File->new(
$file
,
'<:utf8'
)
or croak(
"failed to open file $file"
);
}
PICA::Parser::enable_binmode_encoding(
$self
->{filehandle} );
if
( not
$self
->{proceed} ) {
$self
->{read_counter} = 0;
$self
->{read_records} = [];
}
$self
->{active} = 0;
$self
->{record} =
undef
;
my
$dumpformat
= 0;
my
$line
=
readline
(
$self
->{filehandle} );
if
(
$line
=~ /\x1E/) {
my
$EOL
= $/;
$/ =
chr
(0x1E);
my
$id
=
""
;
my
@linebuf
=
split
( /\x1E/,
$line
);
do
{
last
if
(
$self
->finished());
if
(
@linebuf
) {
$line
=
shift
@linebuf
;
if
(
defined
$line
and not
@linebuf
) {
$line
.=
readline
(
$self
->{filehandle} );
}
}
else
{
$line
=
readline
(
$self
->{filehandle} );
}
if
(
defined
$line
) {
$line
=~ /^\x1D?([^\s]*)/;
if
(PICA::Field::parse_pp_tag($1)) {
$self
->_parseline(
$line
);
}
else
{
if
(
"$id"
ne
"$1"
) {
$self
->_parseline(
""
);
}
$id
= $1;
}
}
}
while
(
defined
$line
);
$/ =
$EOL
;
}
else
{
while
(
defined
$line
and not
$self
->finished ) {
$self
->_parseline(
$line
);
$line
=
readline
(
$self
->{filehandle} );
};
}
$self
->handle_record()
unless
$self
->finished();
$self
;
}
sub
parsedata {
my
(
$self
,
$data
,
$additional
) =
@_
;
$self
->{active} = 0;
$self
->{record} =
undef
;
if
( !
$self
->{proceed} ) {
$self
->{read_counter} = 0;
$self
->{read_records} = [];
}
if
(
ref
(
$data
) eq
'CODE'
) {
my
$chunk
=
&$data
();
while
(
defined
$chunk
) {
$self
->_parsedata(
$chunk
);
$chunk
=
&$data
();
}
}
elsif
( UNIVERSAL::isa(
$data
,
"PICA::Record"
) ) {
foreach
(
$data
->fields ) {
$self
->_parseline(
$_
->string );
}
}
else
{
$self
->_parsedata(
$data
);
}
$self
->handle_record();
$self
;
}
sub
records {
my
$self
=
shift
;
return
@{
$self
->{read_records} };
}
sub
counter {
my
$self
=
shift
;
return
$self
->{read_counter};
}
sub
finished {
my
$self
=
shift
;
return
$self
->{limit} &&
$self
->counter() >=
$self
->{limit};
}
sub
_parsedata {
my
(
$self
,
$data
) =
@_
;
my
@lines
;
if
(
ref
(\
$data
) eq
'SCALAR'
) {
@lines
=
$data
eq
"\n"
? (
''
) :
split
"\n"
,
$data
;
}
elsif
(
ref
(
$data
) eq
'ARRAY'
) {
@lines
= @{
$data
};
}
else
{
croak(
"Got "
.
ref
(\
$data
) .
" when parsing PICA+ while expecting SCALAR or ARRAY"
);
}
foreach
my
$line
(
@lines
) {
$self
->_parseline(
$line
);
}
}
sub
_parseline {
my
(
$self
,
$line
) =
@_
;
chomp
$line
;
if
(
$line
eq
"\x1D"
or (not
$self
->{strict} and
$line
=~ /^\s*$|^
$self
->handle_record()
if
$self
->{active} and @{
$self
->{fields}};
}
else
{
$line
=~ s/^\x1D//;
my
$field
=
eval
{ PICA::Field->parse(
$line
); };
if
($@) {
$@ =~ s/ at .*\n//;
$field
=
$self
->broken_field( $@,
$line
);
}
elsif
(
$self
->{field_handler}) {
$field
=
$self
->{field_handler}(
$field
);
}
if
( UNIVERSAL::isa(
$field
,
'PICA::Field'
) ) {
push
(@{
$self
->{fields}},
$field
);
}
elsif
(
defined
$field
) {
$self
->{broken} =
$field
unless
defined
$self
->{broken};
}
}
$self
->{active} = 1;
}
sub
broken_field {
my
(
$self
,
$msg
,
$line
) =
@_
;
if
(
$self
->{broken_field_handler}) {
return
$self
->{broken_field_handler}(
$msg
,
$line
);
}
$msg
=
"$msg in line \"$line\""
if
defined
$line
;
print
STDERR
"$msg\n"
;
return
;
}
sub
broken_record {
my
(
$self
,
$msg
,
$record
) =
@_
;
if
(
$self
->{broken_record_handler}) {
return
$self
->{broken_record_handler}(
$msg
,
$record
);
}
return
if
UNIVERSAL::isa(
$record
,
'PICA::Record'
) &&
$record
->empty;
print
STDERR
"$msg\n"
if
defined
$msg
;
return
;
}
sub
handle_record {
my
$self
=
shift
;
$self
->{read_counter}++;
my
(
$record
,
$broken
);
if
(
$self
->{broken} ) {
$broken
=
$self
->{broken};
}
else
{
$record
= PICA::Record->new( @{
$self
->{fields}} );
}
$self
->{fields} = [];
$self
->{broken} =
undef
;
if
(not
defined
$broken
) {
if
(
$self
->{record_handler}) {
if
(UNIVERSAL::isa(
$self
->{record_handler},
'PICA::Writer'
) ) {
$self
->{record_handler}->
write
(
$record
);
}
else
{
$record
=
$self
->{record_handler}(
$record
);
$record
=
undef
if
$record
=~ /^-?\d+$/;
}
}
if
(
defined
$record
) {
if
( UNIVERSAL::isa(
$record
,
'PICA::Record'
) ) {
$broken
=
"empty record"
if
$record
->empty;
}
else
{
$broken
=
$record
;
}
}
}
if
(
defined
$broken
) {
$self
->broken_record(
$broken
,
$record
);
}
elsif
(
defined
$record
) {
push
@{
$self
->{read_records} },
$record
;
}
}
1;