lib/EBook/MOBI/MobiPerl/Palm/Doc.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
—
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
—
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
—
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
—
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
—
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
              # Palm::Doc.pm
#
# Palm::PDB helper for handling Palm Doc databases
#
# Copyright (C) 2004 Christophe Beauregard
#
# $Id: Doc.pm,v 1.19 2005/05/12 01:36:49 cpb Exp $
use strict;
package EBook::MOBI::MobiPerl::Palm::Doc;
use EBook::MOBI::MobiPerl::Palm::PDB;
use EBook::MOBI::MobiPerl::Palm::Raw();
use vars qw( $VERSION @ISA );
$VERSION = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
@ISA = qw( EBook::MOBI::MobiPerl::Palm::Raw );
use constant DOC_UNCOMPRESSED => scalar 1;
use constant DOC_COMPRESSED => scalar 2;
use constant DOC_RECSIZE => scalar 4096;
=head1 NAME
Palm::Doc - Handler for Palm Doc books
=head1 SYNOPSIS
use Palm::Doc;
=head1 DESCRIPTION
Helper for reading and writing Palm Doc books. The interface is based on
L<Palm::ZText> since it just makes sense. However, because of the nature
of these databases, record-level processing is just a Bad Idea. Use
the C<text> and C<textfile> calls rather than do direct access of the
C<@records> array.
=head1 EXAMPLES
Convert a text file to a .pdb:
        use Palm::Doc;
        my $doc = new Palm::Doc;
        $doc->textfile( $ARGV[0] );
        $doc->Write( $ARGV[0] . ".pdb" );
Convert an HTML file to a .prc:
        use HTML::TreeBuilder;
        use HTML::FormatText;
        use Palm::Doc;
        my $tree = HTML::TreeBuilder->new_from_file( $ARGV[0] );
        my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 );
        my $doc = new Palm::Doc;
        $doc->{attributes}{resource} = 1;
        $doc->text( $formatter->format( $tree ) );
        $doc->Write( $ARGV[0] . ".prc" );
=cut
#'
sub import
{
        &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
        &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
        &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
        &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
}
=head2 new
        $doc = new Palm::Doc;
Create a new Doc object. By default, it's not a resource database. Setting
C<$self->{attributes}{resource}> to C<1> before any manipulations will
cause it to become a resource database.
=cut
sub new
{
        my $class = shift;
        my $self = $class->SUPER::new(@_);
        $self->{'creator'} = 'REAd';
        $self->{'type'} = 'TEXt';
        $self->{attributes}{resource} = 0;
        $self->{appinfo} = undef;
        $self->{sort} = undef;
        $self->{records} = [];
        return $self;
}
# determine if the given (raw) record is a Doc header record and fill in the
# record with appropriate fields if it is.
sub _parse_headerrec($) {
        my $record = shift;
        return undef unless exists $record->{'data'};
        # Doc header is minimum of 16 bytes
        return undef if length $record->{'data'} < 16;
        my ($version,$spare,$ulen, $records, $recsize, $position)
                = unpack( 'n n N n n N', $record->{'data'} );
    # BEGIN: BORIS DAEPPEN JULY 2012
    # ***********************
    # This code just prints stuff... which is what I don't like
    # One should not print to stderr just for fun!
    # so this is why I put this out of work...
        #my $h = sprintf ("%x", $version);
        #print STDERR "Version: $version - $h - ";
        #if ($version == DOC_COMPRESSED) {
            #print STDERR " DOC_COMPRESSED\n";
        #}
        #if ($version == DOC_UNCOMPRESSED) {
            #print STDERR " DOC_UNCOMPRESSED\n";
        #}
        #if ($version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED) {
            #print STDERR " probably HUFFDIC_COMPRESSED - CANNOT BE DECOMPRESSED!!!\n";
        #}
    # ***********************
    # END: BORIS DAEPPEN JULY 2012
        # the header is followed by a list of record sizes. We don't use
        # this since we can guess the sizes pretty easily by looking at
        # the actual records.
        # According to the spec, $version is either 1 (uncompressed)
        # or 2 (compress), while spare is always zero. AportisDoc supposedly sets
        # spare to something else, so screw AportisDoc.
        return undef if $version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED;
        return undef if $spare != 0;
        $record->{'version'} = $version;
        $record->{'length'} = $ulen;
        $record->{'records'} = $records;
        $record->{'recsize'} = $recsize;
        $record->{'position'} = $position;
        return $record;
}
sub _compress_record($$) {
        my ($version,$in) = @_;
        return $in if $version == DOC_UNCOMPRESSED;
        my $out = '';
        my $lin = length $in;
        my $i = 0;
        while( $i < $lin ) {
                # See http://patb.dyndns.org/Programming/PilotDoc.htm for the code type
                # taxonomy.
                # Try type B compression first.
                # If the next 3 to 10 bytes are already in the compressed buffer, we can
                # encode them into a 2 byte sequence. Don't bother too close to the ends,
                # however... Makes the boundary conditions simpler.
                if( $i > 10 and $lin - $i > 10 ) {
                        my $chunk = '';
                        my $match = -1;
                        # the preamble is what'll be in the decoders output buffer.
                        my $preamble = substr( $in, 0, $i );
                        for( my $j = 10; $j >= 3; $j -- ) {
                                $chunk = substr( $in, $i, $j ); # grab next $j characters
                                $match = rindex( $preamble, $chunk );   # in the output?
                                # type B code has a 2047 byte sliding window, so matches have to be
                                # within that range to be useful
                                last if $match >= 0 and ($i - $match) <= 2047;
                                $match = -1;
                        }
                        my $n = length $chunk;
                        if( $match >= 0 and $n <= 10 and $n >= 3 ) {
                                my $m = $i - $match;
                                # first 2 bits are 10, next 11 are offset, next 3 are length-3
                                $out .= pack( "n", 0x8000 + (($m<<3)&0x3ff8) + ($n-3) );
                                $i += $n;
                                next;
                        }
                }
                 
                my $ch = substr( $in, $i ++, 1 );
                my $och = ord($ch);
                # Try type C compression.
                if( $i+1 < $lin and $ch eq ' ' ) {
                        my $nch = substr( $in, $i, 1 );
                        my $onch = ord($nch);
                        if( $onch >= 0x40 and $onch < 0x80 ) {
                                # space plus ASCII character compression
                                $out .= chr($onch ^ 0x80);
                                $i ++;
                                next;
                        }
                } 
                if( $och == 0 or ($och >= 9 and $och < 0x80) ) {
                        # pass through
                        $out .= $ch;
                } else {
                        # type A code. This is essentially an 'escape' like '\\' in strings.
                        # For efficiency, it's best to encode as long a sequence as
                        # possible with one copy. This might seem like it would cause us to miss
                        # out on a type B sequence, but in actuality keeping long binary strings
                        # together improves the likelyhood of a later type B sequence than 
                        # interspersing them with x01's.
                        my $next = substr($in,$i - 1);
                        if( $next =~ /([\x01-\x08\x80-\xff]{1,8})/o ) {
                                my $binseq = $1;
                                $out .= chr(length $binseq);
                                $out .= $binseq;
                                $i += length( $binseq ) - 1;    # first char, $ch, is already counted
                        }
                }
        }
        return $out;
}
# algorithm taken from makedoc7.cpp with reference to
# http://patb.dyndns.org/Programming/PilotDoc.htm and
# http://www.pyrite.org/doc_format.html
sub _decompress_record($$) {
        my ($version,$in) = @_;
        return $in if $version == DOC_UNCOMPRESSED;
        my $out = '';
        my $lin = length $in;
        my $i = 0;
        while( $i < $lin ) {
                my $ch = substr( $in, $i ++, 1 );
                my $och = ord($ch);
                if( $och >= 1 and $och <= 8 ) {
                        # copy this many bytes... basically a way to 'escape' data
                        $out .= substr( $in, $i, $och );
                        $i += $och;
                } elsif( $och < 0x80 ) {
                        # pass through 0, 9-0x7f
                        $out .= $ch;
                } elsif( $och >= 0xc0 ) {
                        # 0xc0-0xff are 'space' plus ASCII char
                        $out .= ' ';
                        $out .= chr($och ^ 0x80);
                } else {
                        # 0x80-0xbf is sequence from already decompressed buffer
                        my $nch = substr( $in, $i ++, 1 );
                        $och = ($och << 8) + ord($nch);
                        my $m = ($och & 0x3fff) >> 3;
                        my $n = ($och & 0x7) + 3;
                        # This isn't very perl-like, but a simple
                        # substr($out,$lo-$m,$n) doesn't work.
                        my $lo = length $out;
                        for( my $j = 0; $j < $n; $j ++, $lo ++ ) {
                                die "bad Doc compression" unless ($lo-$m) >= 0;
                                $out .= substr( $out, $lo-$m, 1 );
                        }
                }
        }
        return $out;
}
sub Write {
        my $self = shift;
        my $prc = $self->{attributes}{resource};
        my $recs = $prc ? $self->{'resources'} : $self->{'records'};
        my $header = $recs->[0];
        unless( defined _parse_headerrec($header) ) {
                die "@_: Doesn't appear to be a correct book...";
        }
        $self->SUPER::Write(@_);
}
=head2 text
        $text = $doc->text;
Return the contents of the Doc database.
        $text = $doc->text( @text );
Set the contents of the Doc book to the specified arguments. All the list arguments
will simply be concatenated together.
=cut
sub text {
        my $self = shift;
        my $body = '';
        my $prc = $self->{attributes}{resource};
        if( @_ > 0 ) {
                $body = join( '', @_ );
                my $version = DOC_COMPRESSED;
                $self->{'records'} = [];
                $self->{'resources'} = [];
                # first record is the header
                my $header = $prc ? $self->append_Resource() : $self->append_Record();
                $header->{'version'} = $version;
                $header->{'length'} = 0;
                $header->{'records'} = 0;
                $header->{'recsize'} = DOC_RECSIZE;
                # break the document into record-sized chunks
                for( my $i = 0; $i < length($body); $i += DOC_RECSIZE ) {
                        my $record = $prc ? $self->append_Resource : $self->append_Record;
                        my $chunk = substr($body,$i,DOC_RECSIZE);
                        $record->{'data'} = _compress_record( $version, $chunk );
                        $header->{'records'} ++;
                        $header->{'length'} += length $body;
                }
                $header->{'recsize'} = $header->{'length'}
                        if $header->{'length'} < DOC_RECSIZE;
                # pack up the header
                $header->{'data'} = pack( 'n xx N n n N',
                        $header->{'version'}, $header->{'length'},
                        $header->{'records'}, $header->{'recsize'}, 0 );
        } elsif( defined wantarray ) {
            my $recs = $prc ? $self->{'resources'} : $self->{'records'};
            my $header = $recs->[0];
            if( defined _parse_headerrec($header) ) {
                # a proper Doc file should be fine, but if it's not Doc
                # compression like some Mobi docs seem to be we want to
                # bail early. Otherwise we end up with a huge stream of
                # substr() errors and we _still_ don't get any content.
                eval {
                    sub min { return ($_[0]<$_[1]) ? $_[0] : $_[1] }
                    my $maxi = min($#$recs, $header->{'records'});
                    for( my $i = 1; $i <= $maxi; $i ++ ) {
                        my $data = $recs->[$i]->{'data'};
                        my $len = length($data);
                        my $overlap = "";
                        if ($self->{multibyteoverlap}) {
                            my $c = chop $data;
                            print STDERR "I:$i - $len - ", int($c), "\n";
                            my $n = $c & 7;
                            foreach (0..$n-1) {
                                $overlap = (chop $data) . $overlap;
                            }
                        }
                         
                        $body .= _decompress_record( $header->{'version'},
                                                     $data );
                        $body .= $overlap;
                    }
                };
                return undef if $@;
            }
        }
        return $body;
}
=head2 textfile
        $doc->textfile( "README.txt" );
Set the contents of the Doc to the contents of the file and sets the name of the PDB to
the specified filename.
=cut
sub textfile($$) {
        my ($self, $filename) = @_;
        open IN, "< $filename" or return undef;
        binmode IN;
        $self->text( '', <IN> );
        close IN;
        $self->{'name'} = $filename;
}
1;
__END__
=head1 BUGS
Bookmarks are unsupported. I've never had any use for them.
Output databases are always compressed and there's no option to
disable compression.  I consider this a feature, to be honest.
=head2 Note On Character Sets
L<Palm::Doc> doesn't do anything with character sets. This might be a bug,
depending on how you feel about this kind of thing, but the reality is that
we're generally converting between text and Doc files, neither of which are
real great at telling us what encoding we're supposed to use.
My understanding of PalmOS character sets is that Doc books should be
encoded in either Windows Code Page 1252 or, for Japanese, 932. Actually,
the PalmOS encoding is a small variation on those. In practice, ISO 8859-1
works okay for western languages which is real nice because L<Encode>
doesn't know about the PalmOS stuff. 
This gist of all this is that when you're creating a L<Palm::Doc>, you may
need to do something along the lines of:
        use Encode 'from_to';
        my $text = read_my_text();
        from_to( $text, $charset, 'iso-8859-1' ) unless $charset =~ /8859-1$/;
        my $doc = new Palm::Doc();
        $doc->text( $text );
And when you're reading a L<Palm::Doc> and you care about the character
set, you're pretty much going to have to guess the encoding and act
appropriately:
        use Encode 'decode';
        my $doc = new Palm::PDB;
        $doc->Load( $pdbname );
        my $text = decode("iso-8859-1", $doc->text());
=head1 AUTHOR
Christophe Beauregard E<lt>cpb@cpan.orgE<gt>
=head1 SEE ALSO
L<Palm::PDB>
L<Palm::ZText>
makedoc
L<http://www.pyrite.org/doc_format.html>
L<http://patb.dyndns.org/Programming/PilotDoc.htm>
L<Palm::PalmDoc> is another CPAN module for handling Doc databases,
but doesn't use L<Palm::PDB> and doesn't handle reading Docs.
L<http://www.df.lth.se/~triad/krad/recode/palm.html> for details on PalmOS
text encoding
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)