bin/pdf2ocr - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
—
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
              #!/usr/bin/perl -w
use strict;
use warnings;
use lib './lib';
use base 'LEOCHARRE::CLI';
use PDF::OCR::Thorough::Cached;
our $VERSION = sprintf "%d.%02d", q$Revision: 1.5 $ =~ /(\d+)/g;
my $conf={};
my $abs_conf = '/etc/pdf2ocr.conf';
if (-f $abs_conf ){
   $conf = config('/etc/pdf2ocr.conf');
}
my $o= gopts('sna:C');
$conf->{CACHE_BY_SUM} = 1 if $o->{s};
$conf->{abs_cache} = $o->{a} if $o->{a};
my $pdfs = argv_aspaths();
scalar @$pdfs or man();
for (@$pdfs){
   my $p = new PDF::OCR::Thorough::Cached($_) or next;
        $PDF::OCR::Thorough::Cached::ABS_CACHE_DIR = $conf->{abs_cache} if $conf->{abs_cache};
        $PDF::OCR::Thorough::Cached::CACHE_BY_SUM = $conf->{CACHE_BY_SUM} if $conf->{CACHE_BY_SUM};
   debug("cache by sum? ".$PDF::OCR::Thorough::Cached::CACHE_BY_SUM);
   debug("abs cache dir? ".$PDF::OCR::Thorough::Cached::ABS_CACHE_DIR);
   my $abs_cache_file = $p->abs_cached;
   debug("cache file: $abs_cache_file");
   if ($o->{n}){
      print STDERR "$abs_cache_file\n";
      next;
   }
   if ($o->{C}){
      print STDERR ( -f $abs_cache_file ? "$abs_cache_file\n" : "0\n");
      next;
   }
   my $text = $p->get_text;
   print $text;
}
__END__
=pod
=head1 NAME
pdf2ocr - get text content of pdf document images within
=head1 DESCRIPTION
Argument is a pdf file.
This script assumes that each page in the pdf is one 8.5x11 page.. ONE image
that's what the calculations are set up for.
=head1 USAGE EXAMPLES
=head1 OPTION FLAGS
    
        -h help
        -d debug
        -v version
   -s cache by sum on
   -n don't do anything, just show where cache file would be
   -C don't do anything, only show where cache file is if there 
      basically checking if it's cached or not.
=head1 PARAMETERS
   -a abs cache dir
    
=head1 /etc/pdf2ocr.conf
   ---
   abs_cache: /tmp/cache
   CACHE_BY_SUM: 1
=head1 SEE ALSO
PDF::OCR - parent package
PDF::OCR::Thorough::Cached
LEOCHARRE::CLI
=head1 AUTHOR
Leo Charre leocharre at cpan dot org
=cut

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)