#!/usr/local/bin/perl #!/usr/bin/env perl #!/bin/sh ###################################################################### # unichars - list characters for one or more properties # # Tom Christiansen <tchrist@perl.com> # v1.0: Fri Oct 22 23:05:16 MDT 2010 # v1.2: Tue Oct 26 08:28:25 MDT 2010 # better 5.10 support and simpler evals # ################################################################ # # This is an sh wrapper to run the script under # whichever perl occurs first in your path. See # CHOICEs 1 and 2 below for alternate strategies. # The -x will throw off your line numbers otherwise. # ###################################################################### # # The next line is legal in both shell and perl, # but perl sees the if 0 so doesn't execute it. # eval 'exec perl -x -S $0 ${1+"$@"}' if 0; ### CHOICE 1: ###################################################################### ### MAKE FOLLOWING #! line THE TOP LINE, REPLACING /usr/local/bin ### ### with wherever you have a late enough version of Perl is ### ### installed. Will run under 5.10, but prefers 5.12 or better. ### ###################################################################### #!/usr/local/bin/perl # ^^^^^^^^^^^^^^ <=== CHANGE ME ### ###################################################################### ### CHOICE 2: ###################################################################### ### ALTERNATELY, the following #! line does the same thing as ### ### the tricksy sh eval exec line: it finds whichever Perl is ### ### first in your path. However, it works only on BSD systems ### ### (including MacOS), but breaks under Solaris and Linux. ### ###################################################################### #!/usr/bin/env perl -CLA ###################################################################### use strict; use warnings; # qw[ FATAL all ]; use charnames qw[ :full :short latin greek ]; use 5.10.1; use File::Basename qw[ basename ]; use Getopt::Long qw[ GetOptions ]; use File::Spec; use Carp; use Pod::Usage qw[ pod2usage ]; use Encode qw[ decode ]; use Unicode::UCD qw(charinfo casefold); use if $^V >= v5.11.3, qw[ feature unicode_strings ]; # don't need to import this sub utf::is_utf8($); ################################################################ sub ARGCOUNT; sub CF(); sub IT(); sub NAME(); sub NOT_REACHED; sub NUM(); sub am_running_perldb; sub check_options(); sub compile_filter(); sub deQ($); sub deQQ($); sub debug($); sub dequeue($$); sub display; sub fork_pager; sub genfuncs; sub is_runnable; sub locate_program; sub main(); sub panic; sub run_filter(); sub start_pager; sub stupid_evil_and_wrong; sub titlecase; sub underscore; ################################################################ our $VERSION = "1.4 (2011-04-11)"; $| = 1; # command buffering quick-feeds piped stdout $0 = basename($0); # shorten up warnings/errors our %Opt; our $CF; our $CI; our $Shown_Count = 0; main(); exit; ################################################################ sub IT() { $_ } sub NAME() { charnames::viacode(ord $_) || "" } sub genfuncs { for my $nf ( qw< NFD NFC NFKD NFKC FCD FCC > ) { no strict "refs"; *$nf = sub(_) { require Unicode::Normalize; "Unicode::Normalize::$nf"->($_); }; } for my $check ( qw< checkNFD checkNFC checkNFKD checkNFKC checkFCD checkFCC > ) { no strict "refs"; *$check = sub(_) { require Unicode::Normalize; my $stat = "Unicode::Normalize::$check"->($_); if (defined $stat) { return $stat || "0 but true"; } else { # trick to quiet zero-conversion under -w return 0 == 1; } } } for my $nf ( qw< Singleton Exclusion NonStDecomp Comp_Ex NFD_NO NFC_NO NFC_MAYBE NFKD_NO NFKC_NO NFKC_MAYBE > ) { no strict "refs"; *$nf = sub() { require Unicode::Normalize; "Unicode::Normalize::is$nf"->(ord); }; } for my $nl ( 1 .. 4 ) { no strict "refs"; *{ "UCA$nl" } = sub(_) { require Unicode::Collate; my $class = Unicode::Collate:: ; my @args = (level => $nl, variable => "Non-Ignorable"); if ($Opt{locale}) { require Unicode::Collate::Locale; $class = Unicode::Collate::Locale:: ; push @args, locale => $Opt{locale}; } state $coll = $class->new(@args); return $coll->getSortKey($_[0]); }; } no warnings "once"; *UCA = \&UCA1; } sub CF() { $CF = casefold(ord); return ($CF && $CF->{status}) || ""; } sub NUM() { require Unicode::UCD; Unicode::UCD->VERSION(0.32); my $n = Unicode::UCD::num($_); if (defined $n) { return $n || "0 but true"; } else { # trick to quiet zero-conversion under -w return 0 == 1; } } ################################################################ sub main() { for my $fh ( qw[STDOUT STDERR] ) { binmode($fh, ":utf8") || die "can't binmode($fh) to :utf8 encoding: $!"; } check_options(); genfuncs(); compile_filter(); $SIG{PIPE} = sub {exit 0}; run_filter(); if ($Opt{verbose}) { print STDERR "$0: $Shown_Count code points matched.\n"; } close(STDOUT) || warn "$0: close stdout failed: $!\n"; if ($Shown_Count) { exit 0; } else { exit 1; } } ################################################################ sub debug($) { return unless $Opt{debug}; my $msg = shift(); print STDERR "$msg\n"; } sub check_options() { Getopt::Long::Configure qw[ bundling auto_version ]; if (@ARGV == 0) { @ARGV = qw{ --all --category --script }; } GetOptions(\%Opt, qw[ help|h|? man|m debug|d unnamed|u bmp smp astral|all|a casefold|f decimal|d category|general|c|g combining|C script|s block|b bidi|B numeric|n locale|l=s nopager verbose ]) || pod2usage(2); pod2usage(0) if $Opt{help}; pod2usage(-exitstatus => 0, -verbose => 2) if $Opt{man}; @ARGV = (1) unless @ARGV; #$Opt{smp}++; #$Opt{bmp}++; pod2usage("$0: missing arguments") if @ARGV == 0; if (grep /\P{ASCII}/ => @ARGV) { @ARGV = map { decode("UTF-8", $_) } @ARGV; } } sub compile_filter() { my @criteria; for my $i ( 0 .. $#ARGV ) { my $snippet = $ARGV[$i]; $snippet =~ s/^\s+//; # args starting with a backslash or which are a bracketed # espression are interpreted as pattern matches if ($snippet =~ m{ ^ \\ | ^ \[ .* \] $ }x) { $snippet = "/$snippet/"; } my $test_compile = deQ <<'START_TEST'; |Q| use warnings qw[FATAL all]; |Q| my $ignore = START_TEST $test_compile .= deQQ(<<"END_TEST"); |QQ| sub { $snippet }; |QQ| |QQ| # so eval returns true |QQ| 1; |QQ| END_TEST # debug("test compile:\n$test_compile"); eval($test_compile) || die "$0: invalid criterion in '$snippet': $@\n"; $criteria[$i] = "do { $snippet }"; } my $real_code = deQ(<<'START_CODE') . "\t"; |Q| use warnings; |Q| #use warnings qw[FATAL all]; |Q| #no warnings qw[deprecated]; |Q| |Q| sub filter { |Q| |Q| debug(sprintf("testing code point %X", ord())); |Q| |Q| my $result = |Q| START_CODE $real_code .= join("\n &&\n\t" => @criteria) . deQ(<<'END_CODE'); |Q| |Q| ; |Q| |Q| debug("result of " . join(" && ",@criteria) . " is $result"); |Q| return $result; |Q| } |Q| |Q| # so eval returns true |Q| 1; END_CODE debug("CRITERIA are\n$real_code"); eval($real_code) || die; } sub run_filter() { my $first_codepoint = 0x00_0000; my $last_codepoint = 0x10_FFFF; unless ($Opt{astral} || $Opt{smp}) { $last_codepoint = 0x00_FFFF; } if ($Opt{bmp}) { $first_codepoint = 0x00_0000; $last_codepoint = 0x00_FFFF; } if ($Opt{smp}) { $first_codepoint = 0x01_0000 unless $Opt{bmp}; $last_codepoint = 0x01_FFFF; } if ($Opt{astral}) { $last_codepoint = 0x10_FFFF; } my $hex_width = length(sprintf("%x", $last_codepoint)); my $dec_width = length(sprintf("%d", $last_codepoint)); --$hex_width if $last_codepoint == 0x10_FFFF; debug(sprintf("checking codepoints %0${hex_width}X .. %0${hex_width}X", $first_codepoint, $last_codepoint)); CODEPOINT: for my $codepoint ( $first_codepoint .. $last_codepoint ) { # gaggy UTF-16 surrogates are invalid UTF-8 code points next if $codepoint >= 0xD800 && $codepoint <= 0xDFFF; # from utf8.c in perl src; must avoid fatals in 5.10 next if $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF; next if 0xFFFE == ($codepoint & 0xFFFE); # both FFFE and FFFF # debug("testing codepoint $codepoint"); # see "Unicode non-character %s is illegal for interchange" in perldiag(1) $_ = do { no warnings "utf8"; chr($codepoint) }; # fixes "the Unicode bug" unless (utf8::is_utf8($_)) { $_ = decode("iso-8859-1", $_); } unless ($Opt{unnamed}) { # won't find string names for any of these, so don't bother printing next if m{ \p{Unassigned} }x; next if m{ \p{PrivateUse} }x; next if m{ \p{Han} }x; next if m{ \p{InHangulSyllables} }x; } next unless &filter; $Shown_Count++; $CI = charinfo(ord); if (/[\pC\pZ]/) { display " ---- "; } else { display "\N{LEFT-TO-RIGHT OVERRIDE}" ;# if /[\p{BC=R}\p{BC=AL}\p{BC=AN}\p{BC=ON}]/; # display " " if /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/; display " "; display "\N{DOTTED CIRCLE}" if /\p{BC=NSM}/; # display " \N{LEFT-TO-RIGHT MARK}$_\N{LEFT-TO-RIGHT MARK} "; display "$_ "; # display " " unless /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/; display " " unless /[\p{EA=F}\p{EA=W}]/; } display sprintf "%${dec_width}d %0${hex_width}X ", ($codepoint) x 2 if $Opt{decimal}; display sprintf "U+%0${hex_width}X ", $codepoint; if ($Opt{category}) { display sprintf("GC=%2s ", $CI->{category}); } if ($Opt{casefold}) { display sprintf("CF=%1s ", CF()); } if ($Opt{bidi}) { display sprintf("BC=%-3s ", $CI->{bidi}); } if ($Opt{numeric}) { display sprintf("%6s=NV ", $CI->{numeric}); } if ($Opt{block}) { display sprintf("BLK=%-22s ", underscore($CI->{block})); } if ($Opt{script}) { display sprintf("SC=%-12s ", titlecase($CI->{script})); } if ($Opt{combining}) { display sprintf("CC=%-3s ", $CI->{combining}); } display sprintf "%s\n", charnames::viacode($codepoint) || "<unnamed codepoint>"; } } sub underscore { local $_ = shift(); y/ /_/; return $_; } sub titlecase { local $_ = shift(); s/[-_]\K(\p{Ll})/\u$1/g; return $_; } sub display { ARGCOUNT() unless @_ == 1; my $string = $_[0]; state $begun_pager; start_pager() unless $begun_pager++; print $string; } sub am_running_perldb { no warnings "once"; return keys(%DB::sub) > 0; } sub locate_program { ARGCOUNT() unless @_ == 1; my $program = $_[0]; return unless defined $program && length $program; if (File::Spec->file_name_is_absolute($program)) { return is_runnable($program); } my @path_dirs = File::Spec->path(); for my $dir (@path_dirs) { my $pathname = File::Spec->catfile($dir, $program); my $runpath; return $runpath if $runpath = is_runnable($pathname); } return; } sub is_runnable { ARGCOUNT() unless @_ == 1; my $fullpath = $_[0]; if (-x $fullpath && ! -d _) { return $fullpath; } elsif (stupid_evil_and_wrong() && $fullpath !~ /\.exe\z/i) { return is_runnable("$fullpath.exe") } else { return (); } NOT_REACHED(); } sub stupid_evil_and_wrong { state $stupid_evil_and_wrong = { map { $_ => 1 } qw<dos os2 netware symbian mswin32> }; return exists $stupid_evil_and_wrong->{ lc $^O }; } sub panic { confess "$0: INTERNAL ERROR: @_"; } sub NOT_REACHED { panic("NOT REACHED"); } sub ARGCOUNT { panic("wrong arguments to function"); } sub dequeue($$) { my($leader, $body) = @_; $body =~ s/^\s*\Q$leader\E ?//gm; return $body; } sub deQ($) { my $text = $_[0]; return dequeue q<|Q|>, $text; } sub deQQ($) { my $text = $_[0]; return dequeue qq<|QQ|>, $text; } sub start_pager { ARGCOUNT() unless @_ == 0; return if am_running_perldb(); return if $Opt{nopager}; return unless -t STDOUT; my $his_pager = locate_program($ENV{PAGER}) || locate_program("less") || locate_program("more") || locate_program("type") ; return unless $his_pager; my $am_less = ($his_pager =~ /\bless\b/i); local $ENV{LESSCHARSET} = "utf-8" if $am_less; my @pager_args = (); push (@pager_args, "-r") if $am_less; open(STDOUT, "|- :utf8", $his_pager, @pager_args); } sub fork_pager { if (-t STDOUT) { } } ################################################################ ################################################################ ################################################################ __END__ =encoding utf8 =head1 NAME unichars - list characters for one or more properties =head1 SYNOPSIS B<unichars> [I<options>] I<criterion> ... Each criterion is either a square-bracketed character class, a regex starting with a backslash, or an arbitrary Perl expression. See the EXAMPLES section below. OPTIONS: Selection Options: --bmp include the Basic Multilingual Plane (plane 0) [DEFAULT] --smp include the Supplementary Multilingual Plane (plane 1) --astral -a include planes above the BMP (planes 1-15) --unnamed -u include various unnamed characters (see DESCRIPTION) --locale -l specify the locale used for UCA functions Display Options: --category -c include the general category (GC=) --script -s include the script name (SC=) --block -b include the block name (BLK=) --bidi -B include the bidi class (BC=) --combining -C include the canonical combining class (CCC=) --numeric -n include the numeric value (NV=) --casefold -f include the casefold status --decimal -d include the decimal representation of the code point Miscellaneous Options: --version -v print version information and exit --help -h this message --man -m full manpage --debug -d show debugging of criteria and examined code point span Special Functions: $_ is the current code point ord is the current code point's ordinal NAME is charname::viacode(ord) NUM is Unicode::UCD::num(ord), not code point number CF is casefold->{status} NFD, NFC, NFKD, NFKC, FCD, FCC (normalization) UCA, UCA1, UCA2, UCA3, UCA4 (binary sort keys) Singleton, Exclusion, NonStDecomp, Comp_Ex checkNFD, checkNFC, checkNFKD, checkNFKC, checkFCD, checkFCC NFD_NO, NFC_NO, NFC_MAYBE, NFKD_NO, NFKC_NO, NFKC_MAYBE =head1 DESCRIPTION The I<unichars> program reports which characters match all selection criteria I<and>ed together. A criterion beginning with a square bracket or a backslash is assumed to be a regular expression. Anything else is a Perl expression such as you might pass to the Perl C<grep> function. The C<$_> variable is set to each successive Unicode character, and if all criteria match, that character is displayed. The numeric code point is therefore accessible as C<ord>. The special token C<NAME> is set to the full name of the current code point. Also, the tokens C<NFD>, C<NFKD>, C<NFC>, and C<NFKC> are set to the corresponding normalization form. By default only plane 0, the Basic Multilingual Plane, is examined. For plane 1, the Supplementary Multilingual Plane, use B<--smp>. To examine either, specify both B<--bmp> and B<--smp> options, or B<-bs>. To include I<all> valid code points, use the B<-a> or B<--astral> option. Unless the B<--unnamed> option is given, characters with any of the properties Unassigned, PrivateUse, Han, or InHangulSyllables will be excluded. =head1 EXAMPLES Could all non-ASCII digits: $ unichars -a '\d' '\P{ASCII}' | wc -l 401 Find all line terminators: $ unichars '\R' -- 10 0000A LINE FEED (LF) -- 11 0000B LINE TABULATION -- 12 0000C FORM FEED (FF) -- 13 0000D CARRIAGE RETURN (CR) -- 133 00085 NEXT LINE (NEL) -- 8232 02028 LINE SEPARATOR -- 8233 02029 PARAGRAPH SEPARATOR Find what is not C<\s> but is C<[\h\v]>: $ unichars '\S' '[\h\v]' -- 11 0000B LINE TABULATION Count how many code points in the Basic Multilingual Plane are I<not> marks but I<are> diacritics: $ unichars '\PM' '\p{Diacritic}' | wc -l 209 Count how many code points in the Basic Multilingual Plane I<are> marks but are I<not> diacritics: $ unichars '\pM' '\P{Diacritic}' | wc -l 750 Find all code points that are Letters, are in the Greek script, have differing canonical and compatibility decompositions, and whose name contains "SYMBOL": $ unichars -a '\pL' '\p{Greek}' 'NFD ne NFKD' 'NAME =~ /SYMBOL/' Ï 976 003D0 GREEK BETA SYMBOL Ï‘ 977 003D1 GREEK THETA SYMBOL Ï’ 978 003D2 GREEK UPSILON WITH HOOK SYMBOL Ï“ 979 003D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL Ï” 980 003D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL Ï• 981 003D5 GREEK PHI SYMBOL Ï– 982 003D6 GREEK PI SYMBOL Ï° 1008 003F0 GREEK KAPPA SYMBOL ϱ 1009 003F1 GREEK RHO SYMBOL ϲ 1010 003F2 GREEK LUNATE SIGMA SYMBOL Ï´ 1012 003F4 GREEK CAPITAL THETA SYMBOL ϵ 1013 003F5 GREEK LUNATE EPSILON SYMBOL Ϲ 1017 003F9 GREEK CAPITAL LUNATE SIGMA SYMBOL Find all numeric nondigits in the Latin script (within the BMP): $ unichars '\pN' '\D' '\p{Latin}' â… 8544 02160 ROMAN NUMERAL ONE â…¡ 8545 02161 ROMAN NUMERAL TWO â…¢ 8546 02162 ROMAN NUMERAL THREE â…£ 8547 02163 ROMAN NUMERAL FOUR â…¤ 8548 02164 ROMAN NUMERAL FIVE â…¥ 8549 02165 ROMAN NUMERAL SIX â…¦ 8550 02166 ROMAN NUMERAL SEVEN â…§ 8551 02167 ROMAN NUMERAL EIGHT (etc) Find the first three alphanumunderish code points with no assigned name: $ unichars -au '\w' '!length NAME' | head -3 〠13312 003400 <unnamed codepoint> ã 13313 003401 <unnamed codepoint> ã‚ 13314 003402 <unnamed codepoint> Count the combining characters in the Suuplemental Multilingual Plane: $ unichars -s '\pM' | wc -l 61 =head1 ENVIRONMENT If your environment smells like it's in a Unicode encoding, program arguments will be in UTF-8. =head1 BUGS The B<--man> option does not correctly process the page for UTF-8, because it does not pass the necessary B<--utf8> option to L<pod2man>. =head1 SEE ALSO L<uniprops>, L<uninames>, L<perluniprops>, L<perlunicode>, L<perlrecharclass>, L<perlre> =head1 AUTHOR Tom Christiansen <I<tchrist@perl.com>> =head1 COPYRIGHT AND LICENCE Copyright 2010 Tom Christiansen. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself.