############################################################################## # $URL: http://perlcritic.tigris.org/svn/perlcritic/trunk/Perl-Critic/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm $ # $Date: 2007-12-20 10:00:02 -0600 (Thu, 20 Dec 2007) $ # $Author: clonezone $ # $Revision: 2062 $ ############################################################################## package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes; use strict; use warnings; use Readonly; use English qw(-no_match_vars); use Carp; use Perl::Critic::Utils qw{ :booleans :severities }; use Perl::Critic::Utils::PPIRegexp qw{ parse_regexp get_match_string get_modifiers }; use base 'Perl::Critic::Policy'; our $VERSION = '1.081_004'; #----------------------------------------------------------------------------- Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks}; Readonly::Scalar my $EXPL => [261]; Readonly::Scalar my $DEFAULT_MAX_COMPLEXITY => 60; #----------------------------------------------------------------------------- sub supported_parameters { return qw(max_characters) } sub default_severity { return $SEVERITY_MEDIUM } sub default_themes { return qw( core pbp maintenance ) } sub applies_to { return qw(PPI::Token::Regexp::Match PPI::Token::Regexp::Substitute PPI::Token::QuoteLike::Regexp) } #----------------------------------------------------------------------------- sub initialize_if_enabled { my ($self, $config) = @_; #Set configuration if defined $self->{_max_characters} = defined $config->{max_characters} && $config->{max_characters} =~ m/(\d+)/xms && $1 > 0 ? $1 : $DEFAULT_MAX_COMPLEXITY; return $TRUE; } #----------------------------------------------------------------------------- sub violates { my ( $self, $elem, undef ) = @_; # Optimization: if its short enough now, parsing won't make it longer return if $self->{_max_characters} >= length get_match_string($elem); # If it has an "x" flag, it might be shorter after comment and whitespace removal my %modifiers = get_modifiers($elem); if ($modifiers{x}) { my $re = parse_regexp($elem); return if !$re; # syntax error, abort my $qr = $re->visual; # HACK: Remove any (?xism:...) wrapper we may have added in the parse process... $qr =~ s/\A [(][?][xism]+(?:-[xism]+)?: (.*) [)] \z/$1/xms; # Hack: don't count long \p{...} expressions against us so badly $qr =~ s/\\[pP][{]\w+[}]/\\p{...}/gmx; return if $self->{_max_characters} >= length $qr; } return $self->violation( $DESC, $EXPL, $elem ); } 1; __END__ #----------------------------------------------------------------------------- =pod =for stopwords BNF Tatsuhiko Miyagawa =head1 NAME Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes =head1 DESCRIPTION Big regexps are hard to read, perhaps even the hardest part of Perl. A good practice to write digestible chunks of regexp and put them together. This policy flags any regexp that is longer than C<N> characters, where C<N> is a configurable value that defaults to 60. If the regexp uses the C<x> flag, then the length is computed after parsing out any comments or whitespace. =head1 CASE STUDY As an example, look at the regexp used to match email addresses in L<Email::Valid::Loose> (tweaked lightly to wrap for POD) (?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\] \000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015 "]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[ \]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n \015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@, ;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\] )(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000 -\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*) which is constructed from the following code: my $esc = '\\\\'; my $period = '\.'; my $space = '\040'; my $open_br = '\['; my $close_br = '\]'; my $nonASCII = '\x80-\xff'; my $ctrl = '\000-\037'; my $cr_list = '\n\015'; my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # " my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/; my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>; my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# " my $atom = qq<$atom_char+(?!$atom_char)>; my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # " my $word = qq<(?:$atom|$quoted_str)>; my $domain_ref = $atom; my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>; my $sub_domain = qq<(?:$domain_ref|$domain_lit)>; my $domain = qq<$sub_domain(?:$period$sub_domain)*>; my $local_part = qq<$word(?:$word|$period)*>; # This part is modified $Addr_spec_re = qr<$local_part\@$domain>; If you read the code from bottom to top, it is quite readable. And, you can even see the one violation of RFC822 that Tatsuhiko Miyagawa deliberately put into Email::Valid::Loose to allow periods. Look for the C<|\.> in the upper regexp to see that same deviation. One could certainly argue that the top regexp could be re-written more legibly with C<m//x> and comments. But the bottom version is self-documenting and, for example, doesn't repeat C<\x80-\xff> 18 times. Furthermore, it's much easier to compare the second version against the source BNF grammar in RFC 822 to judge whether the implementation is sound even before running tests. =head1 CONFIGURATION This policy allows regexps up to C<N> characters long, where C<N> defaults to 60. You can override this to set it to a different number with the C<max_characters> setting. To do this, put entries in a F<.perlcriticrc> file like this: [RegularExpressions::ProhibitComplexRegexes] max_characters = 40 =head1 CREDITS Initial development of this policy was supported by a grant from the Perl Foundation. =head1 AUTHOR Chris Dolan <cdolan@cpan.org> =head1 COPYRIGHT Copyright (c) 2007 Chris Dolan. Many rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The full text of this license can be found in the LICENSE file included with this module =cut # Local Variables: # mode: cperl # cperl-indent-level: 4 # fill-column: 78 # indent-tabs-mode: nil # c-indentation-style: bsd # End: # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab :