lib/SPVM/Regex.spvm - metacpan.org

package SPVM::Regex : precompile {
  use SPVM::Unicode (uchar);
  use SPVM::List;
  use SPVM::IntList;
  use SPVM::Hash;
  use SPVM::StringBuffer;
  use SPVM::Regex::Replacer;
  use SPVM::Regex::Pattern;
  use SPVM::Util (is_perl_word, isdigit, strtoi, substr, sliceb, copy_str, INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX, FLT_MIN, FLT_MAX, DBL_MIN, DBL_MAX);

  has patterns : SPVM::List;
  has string_start : int;
  has string_end : int;
  
  has caps : ro string[];
  
  has match_start : ro int;
  has match_length : ro int;
  
  has replace_count : ro int;
  
  private enum {
    STATUS_START,
    STATUS_CONTINUE,
    STATUS_QUANTIFIER_MIN,
    STATUS_QUANTIFIER_MAX,
    STATUS_CHAR_CLASS,
  }
  
  sub new : SPVM::Regex ($re_str : string) {
    my $self = new SPVM::Regex;
    
    $self->{patterns} = SPVM::List->new_len(0);
    
    $self->compile($re_str);
    
    $self->{caps} = new string[256];
    
    return $self;
  }

  private sub compile : void ($self : self, $re_str : string) {
    
    unless ($re_str) {
      die "Regex string must be defined";
    }
    
    my $re_str_length = length $re_str;
    
    if ($re_str_length == 0) {
      die "Regex string must have length";
    }
    
    my $before_is_charset = 0;
    my $pos = 0;
    my $patterns = $self->{patterns};
    
    my $quantifier_min_str = "";
    my $quantifier_max_str = "";
    
    my $status = STATUS_START();
    
    my $char_class_uchar_min_maxs_list : SPVM::IntList;
    my $char_class_negate = 0;
    
    my $before_is_char_class_range = 0;
    
    my $is_capture = 0;
    my $is_capture_start = 0;
    
    while ((my $uchar = uchar($re_str, \$pos)) >= 0) {
      if ($self->{string_end}) {
        die "\$ must be last of regex";
      }
      
      my $cur_is_char = 0;
      my $cur_uchar_min_maxs : int[];
      my $cur_negate = 0;
      
      switch ($uchar) {
        case '\\': {
          my $next_uchar = uchar($re_str, \$pos);
          if ($uchar < 0) {
            die "Invalid regex. Last is \\ ";
          }
          
          switch ($next_uchar) {
            case 'd': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)'0', '9'];
              break;
            }
            case 'D': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)'0', '9'];
              $cur_negate = 1;
              break;
            }
            case 'w': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)'0', '9', 'a', 'z', 'A', 'Z', '_', '_'];
              break;
            }
            case 'W': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)'0', '9', 'a', 'z', 'A', 'Z', '_', '_'];
              $cur_negate = 1;
              break;
            }
            case 's': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)' ', ' ', '\t', '\t', '\f', '\f', '\r', '\r', '\n', '\n'];
              break;
            }
            case 'S': {
              $cur_is_char = 1;
              $cur_uchar_min_maxs = [(int)' ', ' ', '\t', '\t', '\f', '\f', '\r', '\r', '\n', '\n'];
              $cur_negate = 1;
              break;
            }
            default: {
              if (is_perl_word($next_uchar)) {
                die "Invalid regex escape character";
              }
              # \: \- etc
              else {
                my $uchar_min_maxs = [(int)$next_uchar, $next_uchar];
                if ($status == STATUS_CHAR_CLASS()) {
                  $char_class_uchar_min_maxs_list->push($uchar_min_maxs->[0]);
                  $char_class_uchar_min_maxs_list->push($uchar_min_maxs->[1]);
                }
                else {
                  $cur_is_char = 1;
                  $cur_uchar_min_maxs = $uchar_min_maxs;
                }
              }
            }
          }
          
          break;
        }
        default: {
          if ($status == STATUS_CHAR_CLASS()) {
            if ($uchar == ']') {
              $status = STATUS_CONTINUE();
              $cur_uchar_min_maxs = $char_class_uchar_min_maxs_list->to_array;
              if (@$cur_uchar_min_maxs == 0) {
                die "Emtpy character class";
              }
              for (my $i = 0; $i < @$cur_uchar_min_maxs; $i += 2) {
                my $min = $cur_uchar_min_maxs->[$i];
                my $max = $cur_uchar_min_maxs->[$i + 1];
                if ($max < $min) {
                  die "Invalid range character class";
                }
              }
              
              $char_class_uchar_min_maxs_list = SPVM::IntList->new_len(0);
              $cur_is_char = 1;
              $cur_negate = $char_class_negate;
            }
            elsif ($uchar == '-') {
              if ($before_is_char_class_range) {
                die "- must not be repeat";
              }
              else {
                $before_is_char_class_range = 1;
              }
            }
            else {
              if ($uchar == '^' && $char_class_uchar_min_maxs_list->length == 0) {
                $char_class_negate = 1;
              }
              else {
                if ($before_is_char_class_range) {
                  if ($char_class_uchar_min_maxs_list->length == 0) {
                    die "- start not found";
                  }
                  else {
                    $char_class_uchar_min_maxs_list->set($char_class_uchar_min_maxs_list->length - 1, $uchar);
                    $before_is_char_class_range = 0;
                  }
                }
                else {
                  $char_class_uchar_min_maxs_list->push($uchar);
                  $char_class_uchar_min_maxs_list->push($uchar);
                }
              }
            }
          }
          else {
            switch ($uchar) {
              case '^': {
                if ($status == STATUS_CHAR_CLASS()) {
                  die "Unexpected";
                }
                elsif ($status != STATUS_START()) {
                  die "Regex compile error";
                }
                
                $self->{string_start} = 1;
                
                break;
              }
              case '.': {
                die ". is not supported";
                break;
              }
              case '$': {
                $self->{string_end} = 1;
                break;
              }
              case '|': {
                die "| is not supported";
                break;
              }
              case '(': {
                if ($is_capture) {
                  die "Invalid capture start";
                }
                else {
                  $is_capture = 1;
                  $is_capture_start = 1;
                }
                break;
              }
              case ')': {
                if ($is_capture) {
                  $is_capture = 0;
                  my $before_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                  $before_pattern->{capture_end} = 1;
                }
                else {
                  die "Invalid capture end";
                }
                break;
              }
              case '[': {
                $status = STATUS_CHAR_CLASS();
                $char_class_uchar_min_maxs_list = SPVM::IntList->new_len(0);
                $before_is_char_class_range = 0;
                break;
              }
              case ']': {
                die "] must be end of character class";
                break;
              }
              case '*': {
                unless ($before_is_charset) {
                  die "Regex compile error";
                }
                my $before_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                $before_pattern->{min_repeat} = 0;
                $before_pattern->{max_repeat} = INT32_MAX();
                $before_is_charset = 0;
                break;
              }
              case '+': {
                unless ($before_is_charset) {
                  die "Regex compile error";
                }
                my $before_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                $before_pattern->{min_repeat} = 1;
                $before_pattern->{max_repeat} = INT32_MAX();
                $before_is_charset = 0;
                break;
              }
              case '?': {
                unless ($before_is_charset) {
                  die "Regex compile error";
                }
                my $before_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                $before_pattern->{min_repeat} = 0;
                $before_pattern->{max_repeat} = 1;
                $before_is_charset = 0;
                break;
              }
              case '{': {
                unless ($before_is_charset) {
                  die "Regex compile error";
                }
                $status = STATUS_QUANTIFIER_MIN();
                $before_is_charset = 0;
                break;
              }
              case '}': {
                unless ($status == STATUS_QUANTIFIER_MAX()) {
                  die "Regex compile error";
                }
                my $quantifier_max = strtoi($quantifier_max_str, 10);
                my $last_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                $last_pattern->{max_repeat} = $quantifier_max;
                
                $status = STATUS_CONTINUE();
                $before_is_charset = 0;
                $quantifier_max_str = "";
                break;
              }
              default: {
                if ($status == STATUS_QUANTIFIER_MIN()) {
                  if (isdigit($uchar)) {
                    $quantifier_min_str .= $uchar - '0';
                  }
                  elsif ($uchar == ',') {
                    my $quantifier_min = strtoi($quantifier_min_str, 10);
                    my $last_pattern = (SPVM::Regex::Pattern)$patterns->get($patterns->length - 1);
                    $last_pattern->{min_repeat} = $quantifier_min;
                    
                    $status = STATUS_QUANTIFIER_MAX();
                    $quantifier_min_str .= "";
                  }
                  else {
                    die "Quantifier min must be non-fractional number";
                  }
                }
                elsif ($status == STATUS_QUANTIFIER_MAX()) {
                  if (isdigit($uchar)) {
                    $quantifier_max_str .= $uchar - '0';
                  }
                  else {
                    die "Quantifier max must be non-fractional number";
                  }
                }
                else {
                  $cur_is_char = 1;
                  $cur_uchar_min_maxs = [$uchar, $uchar];
                }
              }
            }
          }
        }
      }

      if ($cur_is_char) {
        my $pattern = new SPVM::Regex::Pattern;
        $pattern->{type} = SPVM::Regex::Pattern->PATTERN;
        $pattern->{negate} = (byte)$cur_negate;
        $pattern->{uchar_min_maxs} = $cur_uchar_min_maxs;
        $pattern->{max_repeat} = 1;
        $pattern->{min_repeat} = 1;

        if ($is_capture_start) {
          $pattern->{capture_start} = 1;
          $is_capture_start = 0;
        }
        
        $patterns->push($pattern);
        $before_is_charset = 1;
      }
    }
    
    # Check patterns
    for (my $pattern_index = 0; $pattern_index < $patterns->length; $pattern_index++) {
      
      if ($pattern_index == 0) {
        next;
      }
      
      my $before_is_variable_quantifier = 0;
      my $before_pattern =(SPVM::Regex::Pattern)$patterns->get($pattern_index - 1);
      my $cur_pattern = (SPVM::Regex::Pattern)$patterns->get($pattern_index);
      
      # Check duplicatation charset
      if ($before_pattern->{min_repeat} != $before_pattern->{max_repeat}) {
        $before_is_variable_quantifier = 1;
      }
      if ($before_is_variable_quantifier) {
        my $duplicate_charset = $self->duplicate($before_pattern, $cur_pattern);
        if ($duplicate_charset) {
          die "Invalid regex. Charset after variable quantifier is duplicate";
        }
      }
      
      # 0 width regular expression is forbidden
      my $no_zero_quantifier_pattern : SPVM::Regex::Pattern;
      my $find_zere_quantifier = 0;
      for (my $before_pattern_index = $pattern_index - 1; $before_pattern_index >= 0; $before_pattern_index--) {
        my $before_pattern = (SPVM::Regex::Pattern)$patterns->get($before_pattern_index);
        if ($before_pattern->{min_repeat} == 0) {
          $find_zere_quantifier = 1;
        }
        else {
          $no_zero_quantifier_pattern = $before_pattern;
          last;
        }
      }
      if ($find_zere_quantifier && $no_zero_quantifier_pattern) {
        my $duplicate_charset = $self->duplicate($no_zero_quantifier_pattern, $cur_pattern);
        if ($duplicate_charset) {
          die "Invalid regex. Zero width Charset after variable quantifier is duplicate";
        }
      }
    }
  }
  
  sub match : int ($self : self, $target : string, $target_base_index : int) {
    
    unless ($target) {
      die "Target string must be defined";
    }
    
    my $patterns = $self->{patterns};
    my $patterns_length = $patterns->length;
    
    my $match_all = 0;
    my $target_index = 0;
    my $target_base_index_before : int;
    while (1) {
      $target_base_index_before = $target_base_index;
      my $target_base_uchar = uchar($target, \$target_base_index);
      unless ($target_base_uchar >= 0) {
        last;
      }
      
      $target_index =  $target_base_index_before;
      
      my $pattern_index = 0;
      
      my $capture_start_index = -1;
      my $capture_count = 0;
      while (1) {
        my $pattern = (SPVM::Regex::Pattern)$patterns->get($pattern_index);
        my $max_repeat = $pattern->{max_repeat};
        my $min_repeat = $pattern->{min_repeat};
        my $uchar_min_maxs = $pattern->{uchar_min_maxs};
        my $negate = $pattern->{negate};
        
        if ($pattern->{capture_start}) {
          $capture_start_index = $target_index;
        }
        
        my $target_before_index = $target_index;
        my $target_uchar = uchar($target, \$target_index);
        unless ($target_uchar >= 0) {
          last;
        }
        
        
        my $type = $pattern->{type};
        
        my $match_uchar_repeat = 0;
        my $repeat_count = 0;
        my $repeat_before_target_index = $target_before_index;
        
        if ($type == SPVM::Regex::Pattern->PATTERN) {
          # Match uchar repeat
          for (my $repeat_index = 0; $repeat_index < $max_repeat; $repeat_index++) {
            # Match uchar
            my $match_uchar : int;
            
            # Match in not range
            if ($negate) {
              my $match_uchar_count = 0;
              for (my $min_max_index = 0; $min_max_index < @$uchar_min_maxs; $min_max_index += 2) {
                my $uchar_min = $uchar_min_maxs->[$min_max_index];
                my $uchar_max = $uchar_min_maxs->[$min_max_index + 1];
                if ($target_uchar >= $uchar_min && $target_uchar <= $uchar_max) {
                  $match_uchar_count++;
                }
              }
              if ($match_uchar_count == 0) {
                $match_uchar = 1;
              }
            }
            # Match in range
            else {
              $match_uchar = 0;
              for (my $min_max_index = 0; $min_max_index < @$uchar_min_maxs; $min_max_index += 2) {
                my $uchar_min = $uchar_min_maxs->[$min_max_index];
                my $uchar_max = $uchar_min_maxs->[$min_max_index + 1];
                
                if ($target_uchar >= $uchar_min && $target_uchar <= $uchar_max) {
                  $match_uchar = 1;
                  last;
                }
              }
            }
            
            # Increment repeat count
            if ($match_uchar) {
              $repeat_count++;
              $repeat_before_target_index = $target_index;

              if ($target_index == length $target) {
                last;
              }
              
              $target_uchar = uchar($target, \$target_index);
              
              $target_before_index = $target_index;
            }
            else {
              last;
            }
          }
          if ($repeat_count >= $min_repeat) {
            $match_uchar_repeat = 1;
          }
        }
        else {
          die "Not implemented";
        }
        
        if ($match_uchar_repeat) {
          if ($pattern->{capture_end}) {
            $self->{caps}[$capture_count] = substr($target, $capture_start_index, $repeat_before_target_index - $capture_start_index);
            $capture_count++;
            if ($capture_count > 255) {
              die "Too many captures";
            }
          }
          $pattern_index++;
          $target_index = $repeat_before_target_index;
          if ($pattern_index == $patterns_length) {
            last;
          }
        }
        else {
          last;
        }
      }
      
      if ($pattern_index == $patterns_length) {
        $match_all = 1;
        last;
      }
      else {
        if ($self->{string_start}) {
          last;
        }
      }
    }
    
    my $really_match = 0;
    if ($match_all) {
      if ($self->{string_end}) {
        if ($target_index == length $target) {
          $really_match = 1;
        }
      }
      else {
        $really_match = 1;
      }
    }
    
    if ($really_match) {
      $self->{match_start} = $target_base_index_before;
      $self->{match_length} = $target_index - $self->{match_start};
    }
    
    return $really_match;
  }
  
  sub replace  : string ($self : self, $target : string, $target_offset : int, $replace : string) {
    my $opt = SPVM::Hash->new;
    $opt->set_int(replace_all => 0);
    return $self->replace_opt($target, $target_offset, $replace, $opt);
  }

  sub replace_cb  : string ($self : self, $target : string, $target_offset : int, $replace_cb : SPVM::Regex::Replacer) {
    my $opt = SPVM::Hash->new;
    $opt->set_int(replace_all => 0);
    return $self->replace_opt($target, $target_offset, $replace_cb, $opt);
  }

  sub replace_all  : string ($self : self, $target : string, $target_offset : int, $replace : string) {
    my $opt = SPVM::Hash->new;
    $opt->set_int(replace_all => 1);
    return $self->replace_opt($target, $target_offset, $replace, $opt);
  }

  sub replace_all_cb  : string ($self : self, $target : string, $target_offset : int, $replace_cb : SPVM::Regex::Replacer) {
    my $opt = SPVM::Hash->new;
    $opt->set_int(replace_all => 1);
    return $self->replace_opt($target, $target_offset, $replace_cb, $opt);
  }
  
  private sub replace_opt  : string ($self : self, $target : string, $target_base_index : int, $replace_object : object, $opt : SPVM::Hash) {
    
    my $original_target_base_index = $target_base_index;
    
    my $replace_all = 0;
    if (my $replace_all_obj = $opt->get("replace_all")) {
      $replace_all = (int)$replace_all_obj;
    }
    $self->{replace_count} = 0;
    
    unless ($target) {
      return $target;
    }
    
    my $result : byte[];
    my $replace_count = 0;
    my $result_buffer = SPVM::StringBuffer->new;
    my $tarlength = length $target;
    
    if ($target_base_index > 0) {
      my $first_string = substr($target, 0, $target_base_index);
      $result_buffer->push($first_string);
    }
    
    while (my $match = $self->match($target, $target_base_index)) {
      
      my $match_start = $self->match_start;
      my $match_end = $match_start + $self->match_length;
      
      my $replace : string;
      if ($replace_object isa string) {
        $replace = (string)$replace_object;
      }
      elsif ($replace_object isa object[]) {
        my $replace_objects = (object[])$replace_object;
        
        my $replace_buffer = SPVM::StringBuffer->new;
        for (my $i = 0; $i < @$replace_objects; $i++) {
          my $replace_elem_object = $replace_objects->[$i];
          
          if ($replace_elem_object isa string) {
            $replace_buffer->push((string)$replace_elem_object);
          }
          elsif ($replace_elem_object isa SPVM::Int) {
            my $capture = $self->caps->[(int)$replace_elem_object];
            $replace_buffer->push($capture);
          }
          else {
            die "Invalid replace argument element";
          }
        }
        
        $replace = $replace_buffer->to_string;
      }
      elsif ($replace_object isa SPVM::Regex::Replacer) {
        my $replace_buffer = SPVM::StringBuffer->new;
        my $replacer = (SPVM::Regex::Replacer)$replace_object;
        my $cb_result = $replacer->($self);
        $replace_buffer->push($cb_result);
        $replace = $replace_buffer->to_string;
      }
      else {
        die "Invalid replace argument";
      }
      
      my $match_length = $self->match_length;
      my $replace_length = length $replace;
      
      my $result_length = $tarlength + $replace_length - $match_length;
      
      $result = new byte[$result_length];
      
      my $rest_length = $tarlength - $match_end;
      
      my $before_result = (string)sliceb((byte[])$target, $target_base_index, $match_start - $target_base_index);
      $result_buffer->push($before_result);
      $result_buffer->push($replace);
      
      $replace_count++;
      
      unless ($replace_all) {
        last;
      }
      $target_base_index = $match_end;
    }
    
    
    if ($replace_count == 0) {
      $result = (byte[])copy_str($target);
    }
    else {
      my $match_end = $self->match_start + $self->match_length;
      my $rest_length = $tarlength - $match_end;
      my $rest_string = substr($target, $match_end, $rest_length);
      $result_buffer->push($rest_string);
      
      $result = (byte[])$result_buffer->to_string;
    }
    
    $self->{replace_count} = $replace_count;
    
    return (string)$result;
  }

  private sub duplicate : int ($self : self, $pattern1 : SPVM::Regex::Pattern, $pattern2 : SPVM::Regex::Pattern) {
    my $uchar_min_maxs1 = $pattern1->{uchar_min_maxs};
    my $uchar_min_maxs2 = $pattern2->{uchar_min_maxs};
    
    my $duplicate : int;
    my $negate1 = (int)$pattern1->{negate};
    my $negate2 = (int)$pattern2->{negate};
    if ($negate1 && $negate2) {
      $duplicate = 1;
    }
    elsif (!$negate1 && !$negate2) {
      $duplicate = 0;
      for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
        my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
        my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
        for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
          my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
          my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
          if ($uchar_min2 >= $uchar_min1 && $uchar_min2 <= $uchar_max1) {
            $duplicate = 1;
            last;
          }
          elsif ($uchar_max2 >= $uchar_min1 && $uchar_max2 <= $uchar_max1) {
            $duplicate = 1;
            last;
          }
        }
      }
    }
    elsif ($negate1 && !$negate2) {
      $duplicate = 0;
      for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
        my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
        my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
        my $one_no_duplicate = 0;
        for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
          my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
          my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
          if ($uchar_min2 >= $uchar_min1 && $uchar_max2 <= $uchar_max1) {
            $one_no_duplicate = 1;
            last;
          }
        }
        unless ($one_no_duplicate) {
          $duplicate = 1;
          last;
        }
      }
    }
    elsif (!$negate1 && $negate2) {
      $duplicate = 0;
      for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
        my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
        my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
        my $one_no_duplicate = 0;
        for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
          my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
          my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
          if ($uchar_min1 >= $uchar_min2 && $uchar_max1 <= $uchar_max2) {
            $one_no_duplicate = 1;
            last;
          }
        }
        unless ($one_no_duplicate) {
          $duplicate = 1;
          last;
        }
      }
    }
    
    return $duplicate;
  }
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	Go to GitHub issues (only if GitHub is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)
Keyboard Shortcuts