class Regex : precompile {
use Unicode;
use List;
use IntList;
use Hash;
use StringBuffer;
use Regex::Replacer;
use Regex::Pattern;
use Fn;
use Hash;
has patterns : List;
has string_start : byte;
has string_end : byte;
has single_line_mode : byte;
has ascii_mode : byte;
has captures : ro string[];
has match_start : ro int;
has match_length : ro int;
has replace_count : ro int;
private enum {
STATUS_START,
STATUS_CONTINUE,
STATUS_QUANTIFIER_MIN,
STATUS_QUANTIFIER_MAX,
STATUS_CHAR_CLASS,
}
static method new : Regex ($re_str : string) {
my $self = Regex->_new_without_compile();
$self->compile($re_str);
return $self;
}
static method new_with_options : Regex ($re_str : string, $option_chars : string) {
my $self = Regex->_new_without_compile();
if ($option_chars) {
my $options_h = Hash->new({});
my $option_chars_length = length $option_chars;
for (my $i = 0; $i < $option_chars_length; $i++) {
my $option_char = $option_chars->[$i];
if ($option_char == 's') {
my $found = $options_h->get("s");
if ($found) {
die "s option must be used once";
}
$self->{single_line_mode} = 1;
$options_h->set_int("s" => 1);
}
elsif ($option_char == 'a') {
my $found = $options_h->get("a");
if ($found) {
die "a option must be used once";
}
$self->{ascii_mode} = 1;
$options_h->set_int("a" => 1);
}
}
}
$self->compile($re_str);
return $self;
}
private static method _new_without_compile : Regex () {
my $self = new Regex;
$self->{patterns} = List->new_len(new Regex::Pattern[0], 0);
$self->{captures} = new string[256];
return $self;
}
private method compile : void ($re_str : string) {
unless ($re_str) {
die "Regex string must be defined";
}
my $re_str_length = length $re_str;
if ($re_str_length == 0) {
die "Regex string must have length";
}
my $before_is_charset = 0;
my $pos = 0;
my $patterns = $self->{patterns};
my $quantifier_min_str = "";
my $quantifier_max_str = "";
my $status = Regex->STATUS_START();
my $char_class_uchar_min_maxs_list : IntList;
my $char_class_negate = 0;
my $before_is_char_class_range = 0;
my $is_capture = 0;
my $is_capture_start = 0;
while ((my $uchar = Unicode->uchar($re_str, \$pos)) >= 0) {
if ($self->{string_end}) {
die "\$ must be last of regex";
}
my $cur_is_char = 0;
my $cur_uchar_min_maxs : int[];
my $cur_negate = 0;
switch ($uchar) {
case '\\': {
my $next_uchar = Unicode->uchar($re_str, \$pos);
if ($uchar < 0) {
die "Invalid regex. Last is \\ ";
}
switch ($next_uchar) {
case 'd': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)'0', '9'];
}
else {
die "\d must be used with ascii option";
}
break;
}
case 'D': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)'0', '9'];
$cur_negate = 1;
}
else {
die "\D must be used with ascii option";
}
break;
}
case 'w': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)'0', '9', 'a', 'z', 'A', 'Z', '_', '_'];
}
else {
die "\w must be used with ascii option";
}
break;
}
case 'W': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)'0', '9', 'a', 'z', 'A', 'Z', '_', '_'];
$cur_negate = 1;
}
else {
die "\W must be used with ascii option";
}
break;
}
case 's': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)' ', ' ', '\t', '\t', '\f', '\f', '\r', '\r', '\n', '\n'];
}
else {
die "\s must be used with ascii option";
}
break;
}
case 'S': {
if ($self->{ascii_mode}) {
$cur_is_char = 1;
$cur_uchar_min_maxs = [(int)' ', ' ', '\t', '\t', '\f', '\f', '\r', '\r', '\n', '\n'];
$cur_negate = 1;
}
else {
die "\S must be used with ascii option";
}
break;
}
default: {
if (Fn->is_perl_word($next_uchar)) {
die "Invalid regex escape character";
}
# \: \- etc
else {
my $uchar_min_maxs = [(int)$next_uchar, $next_uchar];
if ($status == Regex->STATUS_CHAR_CLASS()) {
$char_class_uchar_min_maxs_list->push($uchar_min_maxs->[0]);
$char_class_uchar_min_maxs_list->push($uchar_min_maxs->[1]);
}
else {
$cur_is_char = 1;
$cur_uchar_min_maxs = $uchar_min_maxs;
}
}
}
}
break;
}
default: {
if ($status == Regex->STATUS_CHAR_CLASS()) {
if ($uchar == ']') {
$status = Regex->STATUS_CONTINUE();
$cur_uchar_min_maxs = $char_class_uchar_min_maxs_list->to_array;
if (@$cur_uchar_min_maxs == 0) {
die "Emtpy character class";
}
for (my $i = 0; $i < @$cur_uchar_min_maxs; $i += 2) {
my $min = $cur_uchar_min_maxs->[$i];
my $max = $cur_uchar_min_maxs->[$i + 1];
if ($max < $min) {
die "Invalid range character class";
}
}
$char_class_uchar_min_maxs_list = IntList->new_len(0);
$cur_is_char = 1;
$cur_negate = $char_class_negate;
}
elsif ($uchar == '-') {
if ($before_is_char_class_range) {
die "- must not be repeat";
}
else {
$before_is_char_class_range = 1;
}
}
else {
if ($uchar == '^' && $char_class_uchar_min_maxs_list->length == 0) {
$char_class_negate = 1;
}
else {
if ($before_is_char_class_range) {
if ($char_class_uchar_min_maxs_list->length == 0) {
die "- start not found";
}
else {
$char_class_uchar_min_maxs_list->set($char_class_uchar_min_maxs_list->length - 1, $uchar);
$before_is_char_class_range = 0;
}
}
else {
$char_class_uchar_min_maxs_list->push($uchar);
$char_class_uchar_min_maxs_list->push($uchar);
}
}
}
}
else {
switch ($uchar) {
case '^': {
if ($status == Regex->STATUS_CHAR_CLASS()) {
die "Unexpected";
}
elsif ($status != Regex->STATUS_START()) {
die "Regex compile error";
}
$self->{string_start} = 1;
break;
}
case '.': {
$cur_is_char = 1;
if ($self->{single_line_mode}) {
$cur_uchar_min_maxs = new int[0];
}
else {
$cur_uchar_min_maxs = [(int)'\n', '\n'];
}
$cur_negate = 1;
break;
}
case '$': {
$self->{string_end} = 1;
break;
}
case '|': {
die "| is not supported";
break;
}
case '(': {
if ($is_capture) {
die "Invalid capture start";
}
else {
$is_capture = 1;
$is_capture_start = 1;
}
break;
}
case ')': {
if ($is_capture) {
$is_capture = 0;
my $before_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$before_pattern->{capture_end} = 1;
}
else {
die "Invalid capture end";
}
break;
}
case '[': {
$status = Regex->STATUS_CHAR_CLASS();
$char_class_uchar_min_maxs_list = IntList->new_len(0);
$before_is_char_class_range = 0;
break;
}
case ']': {
die "] must be end of character class";
break;
}
case '*': {
unless ($before_is_charset) {
die "Regex compile error";
}
my $before_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$before_pattern->{min_repeat} = 0;
$before_pattern->{max_repeat} = Fn->INT32_MAX();
$before_is_charset = 0;
break;
}
case '+': {
unless ($before_is_charset) {
die "Regex compile error";
}
my $before_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$before_pattern->{min_repeat} = 1;
$before_pattern->{max_repeat} = Fn->INT32_MAX();
$before_is_charset = 0;
break;
}
case '?': {
unless ($before_is_charset) {
die "Regex compile error";
}
my $before_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$before_pattern->{min_repeat} = 0;
$before_pattern->{max_repeat} = 1;
$before_is_charset = 0;
break;
}
case '{': {
unless ($before_is_charset) {
die "Regex compile error";
}
$status = Regex->STATUS_QUANTIFIER_MIN();
$before_is_charset = 0;
break;
}
case '}': {
unless ($status == Regex->STATUS_QUANTIFIER_MAX()) {
die "Regex compile error";
}
my $quantifier_max = Fn->to_int_with_base($quantifier_max_str, 10);
my $last_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$last_pattern->{max_repeat} = $quantifier_max;
$status = Regex->STATUS_CONTINUE();
$before_is_charset = 0;
$quantifier_max_str = "";
break;
}
default: {
if ($status == Regex->STATUS_QUANTIFIER_MIN()) {
if (Fn->is_digit($uchar)) {
$quantifier_min_str .= $uchar - '0';
}
elsif ($uchar == ',') {
my $quantifier_min = Fn->to_int_with_base($quantifier_min_str, 10);
my $last_pattern = (Regex::Pattern)$patterns->get($patterns->length - 1);
$last_pattern->{min_repeat} = $quantifier_min;
$status = Regex->STATUS_QUANTIFIER_MAX();
$quantifier_min_str .= "";
}
else {
die "Quantifier min must be non-fractional number";
}
}
elsif ($status == Regex->STATUS_QUANTIFIER_MAX()) {
if (Fn->is_digit($uchar)) {
$quantifier_max_str .= $uchar - '0';
}
else {
die "Quantifier max must be non-fractional number";
}
}
else {
$cur_is_char = 1;
$cur_uchar_min_maxs = [$uchar, $uchar];
}
}
}
}
}
}
if ($cur_is_char) {
my $pattern = new Regex::Pattern;
$pattern->{type} = Regex::Pattern->PATTERN;
$pattern->{negate} = (byte)$cur_negate;
$pattern->{uchar_min_maxs} = $cur_uchar_min_maxs;
$pattern->{max_repeat} = 1;
$pattern->{min_repeat} = 1;
if ($is_capture_start) {
$pattern->{capture_start} = 1;
$is_capture_start = 0;
}
$patterns->push($pattern);
$before_is_charset = 1;
}
}
# Check patterns
for (my $pattern_index = 0; $pattern_index < $patterns->length; $pattern_index++) {
if ($pattern_index == 0) {
next;
}
my $before_is_variable_quantifier = 0;
my $before_pattern =(Regex::Pattern)$patterns->get($pattern_index - 1);
my $cur_pattern = (Regex::Pattern)$patterns->get($pattern_index);
# Check duplicatation charset
if ($before_pattern->{min_repeat} != $before_pattern->{max_repeat}) {
$before_is_variable_quantifier = 1;
}
if ($before_is_variable_quantifier) {
my $duplicate_charset = $self->duplicate($before_pattern, $cur_pattern);
if ($duplicate_charset) {
die "Invalid regex. Charset after variable quantifier is duplicate";
}
}
# 0 width regular expression is forbidden
my $no_zero_quantifier_pattern : Regex::Pattern;
my $find_zere_quantifier = 0;
for (my $before_pattern_index = $pattern_index - 1; $before_pattern_index >= 0; $before_pattern_index--) {
my $before_pattern = (Regex::Pattern)$patterns->get($before_pattern_index);
if ($before_pattern->{min_repeat} == 0) {
$find_zere_quantifier = 1;
}
else {
$no_zero_quantifier_pattern = $before_pattern;
last;
}
}
if ($find_zere_quantifier && $no_zero_quantifier_pattern) {
my $duplicate_charset = $self->duplicate($no_zero_quantifier_pattern, $cur_pattern);
if ($duplicate_charset) {
die "Invalid regex. Zero width Charset after variable quantifier is duplicate";
}
}
}
}
method match : int ($target : string, $target_base_index : int) {
unless ($target) {
die "Target string must be defined";
}
my $patterns = $self->{patterns};
my $patterns_length = $patterns->length;
my $match_all = 0;
my $target_index = 0;
my $target_base_index_before : int;
while (1) {
$target_base_index_before = $target_base_index;
my $target_base_uchar = Unicode->uchar($target, \$target_base_index);
unless ($target_base_uchar >= 0) {
last;
}
$target_index = $target_base_index_before;
my $pattern_index = 0;
my $capture_start_index = -1;
my $capture_count = 0;
while (1) {
my $pattern = (Regex::Pattern)$patterns->get($pattern_index);
my $max_repeat = $pattern->{max_repeat};
my $min_repeat = $pattern->{min_repeat};
my $uchar_min_maxs = $pattern->{uchar_min_maxs};
my $negate = $pattern->{negate};
if ($pattern->{capture_start}) {
$capture_start_index = $target_index;
}
my $target_before_index = $target_index;
my $target_uchar = Unicode->uchar($target, \$target_index);
unless ($target_uchar >= 0) {
last;
}
my $type = $pattern->{type};
my $match_uchar_repeat = 0;
my $repeat_count = 0;
my $repeat_before_target_index = $target_before_index;
if ($type == Regex::Pattern->PATTERN) {
# Match uchar repeat
for (my $repeat_index = 0; $repeat_index < $max_repeat; $repeat_index++) {
# Match uchar
my $match_uchar : int;
# Match in not range
if ($negate) {
my $match_uchar_count = 0;
for (my $min_max_index = 0; $min_max_index < @$uchar_min_maxs; $min_max_index += 2) {
my $uchar_min = $uchar_min_maxs->[$min_max_index];
my $uchar_max = $uchar_min_maxs->[$min_max_index + 1];
if ($target_uchar >= $uchar_min && $target_uchar <= $uchar_max) {
$match_uchar_count++;
}
}
if ($match_uchar_count == 0) {
$match_uchar = 1;
}
}
# Match in range
else {
$match_uchar = 0;
for (my $min_max_index = 0; $min_max_index < @$uchar_min_maxs; $min_max_index += 2) {
my $uchar_min = $uchar_min_maxs->[$min_max_index];
my $uchar_max = $uchar_min_maxs->[$min_max_index + 1];
if ($target_uchar >= $uchar_min && $target_uchar <= $uchar_max) {
$match_uchar = 1;
last;
}
}
}
# Increment repeat count
if ($match_uchar) {
$repeat_count++;
$repeat_before_target_index = $target_index;
if ($target_index == length $target) {
last;
}
$target_uchar = Unicode->uchar($target, \$target_index);
$target_before_index = $target_index;
}
else {
last;
}
}
if ($repeat_count >= $min_repeat) {
$match_uchar_repeat = 1;
}
}
else {
die "Not implemented";
}
if ($match_uchar_repeat) {
if ($pattern->{capture_end}) {
$self->{captures}[$capture_count] = Fn->substr($target, $capture_start_index, $repeat_before_target_index - $capture_start_index);
$capture_count++;
if ($capture_count > 255) {
die "Too many captures";
}
}
$pattern_index++;
$target_index = $repeat_before_target_index;
if ($pattern_index == $patterns_length) {
last;
}
}
else {
last;
}
}
if ($pattern_index == $patterns_length) {
$match_all = 1;
last;
}
else {
if ($self->{string_start}) {
last;
}
}
}
my $really_match = 0;
if ($match_all) {
if ($self->{string_end}) {
if ($target_index == length $target) {
$really_match = 1;
}
}
else {
$really_match = 1;
}
}
if ($really_match) {
$self->{match_start} = $target_base_index_before;
$self->{match_length} = $target_index - $self->{match_start};
}
return $really_match;
}
method replace : string ($target : string, $target_offset : int, $replace : string) {
my $opt = Hash->new({});
$opt->set_int(replace_all => 0);
return $self->replace_opt($target, $target_offset, $replace, $opt);
}
method replace_cb : string ($target : string, $target_offset : int, $replace_cb : Regex::Replacer) {
my $opt = Hash->new({});
$opt->set_int(replace_all => 0);
return $self->replace_opt($target, $target_offset, $replace_cb, $opt);
}
method replace_all : string ($target : string, $target_offset : int, $replace : string) {
my $opt = Hash->new({});
$opt->set_int(replace_all => 1);
return $self->replace_opt($target, $target_offset, $replace, $opt);
}
method replace_all_cb : string ($target : string, $target_offset : int, $replace_cb : Regex::Replacer) {
my $opt = Hash->new({});
$opt->set_int(replace_all => 1);
return $self->replace_opt($target, $target_offset, $replace_cb, $opt);
}
private method replace_opt : string ($target : string, $target_base_index : int, $replace_object : object, $opt : Hash) {
my $original_target_base_index = $target_base_index;
my $replace_all = 0;
if (my $replace_all_obj = $opt->get("replace_all")) {
$replace_all = (int)$replace_all_obj;
}
$self->{replace_count} = 0;
unless ($target) {
return $target;
}
my $result : string;
my $replace_count = 0;
my $result_buffer = StringBuffer->new;
my $tarlength = length $target;
if ($target_base_index > 0) {
my $first_string = Fn->substr($target, 0, $target_base_index);
$result_buffer->push($first_string);
}
while (my $match = $self->match($target, $target_base_index)) {
my $match_start = $self->match_start;
my $match_end = $match_start + $self->match_length;
my $replace : string;
if ($replace_object isa string) {
$replace = (string)$replace_object;
}
elsif ($replace_object isa object[]) {
my $replace_objects = (object[])$replace_object;
my $replace_buffer = StringBuffer->new;
for (my $i = 0; $i < @$replace_objects; $i++) {
my $replace_elem_object = $replace_objects->[$i];
if ($replace_elem_object isa string) {
$replace_buffer->push((string)$replace_elem_object);
}
elsif ($replace_elem_object isa Int) {
my $capture = $self->captures->[(int)$replace_elem_object];
$replace_buffer->push($capture);
}
else {
die "Invalid replace argument element";
}
}
$replace = $replace_buffer->to_string;
}
elsif ($replace_object isa Regex::Replacer) {
my $replace_buffer = StringBuffer->new;
my $replacer = (Regex::Replacer)$replace_object;
my $cb_result = $replacer->($self);
$replace_buffer->push($cb_result);
$replace = $replace_buffer->to_string;
}
else {
die "Invalid replace argument";
}
my $match_length = $self->match_length;
my $replace_length = length $replace;
my $result_length = $tarlength + $replace_length - $match_length;
$result = (mutable string)new_string_len($result_length);
my $rest_length = $tarlength - $match_end;
my $before_result = (mutable string)Fn->substr($target, $target_base_index, $match_start - $target_base_index);
$result_buffer->push($before_result);
$result_buffer->push($replace);
$replace_count++;
unless ($replace_all) {
last;
}
$target_base_index = $match_end;
}
if ($replace_count == 0) {
$result = Fn->copy_string($target);
}
else {
my $match_end = $self->match_start + $self->match_length;
my $rest_length = $tarlength - $match_end;
if ($rest_length > 0) {
my $rest_string = Fn->substr($target, $match_end, $rest_length);
$result_buffer->push($rest_string);
}
$result = $result_buffer->to_string;
}
$self->{replace_count} = $replace_count;
return $result;
}
private method duplicate : int ($pattern1 : Regex::Pattern, $pattern2 : Regex::Pattern) {
my $uchar_min_maxs1 = $pattern1->{uchar_min_maxs};
my $uchar_min_maxs2 = $pattern2->{uchar_min_maxs};
my $duplicate : int;
my $negate1 = (int)$pattern1->{negate};
my $negate2 = (int)$pattern2->{negate};
if ($negate1 && $negate2) {
$duplicate = 1;
}
elsif (!$negate1 && !$negate2) {
$duplicate = 0;
for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
if ($uchar_min2 >= $uchar_min1 && $uchar_min2 <= $uchar_max1) {
$duplicate = 1;
last;
}
elsif ($uchar_max2 >= $uchar_min1 && $uchar_max2 <= $uchar_max1) {
$duplicate = 1;
last;
}
}
}
}
elsif ($negate1 && !$negate2) {
$duplicate = 0;
for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
my $one_no_duplicate = 0;
for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
if ($uchar_min2 >= $uchar_min1 && $uchar_max2 <= $uchar_max1) {
$one_no_duplicate = 1;
last;
}
}
unless ($one_no_duplicate) {
$duplicate = 1;
last;
}
}
}
elsif (!$negate1 && $negate2) {
$duplicate = 0;
for (my $min_max_index1 = 0; $min_max_index1 < @$uchar_min_maxs1; $min_max_index1 += 2) {
my $uchar_min1 = $uchar_min_maxs1->[$min_max_index1];
my $uchar_max1 = $uchar_min_maxs1->[$min_max_index1 + 1];
my $one_no_duplicate = 0;
for (my $min_max_index2 = 0; $min_max_index2 < @$uchar_min_maxs2; $min_max_index2 += 2) {
my $uchar_min2 = $uchar_min_maxs2->[$min_max_index2];
my $uchar_max2 = $uchar_min_maxs2->[$min_max_index2 + 1];
if ($uchar_min1 >= $uchar_min2 && $uchar_max1 <= $uchar_max2) {
$one_no_duplicate = 1;
last;
}
}
unless ($one_no_duplicate) {
$duplicate = 1;
last;
}
}
}
return $duplicate;
}
method cap1 : string () { return $self->captures->[0]; }
method cap2 : string () { return $self->captures->[1]; }
method cap3 : string () { return $self->captures->[2]; }
method cap4 : string () { return $self->captures->[3]; }
method cap5 : string () { return $self->captures->[4]; }
method cap6 : string () { return $self->captures->[5]; }
method cap7 : string () { return $self->captures->[6]; }
method cap8 : string () { return $self->captures->[7]; }
method cap9 : string () { return $self->captures->[8]; }
method cap10 : string () { return $self->captures->[9]; }
}