ruby-changes:74416
From: TSUYUSATO <ko1@a...>
Date: Wed, 9 Nov 2022 23:22:00 +0900 (JST)
Subject: [ruby-changes:74416] f25bb291b4 (master): Support OP_REPEAT and OP_REPEAT_INC
https://git.ruby-lang.org/ruby.git/commit/?id=f25bb291b4 From f25bb291b42a45d23cfc8658720c62e1f3a7390f Mon Sep 17 00:00:00 2001 From: TSUYUSATO Kitsune <make.just.on@g...> Date: Thu, 20 Oct 2022 16:52:23 +0900 Subject: Support OP_REPEAT and OP_REPEAT_INC --- include/ruby/onigmo.h | 2 + regexec.c | 218 +++++++++++++++++++++++++++++++++++++++++--------- regint.h | 19 +++-- 3 files changed, 196 insertions(+), 43 deletions(-) diff --git a/include/ruby/onigmo.h b/include/ruby/onigmo.h index d71dfb80fb..703f38f590 100644 --- a/include/ruby/onigmo.h +++ b/include/ruby/onigmo.h @@ -744,6 +744,8 @@ typedef struct { https://github.com/ruby/ruby/blob/trunk/include/ruby/onigmo.h#L744 typedef struct { int lower; int upper; + int base_num; + int inner_num; } OnigRepeatRange; typedef void (*OnigWarnFunc)(const char* s); diff --git a/regexec.c b/regexec.c index 402e116bbd..4baf8d32b3 100644 --- a/regexec.c +++ b/regexec.c @@ -235,12 +235,15 @@ onig_get_capture_tree(OnigRegion* region) https://github.com/ruby/ruby/blob/trunk/regexec.c#L235 /* count number of jump-like opcodes for allocation of cache memory. */ /* return -1 if we cannot optimize the regex matching by using cache. */ -static int count_num_cache_opcode(regex_t* reg) +static int count_num_cache_opcode(regex_t* reg, int* table_size) { int num = 0; UChar* p = reg->p; UChar* pend = p + reg->used; LengthType len; + MemNumType mem; + MemNumType current_mem = -1; + int current_mem_num = 0; OnigEncoding enc = reg->enc; while (p < pend) { @@ -295,10 +298,10 @@ static int count_num_cache_opcode(regex_t* reg) https://github.com/ruby/ruby/blob/trunk/regexec.c#L298 break; case OP_ANYCHAR_STAR: case OP_ANYCHAR_ML_STAR: - num++; break; + num++; *table_size += 1; break; case OP_ANYCHAR_STAR_PEEK_NEXT: case OP_ANYCHAR_ML_STAR_PEEK_NEXT: - p++; num++; break; + p++; num++; *table_size += 1; break; case OP_WORD: case OP_NOT_WORD: @@ -352,19 +355,54 @@ static int count_num_cache_opcode(regex_t* reg) https://github.com/ruby/ruby/blob/trunk/regexec.c#L355 case OP_PUSH: p += SIZE_RELADDR; num++; + *table_size += 1; break; case OP_POP: break; case OP_PUSH_OR_JUMP_EXACT1: case OP_PUSH_IF_PEEK_NEXT: - p += SIZE_RELADDR + 1; num++; break; + p += SIZE_RELADDR + 1; num++; *table_size += 1; break; case OP_REPEAT: case OP_REPEAT_NG: + if (current_mem != -1) { + // A nested OP_REPEAT is not yet supported. + return NUM_CACHE_OPCODE_FAIL; + } + GET_MEMNUM_INC(mem, p); + p += SIZE_RELADDR; + if (reg->repeat_range[mem].lower == 0) { + num++; + *table_size += 1; + } + reg->repeat_range[mem].base_num = num; + current_mem = mem; + current_mem_num = num; + break; case OP_REPEAT_INC: case OP_REPEAT_INC_NG: + GET_MEMNUM_INC(mem, p); + //fprintf(stderr, "OP_REPEAT %d\n", mem); + if (mem != current_mem) { + // A lone or invalid OP_REPEAT_INC is found. + return NUM_CACHE_OPCODE_FAIL; + } + { + int inner_num = num - current_mem_num; + OnigRepeatRange *repeat_range = ®->repeat_range[mem]; + repeat_range->inner_num = inner_num; + num -= inner_num; + num += inner_num * repeat_range->lower + (inner_num + 1) * (repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower); + //fprintf(stderr, "lower %d < upper %d\n", repeat_range->lower, repeat_range->upper); + if (repeat_range->lower < repeat_range->upper) { + *table_size += 1; + } + current_mem = -1; + current_mem_num = 0; + } + break; case OP_REPEAT_INC_SG: case OP_REPEAT_INC_NG_SG: - // TODO: support OP_REPEAT opcodes. + // TODO: Support nested OP_REPEAT. return NUM_CACHE_OPCODE_FAIL; case OP_NULL_CHECK_START: case OP_NULL_CHECK_END: @@ -410,12 +448,16 @@ static int count_num_cache_opcode(regex_t* reg) https://github.com/ruby/ruby/blob/trunk/regexec.c#L448 return num; } -static void init_cache_index_table(regex_t* reg, UChar **table) +static void init_cache_index_table(regex_t* reg, OnigCacheIndex *table) { UChar* pbegin; UChar* p = reg->p; UChar* pend = p + reg->used; LengthType len; + MemNumType mem; + MemNumType current_mem = -1; + int num = 0; + int current_mem_num = 0; OnigEncoding enc = reg->enc; while (p < pend) { @@ -470,11 +512,20 @@ static void init_cache_index_table(regex_t* reg, UChar **table) https://github.com/ruby/ruby/blob/trunk/regexec.c#L512 break; case OP_ANYCHAR_STAR: case OP_ANYCHAR_ML_STAR: - *table++ = pbegin; break; + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = current_mem; + num++; + table++; + break; case OP_ANYCHAR_STAR_PEEK_NEXT: case OP_ANYCHAR_ML_STAR_PEEK_NEXT: p++; - *table++ = pbegin; + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = current_mem; + num++; + table++; break; case OP_WORD: @@ -528,17 +579,55 @@ static void init_cache_index_table(regex_t* reg, UChar **table) https://github.com/ruby/ruby/blob/trunk/regexec.c#L579 break; case OP_PUSH: p += SIZE_RELADDR; - *table++ = pbegin; + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = current_mem; + num++; + table++; break; case OP_POP: break; case OP_PUSH_OR_JUMP_EXACT1: case OP_PUSH_IF_PEEK_NEXT: - p += SIZE_RELADDR + 1; *table++ = pbegin; break; + p += SIZE_RELADDR + 1; + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = current_mem; + num++; + table++; + break; case OP_REPEAT: case OP_REPEAT_NG: + GET_MEMNUM_INC(mem, p); + p += SIZE_RELADDR; + if (reg->repeat_range[mem].lower == 0) { + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = mem; + num++; + table++; + } + current_mem = mem; + current_mem_num = num; + break; case OP_REPEAT_INC: case OP_REPEAT_INC_NG: + GET_MEMNUM_INC(mem, p); + { + int inner_num = num - current_mem_num; + OnigRepeatRange *repeat_range = ®->repeat_range[mem]; + if (repeat_range->lower < repeat_range->upper) { + table->addr = pbegin; + table->num = num - current_mem_num; + table->outer_repeat = mem; + table++; + } + num -= inner_num; + num += inner_num * repeat_range->lower + (inner_num + 1) * (repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower); + current_mem = -1; + current_mem_num = 0; + } + break; case OP_REPEAT_INC_SG: case OP_REPEAT_INC_NG_SG: // TODO: support OP_REPEAT opcodes. @@ -584,20 +673,6 @@ static void init_cache_index_table(regex_t* reg, UChar **table) https://github.com/ruby/ruby/blob/trunk/regexec.c#L673 } } } - -static int find_cache_index_table(UChar** table, int num_cache_table, UChar* p) -{ - int l = 0, r = num_cache_table - 1, m; - - while (l <= r) { - m = (l + r) / 2; - if (table[m] == p) return m; - if (table[m] < p) l = m + 1; - else r = m - 1; - } - - return -1; -} #endif /* USE_MATCH_CACHE */ extern void @@ -787,7 +862,7 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from) https://github.com/ruby/ruby/blob/trunk/regexec.c#L862 (msa).enable_cache_match_opt = 0;\ (msa).num_fail = 0;\ (msa).num_cache_opcode = NUM_CACHE_OPCODE_UNINIT;\ - (msa).cache_index_table = (UChar **)0;\ + (msa).cache_index_table = (OnigCacheIndex *)0;\ (msa).match_cache = (uint8_t *)0;\ } while(0) #define MATCH_ARG_FREE_CACHE_MATCH_OPT(msa) do {\ @@ -1083,23 +1158,70 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, https://github.com/ruby/ruby/blob/trunk/regexec.c#L1158 #ifdef USE_CACHE_MATCH_OPT -#define DO_CACHE_MATCH_OPT(enable,p,num_cache_table,table,pos,match_cache) do {\ +#define DO_CACHE_MATCH_OPT(reg,stk,repeat_stk,enable,p,num_cache_table,num_cache_size,table,pos,match_cache) do {\ if (enable) {\ - int cache_index = find_cache_index_table((table), (num_cache_table), (p));\ + int cache_index = find_cache_index_table((reg), (stk), (repeat_stk), (table), (num_cache_table), (p));\ if (cache_index >= 0) {\ - int key = (num_cache_table) * (int)(pos) + cache_index;\ + int key = (num_cache_size) * (int)(pos) + cache_index;\ int index = key >> 3;\ int mask = 1 << (key & 7);\ if ((match_cache)[index] & mask) {\ + /*fprintf(stderr, "Use cache (p = %p, cache_index = %d, pos = %d, key = %d)\n", p, cache_index, pos, key);*/\ goto fail;\ }\ + /*fprintf(stderr, "Add cache (p = %p, cache_index = %d, pos = %d, key = %d)\n", p, cache_index, pos, key);*/\ (match_cache)[index] |= mask;\ }\ }\ } while (0) + +static int find_cache_index_table(regex_t* reg, OnigStackType *stk, OnigStackIndex *repeat_stk, OnigCacheIndex* table, int num_cache_table, UChar* p) +{ + int l = 0, r = num_cache_table - 1, m; + OnigCacheIndex* item; + OnigRepeatRange* range; + OnigStackType *stkp; + int count = 0; + int is_inc = *p == OP_REPEAT_INC || *p == OP_REPEAT_INC_NG; + + while (l <= r) { + m = (l + r) / 2; + if (table[m].addr == p) break; + if (table[m].addr < p) l = m + 1; + else r = m - 1; + } + + if (!(0 <= m && m < num_cache_table && table[m].addr == p)) { + return -1; + } + + item = &table[m]; + //fprintf(stderr, "m = %d, outer_repeat = %d, num = %d\n", item->outer_repeat, item->num); + if (item->outer_repeat == -1) { + return item->num; + } + + range = ®->repeat_range[item->outer_repeat]; + //fprintf(stderr, "inner_num = %d, lower = %d, upper = %d\n", range->inner_num, range->lower, range->upper); + + stkp = &stk[repeat_stk[item->outer_repeat]]; + count = is_inc ? stkp->u.repeat.count - 1 : stkp->u.repeat.count; + //fprintf(stderr, "count = %d\n", count); + + if (count < range->lower) { + return range->base_num + range->inner_num * count + item->num; + } + + if (range->upper == 0x7fffffff) { + return range->base (... truncated) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/