ruby-changes:74411
From: TSUYUSATO <ko1@a...>
Date: Wed, 9 Nov 2022 23:21:58 +0900 (JST)
Subject: [ruby-changes:74411] 881bf9a0b8 (master): Implement cache optimization for regexp matching
https://git.ruby-lang.org/ruby.git/commit/?id=881bf9a0b8 From 881bf9a0b8b52c05a5917b95d988ae4b9a391a47 Mon Sep 17 00:00:00 2001 From: TSUYUSATO Kitsune <make.just.on@g...> Date: Mon, 3 Oct 2022 22:21:56 +0900 Subject: Implement cache optimization for regexp matching --- regexec.c | 469 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- regint.h | 8 ++ 2 files changed, 476 insertions(+), 1 deletion(-) diff --git a/regexec.c b/regexec.c index c77d48b1d9..db3881e18a 100644 --- a/regexec.c +++ b/regexec.c @@ -231,6 +231,404 @@ onig_get_capture_tree(OnigRegion* region) https://github.com/ruby/ruby/blob/trunk/regexec.c#L231 } #endif /* USE_CAPTURE_HISTORY */ +#ifdef USE_CACHE_MATCH_OPT + +/* count number of jump-like opcodes for allocation of cache memory. */ +/* return -1 if we cannot optimize the regex matching by using cache. */ +int count_num_cache_opcode(regex_t* reg) +{ + int num = 0; + UChar* pbegin; + UChar* p = reg->p; + UChar* pend = p + reg->used; + LengthType len; + RelAddrType addr; + OnigEncoding enc = reg->enc; + + while (p < pend) { + pbegin = p; + switch (*p++) { + case OP_FINISH: + case OP_END: + break; + + case OP_EXACT1: p++; break; + case OP_EXACT2: p += 2; break; + case OP_EXACT3: p += 3; break; + case OP_EXACT4: p += 4; break; + case OP_EXACT5: p += 5; break; + case OP_EXACTN: + GET_LENGTH_INC(len, p); p += len; break; + case OP_EXACTMB2N1: p += 2; break; + case OP_EXACTMB2N2: p += 4; break; + case OP_EXACTMB2N3: p += 6; break; + case OP_EXACTMB2N: + GET_LENGTH_INC(len, p); p += len * 2; break; + case OP_EXACTMB3N: + GET_LENGTH_INC(len, p); p += len * 3; break; + case OP_EXACTMBN: + { + int mb_len; + GET_LENGTH_INC(mb_len, p); + GET_LENGTH_INC(len, p); + p += mb_len * len; + } + break; + + case OP_EXACT1_IC: + len = enclen(enc, p, pend); p += len; break; + case OP_EXACTN_IC: + GET_LENGTH_INC(len, p); p += len; break; + + case OP_CCLASS: + case OP_CCLASS_NOT: + p += SIZE_BITSET; break; + case OP_CCLASS_MB: + case OP_CCLASS_MB_NOT: + case OP_CCLASS_MIX: + case OP_CCLASS_MIX_NOT: + GET_LENGTH_INC(len, p); p += len; break; + + case OP_ANYCHAR: + case OP_ANYCHAR_ML: + break; + case OP_ANYCHAR_STAR: + case OP_ANYCHAR_ML_STAR: + num++; break; + case OP_ANYCHAR_STAR_PEEK_NEXT: + case OP_ANYCHAR_ML_STAR_PEEK_NEXT: + p++; num++; break; + + case OP_WORD: + case OP_NOT_WORD: + case OP_WORD_BOUND: + case OP_NOT_WORD_BOUND: + case OP_WORD_BEGIN: + case OP_WORD_END: + break; + + case OP_ASCII_WORD: + case OP_NOT_ASCII_WORD: + case OP_ASCII_WORD_BOUND: + case OP_NOT_ASCII_WORD_BOUND: + case OP_ASCII_WORD_BEGIN: + case OP_ASCII_WORD_END: + break; + + case OP_BEGIN_BUF: + case OP_END_BUF: + case OP_BEGIN_LINE: + case OP_END_LINE: + case OP_SEMI_END_BUF: + case OP_BEGIN_POSITION: + break; + + case OP_BACKREF1: + case OP_BACKREF2: + case OP_BACKREFN: + case OP_BACKREFN_IC: + case OP_BACKREF_MULTI: + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_WITH_LEVEL: + return NUM_CACHE_OPCODE_FAIL; + + case OP_MEMORY_START: + case OP_MEMORY_START_PUSH: + case OP_MEMORY_END_PUSH: + case OP_MEMORY_END_PUSH_REC: + case OP_MEMORY_END: + case OP_MEMORY_END_REC: + p += SIZE_MEMNUM; break; + + case OP_KEEP: + break; + + case OP_FAIL: + break; + case OP_JUMP: + GET_RELADDR_INC(addr, p); + break; + case OP_PUSH: + GET_RELADDR_INC(addr, p); + num++; + break; + case OP_POP: + break; + case OP_PUSH_OR_JUMP_EXACT1: + case OP_PUSH_IF_PEEK_NEXT: + p += SIZE_RELADDR + 1; num++; break; + case OP_REPEAT: + case OP_REPEAT_NG: + case OP_REPEAT_INC: + case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: + // TODO: support OP_REPEAT opcodes. + return NUM_CACHE_OPCODE_FAIL; + case OP_NULL_CHECK_START: + case OP_NULL_CHECK_END: + case OP_NULL_CHECK_END_MEMST: + case OP_NULL_CHECK_END_MEMST_PUSH: + p += SIZE_MEMNUM; break; + + case OP_PUSH_POS: + case OP_POP_POS: + break; + case OP_PUSH_POS_NOT: + p += SIZE_RELADDR; break; + case OP_FAIL_POS: + break; + case OP_PUSH_STOP_BT: + case OP_POP_STOP_BT: + return NUM_CACHE_OPCODE_FAIL; + case OP_LOOK_BEHIND: + /* GET_LENGTH_INC(len, p); break; */ + return NUM_CACHE_OPCODE_FAIL; + case OP_PUSH_LOOK_BEHIND_NOT: + // Since optimization assumes a string offset does not back, + // we cannot optimize look-behind opcodes. + /* + GET_RELADDR_INC(addr, p); + GET_LENGTH_INC(len, p); + break; + */ + return NUM_CACHE_OPCODE_FAIL; + case OP_FAIL_LOOK_BEHIND_NOT: + return NUM_CACHE_OPCODE_FAIL; + case OP_PUSH_ABSENT_POS: + case OP_ABSENT_END: + break; + case OP_ABSENT: + p += SIZE_RELADDR; break; + + case OP_CALL: + case OP_RETURN: + return NUM_CACHE_OPCODE_FAIL; + + case OP_CONDITION: + return NUM_CACHE_OPCODE_FAIL; + + case OP_STATE_CHECK_PUSH: + case OP_STATE_CHECK_PUSH_OR_JUMP: + case OP_STATE_CHECK: + case OP_STATE_CHECK_ANYCHAR_STAR: + case OP_STATE_CHECK_ANYCHAR_ML_STAR: + return NUM_CACHE_OPCODE_FAIL; + + case OP_SET_OPTION_PUSH: + case OP_SET_OPTION: + p += SIZE_OPTION; + break; + } + } + + return num; +} + +void init_cache_index_table(regex_t* reg, UChar **table) +{ + UChar* pbegin; + UChar* p = reg->p; + UChar* pend = p + reg->used; + LengthType len; + RelAddrType addr; + OnigEncoding enc = reg->enc; + + while (p < pend) { + pbegin = p; + switch (*p++) { + case OP_FINISH: + case OP_END: + break; + + case OP_EXACT1: p++; break; + case OP_EXACT2: p += 2; break; + case OP_EXACT3: p += 3; break; + case OP_EXACT4: p += 4; break; + case OP_EXACT5: p += 5; break; + case OP_EXACTN: + GET_LENGTH_INC(len, p); p += len; break; + case OP_EXACTMB2N1: p += 2; break; + case OP_EXACTMB2N2: p += 4; break; + case OP_EXACTMB2N3: p += 6; break; + case OP_EXACTMB2N: + GET_LENGTH_INC(len, p); p += len * 2; break; + case OP_EXACTMB3N: + GET_LENGTH_INC(len, p); p += len * 3; break; + case OP_EXACTMBN: + { + int mb_len; + GET_LENGTH_INC(mb_len, p); + GET_LENGTH_INC(len, p); + p += mb_len * len; + } + break; + + case OP_EXACT1_IC: + len = enclen(enc, p, pend); p += len; break; + case OP_EXACTN_IC: + GET_LENGTH_INC(len, p); p += len; break; + + case OP_CCLASS: + case OP_CCLASS_NOT: + p += SIZE_BITSET; break; + case OP_CCLASS_MB: + case OP_CCLASS_MB_NOT: + case OP_CCLASS_MIX: + case OP_CCLASS_MIX_NOT: + GET_LENGTH_INC(len, p); p += len; break; + + case OP_ANYCHAR: + case OP_ANYCHAR_ML: + break; + case OP_ANYCHAR_STAR: + case OP_ANYCHAR_ML_STAR: + *table++ = pbegin; break; + case OP_ANYCHAR_STAR_PEEK_NEXT: + case OP_ANYCHAR_ML_STAR_PEEK_NEXT: + p++; + *table++ = pbegin; + break; + + case OP_WORD: + case OP_NOT_WORD: + case OP_WORD_BOUND: + case OP_NOT_WORD_BOUND: + case OP_WORD_BEGIN: + case OP_WORD_END: + break; + + case OP_ASCII_WORD: + case OP_NOT_ASCII_WORD: + case OP_ASCII_WORD_BOUND: + case OP_NOT_ASCII_WORD_BOUND: + case OP_ASCII_WORD_BEGIN: + case OP_ASCII_WORD_END: + break; + + case OP_BEGIN_BUF: + case OP_END_BUF: + case OP_BEGIN_LINE: + case OP_END_LINE: + case OP_SEMI_END_BUF: + case OP_BEGIN_POSITION: + break; + + case OP_BACKREF1: + case OP_BACKREF2: + case OP_BACKREFN: + case OP_BACKREFN_IC: + case OP_BACKREF_MULTI: + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_WITH_LEVEL: + return; + + case OP_MEMORY_START: + case OP_MEMORY_START_PUSH: + case OP_MEMORY_END_PUSH: + case OP_MEMORY_END_PUSH_REC: + case OP_MEMORY_END: + case OP_MEMORY_END_REC: + p += SIZE_MEMNUM; break; + + case OP_KEEP: + break; + + case OP_FAIL: + break; + case OP_JUMP: + GET_RELADDR_INC(addr, p); + break; + case OP_PUSH: + GET_RELADDR_INC(addr, p); + *table++ = pbegin; + break; + case OP_POP: + break; + case OP_PUSH_OR_JUMP_EXACT1: + case OP_PUSH_IF_PEEK_NEXT: + p += SIZE_RELADDR + 1; *table++ = pbegin; break; + case OP_REPEAT: + case OP_REPEAT_NG: + case OP_REPEAT_INC: + case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: + // TODO: support OP_REPEAT opcodes. + return; + case OP_NULL_CHECK_START: + case OP_NULL_CHECK_END: + case OP_NULL_CHECK_END_MEMST: + case OP_NULL_CHECK_END_MEMST_PUSH: + p += SIZE_MEMNUM; break; + + case OP_PUSH_POS: + case OP_POP_POS: + break; + case OP_PUSH_POS_NOT: + p += SIZE_RELADDR; break; + case OP_FAIL_POS: + break; + case OP_PUSH_STOP_BT: + case OP_POP_STOP_BT: + return; + case OP_LOOK_BEHIND: + /* GET_LENGTH_INC(len, p); break; */ + return; + case OP_PUSH_LOOK_BEHIND_NOT: + // Since optimization assumes a string offset does not back, + // we cannot optimize look-behind opcodes. + /* + GET_RELADDR_INC(addr, p); + GET_LENGTH_INC(len, p); + break; + */ + return; + case OP_FAIL_LOOK_BEHIND_NOT: + return; + case OP_PUSH_ABSENT_POS: + case OP_ABSENT_END: + break; + case OP_ABSENT: + p += SIZE_RELADDR; break; + + case OP_CALL: + case OP_RETURN: + return; + + case OP_CONDITION: + return; + + case OP_STATE_CHECK_PUSH: + case OP_STATE_CHECK_PUSH_OR_JUMP: + case OP_STATE_CHECK: + case OP_STATE_CHECK_ANYCHAR_STAR: + case OP_STATE_CHECK_ANYCHAR_ML_STAR: + return; + + case OP_SET_OPTION_PUSH: + case OP_SET_OPTION: + p += SIZE_OPTION; + break; + } + } +} + +int find_cache_index_table(UChar** table, int num_cache_table, UChar* p) +{ + int l = 0, r = num_ca (... truncated) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/