[前][次][番号順一覧][スレッド一覧]

ruby-changes:74411

From: TSUYUSATO <ko1@a...>
Date: Wed, 9 Nov 2022 23:21:58 +0900 (JST)
Subject: [ruby-changes:74411] 881bf9a0b8 (master): Implement cache optimization for regexp matching

https://git.ruby-lang.org/ruby.git/commit/?id=881bf9a0b8

From 881bf9a0b8b52c05a5917b95d988ae4b9a391a47 Mon Sep 17 00:00:00 2001
From: TSUYUSATO Kitsune <make.just.on@g...>
Date: Mon, 3 Oct 2022 22:21:56 +0900
Subject: Implement cache optimization for regexp matching

---
 regexec.c | 469 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 regint.h  |   8 ++
 2 files changed, 476 insertions(+), 1 deletion(-)

diff --git a/regexec.c b/regexec.c
index c77d48b1d9..db3881e18a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -231,6 +231,404 @@ onig_get_capture_tree(OnigRegion* region) https://github.com/ruby/ruby/blob/trunk/regexec.c#L231
 }
 #endif /* USE_CAPTURE_HISTORY */
 
+#ifdef USE_CACHE_MATCH_OPT
+
+/* count number of jump-like opcodes for allocation of cache memory. */
+/* return -1 if we cannot optimize the regex matching by using cache. */
+int count_num_cache_opcode(regex_t* reg)
+{
+  int num = 0;
+  UChar* pbegin;
+  UChar* p = reg->p;
+  UChar* pend = p + reg->used;
+  LengthType len;
+  RelAddrType addr;
+  OnigEncoding enc = reg->enc;
+
+  while (p < pend) {
+    pbegin = p;
+    switch (*p++) {
+      case OP_FINISH:
+      case OP_END:
+	break;
+
+      case OP_EXACT1: p++; break;
+      case OP_EXACT2: p += 2; break;
+      case OP_EXACT3: p += 3; break;
+      case OP_EXACT4: p += 4; break;
+      case OP_EXACT5: p += 5; break;
+      case OP_EXACTN:
+        GET_LENGTH_INC(len, p); p += len; break;
+      case OP_EXACTMB2N1: p += 2; break;
+      case OP_EXACTMB2N2: p += 4; break;
+      case OP_EXACTMB2N3: p += 6; break;
+      case OP_EXACTMB2N:
+	GET_LENGTH_INC(len, p); p += len * 2; break;
+      case OP_EXACTMB3N:
+	GET_LENGTH_INC(len, p); p += len * 3; break;
+      case OP_EXACTMBN:
+	{
+	  int mb_len;
+	  GET_LENGTH_INC(mb_len, p);
+	  GET_LENGTH_INC(len, p);
+	  p += mb_len * len;
+	}
+        break;
+
+      case OP_EXACT1_IC:
+	len = enclen(enc, p, pend); p += len; break;
+      case OP_EXACTN_IC:
+	GET_LENGTH_INC(len, p); p += len; break;
+
+      case OP_CCLASS:
+      case OP_CCLASS_NOT:
+        p += SIZE_BITSET; break;
+      case OP_CCLASS_MB:
+      case OP_CCLASS_MB_NOT:
+      case OP_CCLASS_MIX:
+      case OP_CCLASS_MIX_NOT:
+	GET_LENGTH_INC(len, p); p += len; break;
+
+      case OP_ANYCHAR:
+      case OP_ANYCHAR_ML:
+	break;
+      case OP_ANYCHAR_STAR:
+      case OP_ANYCHAR_ML_STAR:
+	num++; break;
+      case OP_ANYCHAR_STAR_PEEK_NEXT:
+      case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
+	p++; num++; break;
+
+      case OP_WORD:
+      case OP_NOT_WORD:
+      case OP_WORD_BOUND:
+      case OP_NOT_WORD_BOUND:
+      case OP_WORD_BEGIN:
+      case OP_WORD_END:
+	break;
+
+      case OP_ASCII_WORD:
+      case OP_NOT_ASCII_WORD:
+      case OP_ASCII_WORD_BOUND:
+      case OP_NOT_ASCII_WORD_BOUND:
+      case OP_ASCII_WORD_BEGIN:
+      case OP_ASCII_WORD_END:
+	break;
+
+      case OP_BEGIN_BUF:
+      case OP_END_BUF:
+      case OP_BEGIN_LINE:
+      case OP_END_LINE:
+      case OP_SEMI_END_BUF:
+      case OP_BEGIN_POSITION:
+	break;
+
+      case OP_BACKREF1:
+      case OP_BACKREF2:
+      case OP_BACKREFN:
+      case OP_BACKREFN_IC:
+      case OP_BACKREF_MULTI:
+      case OP_BACKREF_MULTI_IC:
+      case OP_BACKREF_WITH_LEVEL:
+	return NUM_CACHE_OPCODE_FAIL;
+
+      case OP_MEMORY_START:
+      case OP_MEMORY_START_PUSH:
+      case OP_MEMORY_END_PUSH:
+      case OP_MEMORY_END_PUSH_REC:
+      case OP_MEMORY_END:
+      case OP_MEMORY_END_REC:
+	p += SIZE_MEMNUM; break;
+
+      case OP_KEEP:
+	break;
+
+      case OP_FAIL:
+	break;
+      case OP_JUMP:
+	GET_RELADDR_INC(addr, p);
+	break;
+      case OP_PUSH:
+	GET_RELADDR_INC(addr, p);
+	num++;
+	break;
+      case OP_POP:
+	break;
+      case OP_PUSH_OR_JUMP_EXACT1:
+      case OP_PUSH_IF_PEEK_NEXT:
+	p += SIZE_RELADDR + 1; num++; break;
+      case OP_REPEAT:
+      case OP_REPEAT_NG:
+      case OP_REPEAT_INC:
+      case OP_REPEAT_INC_NG:
+      case OP_REPEAT_INC_SG:
+      case OP_REPEAT_INC_NG_SG:
+	// TODO: support OP_REPEAT opcodes.
+	return NUM_CACHE_OPCODE_FAIL;
+      case OP_NULL_CHECK_START:
+      case OP_NULL_CHECK_END:
+      case OP_NULL_CHECK_END_MEMST:
+      case OP_NULL_CHECK_END_MEMST_PUSH:
+	p += SIZE_MEMNUM; break;
+
+      case OP_PUSH_POS:
+      case OP_POP_POS:
+	break;
+      case OP_PUSH_POS_NOT:
+	p += SIZE_RELADDR; break;
+      case OP_FAIL_POS:
+	break;
+      case OP_PUSH_STOP_BT:
+      case OP_POP_STOP_BT:
+	return NUM_CACHE_OPCODE_FAIL;
+      case OP_LOOK_BEHIND:
+	/* GET_LENGTH_INC(len, p); break; */
+	return NUM_CACHE_OPCODE_FAIL;
+      case OP_PUSH_LOOK_BEHIND_NOT:
+	// Since optimization assumes a string offset does not back,
+	// we cannot optimize look-behind opcodes.
+	/*
+	  GET_RELADDR_INC(addr, p);
+	  GET_LENGTH_INC(len, p);
+	  break;
+	*/
+	return NUM_CACHE_OPCODE_FAIL;
+      case OP_FAIL_LOOK_BEHIND_NOT:
+	return NUM_CACHE_OPCODE_FAIL;
+      case OP_PUSH_ABSENT_POS:
+      case OP_ABSENT_END:
+	break;
+      case OP_ABSENT:
+	p += SIZE_RELADDR; break;
+
+      case OP_CALL:
+      case OP_RETURN:
+	return NUM_CACHE_OPCODE_FAIL;
+
+      case OP_CONDITION:
+	return NUM_CACHE_OPCODE_FAIL;
+
+      case OP_STATE_CHECK_PUSH:
+      case OP_STATE_CHECK_PUSH_OR_JUMP:
+      case OP_STATE_CHECK:
+      case OP_STATE_CHECK_ANYCHAR_STAR:
+      case OP_STATE_CHECK_ANYCHAR_ML_STAR:
+	return NUM_CACHE_OPCODE_FAIL;
+
+      case OP_SET_OPTION_PUSH:
+      case OP_SET_OPTION:
+	p += SIZE_OPTION;
+	break;
+    }
+  }
+
+  return num;
+}
+
+void init_cache_index_table(regex_t* reg, UChar **table)
+{
+  UChar* pbegin;
+  UChar* p = reg->p;
+  UChar* pend = p + reg->used;
+  LengthType len;
+  RelAddrType addr;
+  OnigEncoding enc = reg->enc;
+
+  while (p < pend) {
+    pbegin = p;
+    switch (*p++) {
+      case OP_FINISH:
+      case OP_END:
+	break;
+
+      case OP_EXACT1: p++; break;
+      case OP_EXACT2: p += 2; break;
+      case OP_EXACT3: p += 3; break;
+      case OP_EXACT4: p += 4; break;
+      case OP_EXACT5: p += 5; break;
+      case OP_EXACTN:
+        GET_LENGTH_INC(len, p); p += len; break;
+      case OP_EXACTMB2N1: p += 2; break;
+      case OP_EXACTMB2N2: p += 4; break;
+      case OP_EXACTMB2N3: p += 6; break;
+      case OP_EXACTMB2N:
+	GET_LENGTH_INC(len, p); p += len * 2; break;
+      case OP_EXACTMB3N:
+	GET_LENGTH_INC(len, p); p += len * 3; break;
+      case OP_EXACTMBN:
+	{
+	  int mb_len;
+	  GET_LENGTH_INC(mb_len, p);
+	  GET_LENGTH_INC(len, p);
+	  p += mb_len * len;
+	}
+        break;
+
+      case OP_EXACT1_IC:
+	len = enclen(enc, p, pend); p += len; break;
+      case OP_EXACTN_IC:
+	GET_LENGTH_INC(len, p); p += len; break;
+
+      case OP_CCLASS:
+      case OP_CCLASS_NOT:
+        p += SIZE_BITSET; break;
+      case OP_CCLASS_MB:
+      case OP_CCLASS_MB_NOT:
+      case OP_CCLASS_MIX:
+      case OP_CCLASS_MIX_NOT:
+	GET_LENGTH_INC(len, p); p += len; break;
+
+      case OP_ANYCHAR:
+      case OP_ANYCHAR_ML:
+	break;
+      case OP_ANYCHAR_STAR:
+      case OP_ANYCHAR_ML_STAR:
+	*table++ = pbegin; break;
+      case OP_ANYCHAR_STAR_PEEK_NEXT:
+      case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
+	p++;
+	*table++ = pbegin;
+	break;
+
+      case OP_WORD:
+      case OP_NOT_WORD:
+      case OP_WORD_BOUND:
+      case OP_NOT_WORD_BOUND:
+      case OP_WORD_BEGIN:
+      case OP_WORD_END:
+	break;
+
+      case OP_ASCII_WORD:
+      case OP_NOT_ASCII_WORD:
+      case OP_ASCII_WORD_BOUND:
+      case OP_NOT_ASCII_WORD_BOUND:
+      case OP_ASCII_WORD_BEGIN:
+      case OP_ASCII_WORD_END:
+	break;
+
+      case OP_BEGIN_BUF:
+      case OP_END_BUF:
+      case OP_BEGIN_LINE:
+      case OP_END_LINE:
+      case OP_SEMI_END_BUF:
+      case OP_BEGIN_POSITION:
+	break;
+
+      case OP_BACKREF1:
+      case OP_BACKREF2:
+      case OP_BACKREFN:
+      case OP_BACKREFN_IC:
+      case OP_BACKREF_MULTI:
+      case OP_BACKREF_MULTI_IC:
+      case OP_BACKREF_WITH_LEVEL:
+	return;
+
+      case OP_MEMORY_START:
+      case OP_MEMORY_START_PUSH:
+      case OP_MEMORY_END_PUSH:
+      case OP_MEMORY_END_PUSH_REC:
+      case OP_MEMORY_END:
+      case OP_MEMORY_END_REC:
+	p += SIZE_MEMNUM; break;
+
+      case OP_KEEP:
+	break;
+
+      case OP_FAIL:
+	break;
+      case OP_JUMP:
+	GET_RELADDR_INC(addr, p);
+	break;
+      case OP_PUSH:
+	GET_RELADDR_INC(addr, p);
+	*table++ = pbegin;
+	break;
+      case OP_POP:
+	break;
+      case OP_PUSH_OR_JUMP_EXACT1:
+      case OP_PUSH_IF_PEEK_NEXT:
+	p += SIZE_RELADDR + 1; *table++ = pbegin; break;
+      case OP_REPEAT:
+      case OP_REPEAT_NG:
+      case OP_REPEAT_INC:
+      case OP_REPEAT_INC_NG:
+      case OP_REPEAT_INC_SG:
+      case OP_REPEAT_INC_NG_SG:
+	// TODO: support OP_REPEAT opcodes.
+	return;
+      case OP_NULL_CHECK_START:
+      case OP_NULL_CHECK_END:
+      case OP_NULL_CHECK_END_MEMST:
+      case OP_NULL_CHECK_END_MEMST_PUSH:
+	p += SIZE_MEMNUM; break;
+
+      case OP_PUSH_POS:
+      case OP_POP_POS:
+	break;
+      case OP_PUSH_POS_NOT:
+	p += SIZE_RELADDR; break;
+      case OP_FAIL_POS:
+	break;
+      case OP_PUSH_STOP_BT:
+      case OP_POP_STOP_BT:
+	return;
+      case OP_LOOK_BEHIND:
+	/* GET_LENGTH_INC(len, p); break; */
+	return;
+      case OP_PUSH_LOOK_BEHIND_NOT:
+	// Since optimization assumes a string offset does not back,
+	// we cannot optimize look-behind opcodes.
+	/*
+	  GET_RELADDR_INC(addr, p);
+	  GET_LENGTH_INC(len, p);
+	  break;
+	*/
+	return;
+      case OP_FAIL_LOOK_BEHIND_NOT:
+	return;
+      case OP_PUSH_ABSENT_POS:
+      case OP_ABSENT_END:
+	break;
+      case OP_ABSENT:
+	p += SIZE_RELADDR; break;
+
+      case OP_CALL:
+      case OP_RETURN:
+	return;
+
+      case OP_CONDITION:
+	return;
+
+      case OP_STATE_CHECK_PUSH:
+      case OP_STATE_CHECK_PUSH_OR_JUMP:
+      case OP_STATE_CHECK:
+      case OP_STATE_CHECK_ANYCHAR_STAR:
+      case OP_STATE_CHECK_ANYCHAR_ML_STAR:
+	return;
+
+      case OP_SET_OPTION_PUSH:
+      case OP_SET_OPTION:
+	p += SIZE_OPTION;
+	break;
+    }
+  }
+}
+
+int find_cache_index_table(UChar** table, int num_cache_table, UChar* p)
+{
+  int l = 0, r = num_ca (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]