[前][次][番号順一覧][スレッド一覧]

ruby-changes:54048

From: duerst <ko1@a...>
Date: Fri, 7 Dec 2018 16:04:06 +0900 (JST)
Subject: [ruby-changes:54048] duerst:r66267 (trunk): remove code duplication and put everything into forward order

duerst	2018-12-07 16:04:00 +0900 (Fri, 07 Dec 2018)

  New Revision: 66267

  https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=66267

  Log:
    remove code duplication and put everything into forward order
    
    In file regparse.c, in function node_extended_grapheme_cluster(),
    eliminate code duplication of CRLF and '.' (any character). This
    uses the fact that both for Unicode encodings and for non-Unicode
    encodings, the first alternative is CRLF, and the last alternative
    is '.' (any character). This puts all of the pieces into forward
    order (the order of the code follows the order of the syntax
    definition).

  Modified files:
    trunk/regparse.c
Index: regparse.c
===================================================================
--- regparse.c	(revision 66266)
+++ regparse.c	(revision 66267)
@@ -5807,17 +5807,17 @@ create_node_from_array(int kind, Node ** https://github.com/ruby/ruby/blob/trunk/regparse.c#L5807
  *
  * Target       Array name          Index
  *
- *              node_array          0 1 2 3 4 5 6 7 8 9 A B C D E
- * top_alts     alts[4]             0 1 2 3*
- * alts+1       list[4]                 0 1 2 3*
- * list+1       core_alts[7]                0 1 2 3 4 5 6*
- * core_alts+0  H_list[4]                     0 1 2 3*
- * H_list+1     H_alt2[4]                         0 1 2 3*
- * h_alt2+1     H_list2[3]                            0 1 2*
- * core_alts+4  XP_list[4]                            0 1 2 3*
- * XP_list+1    Ex_list[4]                                0 1 2 3*
+ *              node_array          0 1 2 3 4 5 6 7 8 9 A B C D E F
+ * top_alts     alts[5]             0 1 2 3 4*
+ * alts+1       list[4]                   0 1 2 3*
+ * list+1       core_alts[7]                  0 1 2 3 4 5 6*
+ * core_alts+0  H_list[4]                       0 1 2 3*
+ * H_list+1     H_alt2[4]                           0 1 2 3*
+ * h_alt2+1     H_list2[3]                              0 1 2*
+ * core_alts+4  XP_list[4]                              0 1 2 3*
+ * XP_list+1    Ex_list[4]                                  0 1 2 3*
  */
-#define NODE_COMMON_SIZE 15
+#define NODE_COMMON_SIZE 16
 
 static int
 node_extended_grapheme_cluster(Node** np, ScanEnv* env)
@@ -5828,208 +5828,193 @@ node_extended_grapheme_cluster(Node** np https://github.com/ruby/ruby/blob/trunk/regparse.c#L5828
   int r = 0;
   int num1;
   int i;
+  int any_target_position;
   UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
   OnigOptionType option;
-  /* node_array is function-global so that we can free all nodes
+  /* node_common is function-global so that we can free all nodes
    * in case of error. Unused slots are set to NULL_NODE at all times. */
   Node *node_common[NODE_COMMON_SIZE];
+  Node **alts = node_common+0; /* size: 5 */
+
+  for (i=0; i<NODE_COMMON_SIZE; i++)
+    node_common[i] = NULL_NODE;
+
+  /* CRLF, common for both Unicode and non-Unicode */
+  /* \x0D\x0A */
+  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
+  if (r < 0) goto err;
+  num1 = r;
+  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
+  if (r < 0) goto err;
+  alts[0] = node_new_str_raw(buf, buf + num1 + r);
+  if (IS_NULL(alts[0])) goto err;
 
 #ifdef USE_UNICODE_PROPERTIES
   if (ONIGENC_IS_UNICODE(env->enc)) {  /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
     CClassNode* cc;
 
-    for (i=0; i<NODE_COMMON_SIZE; i++)
-      node_common[i] = NULL_NODE;
-
     if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err;
     /* Unicode 11.0.0
-     * CRLF     (this is added last because it is common with non-Unicode encodings)
+     *   CRLF     (already done)
      * | [Control CR LF]
      * | precore* core postcore*
      * | .      (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */
+
+    /* [Control CR LF]    (CR and LF are not in the spec, but this is a conformed fix) */
+    alts[1] = node_new_cclass();
+    if (IS_NULL(alts[1])) goto err;
+    cc = NCCLASS(alts[1]);
+    R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
+    if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
+      R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
+      R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
+    }
+    else {
+      BITSET_SET_BIT(cc->bs, 0x0a);
+      BITSET_SET_BIT(cc->bs, 0x0d);
+    }
+
+    /* precore* core postcore* */
     {
-      Node **alts = node_common+0; /* size: 4 */
+      Node **list = alts + 3; /* size: 4 */
 
-      /* [Control CR LF]    (CR and LF are not in the spec, but this is a conformed fix) */
-      alts[0] = node_new_cclass();
-      if (IS_NULL(alts[0])) goto err;
-      cc = NCCLASS(alts[0]);
-      R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
-      if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
-        R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
-        R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
-      }
-      else {
-        BITSET_SET_BIT(cc->bs, 0x0a);
-        BITSET_SET_BIT(cc->bs, 0x0d);
-      }
+      /* precore*; precore := Prepend */
+      R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
 
-      /* precore* core postcore* */
+      /* core := hangul-syllable
+       *       | ri-sequence
+       *       | xpicto-sequence
+       *       | [^Control CR LF] */
       {
-        Node **list = alts + 2; /* size: 4 */
+        Node **core_alts = list + 2; /* size: 7 */
 
-        /* precore*; precore := Prepend */
-        R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
+        /* hangul-syllable :=
+         *     L* (V+ | LV V* | LVT) T*
+         *   | L+
+         *   | T+ */
+        /* hangul-syllable is an alternative (would be called H_alt)
+         * inside an alternative, but we flatten it into core_alts */
 
-        /* core := hangul-syllable
-         *       | ri-sequence
-         *       | xpicto-sequence
-         *       | [^Control CR LF] */
+        /* L* (V+ | LV V* | LVT) T* */
         {
-          Node **core_alts = list + 2; /* size: 7 */
-
-          /* hangul-syllable :=
-           *     L* (V+ | LV V* | LVT) T*
-           *   | L+
-           *   | T+ */
-          /* hangul-syllable is an alternative (would be called H_alt)
-           * inside an alternative, but we flatten it into core_alts */
+          Node **H_list = core_alts + 1; /* size: 4 */
+          R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*'));
 
-          /* L* (V+ | LV V* | LVT) T* */
+          /* V+ | LV V* | LVT */
           {
-            Node **H_list = core_alts + 1; /* size: 4 */
-            R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*'));
+            Node **H_alt2 = H_list + 2; /* size: 4 */
+            R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
 
-            /* V+ | LV V* | LVT */
+            /* LV V* */
             {
-              Node **H_alt2 = H_list + 2; /* size: 4 */
-              R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
+              Node **H_list2 = H_alt2 + 2; /* size: 3 */
 
-              /* LV V* */
-              {
-                Node **H_list2 = H_alt2 + 2; /* size: 3 */
-
-                R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV"));
-                R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*'));
-                R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2));
-              }
-
-              R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
-              R_ERR(create_node_from_array(ALT, H_list+1, H_alt2));
+              R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV"));
+              R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*'));
+              R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2));
             }
 
-            R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*'));
-            R_ERR(create_node_from_array(LIST, core_alts+0, H_list));
+            R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
+            R_ERR(create_node_from_array(ALT, H_list+1, H_alt2));
           }
 
-          R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
-          R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
-          /* end of hangul-syllable */
-
-          /* ri-sequence := RI RI */
-          R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
+          R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*'));
+          R_ERR(create_node_from_array(LIST, core_alts+0, H_list));
+        }
 
-          /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
-          {
-            Node **XP_list = core_alts + 5; /* size: 3 */
-            R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic"));
+        R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
+        R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
+        /* end of hangul-syllable */
 
-            /* (Extend* ZWJ \p{Extended_Pictographic})* */
-            {
-              Node **Ex_list = XP_list + 2; /* size: 4 */
-              /* assert(Ex_list+4 <= node_common+NODE_COMMON_SIZE) */
-              R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*'));
-
-              /* ZWJ (ZERO WIDTH JOINER) */
-              r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
-              if (r < 0) goto err;
-              Ex_list[1] = node_new_str_raw(buf, buf + r);
-              if (IS_NULL(Ex_list[1])) goto err;
+        /* ri-sequence := RI RI */
+        R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
 
-              R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic"));
-              R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list));
-            }
-            R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
+        /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
+        {
+          Node **XP_list = core_alts + 5; /* size: 3 */
+          R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic"));
 
-            R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
-          }
+          /* (Extend* ZWJ \p{Extended_Pictographic})* */
+          {
+            Node **Ex_list = XP_list + 2; /* size: 4 */
+            if (!(Ex_list+4 == node_common+NODE_COMMON_SIZE)) exit(1);
+            R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*'));
+
+            /* ZWJ (ZERO WIDTH JOINER) */
+            r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
+            if (r < 0) goto err;
+            Ex_list[1] = node_new_str_raw(buf, buf + r);
+            if (IS_NULL(Ex_list[1])) goto err;
 
-          /* [^Control CR LF] */
-          core_alts[5] = node_new_cclass();
-          if (IS_NULL(core_alts[5])) goto err;
-          cc = NCCLASS(core_alts[5]);
-          if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
-            BBuf *inverted_buf = NULL;
-
-            /* Start with a positive buffer and invert at the end.
-             * Otherwise, adding single-character ranges work the wrong way. */
-            R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
-            R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
-            R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
-            R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
-            cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
-          }
-          else {
-            R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
-            BITSET_CLEAR_BIT(cc->bs, 0x0a);
-            BITSET_CLEAR_BIT(cc->bs, 0x0d);
+            R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic"));
+            R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list));
           }
+          R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
 
-          R_ERR(create_node_from_array(ALT, list+1, core_alts));
+          R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
         }
 
-        /* postcore*; postcore = [Extend ZWJ SpacingMark] */
-        R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend"));
-        cc = NCCLASS(list[2]);
-        R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
-        R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
-        R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE));
+        /* [^Control CR LF] */
+        core_alts[5] = node_new_cclass();
+        if (IS_NULL(core_alts[5])) goto err;
+        cc = NCCLASS(core_alts[5]);
+        if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
+          BBuf *inverted_buf = NULL;
+
+          /* Start with a positive buffer and invert at the end.
+           * Otherwise, adding single-character ranges work the wrong way. */
+          R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
+          R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
+          R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
+          R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
+          cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
+        }
+        else {
+          R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
+          BITSET_CLEAR_BIT(cc->bs, 0x0a);
+          BITSET_CLEAR_BIT(cc->bs, 0x0d);
+        }
 
-        R_ERR(create_node_from_array(LIST, alts+1, list));
+        R_ERR(create_node_from_array(ALT, list+1, core_alts));
       }
 
-      /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
-      /* Not in Unicode spec (UAX #29), but added to catch invalid stuff,
-       * because this is Ruby spec for String#grapheme_clusters. */
-      np1 = node_new_anychar();
-      if (IS_NULL(np1)) goto err;
-
-      option = env->option;
-      ONOFF(option, ONIG_OPTION_MULTILINE, 0);
-      tmp = node_new_option(option);
-      if (IS_NULL(tmp)) goto err;
-      NENCLOSE(tmp)->target = np1;
-      alts[2] = tmp;
+      /* postcore*; postcore = [Extend ZWJ SpacingMark] */
+      R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend"));
+      cc = NCCLASS(list[2]);
+      R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
+      R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
+      R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE));
 
-      R_ERR(create_node_from_array(ALT, &top_alt, alts));
+      R_ERR(create_node_from_array(LIST, alts+2, list));
     }
+
+    any_target_position = 3;
   }
   else
 #endif /* USE_UNICODE_PROPERTIES */
   {
-    /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
-    np1 = node_new_anychar();
-    if (IS_NULL(np1)) goto err;
-
-    option = env->option;
-    ONOFF(option, ONIG_OPTION_MULTILINE, 0);
-    tmp = node_new_option(option);
-    if (IS_NULL(tmp)) goto err;
-    NENCLOSE(tmp)->target = np1;
-    np1 = tmp;
-
-    top_alt = onig_node_new_alt(np1, NULL_NODE);
-    if (IS_NULL(top_alt)) goto err;
-    np1 = NULL;
+    any_target_position = 1;
   }
 
-  /* add in CRLF to complete (CRLF | Control | precore* core postcore* | .) */
-  /* \x0D\x0A */
-  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
-  if (r < 0) goto err;
-  num1 = r;
-  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
-  if (r < 0) goto err;
-  np1 = node_new_str_raw(buf, buf + num1 + r);
+  /* PerlSyntax: (?s:.), RubySyntax: (?m:.), common for both Unicode and non-Unicode */
+  /* Not in Unicode spec (UAX #29), but added to catch invalid stuff,
+   * because this is Ruby spec for String#grapheme_clusters. */
+  np1 = node_new_anychar();
   if (IS_NULL(np1)) goto err;
 
-  tmp = onig_node_new_alt(np1, top_alt);
+  option = env->option;
+  ONOFF(option, ONIG_OPTION_MULTILINE, 0);
+  tmp = node_new_option(option);
   if (IS_NULL(tmp)) goto err;
-  top_alt = tmp;
+  NENCLOSE(tmp)->target = np1;
+  alts[any_target_position] = tmp;
   np1 = NULL;
 
-  /* (?>): For efficiency, because there is nothing that isn't in a grapheme cluster,
-           and there is only one way to split a string into grapheme clusters. */
+  R_ERR(create_node_from_array(ALT, &top_alt, alts));
+
+  /* (?>): For efficiency, because there is no text piece
+   *       that is not in a grapheme cluster, and there is only one way
+   *       to split a string into grapheme clusters. */
   tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
   if (IS_NULL(tmp)) goto err;
   NENCLOSE(tmp)->target = top_alt;

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]