[前][次][番号順一覧][スレッド一覧]

ruby-changes:34190

From: nobu <ko1@a...>
Date: Sat, 31 May 2014 08:58:23 +0900 (JST)
Subject: [ruby-changes:34190] nobu:r46271 (trunk): case-folding.rb: perfect hash for case unfolding2

nobu	2014-05-31 08:58:14 +0900 (Sat, 31 May 2014)

  New Revision: 46271

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?revision=46271&view=revision

  Log:
    case-folding.rb: perfect hash for case unfolding2
    
    * enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
      lookup case unfolding table 2.

  Modified files:
    trunk/ChangeLog
    trunk/enc/unicode/case-folding.rb
    trunk/enc/unicode/casefold.h
    trunk/enc/unicode.c
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 46270)
+++ ChangeLog	(revision 46271)
@@ -1,4 +1,7 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
-Sat May 31 08:57:58 2014  Nobuyoshi Nakada  <nobu@r...>
+Sat May 31 08:58:12 2014  Nobuyoshi Nakada  <nobu@r...>
+
+	* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
+	  lookup case unfolding table 2.
 
 	* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
 	  lookup case unfolding table 1.
Index: enc/unicode/casefold.h
===================================================================
--- enc/unicode/casefold.h	(revision 46270)
+++ enc/unicode/casefold.h	(revision 46271)
@@ -5022,6 +5022,142 @@ static const CaseUnfold_12_Type CaseUnfo https://github.com/ruby/ruby/blob/trunk/enc/unicode/casefold.h#L5022
   {{0x0069, 0x0307}, {1, {0x0130}}},
 };
 
+/* C code produced by gperf version 3.0.4 */
+/* Command-line: gperf -7 -k1,2,3,4,5,6 -F,-1 -c -j1 -i1 -t -T -E -C -H onigenc_unicode_CaseUnfold_12_hash -N onigenc_unicode_CaseUnfold_12_lookup  */
+
+/* maximum key range = 71, duplicates = 0 */
+
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__cplusplus) || defined(__GNUC_STDC_INLINE__)
+inline
+#elif defined(__GNUC__)
+__inline
+#endif
+/*ARGSUSED*/
+static unsigned int
+onigenc_unicode_CaseUnfold_12_hash(const OnigCodePoint *codes)
+{
+  static const unsigned char asso_values[] =
+    {
+       3, 58, 54, 57, 56, 16,  8,  2, 43, 82,
+       3,  1, 23, 82, 82, 82, 82, 82, 82,  4,
+      82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
+      82, 82, 52, 51, 50, 49, 48, 47, 46, 45,
+      82, 82, 82, 82, 43, 82, 42, 82, 82, 13,
+      82, 82, 82, 82, 82, 11, 82,  1, 82, 82,
+      14, 82,  1, 82, 82, 31,  3, 82, 82, 30,
+      82, 82, 82, 10, 82, 82, 82, 82, 37, 82,
+      82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
+      82, 82, 82, 82, 82, 82, 37, 15, 36, 35,
+      34, 17,  1, 33, 12,  4, 23, 23, 26, 21,
+      13, 82, 27, 82, 82,  2,  5, 82, 11, 16,
+      82, 15, 82, 82, 23, 82,  8, 82
+    };
+  return asso_values[bits_at(codes, 5)] + asso_values[bits_at(codes, 4)] + asso_values[bits_at(codes, 3)] + asso_values[bits_at(codes, 2)] + asso_values[bits_at(codes, 1)] + asso_values[bits_at(codes, 0)];
+}
+
+#ifdef __GNUC__
+__inline
+#if defined __GNUC_STDC_INLINE__ || defined __GNUC_GNU_INLINE__
+__attribute__ ((__gnu_inline__))
+#endif
+#endif
+static const CodePointList2 *
+onigenc_unicode_CaseUnfold_12_lookup(const OnigCodePoint *codes)
+{
+  enum
+    {
+      MIN_CODE_VALUE = 0x61,
+      MAX_CODE_VALUE = 0x1f7c,
+      TOTAL_KEYWORDS = 59,
+      MIN_WORD_LENGTH = 6,
+      MAX_WORD_LENGTH = 6,
+      MIN_HASH_VALUE = 11,
+      MAX_HASH_VALUE = 81
+    };
+
+  static const short wordlist[] =
+    {
+      -1, -1, -1, -1, -1, -1,
+      -1, -1, -1, -1, -1,
+      /*0x1f66,0x03b9*/ 53,
+      /*0x1f07,0x03b9*/ 38,
+      /*0x1f00,0x03b9*/ 31,
+      /*0x0066,0x0066*/ 1,
+      /*0x1f74,0x03b9*/ 56,
+      /*0x0073,0x0073*/ 6,
+      /*0x0066,0x0069*/ 2,
+      /*0x1f06,0x03b9*/ 37,
+      /*0x0073,0x0074*/ 7,
+      /*0x03b9,0x0342*/ 18,
+      /*0x03c9,0x03b9*/ 23,
+      /*0x03b7,0x03b9*/ 17,
+      /*0x0069,0x0307*/ 58,
+      /*0x03b1,0x03b9*/ 15,
+      /*0x1f61,0x03b9*/ 48,
+      /*0x1f05,0x03b9*/ 36,
+      /*0x1f65,0x03b9*/ 52,
+      /*0x0574,0x0576*/ 29,
+      /*0x03c9,0x0342*/ 22,
+      /*0x03b7,0x0342*/ 16,
+      /*0x057e,0x0576*/ 30,
+      /*0x03b1,0x0342*/ 14,
+      /*0x1f7c,0x03b9*/ 57,
+      /*0x0574,0x0565*/ 26,
+      /*0x0079,0x030a*/ 10,
+      /*0x0077,0x030a*/ 9,
+      /*0x1f70,0x03b9*/ 55,
+      /*0x0574,0x056d*/ 28,
+      /*0x0066,0x006c*/ 3,
+      /*0x0574,0x056b*/ 27,
+      /*0x0061,0x02be*/ 0,
+      /*0x0068,0x0331*/ 4,
+      /*0x1f67,0x03b9*/ 54,
+      /*0x1f64,0x03b9*/ 51,
+      /*0x1f63,0x03b9*/ 50,
+      /*0x1f62,0x03b9*/ 49,
+      /*0x1f60,0x03b9*/ 47,
+      /*0x03ce,0x03b9*/ 24,
+      /*0x03c5,0x0342*/ 21,
+      /*0x03c5,0x0313*/ 20,
+      /*0x03c1,0x0313*/ 19,
+      /*0x02bc,0x006e*/ 11,
+      /*0x03ae,0x03b9*/ 13,
+      /*0x03ac,0x03b9*/ 12,
+      /*0x1f27,0x03b9*/ 46,
+      /*0x1f26,0x03b9*/ 45,
+      /*0x1f25,0x03b9*/ 44,
+      /*0x1f24,0x03b9*/ 43,
+      /*0x1f23,0x03b9*/ 42,
+      /*0x1f22,0x03b9*/ 41,
+      /*0x1f21,0x03b9*/ 40,
+      /*0x1f20,0x03b9*/ 39,
+      /*0x006a,0x030c*/ 5,
+      /*0x1f02,0x03b9*/ 33,
+      /*0x0074,0x0308*/ 8,
+      /*0x1f04,0x03b9*/ 35,
+      /*0x1f03,0x03b9*/ 34,
+      /*0x1f01,0x03b9*/ 32,
+      -1, -1, -1, -1, -1, -1,
+      -1, -1, -1, -1, -1, -1,
+      /*0x0565,0x0582*/ 25
+    };
+
+  if (codes[0] <= MAX_CODE_VALUE && codes[0] >= MIN_CODE_VALUE &&
+      codes[1] <= MAX_CODE_VALUE && codes[1] >= MIN_CODE_VALUE)
+    {
+      register int key = onigenc_unicode_CaseUnfold_12_hash(codes);
+
+      if (key <= MAX_HASH_VALUE && key >= 0)
+        {
+          register short s = wordlist[key];
+
+          if (s >= 0 && code2_equal(codes, CaseUnfold_12_Table[s].from))
+            return &CaseUnfold_12_Table[s].to;
+        }
+    }
+  return 0;
+}
+
 static const CaseUnfold_13_Type CaseUnfold_13_Table[] = {
 #define CaseUnfold_13 (*(CaseUnfold_13_Type (*)[14])(CaseUnfold_13_Table+0))
   {{0x0066, 0x0066, 0x0069}, {1, {0xfb03}}},
@@ -5040,5 +5176,4 @@ static const CaseUnfold_13_Type CaseUnfo https://github.com/ruby/ruby/blob/trunk/enc/unicode/casefold.h#L5176
   {{0x03c9, 0x0342, 0x03b9}, {1, {0x1ff7}}},
 };
 
-#define UNFOLD2_TABLE_SIZE	88
 #define UNFOLD3_TABLE_SIZE	23
Index: enc/unicode/case-folding.rb
===================================================================
--- enc/unicode/case-folding.rb	(revision 46270)
+++ enc/unicode/case-folding.rb	(revision 46271)
@@ -78,12 +78,17 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L78
     self
   end
 
+  def range_check(code)
+    "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE"
+  end
+
   def lookup_hash(key, type, data)
     hash = "onigenc_unicode_#{key}_hash"
     lookup = "onigenc_unicode_#{key}_lookup"
-    gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
-    argname = "code"
-    argdecl = "const OnigCodePoint #{argname}"
+    arity = Array(data[0][0]).size
+    gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(",")} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
+    argname = arity > 1 ? "codes" : "code"
+    argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}"
     n = 7
     m = (1 << n) - 1
     min, max = data.map {|c, *|c}.flatten.minmax
@@ -101,17 +106,21 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L106
     src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
       name = $1
       body = $2
-      body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
+      body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)")
       "#{name}(#{argdecl})\n{\n#{body}}"
     }
     src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
       name = $1
       body = $2
       body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
-      body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
+      body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})")
       body.gsub!(/\{"",-1}/, "-1")
       body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
-      body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
+      body.sub!(/(\s+if\s)\(len\b.*\)/) do
+        "#$1(" <<
+          (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n      ") : range_check(argname)) <<
+          ")"
+      end
       v = nil
       body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
         pre = $1
@@ -119,7 +128,7 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L128
         s = $3
         s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
         v = $1
-        s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
+        s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))")
         "#{pre}{#{s}\n#{indent}}"
       }
       body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
@@ -149,15 +158,14 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L158
 
     # CaseUnfold_12 + CaseUnfold_12_Locale
     name = "CaseUnfold_12"
-    print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
+    data = print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
+    dest.print lookup_hash(name, "CodePointList2", data)
 
     # CaseUnfold_13
     name = "CaseUnfold_13"
     print_table(dest, name, name=>unfold[2])
 
     # table sizes
-    unfold2_table_size = unfold[1].size + unfold_locale[1].size
-    dest.printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5))
     unfold3_table_size = unfold[2].size
     dest.printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7))
   end
Index: enc/unicode.c
===================================================================
--- enc/unicode.c	(revision 46270)
+++ enc/unicode.c	(revision 46271)
@@ -107,6 +107,12 @@ bits_of(const OnigCodePoint c, const int https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L107
   return (c >> (2 - n) * 7) & 127;
 }
 
+static inline int
+bits_at(const OnigCodePoint *c, const int n)
+{
+  return bits_of(c[n / 3], n % 3);
+}
+
 static int
 code1_equal(const OnigCodePoint x, const OnigCodePoint y)
 {
@@ -114,6 +120,14 @@ code1_equal(const OnigCodePoint x, const https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L120
   return 1;
 }
 
+static int
+code2_equal(const OnigCodePoint *x, const OnigCodePoint *y)
+{
+  if (x[0] != y[0]) return 0;
+  if (x[1] != y[1]) return 0;
+  return 1;
+}
+
 #include "enc/unicode/casefold.h"
 
 #include "enc/unicode/name2ctype.h"
@@ -198,27 +212,6 @@ onigenc_unicode_property_name_to_ctype(O https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L212
 
 
 static int
-code2_cmp(st_data_t x0, st_data_t y0)
-{
-  const OnigCodePoint *x = (const OnigCodePoint *)x0;
-  const OnigCodePoint *y = (const OnigCodePoint *)y0;
-  if (x[0] == y[0] && x[1] == y[1]) return 0;
-  return 1;
-}
-
-static st_index_t
-code2_hash(st_data_t x0)
-{
-  const OnigCodePoint *x = (const OnigCodePoint *)x0;
-  return (st_index_t )(x[0] + x[1]);
-}
-
-static const struct st_hash_type type_code2_hash = {
-  code2_cmp,
-  code2_hash,
-};
-
-static int
 code3_cmp(st_data_t x0, st_data_t y0)
 {
   const OnigCodePoint *x = (const OnigCodePoint *)x0;
@@ -240,7 +233,6 @@ static const struct st_hash_type type_co https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L233
 };
 
 
-static st_table* Unfold2Table;
 static st_table* Unfold3Table;
 static int CaseFoldInited = 0;
 
@@ -250,14 +242,6 @@ static int init_case_fold_table(void) https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L242
 
   THREAD_ATOMIC_START;
 
-  Unfold2Table = st_init_table_with_size(&type_code2_hash, UNFOLD2_TABLE_SIZE);
-  if (ONIG_IS_NULL(Unfold2Table)) return ONIGERR_MEMORY;
-
-  for (i = 0; i < numberof(CaseUnfold_12_Table); i++) {
-    const CaseUnfold_12_Type *p2 = &CaseUnfold_12_Table[i];
-    st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to));
-  }
-
   Unfold3Table = st_init_table_with_size(&type_code3_hash, UNFOLD3_TABLE_SIZE);
   if (ONIG_IS_NULL(Unfold3Table)) return ONIGERR_MEMORY;
 
@@ -273,16 +257,8 @@ static int init_case_fold_table(void) https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L257
 
 #define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
 #define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup
+#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup
 
-static inline const CodePointList2 *
-onigenc_unicode_unfold2_lookup(const OnigCodePoint *code)
-{
-  st_data_t to;
-  if (onig_st_lookup(Unfold2Table, (st_data_t )code, &to) != 0) {
-    return (const CodePointList2 *)to;
-  }
-  return 0;
-}
 
 static inline const CodePointList2 *
 onigenc_unicode_unfold3_lookup(const OnigCodePoint *code)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]