ruby-changes:34190
From: nobu <ko1@a...>
Date: Sat, 31 May 2014 08:58:23 +0900 (JST)
Subject: [ruby-changes:34190] nobu:r46271 (trunk): case-folding.rb: perfect hash for case unfolding2
nobu 2014-05-31 08:58:14 +0900 (Sat, 31 May 2014) New Revision: 46271 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?revision=46271&view=revision Log: case-folding.rb: perfect hash for case unfolding2 * enc/unicode/case-folding.rb (lookup_hash): make perfect hash to lookup case unfolding table 2. Modified files: trunk/ChangeLog trunk/enc/unicode/case-folding.rb trunk/enc/unicode/casefold.h trunk/enc/unicode.c Index: ChangeLog =================================================================== --- ChangeLog (revision 46270) +++ ChangeLog (revision 46271) @@ -1,4 +1,7 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1 -Sat May 31 08:57:58 2014 Nobuyoshi Nakada <nobu@r...> +Sat May 31 08:58:12 2014 Nobuyoshi Nakada <nobu@r...> + + * enc/unicode/case-folding.rb (lookup_hash): make perfect hash to + lookup case unfolding table 2. * enc/unicode/case-folding.rb (lookup_hash): make perfect hash to lookup case unfolding table 1. Index: enc/unicode/casefold.h =================================================================== --- enc/unicode/casefold.h (revision 46270) +++ enc/unicode/casefold.h (revision 46271) @@ -5022,6 +5022,142 @@ static const CaseUnfold_12_Type CaseUnfo https://github.com/ruby/ruby/blob/trunk/enc/unicode/casefold.h#L5022 {{0x0069, 0x0307}, {1, {0x0130}}}, }; +/* C code produced by gperf version 3.0.4 */ +/* Command-line: gperf -7 -k1,2,3,4,5,6 -F,-1 -c -j1 -i1 -t -T -E -C -H onigenc_unicode_CaseUnfold_12_hash -N onigenc_unicode_CaseUnfold_12_lookup */ + +/* maximum key range = 71, duplicates = 0 */ + +#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__cplusplus) || defined(__GNUC_STDC_INLINE__) +inline +#elif defined(__GNUC__) +__inline +#endif +/*ARGSUSED*/ +static unsigned int +onigenc_unicode_CaseUnfold_12_hash(const OnigCodePoint *codes) +{ + static const unsigned char asso_values[] = + { + 3, 58, 54, 57, 56, 16, 8, 2, 43, 82, + 3, 1, 23, 82, 82, 82, 82, 82, 82, 4, + 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, + 82, 82, 52, 51, 50, 49, 48, 47, 46, 45, + 82, 82, 82, 82, 43, 82, 42, 82, 82, 13, + 82, 82, 82, 82, 82, 11, 82, 1, 82, 82, + 14, 82, 1, 82, 82, 31, 3, 82, 82, 30, + 82, 82, 82, 10, 82, 82, 82, 82, 37, 82, + 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, + 82, 82, 82, 82, 82, 82, 37, 15, 36, 35, + 34, 17, 1, 33, 12, 4, 23, 23, 26, 21, + 13, 82, 27, 82, 82, 2, 5, 82, 11, 16, + 82, 15, 82, 82, 23, 82, 8, 82 + }; + return asso_values[bits_at(codes, 5)] + asso_values[bits_at(codes, 4)] + asso_values[bits_at(codes, 3)] + asso_values[bits_at(codes, 2)] + asso_values[bits_at(codes, 1)] + asso_values[bits_at(codes, 0)]; +} + +#ifdef __GNUC__ +__inline +#if defined __GNUC_STDC_INLINE__ || defined __GNUC_GNU_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +static const CodePointList2 * +onigenc_unicode_CaseUnfold_12_lookup(const OnigCodePoint *codes) +{ + enum + { + MIN_CODE_VALUE = 0x61, + MAX_CODE_VALUE = 0x1f7c, + TOTAL_KEYWORDS = 59, + MIN_WORD_LENGTH = 6, + MAX_WORD_LENGTH = 6, + MIN_HASH_VALUE = 11, + MAX_HASH_VALUE = 81 + }; + + static const short wordlist[] = + { + -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, + /*0x1f66,0x03b9*/ 53, + /*0x1f07,0x03b9*/ 38, + /*0x1f00,0x03b9*/ 31, + /*0x0066,0x0066*/ 1, + /*0x1f74,0x03b9*/ 56, + /*0x0073,0x0073*/ 6, + /*0x0066,0x0069*/ 2, + /*0x1f06,0x03b9*/ 37, + /*0x0073,0x0074*/ 7, + /*0x03b9,0x0342*/ 18, + /*0x03c9,0x03b9*/ 23, + /*0x03b7,0x03b9*/ 17, + /*0x0069,0x0307*/ 58, + /*0x03b1,0x03b9*/ 15, + /*0x1f61,0x03b9*/ 48, + /*0x1f05,0x03b9*/ 36, + /*0x1f65,0x03b9*/ 52, + /*0x0574,0x0576*/ 29, + /*0x03c9,0x0342*/ 22, + /*0x03b7,0x0342*/ 16, + /*0x057e,0x0576*/ 30, + /*0x03b1,0x0342*/ 14, + /*0x1f7c,0x03b9*/ 57, + /*0x0574,0x0565*/ 26, + /*0x0079,0x030a*/ 10, + /*0x0077,0x030a*/ 9, + /*0x1f70,0x03b9*/ 55, + /*0x0574,0x056d*/ 28, + /*0x0066,0x006c*/ 3, + /*0x0574,0x056b*/ 27, + /*0x0061,0x02be*/ 0, + /*0x0068,0x0331*/ 4, + /*0x1f67,0x03b9*/ 54, + /*0x1f64,0x03b9*/ 51, + /*0x1f63,0x03b9*/ 50, + /*0x1f62,0x03b9*/ 49, + /*0x1f60,0x03b9*/ 47, + /*0x03ce,0x03b9*/ 24, + /*0x03c5,0x0342*/ 21, + /*0x03c5,0x0313*/ 20, + /*0x03c1,0x0313*/ 19, + /*0x02bc,0x006e*/ 11, + /*0x03ae,0x03b9*/ 13, + /*0x03ac,0x03b9*/ 12, + /*0x1f27,0x03b9*/ 46, + /*0x1f26,0x03b9*/ 45, + /*0x1f25,0x03b9*/ 44, + /*0x1f24,0x03b9*/ 43, + /*0x1f23,0x03b9*/ 42, + /*0x1f22,0x03b9*/ 41, + /*0x1f21,0x03b9*/ 40, + /*0x1f20,0x03b9*/ 39, + /*0x006a,0x030c*/ 5, + /*0x1f02,0x03b9*/ 33, + /*0x0074,0x0308*/ 8, + /*0x1f04,0x03b9*/ 35, + /*0x1f03,0x03b9*/ 34, + /*0x1f01,0x03b9*/ 32, + -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, + /*0x0565,0x0582*/ 25 + }; + + if (codes[0] <= MAX_CODE_VALUE && codes[0] >= MIN_CODE_VALUE && + codes[1] <= MAX_CODE_VALUE && codes[1] >= MIN_CODE_VALUE) + { + register int key = onigenc_unicode_CaseUnfold_12_hash(codes); + + if (key <= MAX_HASH_VALUE && key >= 0) + { + register short s = wordlist[key]; + + if (s >= 0 && code2_equal(codes, CaseUnfold_12_Table[s].from)) + return &CaseUnfold_12_Table[s].to; + } + } + return 0; +} + static const CaseUnfold_13_Type CaseUnfold_13_Table[] = { #define CaseUnfold_13 (*(CaseUnfold_13_Type (*)[14])(CaseUnfold_13_Table+0)) {{0x0066, 0x0066, 0x0069}, {1, {0xfb03}}}, @@ -5040,5 +5176,4 @@ static const CaseUnfold_13_Type CaseUnfo https://github.com/ruby/ruby/blob/trunk/enc/unicode/casefold.h#L5176 {{0x03c9, 0x0342, 0x03b9}, {1, {0x1ff7}}}, }; -#define UNFOLD2_TABLE_SIZE 88 #define UNFOLD3_TABLE_SIZE 23 Index: enc/unicode/case-folding.rb =================================================================== --- enc/unicode/case-folding.rb (revision 46270) +++ enc/unicode/case-folding.rb (revision 46271) @@ -78,12 +78,17 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L78 self end + def range_check(code) + "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE" + end + def lookup_hash(key, type, data) hash = "onigenc_unicode_#{key}_hash" lookup = "onigenc_unicode_#{key}_lookup" - gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}" - argname = "code" - argdecl = "const OnigCodePoint #{argname}" + arity = Array(data[0][0]).size + gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(",")} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}" + argname = arity > 1 ? "codes" : "code" + argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}" n = 7 m = (1 << n) - 1 min, max = data.map {|c, *|c}.flatten.minmax @@ -101,17 +106,21 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L106 src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { name = $1 body = $2 - body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)") + body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)") "#{name}(#{argdecl})\n{\n#{body}}" } src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { name = $1 body = $2 body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") - body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)') + body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})") body.gsub!(/\{"",-1}/, "-1") body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') - body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"} + body.sub!(/(\s+if\s)\(len\b.*\)/) do + "#$1(" << + (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) << + ")" + end v = nil body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { pre = $1 @@ -119,7 +128,7 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L128 s = $3 s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') v = $1 - s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))") + s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))") "#{pre}{#{s}\n#{indent}}" } body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") @@ -149,15 +158,14 @@ class CaseFolding https://github.com/ruby/ruby/blob/trunk/enc/unicode/case-folding.rb#L158 # CaseUnfold_12 + CaseUnfold_12_Locale name = "CaseUnfold_12" - print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) + data = print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) + dest.print lookup_hash(name, "CodePointList2", data) # CaseUnfold_13 name = "CaseUnfold_13" print_table(dest, name, name=>unfold[2]) # table sizes - unfold2_table_size = unfold[1].size + unfold_locale[1].size - dest.printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5)) unfold3_table_size = unfold[2].size dest.printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7)) end Index: enc/unicode.c =================================================================== --- enc/unicode.c (revision 46270) +++ enc/unicode.c (revision 46271) @@ -107,6 +107,12 @@ bits_of(const OnigCodePoint c, const int https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L107 return (c >> (2 - n) * 7) & 127; } +static inline int +bits_at(const OnigCodePoint *c, const int n) +{ + return bits_of(c[n / 3], n % 3); +} + static int code1_equal(const OnigCodePoint x, const OnigCodePoint y) { @@ -114,6 +120,14 @@ code1_equal(const OnigCodePoint x, const https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L120 return 1; } +static int +code2_equal(const OnigCodePoint *x, const OnigCodePoint *y) +{ + if (x[0] != y[0]) return 0; + if (x[1] != y[1]) return 0; + return 1; +} + #include "enc/unicode/casefold.h" #include "enc/unicode/name2ctype.h" @@ -198,27 +212,6 @@ onigenc_unicode_property_name_to_ctype(O https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L212 static int -code2_cmp(st_data_t x0, st_data_t y0) -{ - const OnigCodePoint *x = (const OnigCodePoint *)x0; - const OnigCodePoint *y = (const OnigCodePoint *)y0; - if (x[0] == y[0] && x[1] == y[1]) return 0; - return 1; -} - -static st_index_t -code2_hash(st_data_t x0) -{ - const OnigCodePoint *x = (const OnigCodePoint *)x0; - return (st_index_t )(x[0] + x[1]); -} - -static const struct st_hash_type type_code2_hash = { - code2_cmp, - code2_hash, -}; - -static int code3_cmp(st_data_t x0, st_data_t y0) { const OnigCodePoint *x = (const OnigCodePoint *)x0; @@ -240,7 +233,6 @@ static const struct st_hash_type type_co https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L233 }; -static st_table* Unfold2Table; static st_table* Unfold3Table; static int CaseFoldInited = 0; @@ -250,14 +242,6 @@ static int init_case_fold_table(void) https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L242 THREAD_ATOMIC_START; - Unfold2Table = st_init_table_with_size(&type_code2_hash, UNFOLD2_TABLE_SIZE); - if (ONIG_IS_NULL(Unfold2Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_12_Table); i++) { - const CaseUnfold_12_Type *p2 = &CaseUnfold_12_Table[i]; - st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to)); - } - Unfold3Table = st_init_table_with_size(&type_code3_hash, UNFOLD3_TABLE_SIZE); if (ONIG_IS_NULL(Unfold3Table)) return ONIGERR_MEMORY; @@ -273,16 +257,8 @@ static int init_case_fold_table(void) https://github.com/ruby/ruby/blob/trunk/enc/unicode.c#L257 #define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup #define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup +#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup -static inline const CodePointList2 * -onigenc_unicode_unfold2_lookup(const OnigCodePoint *code) -{ - st_data_t to; - if (onig_st_lookup(Unfold2Table, (st_data_t )code, &to) != 0) { - return (const CodePointList2 *)to; - } - return 0; -} static inline const CodePointList2 * onigenc_unicode_unfold3_lookup(const OnigCodePoint *code) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/