[前][次][番号順一覧][スレッド一覧]

ruby-changes:47582

From: naruse <ko1@a...>
Date: Thu, 31 Aug 2017 15:35:33 +0900 (JST)
Subject: [ruby-changes:47582] naruse:r59698 (trunk): String#each_grapheme_cluster and String#grapheme_clusters

naruse	2017-08-31 15:35:28 +0900 (Thu, 31 Aug 2017)

  New Revision: 59698

  https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=59698

  Log:
    String#each_grapheme_cluster and String#grapheme_clusters
    
    added to enumerate grapheme clusters [Feature #13780]

  Modified files:
    trunk/NEWS
    trunk/string.c
    trunk/test/ruby/test_string.rb
Index: test/ruby/test_string.rb
===================================================================
--- test/ruby/test_string.rb	(revision 59697)
+++ test/ruby/test_string.rb	(revision 59698)
@@ -885,6 +885,46 @@ CODE https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L885
     end
   end
 
+  def test_each_grapheme_cluster
+    [
+      "\u{20 200d}",
+      "\u{600 600}",
+      "\u{600 20}",
+      "\u{261d 1F3FB}",
+      "\u{1f600}",
+      "\u{20 308}",
+      "\u{1F477 1F3FF 200D 2640 FE0F}",
+      "\u{1F468 200D 1F393}",
+      "\u{1F46F 200D 2642 FE0F}",
+      "\u{1f469 200d 2764 fe0f 200d 1f469}",
+    ].each do |g|
+      assert_equal [g], g.each_grapheme_cluster.to_a
+    end
+
+    assert_equal ["\u000A", "\u0308"], "\u{a 308}".each_grapheme_cluster.to_a
+    assert_equal ["\u000D", "\u0308"], "\u{d 308}".each_grapheme_cluster.to_a
+  end
+
+  def test_grapheme_clusters
+    [
+      "\u{20 200d}",
+      "\u{600 600}",
+      "\u{600 20}",
+      "\u{261d 1F3FB}",
+      "\u{1f600}",
+      "\u{20 308}",
+      "\u{1F477 1F3FF 200D 2640 FE0F}",
+      "\u{1F468 200D 1F393}",
+      "\u{1F46F 200D 2642 FE0F}",
+      "\u{1f469 200d 2764 fe0f 200d 1f469}",
+    ].each do |g|
+      assert_equal [g], g.grapheme_clusters
+    end
+
+    assert_equal ["\u000A", "\u0308"], "\u{a 308}".grapheme_clusters
+    assert_equal ["\u000D", "\u0308"], "\u{d 308}".grapheme_clusters
+  end
+
   def test_each_line
     save = $/
     $/ = "\n"
Index: NEWS
===================================================================
--- NEWS	(revision 59697)
+++ NEWS	(revision 59698)
@@ -94,6 +94,8 @@ with all sufficient information, see the https://github.com/ruby/ruby/blob/trunk/NEWS#L94
   * String#delete_prefix! is added to remove prefix destructively [Feature #12694]
   * String#delete_suffix is added to remove suffix [Feature #13665]
   * String#delete_suffix! is added to remove suffix destructively [Feature #13665]
+  * String#each_grapheme_cluster and String#grapheme_clusters is added to
+    enumerate grapheme clusters [Feature #13780]
 
 * Thread
 
Index: string.c
===================================================================
--- string.c	(revision 59697)
+++ string.c	(revision 59698)
@@ -8081,6 +8081,117 @@ rb_str_codepoints(VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8081
     return rb_str_enumerate_codepoints(str, 1);
 }
 
+static VALUE
+rb_str_enumerate_grapheme_clusters(VALUE str, int wantarray)
+{
+    regex_t *reg_grapheme_cluster = NULL;
+    static regex_t *reg_grapheme_cluster_utf8 = NULL;
+    int encidx = ENCODING_GET(str);
+    rb_encoding *enc = rb_enc_from_index(encidx);
+    int unicode_p = rb_enc_unicode_p(enc);
+    const char *ptr, *end;
+    VALUE ary;
+
+    if (!unicode_p) {
+   return rb_str_enumerate_codepoints(str, wantarray);
+    }
+
+    /* synchronize */
+    if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
+   reg_grapheme_cluster = reg_grapheme_cluster_utf8;
+    }
+    if (!reg_grapheme_cluster) {
+   const OnigUChar source[] = "\\X";
+   int r = onig_new(&reg_grapheme_cluster, source, source + sizeof(source) - 1,
+       ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL);
+   if (r) {
+       rb_bug("cannot compile grapheme cluster regexp");
+   }
+   if (encidx == rb_utf8_encindex()) {
+       reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
+   }
+    }
+
+    ptr = RSTRING_PTR(str);
+    end = RSTRING_END(str);
+
+    if (rb_block_given_p()) {
+   if (wantarray) {
+#if STRING_ENUMERATORS_WANTARRAY
+       rb_warn("given block not used");
+       ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
+#else
+       rb_warning("passing a block to String#grapheme_clusters is deprecated");
+       wantarray = 0;
+#endif
+   }
+    }
+    else {
+   if (wantarray)
+       ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
+   else
+       return SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
+    }
+
+    while (ptr < end) {
+   VALUE grapheme_cluster;
+   OnigPosition len = onig_match(reg_grapheme_cluster,
+       (const OnigUChar *)ptr, (const OnigUChar *)end,
+       (const OnigUChar *)ptr, NULL, 0);
+   if (len == 0) break;
+   if (len < 0) {
+       break;
+   }
+   grapheme_cluster = rb_enc_str_new(ptr, len, enc);
+   if (wantarray)
+       rb_ary_push(ary, grapheme_cluster);
+   else
+       rb_yield(grapheme_cluster);
+   ptr += len;
+    }
+    if (wantarray)
+   return ary;
+    else
+   return str;
+}
+
+/*
+ *  call-seq:
+ *     str.each_grapheme_cluster {|cstr| block }    -> str
+ *     str.each_grapheme_cluster                    -> an_enumerator
+ *
+ *  Passes each grapheme cluster in <i>str</i> to the given block, or returns
+ *  an enumerator if no block is given.
+ *  Unlike String#each_char, this enumerates by grapheme clusters defined by
+ *  Unicode Standard Annex #29 http://unicode.org/reports/tr29/
+ *
+ *     "a\u0300".each_chars.to_a.size #=> 2
+ *     "a\u0300".each_grapheme_cluster.to_a.size #=> 1
+ *
+ */
+
+static VALUE
+rb_str_each_grapheme_cluster(VALUE str)
+{
+    return rb_str_enumerate_grapheme_clusters(str, 0);
+}
+
+/*
+ *  call-seq:
+ *     str.grapheme_clusters   -> an_array
+ *
+ *  Returns an array of grapheme clusters in <i>str</i>.  This is a shorthand
+ *  for <code>str.each_grapheme_cluster.to_a</code>.
+ *
+ *  If a block is given, which is a deprecated form, works the same as
+ *  <code>each_grapheme_cluster</code>.
+ */
+
+static VALUE
+rb_str_grapheme_clusters(VALUE str)
+{
+    return rb_str_enumerate_grapheme_clusters(str, 1);
+}
 
 static long
 chopped_length(VALUE str)
@@ -10492,6 +10603,7 @@ Init_String(void) https://github.com/ruby/ruby/blob/trunk/string.c#L10603
     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
+    rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
@@ -10547,6 +10659,7 @@ Init_String(void) https://github.com/ruby/ruby/blob/trunk/string.c#L10659
     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
+    rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
 
     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
 

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]