ruby-changes:13495

naruse	2009-10-09 03:07:08 +0900 (Fri, 09 Oct 2009)

  New Revision: 25271

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=25271

  Log:
    * tool/enc-unicode.rb: optimized.
    
    * enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
      enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
      U+100000-U+10FFFD is assigned, not Cn.

  Modified files:
    trunk/ChangeLog
    trunk/enc/unicode/name2ctype.h.blt
    trunk/enc/unicode/name2ctype.kwd
    trunk/enc/unicode/name2ctype.src
    trunk/tool/enc-unicode.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 25270)
+++ ChangeLog	(revision 25271)
@@ -1,3 +1,11 @@
+Fri Oct  9 02:58:18 2009  NARUSE, Yui  <naruse@r...>
+
+	* tool/enc-unicode.rb: optimized.
+
+	* enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
+	  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
+	  U+100000-U+10FFFD is assigned, not Cn.
+
 Fri Oct  9 02:12:02 2009  Marc-Andre Lafortune  <ruby-core@m...>
 
 	* ext/curses/curses.c: Many functions of module Curses could cause a
Index: enc/unicode/name2ctype.kwd
===================================================================
--- enc/unicode/name2ctype.kwd	(revision 25270)
+++ enc/unicode/name2ctype.kwd	(revision 25271)
@@ -3923,7 +3923,7 @@
 
 /* 'Assigned': - */
 static const OnigCodePoint CR_Assigned[] = {
-	484,
+	485,
 	0x0000, 0x0377,
 	0x037a, 0x037e,
 	0x0384, 0x038a,
@@ -4408,6 +4408,7 @@
 	0xe0020, 0xe007f,
 	0xe0100, 0xe01ef,
 	0xf0000, 0xffffd,
+	0x100000, 0x10fffd,
 }; /* CR_Assigned */
 
 /* 'C': Major Category */
@@ -4464,7 +4465,7 @@
 
 /* 'Cn': General Category */
 static const OnigCodePoint CR_Cn[] = {
-	484,
+	485,
 	0x0378, 0x0379,
 	0x037f, 0x0383,
 	0x038b, 0x038b,
@@ -4948,7 +4949,8 @@
 	0xe0002, 0xe001f,
 	0xe0080, 0xe00ff,
 	0xe01f0, 0xeffff,
-	0xffffe, 0x10ffff,
+	0xffffe, 0xfffff,
+	0x10fffe, 0x10ffff,
 }; /* CR_Cn */
 
 /* 'Co': General Category */
Index: enc/unicode/name2ctype.h.blt
===================================================================
--- enc/unicode/name2ctype.h.blt	(revision 25270)
+++ enc/unicode/name2ctype.h.blt	(revision 25271)
@@ -3959,7 +3959,7 @@
 
 /* 'Assigned': - */
 static const OnigCodePoint CR_Assigned[] = {
-	484,
+	485,
 	0x0000, 0x0377,
 	0x037a, 0x037e,
 	0x0384, 0x038a,
@@ -4444,6 +4444,7 @@
 	0xe0020, 0xe007f,
 	0xe0100, 0xe01ef,
 	0xf0000, 0xffffd,
+	0x100000, 0x10fffd,
 }; /* CR_Assigned */
 
 /* 'C': Major Category */
@@ -4500,7 +4501,7 @@
 
 /* 'Cn': General Category */
 static const OnigCodePoint CR_Cn[] = {
-	484,
+	485,
 	0x0378, 0x0379,
 	0x037f, 0x0383,
 	0x038b, 0x038b,
@@ -4984,7 +4985,8 @@
 	0xe0002, 0xe001f,
 	0xe0080, 0xe00ff,
 	0xe01f0, 0xeffff,
-	0xffffe, 0x10ffff,
+	0xffffe, 0xfffff,
+	0x10fffe, 0x10ffff,
 }; /* CR_Cn */
 
 /* 'Co': General Category */
Index: enc/unicode/name2ctype.src
===================================================================
--- enc/unicode/name2ctype.src	(revision 25270)
+++ enc/unicode/name2ctype.src	(revision 25271)
@@ -3923,7 +3923,7 @@
 
 /* 'Assigned': - */
 static const OnigCodePoint CR_Assigned[] = {
-	484,
+	485,
 	0x0000, 0x0377,
 	0x037a, 0x037e,
 	0x0384, 0x038a,
@@ -4408,6 +4408,7 @@
 	0xe0020, 0xe007f,
 	0xe0100, 0xe01ef,
 	0xf0000, 0xffffd,
+	0x100000, 0x10fffd,
 }; /* CR_Assigned */
 
 /* 'C': Major Category */
@@ -4464,7 +4465,7 @@
 
 /* 'Cn': General Category */
 static const OnigCodePoint CR_Cn[] = {
-	484,
+	485,
 	0x0378, 0x0379,
 	0x037f, 0x0383,
 	0x038b, 0x038b,
@@ -4948,7 +4949,8 @@
 	0xe0002, 0xe001f,
 	0xe0080, 0xe00ff,
 	0xe01f0, 0xeffff,
-	0xffffe, 0x10ffff,
+	0xffffe, 0xfffff,
+	0x10fffe, 0x10ffff,
 }; /* CR_Cn */
 
 /* 'Co': General Category */
Index: tool/enc-unicode.rb
===================================================================
--- tool/enc-unicode.rb	(revision 25270)
+++ tool/enc-unicode.rb	(revision 25271)
@@ -2,6 +2,13 @@
 
 # Creates the data structures needed by Onigurma to map Unicode codepoints to
 # property names and POSIX character classes
+#
+# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
+# (http://unicode.org/Public/UNIDATA/)
+# And run following command.
+#   ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
+# You can get source file for gperf.
+# After this, simply make ruby.
 
 unless ARGV.size == 2
   $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
@@ -17,10 +24,11 @@
   # codepoints with property _property_. Note: It is intended that some ranges
   # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020
 
-  codepoints = codepoints.uniq.sort
+  codepoints.sort!
   last_cp = codepoints.first
   pairs = [[last_cp, nil]]
   codepoints[1..-1].each do |codepoint|
+    next if last_cp == codepoint
 
     # If the current codepoint does not follow directly on from the last
     # codepoint, the last codepoint represents the end of the current range,
@@ -39,7 +47,7 @@
 
 def parse_unicode_data(file)
   last_cp = 0
-  data = {'Cn' => []}
+  data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
   beg_cp = nil
   IO.foreach(file) do |line|
     fields = line.split(';')
@@ -64,6 +72,10 @@
     # Cn category.
     data['Cn'].concat((last_cp.next...beg_cp).to_a)
 
+    # Assigned - Defined in unicode.c; interpreted as every character in the
+    # Unicode range minus the unassigned characters
+    data['Assigned'].concat(cps)
+
     # The third field denotes the 'General' category, e.g. Lu
     (data[fields[2]] ||= []).concat(cps)
 
@@ -73,16 +85,15 @@
     last_cp = cp
   end
 
-  # General Category property
-  gcps = %w[Any Assigned]
-  gcps.concat data.keys.sort
-
   # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
   # codepoints to Cn and C
-  cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
+  cn_remainder = (last_cp.next..0x10ffff).to_a
   data['Cn'] += cn_remainder
   data['C'] += cn_remainder
 
+  # Define General Category properties
+  gcps = data.keys.sort
+
   # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
   #
 
@@ -145,10 +156,6 @@
   # Any - Defined in unicode.c
   data['Any'] = (0x0000..0x10ffff).to_a
 
-  # Assigned - Defined in unicode.c; interpreted as every character in the
-  # Unicode range minus the unassigned characters
-  data['Assigned'] = data['Any'] - data['Cn']
-
   # Returns General Category Property names and the data
   [gcps, data]
 end

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/