commit-classpath
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

FYI: java.text.RuleBasedCollator and java.text.CollationElementIterator


From: Guilhem Lavaux
Subject: FYI: java.text.RuleBasedCollator and java.text.CollationElementIterator
Date: Mon, 29 Dec 2003 19:08:14 +0100
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031007

Hi,

I am merging the new code for RuleBasedCollator and CollationElementIterator. I've tested it with mauve and it gives only 4 failures (for 242 tests), so it's nearly perfect compared to the JDK. There's only one feature which is still not completely implemented: reverse accent ordering. I've tried to indent it correctly, please tell if you see something wrong.

Regards,
Guilhem.
Index: ChangeLog
===================================================================
RCS file: /cvsroot/classpath/classpath/ChangeLog,v
retrieving revision 1.1736
diff -u -r1.1736 ChangeLog
--- ChangeLog   29 Dec 2003 12:04:50 -0000      1.1736
+++ ChangeLog   29 Dec 2003 18:05:33 -0000
@@ -1,5 +1,11 @@
 2003-12-29 Guilhem Lavaux <address@hidden>
 
+       * java/text/RuleBasedCollator.java,
+       java/text/CollationElementIterator.java:
+       Parser rewritten. All but one feature implemented.
+
+2003-12-29 Guilhem Lavaux <address@hidden>
+
        * java/net/URLStreamHandler.java
        (parseURL): Change a relative path into an
        absolute if the original URL does not have any path.
Index: java/text/RuleBasedCollator.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/text/RuleBasedCollator.java,v
retrieving revision 1.17
diff -u -r1.17 RuleBasedCollator.java
--- java/text/RuleBasedCollator.java    21 Oct 2003 14:56:12 -0000      1.17
+++ java/text/RuleBasedCollator.java    29 Dec 2003 18:05:34 -0000
@@ -39,6 +39,9 @@
 package java.text;
 
 import java.util.Vector;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Comparator;
 
 /* Written using "Java Class Libraries", 2nd edition, plus online
  * API docs for JDK 1.2 from http://www.javasoft.com.
@@ -60,7 +63,10 @@
  * <li> Reset: '&amp;' : <text>
  * </ul>
  * The modifier character indicates that accents sort backward as is the
- * case with French.  The relational operators specify how the text 
+ * case with French.  The modifier applies to all rules <b>after</b>
+ * the modifier but before the next primary sequence. If placed at the end
+ * of the sequence if applies to all unknown accented character.
+ * The relational operators specify how the text 
  * argument relates to the previous term.  The relation characters have
  * the following meanings:
  * <ul>
@@ -111,6 +117,9 @@
  * anywhere in the previous rule string segment so the rule following the
  * reset rule cannot be inserted.
  * <p>
+ * "&lt; a &amp; A @ &lt e &amp; E &lt f&amp; F" - This sequence is equivalent 
to the following
+ * "&lt; a &amp; A &lt E &amp; e &lt f &amp; F".
+ * <p>
  * For a description of the various comparison strength types, see the
  * documentation for the <code>Collator</code> class.
  * <p>
@@ -132,270 +141,591 @@
  *
  * @author Aaron M. Renn <address@hidden>
  * @author Tom Tromey <address@hidden>
- * @date March 25, 1999
+ * @author Guilhem Lavaux <address@hidden>
  */
 public class RuleBasedCollator extends Collator
 {
+  /**
+   * This class describes what rank has a character (or a sequence of 
characters) 
+   * in the lexicographic order. Each element in a rule has a collation 
element.
+   */
   final class CollationElement
   {
     String char_seq;
     int primary;
     short secondary;
     short tertiary;
+    short equality;
+    boolean ignore;
+    String expansion;
 
-    CollationElement (String char_seq, int primary, short secondary, short 
tertiary)
+    CollationElement(String char_seq, int primary, short secondary, short 
tertiary,
+                    short equality, String expansion)
     {
       this.char_seq = char_seq;
       this.primary = primary;
       this.secondary = secondary;
       this.tertiary = tertiary;
+      this.equality = equality;
+      this.ignore = false;
+      this.expansion = expansion;
+    }
+    
+    CollationElement(String char_seq)
+    {
+      this.char_seq = char_seq;
+      this.ignore = true;
+    }
+
+    final int getValue()
+    {
+      return (primary << 16) + (secondary << 8) + tertiary;
     }
 
   } // inner class CollationElement
 
   /**
-   * This the the original rule string.
-   */
-  private String rules;
+   * Basic collation instruction (internal format) to build the series of
+   * collation elements. It contains an instruction which specifies the new
+   * state of the generator. The sequence of instruction should not contain
+   * RESET (it is used by
+   * address@hidden 
#mergeRules(int,java.lang.String,java.util.Vector,java.util.Vector)})
+   * as a temporary state while merging two sets of instructions.
+   */
+  final class CollationSorter
+  {
+    static final int GREATERP = 0;
+    static final int GREATERS = 1;
+    static final int GREATERT = 2;
+    static final int EQUAL = 3;
+    static final int RESET = 4;
+    static final int IGNORE = 5;
+    static final int INVERSE_SECONDARY = 6;
+    
+    int comparisonType;
+    String textElement;
+    int hashText;
+    int offset;
+
+    String expansionOrdering;
+  }
 
   /**
-   * This is the table of collation element values
-   */
-  private Object[] ce_table;
-  
-  /**
-   * This method initializes a new instance of <code>RuleBasedCollator</code>
-   * with the specified collation rules.  Note that an application normally
-   * obtains an instance of <code>RuleBasedCollator</code> by calling the
-   * <code>getInstance</code> method of <code>Collator</code>.  That method
-   * automatically loads the proper set of rules for the desired locale.
-   *
-   * @param rules The collation rule string.
+   * This method returns the number of common characters at the beginning
+   * of the string of the two parameters.
    *
-   * @exception ParseException If the rule string contains syntax errors.
+   * @param prefix A string considered as a prefix to test against
+   * the other string.
+   * @param s A string to test the prefix against.
+   * @return The number of common characters.
    */
-  public RuleBasedCollator (String rules) throws ParseException
+  static int findPrefixLength(String prefix, String s)
   {
-    if (rules.equals (""))
-      throw new ParseException ("empty rule set", 0);
+    int i;
+
+    for (i = 0; i < prefix.length() && i < s.length(); i++)
+      {
+       if (prefix.charAt(i) != s.charAt(i))
+         return i;
+      }
+    return i;
+  }
+
+  /**
+   * Here we are merging two sets of sorting instructions: 'patch' into 
'main'. This methods
+   * checks whether it is possible to find an anchor point for the rules to be 
merged and
+   * then insert them at that precise point.
+   *
+   * @param offset Offset in the string containing rules of the beginning of 
the rules
+   * being merged in.
+   * @param starter Text of the rules being merged.
+   * @param main Repository of all already parsed rules.
+   * @param patch Rules to be merged into the repository.
+   * @throws ParseException if it is impossible to find an anchor point for 
the new rules.
+   */
+  private void mergeRules(int offset, String starter, Vector main, Vector 
patch)
+    throws ParseException 
+  {
+    Enumeration elements = main.elements();
+    int insertion_point = -1;
+    int max_length = 0;
     
-    this.rules = rules;
-    Vector vec = new Vector();
-    boolean ignore_chars = true;
-    int primary_seq = 0;
-    short secondary_seq = 0;
-    short tertiary_seq = 0;
-    StringBuffer argument = new StringBuffer();
-    int len = rules.length();
+    /* We must check that no rules conflict with another already present. If it
+     * is the case delete the old rule. 
+     */
     
-    for (int index = 0; index < len; ++index)
+    /* For the moment good old O(N^2) algorithm.
+     */
+    for (int i = 0; i < patch.size(); i++)
       {
-       char c = rules.charAt (index);
+       int j = 0;
+       
+       while (j < main.size())
+         {
+           CollationSorter rule1 = (CollationSorter) patch.elementAt(i);
+           CollationSorter rule2 = (CollationSorter) main.elementAt(j);
+           
+           if (rule1.textElement.equals(rule2.textElement))
+             main.removeElementAt(j);
+           else
+             j++;
+         }
+      }
 
-       // Just skip whitespace.
-       if (Character.isWhitespace (c))
-         continue;
+    // Find the insertion point... O(N)
+    for (int i = 0; i < main.size(); i++)
+      {
+       CollationSorter sorter = (CollationSorter) main.elementAt(i);
+       int length = findPrefixLength(starter, sorter.textElement);
+               
+       if (length > max_length)
+         {
+           max_length = length;
+           insertion_point = i+1;
+         }
+      }
 
-        // Primary difference
-        if (c == '<')
-          {
-            ignore_chars = false;
-            CollationElement e = new CollationElement (argument.toString(), 
primary_seq,
-                                                       secondary_seq,
-                                                       tertiary_seq);
-            secondary_seq = 0;
-            tertiary_seq = 0;
-            ++primary_seq;
-
-            vec.add (e);
-            argument.setLength (0);
-            continue;
-          }
+    if (insertion_point < 0)
+      throw new ParseException("no insertion point found for " + starter, 
offset);
 
-        // Secondary difference
-        if (c == ';')
-          {
-            if (primary_seq == 0)
-              throw new ParseException (rules, index);
+    if (max_length < starter.length())
+      {
+       /*
+        * We need to expand the first entry. It must be sorted
+        * like if it was the reference key itself (like the spec
+        * said. So the first entry is special: the element is
+        * replaced by the specified text element for the sorting.
+        * This text replace the old one for comparisons. However
+        * to preserve the behaviour we replace the first key (corresponding
+        * to the found prefix) by a new code rightly ordered in the
+        * sequence. The rest of the subsequence must be appended
+        * to the end of the sequence.
+        */
+       CollationSorter sorter = (CollationSorter) patch.elementAt(0);
+       CollationSorter expansionPrefix =
+         (CollationSorter) main.elementAt(insertion_point-1);
+       
+       sorter.expansionOrdering = starter.substring(max_length); // Skip the 
first good prefix element
+       
+       main.insertElementAt(sorter, insertion_point);
+       
+       /*
+        * This is a new set of rules. Append to the list.
+        */
+       patch.removeElementAt(0);
+       insertion_point = main.size();
+      }
 
-            CollationElement e = new CollationElement (argument.toString(), 
primary_seq,
-                                                       secondary_seq,
-                                                       tertiary_seq);
-            ++secondary_seq;
-            tertiary_seq = 0;
-
-            vec.add (e);
-            argument.setLength (0);
-            continue;
-          }
+    // Now insert all elements of patch at the insertion point.
+    for (int i = 0; i < patch.size(); i++)
+      main.insertElementAt(patch.elementAt(i), i+insertion_point);
+  }
 
-        // Tertiary difference
-        if (c == ',')
-          {
-            if (primary_seq == 0)
-              throw new ParseException (rules, index);
+  /**
+   * This method parses a string and build a set of sorting instructions. The 
parsing
+   * may only be partial on the case the rules are to be merged sometime later.
+   * 
+   * @param stop_on_reset If this parameter is true then the parser stops when 
it
+   * encounters a reset instruction. In the other case, it tries to parse the 
subrules
+   * and merged it in the same repository.
+   * @param v Output vector for the set of instructions.
+   * @param base_offset Offset in the string to begin parsing.
+   * @param rules Rules to be parsed.
+   * @return -1 if the parser reached the end of the string, an integer 
representing the
+   * offset in the string at which it stopped parsing. 
+   * @throws ParseException if something turned wrong during the parsing. To 
get details
+   * decode the message.
+   */
+  private int subParseString(boolean stop_on_reset, Vector v,
+                            int base_offset, String rules)
+    throws ParseException
+  {
+    boolean ignoreChars = (base_offset == 0);
+    int operator = -1;
+    StringBuffer sb = new StringBuffer("");
+    boolean doubleQuote = false;
+    boolean eatingChars = false;
+    boolean nextIsModifier = false;
+    boolean isModifier = false;
+    int i;
+    
+main_parse_loop:
+    for (i = 0; i < rules.length(); i++)
+      {
+       char c = rules.charAt(i);
+       int type = -1;
+       
+       if (!eatingChars &&
+           ((c >= 0x09 && c <= 0x0D) || (c == 0x20)))
+             continue;
 
-            CollationElement e = new CollationElement (argument.toString(), 
primary_seq,
-                                                       secondary_seq,
-                                                       tertiary_seq);
-            ++tertiary_seq;
-
-            vec.add (e);
-            argument.setLength (0);
-            continue;
-          }
+       isModifier = nextIsModifier;
+       nextIsModifier = false;
 
-        // Is equal to
-        if (c == '=')
-          {
-            if (primary_seq == 0)
-              throw new ParseException (rules, index);
+       if (eatingChars && c != '\'')
+         {
+           doubleQuote = false;
+           sb.append(c);
+           continue;
+         }
+       if (doubleQuote && eatingChars)
+         {
+           sb.append(c);
+           doubleQuote = false;
+           continue;
+         }
 
-            CollationElement e = new CollationElement (argument.toString(), 
primary_seq,
-                                                       secondary_seq,
-                                                       tertiary_seq);
-            vec.add (e);
-            argument.setLength (0);
-            continue;
-          }
+       switch (c) {
+       case '!':
+         throw new ParseException
+           ("Modifier '!' is not yet supported by Classpath", i+base_offset);
+       case '<':
+         ignoreChars = false;
+         type = CollationSorter.GREATERP;
+         break;
+       case ';':
+         if (!ignoreChars)
+           type = CollationSorter.GREATERS;
+         else
+           type = CollationSorter.IGNORE;
+         break;
+       case ',':
+         if (!ignoreChars)
+           type = CollationSorter.GREATERT;
+         else
+           type = CollationSorter.IGNORE;
+         break;
+       case '=':
+         if (!ignoreChars)
+           type = CollationSorter.EQUAL;
+         else
+           type = CollationSorter.IGNORE;
+         break;
+       case '\'':
+         eatingChars = !eatingChars;
+         doubleQuote = true;
+         break;
+       case '@':
+         if (ignoreChars)
+           throw new ParseException
+             ("comparison list has not yet been started. You may only use"
+              + "(<,;=&)", i+base_offset);
+         // Inverse the order of secondaries from now on.
+         nextIsModifier = true;
+         type = CollationSorter.INVERSE_SECONDARY;
+         break;
+       case '&':
+         type = CollationSorter.RESET;
+         if (stop_on_reset)
+           break main_parse_loop;
+         break;
+       default:
+         if (operator < 0)
+           throw new ParseException
+             ("operator missing at " + (i+base_offset), i+base_offset);
+         if (!eatingChars &&
+             ((c >= 0x21 && c <= 0x2F) 
+              || (c >= 0x3A && c <= 0x40)
+              || (c >= 0x5B && c <= 0x60)
+              || (c >= 0x7B && c <= 0x7E)))
+           throw new ParseException
+             ("unquoted punctuation character '"+c+"'", i+base_offset);
+
+         //type = ignoreChars ? CollationSorter.IGNORE : -1;
+         sb.append(c);
+         break;
+       }
 
-        // Sort accents backwards
-        if (c == '@')
-          {
-            throw new ParseException("French style accents not implemented 
yet", 0);
-          }
+       if (type  < 0)
+         continue;
 
-        // Reset command
-        if (c == '&')
-          {
-            throw new ParseException("Reset not implemented yet", 0);
-          }
+       if (operator < 0)
+         {
+           operator = type;
+           continue;
+         }
 
-        // See if we are still reading characters to skip
-        if (ignore_chars == true)
-          {
-            CollationElement e = new CollationElement (c + "", 0, (short)0, 
-                                                       (short)0);
-            vec.add(e);
-            continue;
-          }
+       if (sb.length() == 0 && !isModifier)
+         throw new ParseException
+           ("text element empty at " + (i+base_offset), i+base_offset);
+
+       if (operator == CollationSorter.RESET)
+         {
+           /* Reposition in the sorting list at the position
+            * indicated by the text element.
+            */
+           String subrules = rules.substring(i);
+           Vector sorted_rules = new Vector();
+           int idx;
+
+           // Parse the subrules but do not iterate through all
+           // sublist. This is the priviledge of the first call.
+           idx = subParseString(true, sorted_rules, base_offset+i, subrules);
+    
+           // Merge new parsed rules into the list.
+           mergeRules(base_offset+i, sb.toString(), v, sorted_rules);
+           sb.setLength(0);
+           
+           // Reset state to none.
+           operator = -1;
+           type = -1;
+           // We have found a new subrule at 'idx' but it has not been parsed.
+           if (idx >= 0)
+             {
+               i += idx-1;
+               continue main_parse_loop;
+             }
+           else
+               // No more rules.
+               break main_parse_loop;
+         }
+
+       CollationSorter sorter = new CollationSorter();
+       
+       sorter.comparisonType = operator;
+       sorter.textElement = sb.toString();
+       sorter.hashText = sorter.textElement.hashCode();
+       sorter.offset = base_offset+rules.length();
+       sb.setLength(0);
 
-        argument.append (c);
+       v.add(sorter);
+       operator = type;
       }
 
-    if (argument.length() > 0)
+    if (operator >= 0)
       {
-       CollationElement e = new CollationElement (argument.toString(), 
primary_seq,
-                                                  secondary_seq, tertiary_seq);
-       vec.add (e);
+       CollationSorter sorter = new CollationSorter();
+       int pos = rules.length() + base_offset;
+
+       if ((sb.length() != 0 && nextIsModifier)
+           || (sb.length() == 0 && !nextIsModifier && !eatingChars))
+         throw new ParseException("text element empty at " + pos, pos);
+
+       sorter.comparisonType = operator;
+       sorter.textElement = sb.toString();
+       sorter.hashText = sorter.textElement.hashCode();
+       sorter.offset = base_offset+pos;
+       v.add(sorter);
       }
 
-    ce_table = vec.toArray();
+    if (i == rules.length())
+      return -1;
+    else
+      return i;
   }
 
   /**
-   * This method creates a copy of this object.
+   * This method completely parses a string 'rules' containing sorting rules.
    *
-   * @return A copy of this object.
+   * @param rules String containing the rules to be parsed. 
+   * @return A set of sorting instructions stored in a Vector.
+   * @throws ParseException if something turned wrong during the parsing. To 
get details
+   * decode the message.
    */
-  public Object clone()
+  private Vector parseString(String rules) 
+    throws ParseException
   {
-    RuleBasedCollator c = (RuleBasedCollator) super.clone ();
-    return c;
+    Vector v = new Vector();
+
+    // result of the first subParseString is not absolute (may be -1 or a
+    // positive integer). But we do not care.
+    subParseString(false, v, 0, rules);
+    
+    return v;
   }
 
   /**
-   * This method returns an integer which indicates whether the first
-   * specified <code>String</code> is less than, greater than, or equal to
-   * the second.  The value depends not only on the collation rules in
-   * effect, but also the strength and decomposition settings of this object.
-   *
-   * @param source The first <code>String</code> to compare.
-   * @param target A second <code>String</code> to compare to the first.
+   * This the the original rule string.
+   */
+  private String rules;
+
+  /**
+   * This is the table of collation element values
+   */
+  private Object[] ce_table;
+
+  /**
+   * Quick-prefix finder.
+   */
+  HashMap prefix_tree;
+
+  /**
+   * This is the value of the last sequence entered into
+   * <code>ce_table</code>. It is used to compute the
+   * ordering value of unspecified character.
+   */
+  private int last_primary_value;
+
+  /**
+   * This variable is true if accents need to be sorted
+   * in the other direction.
+   */
+  private boolean inverseAccentComparison;
+
+  /**
+   * This method uses the sorting instructions built by address@hidden 
#parseString}
+   * to build collation elements which can be directly used to sort strings.
    *
-   * @return A negative integer if source &lt; target, a positive integer
-   * if source &gt; target, or 0 if source == target.
+   * @param parsedElements Parsed instructions stored in a Vector.
+   * @throws ParseException if the order of the instructions are not valid.
    */
-  public int compare (String source, String target)
+  private void buildCollationVector(Vector parsedElements)
+    throws ParseException
   {
-    CollationElementIterator cs = getCollationElementIterator (source);
-    CollationElementIterator ct = getCollationElementIterator (target);
-
-    while (true)
+    int primary_seq = 0;
+    short secondary_seq = 0;
+    short tertiary_seq = 0;
+    short equality_seq = 0;
+    boolean inverseComparisons = false;
+    final boolean DECREASING = false;
+    final boolean INCREASING = true;
+    boolean secondaryType = INCREASING;
+    Vector v = new Vector();
+
+    // elts is completely sorted.
+element_loop:
+    for (int i = 0; i < parsedElements.size(); i++)
       {
-       int os = cs.next();
-       int ot = ct.next();
+       CollationSorter elt = (CollationSorter) parsedElements.elementAt(i);
+       boolean ignoreChar = false;
 
-       // Check for end of string
-       if (os == CollationElementIterator.NULLORDER
-           && ot == CollationElementIterator.NULLORDER)
+       switch (elt.comparisonType)
          {
-           // Source and target string are equal.
-           return 0;
+         case CollationSorter.GREATERP:
+           primary_seq++;
+           if (inverseComparisons)
+             {
+               secondary_seq = Short.MAX_VALUE;
+               secondaryType = DECREASING;
+             }
+           else
+             {
+               secondary_seq = 0;
+               secondaryType = INCREASING;
+             }
+           tertiary_seq = 0;
+           equality_seq = 0;
+           inverseComparisons = false;
+           break;
+         case CollationSorter.GREATERS:
+           if (secondaryType == DECREASING)
+             secondary_seq--;
+           else
+             secondary_seq++;
+           tertiary_seq = 0;
+           equality_seq = 0;
+           break;
+         case CollationSorter.INVERSE_SECONDARY:
+           inverseComparisons = true;
+           continue element_loop;
+         case CollationSorter.GREATERT:
+           tertiary_seq++;
+           equality_seq = 0;
+           break;
+         case CollationSorter.IGNORE:
+           ignoreChar = true;
+         case CollationSorter.EQUAL:
+           equality_seq++;
+           break;
+         case CollationSorter.RESET:
+           throw new ParseException
+             ("Invalid reached state 'RESET'. Internal error", elt.offset);
+         default:
+           throw new ParseException
+             ("Invalid unknown state '" + elt.comparisonType + "'", 
elt.offset);
          }
-       else if (os == CollationElementIterator.NULLORDER)
-         {
-           // Source string is shorter, so return "less than".
-           return -1;
-         }
-       else if (ot == CollationElementIterator.NULLORDER)
+
+       CollationElement e;
+
+       if (!ignoreChar)
          {
-           // Target string is shorter, so return "greater than".
-           return 1;
+           e = new CollationElement(elt.textElement, primary_seq,
+                                    secondary_seq, tertiary_seq,
+                                    equality_seq, elt.expansionOrdering);
          }
+       else
+         e = new CollationElement(elt.textElement);
 
-        // We know chars are totally equal, so skip
-        if (os == ot)
-          continue;
+       v.add(e);
+      }
 
-        // Check for primary strength differences
-        int prim1 = cs.primaryOrder (os); 
-        int prim2 = ct.primaryOrder (ot); 
+    this.inverseAccentComparison = inverseComparisons; 
 
-        if (prim1 < prim2)
-          return -1;
-        else if (prim1 > prim2)
-          return 1;
-        else if (getStrength() == PRIMARY)
-          continue;
+    ce_table = v.toArray();
 
-        // Check for secondary strength differences
-        int sec1 = cs.secondaryOrder (os);
-        int sec2 = ct.secondaryOrder (ot);
+    last_primary_value = primary_seq+1;
+  }
 
-        if (sec1 < sec2)
-          return -1;
-        else if (sec1 > sec2)
-          return 1;
-        else if (getStrength() == SECONDARY)
-          continue;
+  /**
+   * Build a tree where all keys are the texts of collation elements and data 
is
+   * the collation element itself. The tree is used when extracting all prefix
+   * for a given text.
+   */
+  private void buildPrefixAccess()
+  {
+    prefix_tree = new HashMap();
 
-        // Check for tertiary differences
-        int tert1 = cs.tertiaryOrder (os);
-        int tert2 = ct.tertiaryOrder (ot);
+    for (int i = 0; i < ce_table.length; i++)
+      {
+       CollationElement e = (CollationElement) ce_table[i];
 
-        if (tert1 < tert2)
-          return -1;
-        else if (tert1 > tert2)
-          return 1;
+       prefix_tree.put(e.char_seq, e);
       }
   }
 
   /**
-   * This method tests this object for equality against the specified 
-   * object.  This will be true if and only if the specified object is
-   * another reference to this object.
+   * This method initializes a new instance of <code>RuleBasedCollator</code>
+   * with the specified collation rules.  Note that an application normally
+   * obtains an instance of <code>RuleBasedCollator</code> by calling the
+   * <code>getInstance</code> method of <code>Collator</code>.  That method
+   * automatically loads the proper set of rules for the desired locale.
    *
-   * @param obj The <code>Object</code> to compare against this object.
+   * @param rules The collation rule string.
    *
-   * @return <code>true</code> if the specified object is equal to this 
object, <code>false</code> otherwise.
+   * @exception ParseException If the rule string contains syntax errors.
    */
-  public boolean equals (Object obj)
+  public RuleBasedCollator(String rules) throws ParseException
   {
-    if (obj == this)
-      return true;
+    this.rules = rules;
 
-    return false;
+    if (rules.equals(""))
+      throw new ParseException("Empty rule set", 0);
+
+    buildCollationVector(parseString(rules));
+    buildPrefixAccess();
+  }
+
+  /**
+   * This method returns a <code>String</code> containing the collation rules
+   * for this object.
+   *
+   * @return The collation rules for this object.
+   */
+  public String getRules()
+  {
+    return rules;
+  }
+
+  /**
+   * This method builds a default collation element without invoking
+   * the database created from the rules passed to the constructor.
+   *
+   * @param c Character which needs a collation element.
+   * @return A valid brand new CollationElement instance.
+   */
+  CollationElement getDefaultElement(char c)
+  {
+    int v;
+
+    // Preliminary support for generic accent sorting inversion (I don't know 
if all
+    // characters in the range should be sorted backward). This is the place
+    // to fix this if needed.
+    if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
+      v = 0x0361 - ((int)c - 0x02B9);
+    else
+      v = (short)c;
+    return new CollationElement(""+c, last_primary_value + v,
+                               (short)0, (short)0, (short) 0, null);
   }
 
   /**
@@ -403,35 +733,122 @@
    * for the specified <code>String</code> under the collation rules for this
    * object.
    *
-   * @param source The <code>String</code> to return the 
<code>CollationElementIterator</code> instance for.
+   * @param str The <code>String</code> to return the 
<code>CollationElementIterator</code> instance for.
    *
    * @return A <code>CollationElementIterator</code> for the specified 
<code>String</code>.
    */
-  public CollationElementIterator getCollationElementIterator (String source)
+  public CollationElementIterator getCollationElementIterator(String str)
   {
-    return new CollationElementIterator (source, this);
-  }
+    return new CollationElementIterator(this, str);
+  }  
 
   /**
    * This method returns an instance of <code>CollationElementIterator</code>
    * for the <code>String</code> represented by the specified
    * <code>CharacterIterator</code>.
    *
-   * @param source The <code>CharacterIterator</code> with the desired 
<code>String</code>.
+   * @param ci The <code>CharacterIterator</code> with the desired 
<code>String</code>.
    *
    * @return A <code>CollationElementIterator</code> for the specified 
<code>String</code>.
    */
-  public CollationElementIterator getCollationElementIterator 
(CharacterIterator source)
+  public CollationElementIterator 
getCollationElementIterator(CharacterIterator ci)
   {
-    StringBuffer sb = new StringBuffer();
+    StringBuffer sb = new StringBuffer("");
 
     // Right now we assume that we will read from the beginning of the string.
-    for (char c = source.first(); c != CharacterIterator.DONE; c = 
source.next()) 
+    char c = ci.first();
+    while (c != CharacterIterator.DONE) 
       {
-        sb.append (c);
+        sb.append(c);
+        c = ci.next();
       }
 
-    return getCollationElementIterator (sb.toString());
+    return getCollationElementIterator(sb.toString());
+  }
+
+  /**
+   * This method returns an integer which indicates whether the first
+   * specified <code>String</code> is less than, greater than, or equal to
+   * the second.  The value depends not only on the collation rules in
+   * effect, but also the strength and decomposition settings of this object.
+   *
+   * @param s1 The first <code>String</code> to compare.
+   * @param s2 A second <code>String</code> to compare to the first.
+   *
+   * @return A negative integer if s1 &lt; s2, a positive integer
+   * if s1 &gt; s2, or 0 if s1 == s2.
+   */
+  public int compare(String s1, String s2)
+  {
+    CollationElementIterator cei1 = getCollationElementIterator(s1);
+    CollationElementIterator cei2 = getCollationElementIterator(s2);
+
+    for(;;)
+      {
+        CollationElement ord1block = cei1.nextBlock(); 
+        CollationElement ord2block = cei2.nextBlock(); 
+       int ord1;
+       int ord2;
+
+       if (ord1block != null)
+         ord1 = ord1block.getValue();
+       else
+         {
+           if (ord2block == null)
+             return 0;
+           return -1;
+         }
+
+       if (ord2block == null)
+         return 1;
+       
+       ord2 = ord2block.getValue();
+       
+       // We know chars are totally equal, so skip
+        if (ord1 == ord2)
+         {
+           if (getStrength() == IDENTICAL)
+             if (!ord1block.char_seq.equals(ord2block.char_seq))
+               return ord1block.char_seq.compareTo(ord2block.char_seq);
+           continue;
+         }
+
+        // Check for primary strength differences
+        int prim1 = cei1.primaryOrder(ord1); 
+        int prim2 = cei2.primaryOrder(ord2); 
+
+        if (prim1 < prim2)
+          return -1;
+        else if (prim1 > prim2)
+          return 1;
+        else if (getStrength() == PRIMARY)
+          continue;
+
+        // Check for secondary strength differences
+        int sec1 = cei1.secondaryOrder(ord1);
+        int sec2 = cei2.secondaryOrder(ord2);
+
+        if (sec1 < sec2)
+          return -1;
+        else if (sec1 > sec2)
+          return 1;
+        else if (getStrength() == SECONDARY)
+          continue;
+
+        // Check for tertiary differences
+        int tert1 = cei1.tertiaryOrder(ord1);
+        int tert2 = cei2.tertiaryOrder(ord2);
+
+        if (tert1 < tert2)
+          return -1;
+        else if (tert1 > tert2)
+          return 1;
+       else if (getStrength() == TERTIARY)
+         continue;
+
+       // Apparently JDK does this (at least for my test case).
+       return ord1block.char_seq.compareTo(ord2block.char_seq);    
+      }
   }
 
   /**
@@ -441,14 +858,14 @@
    * provide speed benefits if multiple comparisons are performed, such
    * as during a sort.
    *
-   * @param source The <code>String</code> to create a 
<code>CollationKey</code> for.
+   * @param str The <code>String</code> to create a <code>CollationKey</code> 
for.
    *
    * @return A <code>CollationKey</code> for the specified <code>String</code>.
    */
-  public CollationKey getCollationKey (String source)
+  public CollationKey getCollationKey(String str)
   {
-    CollationElementIterator cei = getCollationElementIterator (source);
-    Vector vec = new Vector (25);
+    CollationElementIterator cei = getCollationElementIterator(str);
+    Vector vect = new Vector(25);
 
     int ord = cei.next();
     cei.reset(); //set to start of string
@@ -458,44 +875,50 @@
         switch (getStrength())
           {
             case PRIMARY:
-               ord = cei.primaryOrder (ord);
+               ord = cei.primaryOrder(ord);
                break;
 
             case SECONDARY:
-               ord = cei.secondaryOrder (ord);
+               ord = cei.secondaryOrder(ord);
 
             default:
                break;
           }
 
-        vec.add (new Integer (ord)); 
+        vect.add(new Integer(ord)); 
        ord = cei.next(); //increment to next key
       }
 
-    Object[] objarr = vec.toArray();
-    byte[] key = new byte [objarr.length * 4];
+    Object[] objarr = vect.toArray();
+    byte[] key = new byte[objarr.length * 4];
 
     for (int i = 0; i < objarr.length; i++)
       {
-        int j = ((Integer) objarr [i]).intValue();
-        key [i * 4] = (byte) ((j & 0xFF000000) >> 24);
-        key [i * 4 + 1] = (byte) ((j & 0x00FF0000) >> 16);
-        key [i * 4 + 2] = (byte) ((j & 0x0000FF00) >> 8);
-        key [i * 4 + 3] = (byte) (j & 0x000000FF);
+        int j = ((Integer)objarr[i]).intValue();
+        key [i * 4] = (byte)((j & 0xFF000000) >> 24);
+        key [i * 4 + 1] = (byte)((j & 0x00FF0000) >> 16);
+        key [i * 4 + 2] = (byte)((j & 0x0000FF00) >> 8);
+        key [i * 4 + 3] = (byte)(j & 0x000000FF);
       }
 
-    return new CollationKey (this, source, key);
+    return new CollationKey(this, str, key);
   }
 
   /**
-   * This method returns a <code>String</code> containing the collation rules
-   * for this object.
+   * This method tests this object for equality against the specified 
+   * object.  This will be true if and only if the specified object is
+   * another reference to this object.
    *
-   * @return The collation rules for this object.
+   * @param obj The <code>Object</code> to compare against this object.
+   *
+   * @return <code>true</code> if the specified object is equal to this 
object, <code>false</code> otherwise.
    */
-  public String getRules()
+  public boolean equals(Object obj)
   {
-    return rules;
+    if (obj == this)
+      return true;
+    else
+      return false;
   }
 
   /**
@@ -505,31 +928,16 @@
    */
   public int hashCode()
   {
-    return System.identityHashCode (this);
+    return System.identityHashCode(this);
   }
 
   /**
-   * This method calculates the collation element value for the specified
-   * character(s).
+   * This method creates a copy of this object.
+   *
+   * @return A copy of this object.
    */
-  int getCollationElementValue (String source)
+  public Object clone()
   {
-    CollationElement e = null;
-
-    // The table is sorted.  Change to a binary search later.
-    for (int i = 0; i < ce_table.length; i++) 
-      if (((CollationElement) ce_table [i]).char_seq.equals (source))
-        {
-          e = (CollationElement) ce_table [i];
-          break;
-        }
-
-    if (e == null)
-      e = new CollationElement (source, 0xFFFF, (short)0xFF, (short)0xFF);
-
-    int retval = (e.primary << 16) + (e.secondary << 8) + e.tertiary;
-
-    return retval;
+    return super.clone();
   }
-
-} // class RuleBaseCollator
+}
Index: java/text/CollationElementIterator.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/text/CollationElementIterator.java,v
retrieving revision 1.10
diff -u -r1.10 CollationElementIterator.java
--- java/text/CollationElementIterator.java     15 Oct 2003 16:05:10 -0000      
1.10
+++ java/text/CollationElementIterator.java     29 Dec 2003 18:05:34 -0000
@@ -38,6 +38,11 @@
 
 package java.text;
 
+import java.util.Vector;
+import java.util.NoSuchElementException;
+import java.util.Map;
+import java.util.SortedMap;
+
 /* Written using "Java Class Libraries", 2nd edition, plus online
  * API docs for JDK 1.2 from http://www.javasoft.com.
  * Status: Believed complete and correct to JDK 1.1.
@@ -53,6 +58,7 @@
  *
  * @author Aaron M. Renn <address@hidden>
  * @author Tom Tromey <address@hidden>
+ * @author Guilhem Lavaux <address@hidden>
  */
 public final class CollationElementIterator
 {
@@ -73,11 +79,22 @@
   private String text;
 
   /**
-   * This is the index into the String where we are currently scanning.
+   * This is the index into the collation decomposition where we are currently 
scanning.
    */
   private int index;
 
   /**
+   * This is the index into the String where we are currently scanning.
+   */
+  private int textIndex;
+
+  /**
+   * Array containing the collation decomposition of the
+   * text given to the constructor.
+   */
+  private Object[] text_decomposition;
+
+  /**
    * This method initializes a new instance of 
<code>CollationElementIterator</code>
    * to iterate over the specified <code>String</code> using the rules in the
    * specified <code>RuleBasedCollator</code>.
@@ -85,27 +102,73 @@
    * @param collator The <code>RuleBasedCollation</code> used for calculating 
collation values
    * @param text The <code>String</code> to iterate over.
    */
-  CollationElementIterator (String text, RuleBasedCollator collator)
+  CollationElementIterator(RuleBasedCollator collator, String text)
   {
-    setText (text);
     this.collator = collator;
+    
+    setText (text);    
+  }
+
+  RuleBasedCollator.CollationElement nextBlock()
+  {
+    if (index >= text_decomposition.length)
+      return null;
+    
+    RuleBasedCollator.CollationElement e =
+      (RuleBasedCollator.CollationElement) text_decomposition[index++];
+    
+    textIndex += e.char_seq.length();
+
+    return e;
+  }
+
+  RuleBasedCollator.CollationElement previousBlock()
+  {
+    if (index == 0)
+      return null;
+    
+    index--;
+    RuleBasedCollator.CollationElement e =
+      (RuleBasedCollator.CollationElement) text_decomposition[index];
+
+    textIndex -= e.char_seq.length();
+    
+    return e;
   }
 
   /**
-   * This method returns the collation ordering value of the next character
-   * in the string.  This method will return <code>NULLORDER</code> if the
+   * This method returns the collation ordering value of the next character 
sequence
+   * in the string (it may be an extended character following collation rules).
+   * This method will return <code>NULLORDER</code> if the
    * end of the string was reached.
    *
    * @return The collation ordering value.
    */
   public int next()
   {
-    if (index == text.length())
+    RuleBasedCollator.CollationElement e = nextBlock();
+
+    if (e == null)
       return NULLORDER;
+    
+    return e.getValue();
+  }
+
+  /**
+   * This method returns the collation ordering value of the previous character
+   * in the string.  This method will return <code>NULLORDER</code> if the
+   * beginning of the string was reached.
+   *
+   * @return The collation ordering value.
+   */
+  public int previous()
+  {
+    RuleBasedCollator.CollationElement e = previousBlock();
 
-    String s = text.charAt (index) + "";
-    index++;
-    return collator.getCollationElementValue (s);
+    if (e == null)
+      return NULLORDER;
+    
+    return e.getValue();
   }
 
   /**
@@ -116,7 +179,7 @@
    *
    * @return The primary order value of the specified collation value.  This 
is the high 16 bits.
    */
-  public static final int primaryOrder (int order)
+  public static final int primaryOrder(int order)
   {
     // From the JDK 1.2 spec.
     return order >>> 16;
@@ -129,6 +192,7 @@
   public void reset()
   {
     index = 0;
+    textIndex = 0;
   }
 
   /**
@@ -139,7 +203,7 @@
    *
    * @return The secondary order value of the specified collation value.  This 
is the bits 8-15.
    */
-  public static final short secondaryOrder (int order)
+  public static final short secondaryOrder(int order)
   {
     // From the JDK 1.2 spec.
     return (short) ((order >>> 8) & 255);
@@ -153,7 +217,7 @@
    *
    * @return The tertiary order value of the specified collation value.  This 
is the low eight bits.
    */
-  public static final short tertiaryOrder (int order)
+  public static final short tertiaryOrder(int order)
   {
     // From the JDK 1.2 spec.
     return (short) (order & 255);
@@ -163,14 +227,70 @@
    * This method sets the <code>String</code> that it is iterating over
    * to the specified <code>String</code>.
    *
-   * @param text The new <code>String</code> to iterate over.
-   *
-   * @since 1.2
+   * @param The new <code>String</code> to iterate over.
    */
-  public void setText (String text)
+  public void setText(String text)
   {
+    int idx = 0;
+
     this.text = text;
     index = 0;
+
+    String work_text = text.intern();
+
+    Vector v = new Vector();
+    // Build element collection ordered as they come in "text".
+    while (idx < work_text.length())
+      {
+       String key, key_old;
+
+       Object object = null;
+       int p = 1;
+       
+       // IMPROVE: use a TreeMap with a prefix-ordering rule.
+       key_old = key = null;
+       do
+         {
+           if (object != null)
+             key_old = key;
+           key = work_text.substring (idx, idx+p);
+           object = collator.prefix_tree.get (key);
+           p++;
+         }
+       while (idx+p <= work_text.length());
+       
+       if (object == null)
+         key = key_old;
+       
+       RuleBasedCollator.CollationElement prefix =
+         (RuleBasedCollator.CollationElement) collator.prefix_tree.get (key);
+       
+       if (prefix == null)
+         {
+           RuleBasedCollator.CollationElement e =
+             collator.getDefaultElement(work_text.charAt (idx));
+           
+           v.add (e);
+           idx++;
+           continue;
+         }
+
+       if (prefix.expansion != null)
+         {
+           work_text = prefix.expansion
+             + work_text.substring (idx+prefix.char_seq.length());
+           idx = 0;
+           v.add (prefix);
+         }
+       else
+         {
+           if (!prefix.ignore)
+             v.add (prefix);
+           idx += prefix.char_seq.length();
+         }
+      }
+    
+    text_decomposition = v.toArray();
   }
 
   /**
@@ -180,19 +300,19 @@
    *
    * @param ci The <code>CharacterIterator</code> containing the new 
<code>String</code> to iterate over.
    */
-  public void setText (CharacterIterator ci)
+  public void setText(CharacterIterator ci)
   {
+    StringBuffer sb = new StringBuffer("");
+
     // For now assume we read from the beginning of the string.
     char c = ci.first();
-    StringBuffer sb = new StringBuffer ("");
-
     while (c != CharacterIterator.DONE)
       {
-        sb.append (c);
+        sb.append(c);
         c = ci.next();
       }
 
-    setText (sb.toString());
+    setText(sb.toString());
   }
 
   /**
@@ -200,12 +320,10 @@
    * that is being iterated over.
    *
    * @return The iteration index position.
-   *
-   * @since 1.2
    */
   public int getOffset()
   {
-    return index;
+    return textIndex;
   }
 
   /**
@@ -218,18 +336,28 @@
    *
    * @exception IllegalArgumentException If the new offset is not valid.
    */
-  public void setOffset (int offset)
+  public void setOffset(int offset)
   {
     if (offset < 0)
-      throw new IllegalArgumentException ("Negative offset: " + offset);
+      throw new IllegalArgumentException("Negative offset: " + offset);
 
-    if ((text.length () > 0) && (offset > 0))
-      throw new IllegalArgumentException ("Offset too large: " + offset);
-    else if (offset > (text.length () - 1))
-      throw new IllegalArgumentException ("Offset too large: " + offset);
+    if ((text.length() > 0) && (offset > 0))
+      throw new IllegalArgumentException("Offset too large: " + offset);
+    else if (offset > (text.length() - 1))
+      throw new IllegalArgumentException("Offset too large: " + offset);
 
-    index = offset;
-  }    
+    textIndex = 0;
+    for (int i=0;i<text_decomposition.length;i++)
+      {
+       RuleBasedCollator.CollationElement e =
+         (RuleBasedCollator.CollationElement) text_decomposition[i];
+       int idx = textIndex + e.char_seq.length();
+       
+       if (idx > offset)
+         break;
+       textIndex = idx;
+      }
+  }
 
   /**
    * This method returns the maximum length of any expansion sequence that
@@ -239,28 +367,8 @@
    *
    * @param The maximum length of an expansion sequence.
    */
-  public int getMaxExpansion (int value)
+  public int getMaxExpansion(int value)
   {
-    //************ Implement me!!!!!!!!!
-    return 5;
+    return 1;
   }
-
-  /**
-   * This method returns the collation ordering value of the previous character
-   * in the string.  This method will return <code>NULLORDER</code> if the
-   * beginning of the string was reached.
-   *
-   * @return The collation ordering value.
-   */
-  public int previous()
-  {
-    --index;
-    if (index < 0)
-      return NULLORDER;
-
-    String s = text.charAt (index) + "";
-    return collator.getCollationElementValue (s);
-  }
-
-} // class CollationElementIterator
-
+}

reply via email to

[Prev in Thread] Current Thread [Next in Thread]