Index: ArabicTokenizer.java
===================================================================
RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/lucene/ArabicTokenizer.java,v
retrieving revision 1.1
diff -u -r1.1 ArabicTokenizer.java
--- ArabicTokenizer.java 15 Oct 2003 17:12:21 -0000 1.1
+++ ArabicTokenizer.java 13 Jun 2005 08:44:13 -0000
@@ -22,8 +22,13 @@
package gpl.pierrick.brihaye.aramorph.lucene;
+
import java.io.IOException;
import java.io.Reader;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.Vector;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
@@ -44,6 +49,7 @@
private boolean debug = false;
+ private static Set arabicLetters = getArabicSet();
/** Constructs a tokenizer that will return tokens in the arabic alphabet.
* @param input The reader
*/
@@ -51,6 +57,80 @@
this(input, false);
}
+ private static Set getArabicSet() {
+ Set ara = new RangeSet();
+ ara.add(new Character('\u067E')); //U+067E : ARABIC LETTER PEH
+ ara.add(new Character('\u0679')); //U+0679 : ARABIC LETTER TTEH
+ ara.add(new Character('\u0686')); //U+0686 : ARABIC LETTER TCHEH
+ ara.add(new Character('\u0698')); //U+0698 : ARABIC LETTER JEH
+ ara.add(new Character('\u0688')); //U+0688 : ARABIC LETTER DDAL
+ ara.add(new Character('\u06AF')); //U+06AF : ARABIC LETTER GAF
+ ara.add(new Character('\u06A9')); //U+06A9 : ARABIC LETTER KEHEH
+ ara.add(new Character('\u0691')); //U+0691 : ARABIC LETTER RREH
+ ara.add(new Character('\u06BA')); //U+06BA : ARABIC LETTER NOON GHUNNA
+ //ara.add(new Character('\u060C')); //U+060C : ARABIC COMMA
+ ara.add(new Character('\u06BE')); //U+06BE : ARABIC LETTER HEH DOACHASHMEE
+ //ara.add(new Character('\u061B')); //U+061B : ARABIC SEMICOLON
+ //ara.add(new Character('\u061F')); //U+061F : ARABIC QUESTION MARK
+ ara.add(new Character('\u06C1')); //U+06C1 : ARABIC LETTER HEH GOAL
+ ara.add(new Character('\u0621')); //U+0621 : ARABIC LETTER HAMZA
+ ara.add(new Character('\u0622')); //U+0622 : ARABIC LETTER ALEF WITH MADDA ABOVE
+ ara.add(new Character('\u0623')); //U+0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE
+ ara.add(new Character('\u0624')); //U+0624 : ARABIC LETTER WAW WITH HAMZA ABOVE
+ ara.add(new Character('\u0625')); //U+0625 : ARABIC LETTER ALEF WITH HAMZA BELOW
+ ara.add(new Character('\u0626')); //U+0626 : ARABIC LETTER YEH WITH HAMZA ABOVE
+ ara.add(new Character('\u0627')); //U+0627 : ARABIC LETTER ALEF
+ ara.add(new Character('\u0628')); //U+0628 : ARABIC LETTER BEH
+ ara.add(new Character('\u0629')); //U+0629 : ARABIC LETTER TEH MARBUTA
+ ara.add(new Character('\u062A')); //U+062A : ARABIC LETTER TEH
+ ara.add(new Character('\u062B')); //U+062B : ARABIC LETTER THEH
+ ara.add(new Character('\u062C')); //U+062C : ARABIC LETTER JEEM
+ ara.add(new Character('\u062D')); //U+062D : ARABIC LETTER HAH
+ ara.add(new Character('\u062E')); //U+062E : ARABIC LETTER KHAH
+ ara.add(new Character('\u062F')); //U+062F : ARABIC LETTER DAL
+ ara.add(new Character('\u0630')); //U+0630 : ARABIC LETTER THAL
+ ara.add(new Character('\u0631')); //U+0631 : ARABIC LETTER REH
+ ara.add(new Character('\u0632')); //U+0632 : ARABIC LETTER ZAIN
+ ara.add(new Character('\u0633')); //U+0633 : ARABIC LETTER SEEN
+ ara.add(new Character('\u0634')); //U+0634 : ARABIC LETTER SHEEN
+ ara.add(new Character('\u0635')); //U+0635 : ARABIC LETTER SAD
+ ara.add(new Character('\u0636')); //U+0636 : ARABIC LETTER DAD
+ ara.add(new Character('\u0637')); //U+0637 : ARABIC LETTER TAH
+ ara.add(new Character('\u0638')); //U+0638 : ARABIC LETTER ZAH
+ ara.add(new Character('\u0639')); //U+0639 : ARABIC LETTER AIN
+ ara.add(new Character('\u063A')); //U+063A : ARABIC LETTER GHAIN
+ ara.add(new Character('\u0640')); //U+0640 : ARABIC TATWEEL
+ ara.add(new Character('\u0641')); //U+0641 : ARABIC LETTER FEH
+ ara.add(new Character('\u0642')); //U+0642 : ARABIC LETTER QAF
+ ara.add(new Character('\u0643')); //U+0643 : ARABIC LETTER KAF
+ ara.add(new Character('\u0644')); //U+0644 : ARABIC LETTER LAM
+ ara.add(new Character('\u0645')); //U+0645 : ARABIC LETTER MEEM
+ ara.add(new Character('\u0646')); //U+0646 : ARABIC LETTER NOON
+ ara.add(new Character('\u0647')); //U+0647 : ARABIC LETTER HEH
+ ara.add(new Character('\u0648')); //U+0648 : ARABIC LETTER WAW
+ ara.add(new Character('\u0649')); //U+0649 : ARABIC LETTER ALEF MAKSURA
+ ara.add(new Character('\u064A')); //U+064A : ARABIC LETTER YEH
+ ara.add(new Character('\u064B')); //U+064B : ARABIC FATHATAN
+ ara.add(new Character('\u064C')); //U+064C : ARABIC DAMMATAN
+ ara.add(new Character('\u064D')); //U+064D : ARABIC KASRATAN
+ ara.add(new Character('\u064E')); //U+064E : ARABIC FATHA
+ ara.add(new Character('\u064F')); //U+064F : ARABIC DAMMA
+ ara.add(new Character('\u0650')); //U+0650 : ARABIC KASRA
+ ara.add(new Character('\u0651')); //U+0651 : ARABIC SHADDA
+ ara.add(new Character('\u0652')); //U+0652 : ARABIC SUKUN
+ ara.add(new Character('\u06D2')); //U+06D2 : ARABIC LETTER YEH BARREE
+ ara.add(new Character('\u0640')); //U+0640 : ARABIC TATWEEL
+ ara.add(new Character('\u064B')); //U+064B : ARABIC FATHATAN
+ ara.add(new Character('\u064C')); //U+064C : ARABIC DAMMATAN
+ ara.add(new Character('\u064D')); //U+064D : ARABIC KASRATAN
+ ara.add(new Character('\u064E')); //U+064E : ARABIC FATHA
+ ara.add(new Character('\u064F')); //U+064F : ARABIC DAMMA
+ ara.add(new Character('\u0650')); //U+0650 : ARABIC KASRA
+ ara.add(new Character('\u0651')); //U+0651 : ARABIC SHADDA
+ ara.add(new Character('\u0652')); //U+0652 : ARABIC SUKUN
+ return ara;
+ }
+
/** Constructs a tokenizer that will return tokens in the arabic alphabet.
* @param input The reader
* @param debug Whether or not the tokenizer should display convenience messages on System.out
@@ -72,7 +152,7 @@
Arabic digits as well as hindic digits are in use in the Maghreb (from Morocco to Lybia)
We should have an option to set the digit processing
*/
- if (c == '\u067E') return true; //U+067E : ARABIC LETTER PEH
+ /*if (c == '\u067E') return true; //U+067E : ARABIC LETTER PEH
if (c == '\u0679') return true; //U+0679 : ARABIC LETTER TTEH
if (c == '\u0686') return true; //U+0686 : ARABIC LETTER TCHEH
if (c == '\u0698') return true; //U+0698 : ARABIC LETTER JEH
@@ -141,7 +221,7 @@
if (c == '\u0650') return true; //U+0650 : ARABIC KASRA
if (c == '\u0651') return true; //U+0651 : ARABIC SHADDA
if (c == '\u0652') return true; //U+0652 : ARABIC SUKUN
- return false;
+*/ return arabicLetters.contains(new Character(c));
}
/** Returns the next token in the stream, or null
at EOS.
@@ -188,6 +268,236 @@
return new Token(txt, start, start+length, "ARABIC");
}
+ public static void main(String[] args) {
+ RangeSet rset = new RangeSet();
+ rset.add(new Integer(10));
+ rset.add(new Integer(14));
+ rset.add(new Integer(13));
+ rset.add(new Integer(11));
+ rset.add(new Integer(15));
+ rset.add(new Integer(12));
+ System.out.println(rset.size());
+ System.out.println(rset.contains(new Integer(14)));
+ }
+
}
+/**
+ * This set stores integer numbers in a form of list of disjoint ranges.
+ * It is very useful when the numbers in the set are contiguous.
+ * It automatically optimizes the representation so that no two adjacent ranges exist.
+ * @author Ahmed Saad, 2005
+ */
+class RangeSet implements Set {
+ class Range {
+ public int from,to;
+ public Range(int from, int to) {
+ this.from = from;
+ this.to = to;
+ }
+ }
+ private Vector ranges = new Vector();
+
+ public RangeSet() {}
+ /**
+ * Returns number of integers stored in this set.
+ * @return number of integers stored.
+ */
+ public int size() {
+ int totalSize=0;
+ for (int i=0; itrue if the set is empty
+ */
+ public boolean isEmpty() {
+ return ranges.isEmpty();
+ }
+
+ /**
+ * Check for an Integer or Character to exist in the set.
+ * An integer is in the set if lies in one of its ranges.
+ * For checking, we use a slightly modified version of binary search.
+ * @param x the integer to check in the set.
+ * If this parameter is not of type Integer or Character an IllegalArgumentException is thrown.
+ * @return true if this integer is found
+ * @throws IllegalArgumentException if the passed parameter is neither Integer nor Character
+ */
+ public boolean contains(Object x) throws IllegalArgumentException{
+ //first, retrive the number passed
+ int num=0;
+ if (x instanceof Integer) {
+ Integer i = (Integer) x;
+ num = i.intValue();
+ } else if (x instanceof Character) {
+ Character c = (Character) x;
+ num = c.charValue();
+ } else throw new IllegalArgumentException("RangeSet.contains must receive an Integer or Character");
+
+ //second, check it in the set
+ int l=0, h=ranges.size()-1, m=0;
+ while (l <= h) {
+ m = (l+h)>>1;
+ Range r = (Range)ranges.elementAt(m);
+ if (num >= r.from && num <= r.to)
+ return true;
+ if (num < r.from)
+ h = m-1;
+ else l = m+1;
+ }
+ return false;
+ }
+
+ /**
+ * TODO return an iterator for all numbers in this set.
+ */
+ public Iterator iterator() {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ /**
+ * TODO Retrieves an array of Integer for all numbers in the set.
+ */
+ public Object[] toArray() {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ /**
+ * TODO Retrieves an array of Integer in the specified array.
+ * @param ar
+ * @return
+ */
+ public Object[] toArray(Object[] ar) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ /**
+ * Inserts the given Integer or Character in the set.
+ * @param x the Integer or Character to be added to the set.
+ * If this parameter is not of type Integer or Character an IllegalArgumentException is thrown.
+ * @return true
if this number was added. false
if it was already in the set.
+ * @throws IllegalArgumentException if the passed parameter is neither Integer nor Character.
+ */
+ public boolean add(Object x) {
+ //first, retrive the number passed
+ int num=0;
+ if (x instanceof Integer) {
+ Integer i = (Integer) x;
+ num = i.intValue();
+ } else if (x instanceof Character) {
+ Character c = (Character) x;
+ num = c.charValue();
+ } else throw new IllegalArgumentException("RangeSet.add must receive an Integer or Character");
+
+ //second, check it in the set
+ int l=0, h=ranges.size()-1, m=0;
+ Range r=null;
+ while (l <= h) {
+ m = (l+h)>>1;
+ r = (Range)ranges.elementAt(m);
+ if (num >= r.from && num <= r.to)
+ return false;
+ if (num < r.from)
+ h = m-1;
+ else l = m+1;
+ }
+
+ //third, see where to add the new number
+ r = new Range(num,num);
+ ranges.add(l,r);
+
+ //fourth, check if we can merge the new range with its previous or next or both
+ Range r2 = null;
+ h = Math.min(l+1,ranges.size()-1);
+ l = Math.max(l-1,0);
+ while (l < h) {
+ //try to merge l with l+1
+ r = (Range) ranges.elementAt(l);
+ r2 = (Range) ranges.elementAt(l+1);
+ if (r.to == r2.from - 1) {
+ r2.from = r.from;
+ ranges.remove(l);
+ h--;
+ } else
+ l++;
+ }
+ return true;
+ }
+
+ /**
+ * TODO remove an element from the set and split if necessary
+ */
+ public boolean remove(Object arg0) {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+ /**
+ * Checks if all the Integers or Characters in the Collection are in the set or not.
+ * @param col
+ * @return true
if all values in the Collection are in the set.
+ * @throws IllegalArgumentException if one of the values in the collection is not Integer or Character.
+ */
+ public boolean containsAll(Collection col) throws IllegalArgumentException{
+ Iterator i = col.iterator();
+ while (i.hasNext()) {
+ if (!contains(i.next()))
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Adds all elements in a Collection to the set.
+ * @param col the collection to be added
+ * @return true if the set has changed due to this call
+ * @throws IllegalArgumentException if an element in the Collection is not Integer or Character.
+ * Note that the addition of elements will stop after throwing this exception.
+ */
+ public boolean addAll(Collection col) throws IllegalArgumentException{
+ boolean changed = false;
+ Iterator i = col.iterator();
+ while (i.hasNext()) {
+ changed = changed || add(i.next());
+ }
+ return changed;
+ }
+
+ /**
+ * TODO remove all elements except those in the Collection.
+ * @param arg0
+ * @return
+ */
+ public boolean retainAll(Collection arg0) {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+ /**
+ * TODO remove all elements in the Collection.
+ * @param arg0
+ * @return
+ */
+ public boolean removeAll(Collection arg0) {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+ /**
+ * Removes all elements in the set.
+ */
+ public void clear() {
+ ranges.clear();
+ }
+
+}