Index: ArabicTokenizer.java =================================================================== RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/lucene/ArabicTokenizer.java,v retrieving revision 1.1 diff -u -r1.1 ArabicTokenizer.java --- ArabicTokenizer.java 15 Oct 2003 17:12:21 -0000 1.1 +++ ArabicTokenizer.java 13 Jun 2005 08:44:13 -0000 @@ -22,8 +22,13 @@ package gpl.pierrick.brihaye.aramorph.lucene; + import java.io.IOException; import java.io.Reader; +import java.util.Collection; +import java.util.Iterator; +import java.util.Set; +import java.util.Vector; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; @@ -44,6 +49,7 @@ private boolean debug = false; + private static Set arabicLetters = getArabicSet(); /** Constructs a tokenizer that will return tokens in the arabic alphabet. * @param input The reader */ @@ -51,6 +57,80 @@ this(input, false); } + private static Set getArabicSet() { + Set ara = new RangeSet(); + ara.add(new Character('\u067E')); //U+067E : ARABIC LETTER PEH + ara.add(new Character('\u0679')); //U+0679 : ARABIC LETTER TTEH + ara.add(new Character('\u0686')); //U+0686 : ARABIC LETTER TCHEH + ara.add(new Character('\u0698')); //U+0698 : ARABIC LETTER JEH + ara.add(new Character('\u0688')); //U+0688 : ARABIC LETTER DDAL + ara.add(new Character('\u06AF')); //U+06AF : ARABIC LETTER GAF + ara.add(new Character('\u06A9')); //U+06A9 : ARABIC LETTER KEHEH + ara.add(new Character('\u0691')); //U+0691 : ARABIC LETTER RREH + ara.add(new Character('\u06BA')); //U+06BA : ARABIC LETTER NOON GHUNNA + //ara.add(new Character('\u060C')); //U+060C : ARABIC COMMA + ara.add(new Character('\u06BE')); //U+06BE : ARABIC LETTER HEH DOACHASHMEE + //ara.add(new Character('\u061B')); //U+061B : ARABIC SEMICOLON + //ara.add(new Character('\u061F')); //U+061F : ARABIC QUESTION MARK + ara.add(new Character('\u06C1')); //U+06C1 : ARABIC LETTER HEH GOAL + ara.add(new Character('\u0621')); //U+0621 : ARABIC LETTER HAMZA + ara.add(new Character('\u0622')); //U+0622 : ARABIC LETTER ALEF WITH MADDA ABOVE + ara.add(new Character('\u0623')); //U+0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE + ara.add(new Character('\u0624')); //U+0624 : ARABIC LETTER WAW WITH HAMZA ABOVE + ara.add(new Character('\u0625')); //U+0625 : ARABIC LETTER ALEF WITH HAMZA BELOW + ara.add(new Character('\u0626')); //U+0626 : ARABIC LETTER YEH WITH HAMZA ABOVE + ara.add(new Character('\u0627')); //U+0627 : ARABIC LETTER ALEF + ara.add(new Character('\u0628')); //U+0628 : ARABIC LETTER BEH + ara.add(new Character('\u0629')); //U+0629 : ARABIC LETTER TEH MARBUTA + ara.add(new Character('\u062A')); //U+062A : ARABIC LETTER TEH + ara.add(new Character('\u062B')); //U+062B : ARABIC LETTER THEH + ara.add(new Character('\u062C')); //U+062C : ARABIC LETTER JEEM + ara.add(new Character('\u062D')); //U+062D : ARABIC LETTER HAH + ara.add(new Character('\u062E')); //U+062E : ARABIC LETTER KHAH + ara.add(new Character('\u062F')); //U+062F : ARABIC LETTER DAL + ara.add(new Character('\u0630')); //U+0630 : ARABIC LETTER THAL + ara.add(new Character('\u0631')); //U+0631 : ARABIC LETTER REH + ara.add(new Character('\u0632')); //U+0632 : ARABIC LETTER ZAIN + ara.add(new Character('\u0633')); //U+0633 : ARABIC LETTER SEEN + ara.add(new Character('\u0634')); //U+0634 : ARABIC LETTER SHEEN + ara.add(new Character('\u0635')); //U+0635 : ARABIC LETTER SAD + ara.add(new Character('\u0636')); //U+0636 : ARABIC LETTER DAD + ara.add(new Character('\u0637')); //U+0637 : ARABIC LETTER TAH + ara.add(new Character('\u0638')); //U+0638 : ARABIC LETTER ZAH + ara.add(new Character('\u0639')); //U+0639 : ARABIC LETTER AIN + ara.add(new Character('\u063A')); //U+063A : ARABIC LETTER GHAIN + ara.add(new Character('\u0640')); //U+0640 : ARABIC TATWEEL + ara.add(new Character('\u0641')); //U+0641 : ARABIC LETTER FEH + ara.add(new Character('\u0642')); //U+0642 : ARABIC LETTER QAF + ara.add(new Character('\u0643')); //U+0643 : ARABIC LETTER KAF + ara.add(new Character('\u0644')); //U+0644 : ARABIC LETTER LAM + ara.add(new Character('\u0645')); //U+0645 : ARABIC LETTER MEEM + ara.add(new Character('\u0646')); //U+0646 : ARABIC LETTER NOON + ara.add(new Character('\u0647')); //U+0647 : ARABIC LETTER HEH + ara.add(new Character('\u0648')); //U+0648 : ARABIC LETTER WAW + ara.add(new Character('\u0649')); //U+0649 : ARABIC LETTER ALEF MAKSURA + ara.add(new Character('\u064A')); //U+064A : ARABIC LETTER YEH + ara.add(new Character('\u064B')); //U+064B : ARABIC FATHATAN + ara.add(new Character('\u064C')); //U+064C : ARABIC DAMMATAN + ara.add(new Character('\u064D')); //U+064D : ARABIC KASRATAN + ara.add(new Character('\u064E')); //U+064E : ARABIC FATHA + ara.add(new Character('\u064F')); //U+064F : ARABIC DAMMA + ara.add(new Character('\u0650')); //U+0650 : ARABIC KASRA + ara.add(new Character('\u0651')); //U+0651 : ARABIC SHADDA + ara.add(new Character('\u0652')); //U+0652 : ARABIC SUKUN + ara.add(new Character('\u06D2')); //U+06D2 : ARABIC LETTER YEH BARREE + ara.add(new Character('\u0640')); //U+0640 : ARABIC TATWEEL + ara.add(new Character('\u064B')); //U+064B : ARABIC FATHATAN + ara.add(new Character('\u064C')); //U+064C : ARABIC DAMMATAN + ara.add(new Character('\u064D')); //U+064D : ARABIC KASRATAN + ara.add(new Character('\u064E')); //U+064E : ARABIC FATHA + ara.add(new Character('\u064F')); //U+064F : ARABIC DAMMA + ara.add(new Character('\u0650')); //U+0650 : ARABIC KASRA + ara.add(new Character('\u0651')); //U+0651 : ARABIC SHADDA + ara.add(new Character('\u0652')); //U+0652 : ARABIC SUKUN + return ara; + } + /** Constructs a tokenizer that will return tokens in the arabic alphabet. * @param input The reader * @param debug Whether or not the tokenizer should display convenience messages on System.out @@ -72,7 +152,7 @@ Arabic digits as well as hindic digits are in use in the Maghreb (from Morocco to Lybia) We should have an option to set the digit processing */ - if (c == '\u067E') return true; //U+067E : ARABIC LETTER PEH + /*if (c == '\u067E') return true; //U+067E : ARABIC LETTER PEH if (c == '\u0679') return true; //U+0679 : ARABIC LETTER TTEH if (c == '\u0686') return true; //U+0686 : ARABIC LETTER TCHEH if (c == '\u0698') return true; //U+0698 : ARABIC LETTER JEH @@ -141,7 +221,7 @@ if (c == '\u0650') return true; //U+0650 : ARABIC KASRA if (c == '\u0651') return true; //U+0651 : ARABIC SHADDA if (c == '\u0652') return true; //U+0652 : ARABIC SUKUN - return false; +*/ return arabicLetters.contains(new Character(c)); } /** Returns the next token in the stream, or null at EOS. @@ -188,6 +268,236 @@ return new Token(txt, start, start+length, "ARABIC"); } + public static void main(String[] args) { + RangeSet rset = new RangeSet(); + rset.add(new Integer(10)); + rset.add(new Integer(14)); + rset.add(new Integer(13)); + rset.add(new Integer(11)); + rset.add(new Integer(15)); + rset.add(new Integer(12)); + System.out.println(rset.size()); + System.out.println(rset.contains(new Integer(14))); + } + } +/** + * This set stores integer numbers in a form of list of disjoint ranges. + * It is very useful when the numbers in the set are contiguous. + * It automatically optimizes the representation so that no two adjacent ranges exist. + * @author Ahmed Saad, 2005 + */ +class RangeSet implements Set { + class Range { + public int from,to; + public Range(int from, int to) { + this.from = from; + this.to = to; + } + } + private Vector ranges = new Vector(); + + public RangeSet() {} + /** + * Returns number of integers stored in this set. + * @return number of integers stored. + */ + public int size() { + int totalSize=0; + for (int i=0; itrue if the set is empty + */ + public boolean isEmpty() { + return ranges.isEmpty(); + } + + /** + * Check for an Integer or Character to exist in the set. + * An integer is in the set if lies in one of its ranges. + * For checking, we use a slightly modified version of binary search. + * @param x the integer to check in the set. + * If this parameter is not of type Integer or Character an IllegalArgumentException is thrown. + * @return true if this integer is found + * @throws IllegalArgumentException if the passed parameter is neither Integer nor Character + */ + public boolean contains(Object x) throws IllegalArgumentException{ + //first, retrive the number passed + int num=0; + if (x instanceof Integer) { + Integer i = (Integer) x; + num = i.intValue(); + } else if (x instanceof Character) { + Character c = (Character) x; + num = c.charValue(); + } else throw new IllegalArgumentException("RangeSet.contains must receive an Integer or Character"); + + //second, check it in the set + int l=0, h=ranges.size()-1, m=0; + while (l <= h) { + m = (l+h)>>1; + Range r = (Range)ranges.elementAt(m); + if (num >= r.from && num <= r.to) + return true; + if (num < r.from) + h = m-1; + else l = m+1; + } + return false; + } + + /** + * TODO return an iterator for all numbers in this set. + */ + public Iterator iterator() { + // TODO Auto-generated method stub + return null; + } + + /** + * TODO Retrieves an array of Integer for all numbers in the set. + */ + public Object[] toArray() { + // TODO Auto-generated method stub + return null; + } + + /** + * TODO Retrieves an array of Integer in the specified array. + * @param ar + * @return + */ + public Object[] toArray(Object[] ar) { + // TODO Auto-generated method stub + return null; + } + + /** + * Inserts the given Integer or Character in the set. + * @param x the Integer or Character to be added to the set. + * If this parameter is not of type Integer or Character an IllegalArgumentException is thrown. + * @return true if this number was added. false if it was already in the set. + * @throws IllegalArgumentException if the passed parameter is neither Integer nor Character. + */ + public boolean add(Object x) { + //first, retrive the number passed + int num=0; + if (x instanceof Integer) { + Integer i = (Integer) x; + num = i.intValue(); + } else if (x instanceof Character) { + Character c = (Character) x; + num = c.charValue(); + } else throw new IllegalArgumentException("RangeSet.add must receive an Integer or Character"); + + //second, check it in the set + int l=0, h=ranges.size()-1, m=0; + Range r=null; + while (l <= h) { + m = (l+h)>>1; + r = (Range)ranges.elementAt(m); + if (num >= r.from && num <= r.to) + return false; + if (num < r.from) + h = m-1; + else l = m+1; + } + + //third, see where to add the new number + r = new Range(num,num); + ranges.add(l,r); + + //fourth, check if we can merge the new range with its previous or next or both + Range r2 = null; + h = Math.min(l+1,ranges.size()-1); + l = Math.max(l-1,0); + while (l < h) { + //try to merge l with l+1 + r = (Range) ranges.elementAt(l); + r2 = (Range) ranges.elementAt(l+1); + if (r.to == r2.from - 1) { + r2.from = r.from; + ranges.remove(l); + h--; + } else + l++; + } + return true; + } + + /** + * TODO remove an element from the set and split if necessary + */ + public boolean remove(Object arg0) { + // TODO Auto-generated method stub + return false; + } + + /** + * Checks if all the Integers or Characters in the Collection are in the set or not. + * @param col + * @return true if all values in the Collection are in the set. + * @throws IllegalArgumentException if one of the values in the collection is not Integer or Character. + */ + public boolean containsAll(Collection col) throws IllegalArgumentException{ + Iterator i = col.iterator(); + while (i.hasNext()) { + if (!contains(i.next())) + return false; + } + return true; + } + + /** + * Adds all elements in a Collection to the set. + * @param col the collection to be added + * @return true if the set has changed due to this call + * @throws IllegalArgumentException if an element in the Collection is not Integer or Character. + * Note that the addition of elements will stop after throwing this exception. + */ + public boolean addAll(Collection col) throws IllegalArgumentException{ + boolean changed = false; + Iterator i = col.iterator(); + while (i.hasNext()) { + changed = changed || add(i.next()); + } + return changed; + } + + /** + * TODO remove all elements except those in the Collection. + * @param arg0 + * @return + */ + public boolean retainAll(Collection arg0) { + // TODO Auto-generated method stub + return false; + } + + /** + * TODO remove all elements in the Collection. + * @param arg0 + * @return + */ + public boolean removeAll(Collection arg0) { + // TODO Auto-generated method stub + return false; + } + + /** + * Removes all elements in the set. + */ + public void clear() { + ranges.clear(); + } + +}