Index: src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java =================================================================== RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java,v retrieving revision 1.4 diff -u -r1.4 DictionaryEntry.java --- src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java 29 Feb 2004 10:58:53 -0000 1.4 +++ src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java 12 Jun 2005 20:15:31 -0000 @@ -26,20 +26,22 @@ package gpl.pierrick.brihaye.aramorph; +import java.io.Serializable; import java.util.Arrays; /** An abstraction of a dictionary entry for a word. @author Pierrick Brihaye, 2003 */ -class DictionaryEntry { +class DictionaryEntry implements Serializable{ private String entry; private String lemmaID; private String vocalization; private String morphology; private String gloss; - private String[] glosses; - private String[] POS; + private String oPOS; + transient private String[] glosses; + transient private String[] POS; protected DictionaryEntry(String entry, String lemmaID, String vocalization, String morphology, String gloss, String POS) { this.entry = entry.trim(); @@ -47,32 +49,7 @@ this.vocalization = vocalization.trim(); this.morphology = morphology.trim(); this.gloss = gloss; - int i, offset; - String[] array = null; - //split("[/()]"); - array = gloss.split("\\+"); - for (i = 0 ; i < array.length ; i++) { - array[i] = array[i].trim(); - } - //For suffixes - if ("".equals(array[0])) offset = 1; - else offset = 0; - this.glosses = new String[array.length - offset]; - for (i = offset ; i < array.length ; i++) { - this.glosses[i - offset] = array[i]; - } - //replaceFirst("^.*/",""); - array = POS.split("\\+"); - for (i = 0 ; i < array.length ; i++) { - array[i] = array[i].trim(); - } - //For suffixes - if ("".equals(array[0])) offset = 1; - else offset = 0; - this.POS = new String[array.length - offset]; - for (i = offset ; i < array.length ; i++) { - this.POS[i - offset] = array[i]; - } + this.oPOS = POS; } protected String getEntry() { return this.entry; } @@ -83,11 +60,71 @@ protected String getMorphology() { return this.morphology; } - protected String[] getPOS() { return this.POS; } + protected String[] getPOS() { + if (this.POS == null) { + int i, offset; + String[] array = null; + //split("[/()]"); + array = gloss.split("\\+"); + for (i = 0 ; i < array.length ; i++) { + array[i] = array[i].trim(); + } + //For suffixes + if ("".equals(array[0])) offset = 1; + else offset = 0; + this.glosses = new String[array.length - offset]; + for (i = offset ; i < array.length ; i++) { + this.glosses[i - offset] = array[i]; + } + //replaceFirst("^.*/",""); + array = this.oPOS.split("\\+"); + for (i = 0 ; i < array.length ; i++) { + array[i] = array[i].trim(); + } + //For suffixes + if ("".equals(array[0])) offset = 1; + else offset = 0; + this.POS = new String[array.length - offset]; + for (i = offset ; i < array.length ; i++) { + this.POS[i - offset] = array[i]; + } + } + return this.POS; + } protected String getGloss() { return this.gloss; } - protected String[] getGlosses() { return this.glosses; } + protected String[] getGlosses() { + if (this.glosses == null) { + int i, offset; + String[] array = null; + //split("[/()]"); + array = gloss.split("\\+"); + for (i = 0 ; i < array.length ; i++) { + array[i] = array[i].trim(); + } + //For suffixes + if ("".equals(array[0])) offset = 1; + else offset = 0; + this.glosses = new String[array.length - offset]; + for (i = offset ; i < array.length ; i++) { + this.glosses[i - offset] = array[i]; + } + //replaceFirst("^.*/",""); + array = this.oPOS.split("\\+"); + for (i = 0 ; i < array.length ; i++) { + array[i] = array[i].trim(); + } + //For suffixes + if ("".equals(array[0])) offset = 1; + else offset = 0; + this.POS = new String[array.length - offset]; + for (i = offset ; i < array.length ; i++) { + this.POS[i - offset] = array[i]; + } + } + return this.glosses; + } } Index: src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java =================================================================== RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java,v retrieving revision 1.3 diff -u -r1.3 InMemoryDictionaryHandler.java --- src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java 26 Feb 2004 13:58:44 -0000 1.3 +++ src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java 12 Jun 2005 20:15:32 -0000 @@ -26,10 +26,12 @@ package gpl.pierrick.brihaye.aramorph; +import java.io.*; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; +import java.sql.Savepoint; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -43,6 +45,10 @@ */ class InMemoryDictionaryHandler { + /** A pattern which is used in parsing the dictionary file*/ + private static Pattern p = Pattern.compile(".*" + "(.+?)" + ".*"); + /**This map is used for translation of dictionaries*/ + private static Map translationMap = getTranslationSet(); /** The unique instance of this handler. */ private static InMemoryDictionaryHandler handler = null; /** Dictionary of prefixes */ @@ -68,17 +74,177 @@ private InMemoryDictionaryHandler() { System.out.println("Initializing in-memory dictionary handler..."); // load 3 lexicons - loadDictionary(prefixes, "dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes")); + /*loadDictionary(prefixes, "dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes")); loadDictionary(stems, "dictStems", this.getClass().getResourceAsStream("dictionaries/dictStems")); loadDictionary(suffixes, "dictSuffixes", this.getClass().getResourceAsStream("dictionaries/dictSuffixes")); //load 3 compatibility tables loadCompatibilityTable(hash_AB, "tableAB", this.getClass().getResourceAsStream("dictionaries/tableAB")); loadCompatibilityTable(hash_AC, "tableAC", this.getClass().getResourceAsStream("dictionaries/tableAC")); - loadCompatibilityTable(hash_BC, "tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC")); + loadCompatibilityTable(hash_BC, "tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC"));*/ + readFromFile(); + //writeToFile(); handler = this; System.out.println("... done."); }; + /** + * This function will write the dictionaries and compatibility tables in the Java + * serializable formats. For this to work, we had to implement the interface + * Serializable for the class DictionaryEntry + */ + public void writeToFile() { + ObjectOutputStream out; + try { + out = new ObjectOutputStream(new FileOutputStream("dictPrefixes2")); + out.writeObject(prefixes); + out.close(); + + out = new ObjectOutputStream(new FileOutputStream("dictStems2")); + out.writeObject(stems); + out.close(); + + out = new ObjectOutputStream(new FileOutputStream("dictSuffixes2")); + out.writeObject(suffixes); + out.close(); + + out = new ObjectOutputStream(new FileOutputStream("tableAB2")); + out.writeObject(hash_AB); + out.close(); + + out = new ObjectOutputStream(new FileOutputStream("tableAC2")); + out.writeObject(hash_AC); + out.close(); + + out = new ObjectOutputStream(new FileOutputStream("tableBC2")); + out.writeObject(hash_BC); + out.close(); + /*out = new ObjectOutputStream(new FileOutputStream("tables")); + out.writeObject(prefixes); + out.writeObject(stems); + out.writeObject(suffixes); + + out.writeObject(hash_AB); + out.writeObject(hash_AC); + out.writeObject(hash_BC); + out.close();*/ + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * This function reads dictionaries and compatibility tables back from + * the files stored in Java serializable format. + */ + public void readFromFile() { + ObjectInputStream in; + try { + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictPrefixes2")); + prefixes = (MultiHashMap)in.readObject(); + in.close(); + + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictStems2")); + stems = (MultiHashMap)in.readObject(); + in.close(); + + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictSuffixes2")); + suffixes = (MultiHashMap)in.readObject(); + in.close(); + + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableAB2")); + hash_AB = (HashSet) in.readObject(); + in.close(); + + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableAC2")); + hash_AC = (HashSet) in.readObject(); + in.close(); + + in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableBC2")); + hash_BC = (HashSet) in.readObject(); + in.close(); + /*in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tables")); + prefixes = (MultiHashMap)in.readObject(); + stems = (MultiHashMap)in.readObject(); + suffixes = (MultiHashMap)in.readObject(); + + hash_AB = (HashSet) in.readObject(); + hash_AC = (HashSet) in.readObject(); + hash_BC = (HashSet) in.readObject(); + + in.close();*/ + } catch (IOException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private static Map getTranslationSet() { + Map s = new HashMap(61); + + s.put(new Character(';'),new Character('/')); //TODO : is it necessary ? + s.put(new Character('À'),new Character('A')); + s.put(new Character('Á'),new Character('A')); + s.put(new Character('Â'),new Character('A')); + s.put(new Character('Ã'),new Character('A')); + s.put(new Character('Ä'),new Character('A')); + s.put(new Character('Å'),new Character('A')); + s.put(new Character('Ç'),new Character('C')); + s.put(new Character('È'),new Character('E')); + s.put(new Character('É'),new Character('E')); + s.put(new Character('Ê'),new Character('E')); + s.put(new Character('Ë'),new Character('E')); + s.put(new Character('Ì'),new Character('I')); + s.put(new Character('Í'),new Character('I')); + s.put(new Character('Î'),new Character('I')); + s.put(new Character('Ï'),new Character('I')); + s.put(new Character('Ñ'),new Character('N')); + s.put(new Character('Ò'),new Character('O')); + s.put(new Character('Ó'),new Character('O')); + s.put(new Character('Ô'),new Character('O')); + s.put(new Character('Õ'),new Character('O')); + s.put(new Character('Ö'),new Character('O')); + s.put(new Character('Ù'),new Character('U')); + s.put(new Character('Ú'),new Character('U')); + s.put(new Character('Û'),new Character('U')); + s.put(new Character('Ü'),new Character('U')); + s.put(new Character('à'),new Character('a')); + s.put(new Character('á'),new Character('a')); + s.put(new Character('â'),new Character('a')); + s.put(new Character('ã'),new Character('a')); + s.put(new Character('ä'),new Character('a')); + s.put(new Character('å'),new Character('a')); + s.put(new Character('ç'),new Character('c')); + s.put(new Character('è'),new Character('e')); + s.put(new Character('é'),new Character('e')); + s.put(new Character('ê'),new Character('e')); + s.put(new Character('ë'),new Character('e')); + s.put(new Character('ì'),new Character('i')); + s.put(new Character('í'),new Character('i')); + s.put(new Character('î'),new Character('i')); + s.put(new Character('ï'),new Character('i')); + s.put(new Character('ñ'),new Character('n')); + s.put(new Character('ò'),new Character('o')); + s.put(new Character('ó'),new Character('o')); + s.put(new Character('ô'),new Character('o')); + s.put(new Character('õ'),new Character('o')); + s.put(new Character('ö'),new Character('o')); + s.put(new Character('ù'),new Character('u')); + s.put(new Character('ú'),new Character('u')); + s.put(new Character('û'),new Character('u')); + s.put(new Character('ü'),new Character('u')); + s.put(new Character('Æ'),new String("AE")); + s.put(new Character('Š'),new String("Sh")); + s.put(new Character('Ž'),new String("Zh")); + s.put(new Character('ß'),new String("ss")); + s.put(new Character('æ'),new String("ae")); + s.put(new Character('š'),new String("sh")); + s.put(new Character('ž'),new String("zh")); + return s; + } /** Returns a unique instance of the handler. * @return The instance */ @@ -204,12 +370,12 @@ String gloss; String POS; - Pattern p; + //Pattern p; Matcher m; // two ways to get the POS info: // (1) explicitly, by extracting it from the gloss field: - p = Pattern.compile(".*" + "(.+?)" + ".*"); + //p = Pattern.compile(".*" + "(.+?)" + ".*"); m = p.matcher(glossPOS); if (m.matches()) { POS = m.group(1); //extract POS from glossPOS @@ -255,7 +421,7 @@ // clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8) gloss = gloss.replaceFirst(".+?",""); gloss = gloss.trim(); - //TODO : we definitely need a translate() method in the java packages ! + /*//TODO : we definitely need a translate() method in the java packages ! gloss = gloss.replaceAll(";","/"); //TODO : is it necessary ? gloss = gloss.replaceAll("À","A"); gloss = gloss.replaceAll("Á","A"); @@ -313,7 +479,9 @@ gloss = gloss.replaceAll("ß","ss"); gloss = gloss.replaceAll("æ","ae"); gloss = gloss.replaceAll("š","sh"); - gloss = gloss.replaceAll("ž","zh"); + gloss = gloss.replaceAll("ž","zh");*/ + gloss = translate(gloss); + // note that although we read 4 fields from the dict we now save 5 fields in the hash table // because the info in last field, glossPOS, was split into two: gloss and POS DictionaryEntry de = new DictionaryEntry(entry, lemmaID, vocalization, morphology, gloss, POS); @@ -333,7 +501,36 @@ throw new RuntimeException("Can not open : " + name); } } - + + /** + * This function translates the given gloss according + * to the translate map. The characters of the gloss + * will be traversed one by one and replacted according + * to the translation map. + * @param gloss the gloss to be translated. + */ + private String translate(String gloss) { + char[] translatedGloss = new char[gloss.length() * 2]; + int i=0; + char[] arGloss = gloss.toCharArray(); + + for (int x = 0;x < arGloss.length;x++) { + Object result = translationMap.get(new Character(arGloss[x])); + if (result == null) + translatedGloss[i++] = arGloss[x]; + else if (result instanceof Character) { + Character temp1 = (Character)result; + translatedGloss[i++] = temp1.charValue(); + } else /*instance of String*/{ + String temp1 = (String)result; + char temp2[] = temp1.toCharArray(); + for (int j = 0; j < temp2.length;j++) + translatedGloss[i++]=temp2[j]; + } + } + return new String(translatedGloss,0,i); + } + /** Loads a compatibility table into a Set. * @param set The set * @param name A human-readable name