Index: src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java
===================================================================
RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java,v
retrieving revision 1.4
diff -u -r1.4 DictionaryEntry.java
--- src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java 29 Feb 2004 10:58:53 -0000 1.4
+++ src/java/gpl/pierrick/brihaye/aramorph/DictionaryEntry.java 12 Jun 2005 20:15:31 -0000
@@ -26,20 +26,22 @@
package gpl.pierrick.brihaye.aramorph;
+import java.io.Serializable;
import java.util.Arrays;
/** An abstraction of a dictionary entry for a word.
@author Pierrick Brihaye, 2003
*/
-class DictionaryEntry {
+class DictionaryEntry implements Serializable{
private String entry;
private String lemmaID;
private String vocalization;
private String morphology;
private String gloss;
- private String[] glosses;
- private String[] POS;
+ private String oPOS;
+ transient private String[] glosses;
+ transient private String[] POS;
protected DictionaryEntry(String entry, String lemmaID, String vocalization, String morphology, String gloss, String POS) {
this.entry = entry.trim();
@@ -47,32 +49,7 @@
this.vocalization = vocalization.trim();
this.morphology = morphology.trim();
this.gloss = gloss;
- int i, offset;
- String[] array = null;
- //split("[/()]");
- array = gloss.split("\\+");
- for (i = 0 ; i < array.length ; i++) {
- array[i] = array[i].trim();
- }
- //For suffixes
- if ("".equals(array[0])) offset = 1;
- else offset = 0;
- this.glosses = new String[array.length - offset];
- for (i = offset ; i < array.length ; i++) {
- this.glosses[i - offset] = array[i];
- }
- //replaceFirst("^.*/","");
- array = POS.split("\\+");
- for (i = 0 ; i < array.length ; i++) {
- array[i] = array[i].trim();
- }
- //For suffixes
- if ("".equals(array[0])) offset = 1;
- else offset = 0;
- this.POS = new String[array.length - offset];
- for (i = offset ; i < array.length ; i++) {
- this.POS[i - offset] = array[i];
- }
+ this.oPOS = POS;
}
protected String getEntry() { return this.entry; }
@@ -83,11 +60,71 @@
protected String getMorphology() { return this.morphology; }
- protected String[] getPOS() { return this.POS; }
+ protected String[] getPOS() {
+ if (this.POS == null) {
+ int i, offset;
+ String[] array = null;
+ //split("[/()]");
+ array = gloss.split("\\+");
+ for (i = 0 ; i < array.length ; i++) {
+ array[i] = array[i].trim();
+ }
+ //For suffixes
+ if ("".equals(array[0])) offset = 1;
+ else offset = 0;
+ this.glosses = new String[array.length - offset];
+ for (i = offset ; i < array.length ; i++) {
+ this.glosses[i - offset] = array[i];
+ }
+ //replaceFirst("^.*/","");
+ array = this.oPOS.split("\\+");
+ for (i = 0 ; i < array.length ; i++) {
+ array[i] = array[i].trim();
+ }
+ //For suffixes
+ if ("".equals(array[0])) offset = 1;
+ else offset = 0;
+ this.POS = new String[array.length - offset];
+ for (i = offset ; i < array.length ; i++) {
+ this.POS[i - offset] = array[i];
+ }
+ }
+ return this.POS;
+ }
protected String getGloss() { return this.gloss; }
- protected String[] getGlosses() { return this.glosses; }
+ protected String[] getGlosses() {
+ if (this.glosses == null) {
+ int i, offset;
+ String[] array = null;
+ //split("[/()]");
+ array = gloss.split("\\+");
+ for (i = 0 ; i < array.length ; i++) {
+ array[i] = array[i].trim();
+ }
+ //For suffixes
+ if ("".equals(array[0])) offset = 1;
+ else offset = 0;
+ this.glosses = new String[array.length - offset];
+ for (i = offset ; i < array.length ; i++) {
+ this.glosses[i - offset] = array[i];
+ }
+ //replaceFirst("^.*/","");
+ array = this.oPOS.split("\\+");
+ for (i = 0 ; i < array.length ; i++) {
+ array[i] = array[i].trim();
+ }
+ //For suffixes
+ if ("".equals(array[0])) offset = 1;
+ else offset = 0;
+ this.POS = new String[array.length - offset];
+ for (i = offset ; i < array.length ; i++) {
+ this.POS[i - offset] = array[i];
+ }
+ }
+ return this.glosses;
+ }
}
Index: src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java
===================================================================
RCS file: /cvsroot/aramorph/aramorph/src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java,v
retrieving revision 1.3
diff -u -r1.3 InMemoryDictionaryHandler.java
--- src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java 26 Feb 2004 13:58:44 -0000 1.3
+++ src/java/gpl/pierrick/brihaye/aramorph/InMemoryDictionaryHandler.java 12 Jun 2005 20:15:32 -0000
@@ -26,10 +26,12 @@
package gpl.pierrick.brihaye.aramorph;
+import java.io.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.sql.Savepoint;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -43,6 +45,10 @@
*/
class InMemoryDictionaryHandler {
+ /** A pattern which is used in parsing the dictionary file*/
+ private static Pattern p = Pattern.compile(".*" + "(.+?)" + ".*");
+ /**This map is used for translation of dictionaries*/
+ private static Map translationMap = getTranslationSet();
/** The unique instance of this handler. */
private static InMemoryDictionaryHandler handler = null;
/** Dictionary of prefixes */
@@ -68,17 +74,177 @@
private InMemoryDictionaryHandler() {
System.out.println("Initializing in-memory dictionary handler...");
// load 3 lexicons
- loadDictionary(prefixes, "dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes"));
+ /*loadDictionary(prefixes, "dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes"));
loadDictionary(stems, "dictStems", this.getClass().getResourceAsStream("dictionaries/dictStems"));
loadDictionary(suffixes, "dictSuffixes", this.getClass().getResourceAsStream("dictionaries/dictSuffixes"));
//load 3 compatibility tables
loadCompatibilityTable(hash_AB, "tableAB", this.getClass().getResourceAsStream("dictionaries/tableAB"));
loadCompatibilityTable(hash_AC, "tableAC", this.getClass().getResourceAsStream("dictionaries/tableAC"));
- loadCompatibilityTable(hash_BC, "tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC"));
+ loadCompatibilityTable(hash_BC, "tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC"));*/
+ readFromFile();
+ //writeToFile();
handler = this;
System.out.println("... done.");
};
+ /**
+ * This function will write the dictionaries and compatibility tables in the Java
+ * serializable formats. For this to work, we had to implement the interface
+ * Serializable for the class DictionaryEntry
+ */
+ public void writeToFile() {
+ ObjectOutputStream out;
+ try {
+ out = new ObjectOutputStream(new FileOutputStream("dictPrefixes2"));
+ out.writeObject(prefixes);
+ out.close();
+
+ out = new ObjectOutputStream(new FileOutputStream("dictStems2"));
+ out.writeObject(stems);
+ out.close();
+
+ out = new ObjectOutputStream(new FileOutputStream("dictSuffixes2"));
+ out.writeObject(suffixes);
+ out.close();
+
+ out = new ObjectOutputStream(new FileOutputStream("tableAB2"));
+ out.writeObject(hash_AB);
+ out.close();
+
+ out = new ObjectOutputStream(new FileOutputStream("tableAC2"));
+ out.writeObject(hash_AC);
+ out.close();
+
+ out = new ObjectOutputStream(new FileOutputStream("tableBC2"));
+ out.writeObject(hash_BC);
+ out.close();
+ /*out = new ObjectOutputStream(new FileOutputStream("tables"));
+ out.writeObject(prefixes);
+ out.writeObject(stems);
+ out.writeObject(suffixes);
+
+ out.writeObject(hash_AB);
+ out.writeObject(hash_AC);
+ out.writeObject(hash_BC);
+ out.close();*/
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * This function reads dictionaries and compatibility tables back from
+ * the files stored in Java serializable format.
+ */
+ public void readFromFile() {
+ ObjectInputStream in;
+ try {
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictPrefixes2"));
+ prefixes = (MultiHashMap)in.readObject();
+ in.close();
+
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictStems2"));
+ stems = (MultiHashMap)in.readObject();
+ in.close();
+
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/dictSuffixes2"));
+ suffixes = (MultiHashMap)in.readObject();
+ in.close();
+
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableAB2"));
+ hash_AB = (HashSet) in.readObject();
+ in.close();
+
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableAC2"));
+ hash_AC = (HashSet) in.readObject();
+ in.close();
+
+ in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tableBC2"));
+ hash_BC = (HashSet) in.readObject();
+ in.close();
+ /*in = new ObjectInputStream(this.getClass().getResourceAsStream("dictionaries/tables"));
+ prefixes = (MultiHashMap)in.readObject();
+ stems = (MultiHashMap)in.readObject();
+ suffixes = (MultiHashMap)in.readObject();
+
+ hash_AB = (HashSet) in.readObject();
+ hash_AC = (HashSet) in.readObject();
+ hash_BC = (HashSet) in.readObject();
+
+ in.close();*/
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (ClassNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ private static Map getTranslationSet() {
+ Map s = new HashMap(61);
+
+ s.put(new Character(';'),new Character('/')); //TODO : is it necessary ?
+ s.put(new Character('À'),new Character('A'));
+ s.put(new Character('Á'),new Character('A'));
+ s.put(new Character('Â'),new Character('A'));
+ s.put(new Character('Ã'),new Character('A'));
+ s.put(new Character('Ä'),new Character('A'));
+ s.put(new Character('Å'),new Character('A'));
+ s.put(new Character('Ç'),new Character('C'));
+ s.put(new Character('È'),new Character('E'));
+ s.put(new Character('É'),new Character('E'));
+ s.put(new Character('Ê'),new Character('E'));
+ s.put(new Character('Ë'),new Character('E'));
+ s.put(new Character('Ì'),new Character('I'));
+ s.put(new Character('Í'),new Character('I'));
+ s.put(new Character('Î'),new Character('I'));
+ s.put(new Character('Ï'),new Character('I'));
+ s.put(new Character('Ñ'),new Character('N'));
+ s.put(new Character('Ò'),new Character('O'));
+ s.put(new Character('Ó'),new Character('O'));
+ s.put(new Character('Ô'),new Character('O'));
+ s.put(new Character('Õ'),new Character('O'));
+ s.put(new Character('Ö'),new Character('O'));
+ s.put(new Character('Ù'),new Character('U'));
+ s.put(new Character('Ú'),new Character('U'));
+ s.put(new Character('Û'),new Character('U'));
+ s.put(new Character('Ü'),new Character('U'));
+ s.put(new Character('à'),new Character('a'));
+ s.put(new Character('á'),new Character('a'));
+ s.put(new Character('â'),new Character('a'));
+ s.put(new Character('ã'),new Character('a'));
+ s.put(new Character('ä'),new Character('a'));
+ s.put(new Character('å'),new Character('a'));
+ s.put(new Character('ç'),new Character('c'));
+ s.put(new Character('è'),new Character('e'));
+ s.put(new Character('é'),new Character('e'));
+ s.put(new Character('ê'),new Character('e'));
+ s.put(new Character('ë'),new Character('e'));
+ s.put(new Character('ì'),new Character('i'));
+ s.put(new Character('í'),new Character('i'));
+ s.put(new Character('î'),new Character('i'));
+ s.put(new Character('ï'),new Character('i'));
+ s.put(new Character('ñ'),new Character('n'));
+ s.put(new Character('ò'),new Character('o'));
+ s.put(new Character('ó'),new Character('o'));
+ s.put(new Character('ô'),new Character('o'));
+ s.put(new Character('õ'),new Character('o'));
+ s.put(new Character('ö'),new Character('o'));
+ s.put(new Character('ù'),new Character('u'));
+ s.put(new Character('ú'),new Character('u'));
+ s.put(new Character('û'),new Character('u'));
+ s.put(new Character('ü'),new Character('u'));
+ s.put(new Character('Æ'),new String("AE"));
+ s.put(new Character(''),new String("Sh"));
+ s.put(new Character(''),new String("Zh"));
+ s.put(new Character('ß'),new String("ss"));
+ s.put(new Character('æ'),new String("ae"));
+ s.put(new Character(''),new String("sh"));
+ s.put(new Character(''),new String("zh"));
+ return s;
+ }
/** Returns a unique instance of the handler.
* @return The instance
*/
@@ -204,12 +370,12 @@
String gloss;
String POS;
- Pattern p;
+ //Pattern p;
Matcher m;
// two ways to get the POS info:
// (1) explicitly, by extracting it from the gloss field:
- p = Pattern.compile(".*" + "(.+?)" + ".*");
+ //p = Pattern.compile(".*" + "(.+?)" + ".*");
m = p.matcher(glossPOS);
if (m.matches()) {
POS = m.group(1); //extract POS from glossPOS
@@ -255,7 +421,7 @@
// clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8)
gloss = gloss.replaceFirst(".+?","");
gloss = gloss.trim();
- //TODO : we definitely need a translate() method in the java packages !
+ /*//TODO : we definitely need a translate() method in the java packages !
gloss = gloss.replaceAll(";","/"); //TODO : is it necessary ?
gloss = gloss.replaceAll("À","A");
gloss = gloss.replaceAll("Á","A");
@@ -313,7 +479,9 @@
gloss = gloss.replaceAll("ß","ss");
gloss = gloss.replaceAll("æ","ae");
gloss = gloss.replaceAll("","sh");
- gloss = gloss.replaceAll("","zh");
+ gloss = gloss.replaceAll("","zh");*/
+ gloss = translate(gloss);
+
// note that although we read 4 fields from the dict we now save 5 fields in the hash table
// because the info in last field, glossPOS, was split into two: gloss and POS
DictionaryEntry de = new DictionaryEntry(entry, lemmaID, vocalization, morphology, gloss, POS);
@@ -333,7 +501,36 @@
throw new RuntimeException("Can not open : " + name);
}
}
-
+
+ /**
+ * This function translates the given gloss according
+ * to the translate map. The characters of the gloss
+ * will be traversed one by one and replacted according
+ * to the translation map.
+ * @param gloss the gloss to be translated.
+ */
+ private String translate(String gloss) {
+ char[] translatedGloss = new char[gloss.length() * 2];
+ int i=0;
+ char[] arGloss = gloss.toCharArray();
+
+ for (int x = 0;x < arGloss.length;x++) {
+ Object result = translationMap.get(new Character(arGloss[x]));
+ if (result == null)
+ translatedGloss[i++] = arGloss[x];
+ else if (result instanceof Character) {
+ Character temp1 = (Character)result;
+ translatedGloss[i++] = temp1.charValue();
+ } else /*instance of String*/{
+ String temp1 = (String)result;
+ char temp2[] = temp1.toCharArray();
+ for (int j = 0; j < temp2.length;j++)
+ translatedGloss[i++]=temp2[j];
+ }
+ }
+ return new String(translatedGloss,0,i);
+ }
+
/** Loads a compatibility table into a Set
.
* @param set The set
* @param name A human-readable name