CogComp · cowchipkid · Sep 12, 2017 · Aug 25, 2017 · Aug 25, 2017 · Aug 25, 2017
diff --git a/.gitignore b/.gitignore
@@ -63,3 +63,4 @@ lbjava-examples/src/main/java/edu/illinois/cs/cogcomp/lbjava/examples/spam/SpamC
 lbjava-examples/src/main/java/edu/illinois/cs/cogcomp/lbjava/examples/regression/MyFeatures.java
 lbjava-examples/src/main/java/edu/illinois/cs/cogcomp/lbjava/examples/regression/MyLabel.java
 lbjava-examples/src/main/java/edu/illinois/cs/cogcomp/lbjava/examples/regression/SGDClassifier.java
+/.metadata/
diff --git a/lbjava-examples/pom.xml b/lbjava-examples/pom.xml
@@ -3,7 +3,7 @@
     <parent>
         <artifactId>lbjava-project</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>1.2.26</version>
+        <version>1.3.0</version>
     </parent>
 
     <modelVersion>4.0.0</modelVersion>
@@ -27,12 +27,12 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>LBJava</artifactId>
-            <version>1.2.26</version>
+            <version>1.3.0</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>lbjava-maven-plugin</artifactId>
-            <version>1.2.26</version>
+            <version>1.3.0</version>
         </dependency>
     </dependencies>
 
@@ -63,7 +63,7 @@
             <plugin>
                 <groupId>edu.illinois.cs.cogcomp</groupId>
                 <artifactId>lbjava-maven-plugin</artifactId>
-                <version>1.2.26</version>
+                <version>1.3.0</version>
                 <configuration>
                     <gspFlag>${project.basedir}/src/main/java</gspFlag>
                     <dFlag>${project.basedir}/target/classes</dFlag>

diff --git a/lbjava-mvn-plugin/pom.xml b/lbjava-mvn-plugin/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>lbjava-project</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>1.2.26</version>
+        <version>1.3.0</version>
     </parent>
 
     <artifactId>lbjava-maven-plugin</artifactId>
@@ -76,7 +76,7 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>LBJava</artifactId>
-            <version>1.2.26</version>
+            <version>1.3.0</version>
             <type>jar</type>
             <scope>compile</scope>
         </dependency>

diff --git a/lbjava/pom.xml b/lbjava/pom.xml
@@ -3,7 +3,7 @@
 	<parent>
         <artifactId>lbjava-project</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>1.2.26</version>
+        <version>1.3.0</version>
     </parent>
 
 	<modelVersion>4.0.0</modelVersion>

diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/Train.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/Train.java
@@ -880,53 +880,57 @@ public void run() {
             if (!lce.onlyCodeGeneration) {
                 // If there's a "from" clause, train.
                 try {
-                    if (lce.parser != null) {
-                        System.out.println("Training " + getName());
-                        if (preExtract) {
-                            preExtractAndPrune();
-                            System.gc();
-                        } else
-                            learner.saveLexicon();
-                        int trainingRounds = 1;
-
-                        if (tuningParameters) {
-                            String parametersPath = getName();
-                            if (Main.classDirectory != null)
-                                parametersPath =
-                                        Main.classDirectory + File.separator + parametersPath;
-                            parametersPath += ".p";
-
-                            Learner.Parameters bestParameters = tune();
-                            trainingRounds = bestParameters.rounds;
-                            Learner.writeParameters(bestParameters, parametersPath);
-                            System.out.println("  " + getName()
-                                    + ": Training on entire training set");
-                        } else {
-                            if (lce.rounds != null)
-                                trainingRounds = Integer.parseInt(((Constant) lce.rounds).value);
-
-                            if (lce.K != null) {
-                                int[] rounds = {trainingRounds};
-                                int k = Integer.parseInt(lce.K.value);
-                                double alpha = Double.parseDouble(lce.alpha.value);
-                                trainer.crossValidation(rounds, k, lce.splitPolicy, alpha,
-                                        testingMetric, true);
+                    learner.beginTraining();
+                    try {
+                        if (lce.parser != null) {
+                            System.out.println("Training " + getName());
+                            if (preExtract) {
+                                preExtractAndPrune();
+                                System.gc();
+                            } else
+                                learner.saveLexicon();
+                            int trainingRounds = 1;
+
+                            if (tuningParameters) {
+                                String parametersPath = getName();
+                                if (Main.classDirectory != null)
+                                    parametersPath =
+                                            Main.classDirectory + File.separator + parametersPath;
+                                parametersPath += ".p";
+
+                                Learner.Parameters bestParameters = tune();
+                                trainingRounds = bestParameters.rounds;
+                                Learner.writeParameters(bestParameters, parametersPath);
                                 System.out.println("  " + getName()
                                         + ": Training on entire training set");
+                            } else {
+                                if (lce.rounds != null)
+                                    trainingRounds = Integer.parseInt(((Constant) lce.rounds).value);
+
+                                if (lce.K != null) {
+                                    int[] rounds = {trainingRounds};
+                                    int k = Integer.parseInt(lce.K.value);
+                                    double alpha = Double.parseDouble(lce.alpha.value);
+                                    trainer.crossValidation(rounds, k, lce.splitPolicy, alpha,
+                                            testingMetric, true);
+                                    System.out.println("  " + getName()
+                                            + ": Training on entire training set");
+                                }
                             }
-                        }
-
-                        trainer.train(lce.startingRound, trainingRounds);
-
-                        if (testParser != null) {
-                            System.out.println("Testing " + getName());
-                            new Accuracy(true).test(learner, learner.getLabeler(), testParser);
-                        }
-
-                        System.out.println("Writing " + getName());
-                    } else
-                        learner.saveLexicon(); // Writes .lex even if lexicon is empty.
-
+                            trainer.train(lce.startingRound, trainingRounds);
+                        } else
+                            learner.saveLexicon(); // Writes .lex even if lexicon is empty.
+                    } finally {
+                        learner.doneTraining();
+                    }
+
+                    if (lce.parser != null && testParser != null) {
+                        System.out.println("Testing " + getName());
+                        new Accuracy(true).test(learner, learner.getLabeler(), testParser);
+                    }
+
+                    // save the final model.
+                    System.out.println("Writing " + getName());
                     learner.save(); // Doesn't write .lex if lexicon is empty.
                 } catch (Exception e) {
                     System.err.println("LBJava ERROR: Exception while training " + getName() + ":");

diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Learner.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Learner.java
@@ -66,6 +66,9 @@ public abstract class Learner extends Classifier {
 
     /** The number of candidate examples when a global object is passed here. */
     protected int candidates = 1;
+
+    /** this is set while training. */
+    protected boolean intraining = false;
 
     /**
      * This constructor is used by the LBJava compiler; it should never be called by a programmer.
@@ -259,7 +262,6 @@ public URL getModelLocation() {
         return lcFilePath;
     }
 
-
     /**
      * Sets the location of the lexicon as a regular file on this file system.
      *
@@ -289,7 +291,6 @@ public URL getLexiconLocation() {
         return lexFilePath;
     }
 
-
     /**
      * Establishes a new feature counting policy for this learner's lexicon.
      *
@@ -304,7 +305,6 @@ public void countFeatures(Lexicon.CountPolicy policy) {
         lexicon.countFeatures(policy);
     }
 
-
     /**
      * Returns this learner's feature lexicon after discarding any feature counts it may have been
      * storing. This method is likely only useful when the lexicon and its counts are currently
@@ -320,7 +320,6 @@ public Lexicon getLexiconDiscardCounts() {
         return lexicon;
     }
 
-
     /**
      * Returns a new, emtpy learner into which all of the parameters that control the behavior of
      * the algorithm have been copied. Here, "emtpy" means no learning has taken place.
@@ -331,7 +330,6 @@ public Learner emptyClone() {
         return clone;
     }
 
-
     /**
      * Trains the learning algorithm given an object as an example. By default, this simply converts
      * the example object into arrays and passes it to {@link #learn(int[],double[],int[],double[])}
@@ -345,7 +343,6 @@ public void learn(Object example) {
                 (double[]) exampleArray[3]);
     }
 
-
     /**
      * Trains the learning algorithm given a feature vector as an example. This simply converts the
      * example object into arrays and passes it to {@link #learn(int[],double[],int[],double[])}.
@@ -633,6 +630,15 @@ public double realValue(int[] f, double[] v) {
                         + getClass().getName() + "'.");
     }
 
+    /**
+     * Start training, this might involve training many models, for cross validation, 
+     * parameter tuning and so on.
+     **/
+    public void beginTraining() {
+        intraining = true;
+    }
+
+
 
     /**
      * Overridden by subclasses to perform any required post-processing computations after all
@@ -642,6 +648,21 @@ public double realValue(int[] f, double[] v) {
     public void doneLearning() {}
 
 
+    /**
+     * Overridden by subclasses to perform any required post-training computations optimizations, 
+     * in particular, feature subset reduction. This default method does nothing.
+     */
+    public void doneTraining() {
+        if (intraining) {
+            intraining = false;
+        } else {
+            throw new RuntimeException("calling doneLearning without previously calling beginTraining"
+                + " violates the lifecycle contract. Or perhaps the subclass does not call the superclass "
+                + "method. Contact the developer.");
+        }
+    }
+
+
     /**
      * This method is sometimes called before training begins, although it is not guaranteed to be
      * called at all. It allows the number of examples and number of features to be passed to the

diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java
@@ -9,12 +9,15 @@
 
 import java.io.Serializable;
 import java.net.URL;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
 import edu.illinois.cs.cogcomp.core.datastructures.vectors.*;
+import edu.illinois.cs.cogcomp.lbjava.classify.DiscreteConjunctiveFeature;
 import edu.illinois.cs.cogcomp.lbjava.classify.Feature;
+import edu.illinois.cs.cogcomp.lbjava.classify.RealConjunctiveFeature;
 import edu.illinois.cs.cogcomp.lbjava.util.ByteString;
 import edu.illinois.cs.cogcomp.lbjava.util.ClassUtils;
 import edu.illinois.cs.cogcomp.lbjava.util.FVector;
@@ -305,7 +308,7 @@ public boolean contains(Feature f) {
      *
      * @param f The feature to look up.
      * @return The integer key that the feature maps to.
-     **/
+     **/ 
     public int lookup(Feature f) {
         return lookup(f, false, -1);
     }
@@ -661,6 +664,36 @@ public void discardPrunedFeatures() {
         pruneCutoff = -1;
     }
 
+    /**
+     * Discard features at the provided indices. This operation is performed
+     * last to first so we can do it in place. This method will sort the input
+     * array.
+     * @param dumpthese the indexes of the features to dump.
+     */
+    public void discardPrunedFeatures(int [] dumpthese) {
+    	Arrays.sort(dumpthese);
+    	lexiconInv.remove(dumpthese);
+
+        // this compresses the FVector
+        lexiconInv = new FVector(lexiconInv);
+        if (lexicon != null) {
+
+            // reconstitute the lexicon.
+            lexicon.clear();
+            for (int i = 0; i < lexiconInv.size();i++) {
+                lexicon.put(lexiconInv.get(i), new Integer(i));
+            }
+
+            // sanity check, make sure the indices in the lexicon map matches the index in the feature vector
+            for (int i = 0; i < lexiconInv.size();i++) {
+                if (i != ((Integer)lexicon.get(lexiconInv.get(i))).intValue()) {
+                    throw new RuntimeException("After optimization pruning, the index in the lexicon did "
+                        + "not match the inverted index.");
+                }
+            }
+        }
+    }
+
 
     /**
      * <!-- clone() --> Returns a deep clone of this lexicon implemented as a <code>HashMap</code>.
@@ -742,10 +775,9 @@ public int compare(int i1, int i2) {
         ByteString previousBSIdentifier = null;
         out.writeInt(indexes.length);
         out.writeInt(pruneCutoff);
-
         for (int i = 0; i < indexes.length; ++i) {
             Feature f = inverse.get(indexes[i]);
-            previousClassName =
+             previousClassName =
                     f.lexWrite(out, this, previousClassName, previousPackage, previousClassifier,
                             previousSIdentifier, previousBSIdentifier);
             previousPackage = f.getPackage();
@@ -757,7 +789,6 @@ else if (f.hasByteStringIdentifier())
 
             out.writeInt(indexes[i]);
         }
-
         if (featureCounts == null)
             out.writeInt(0);
         else
@@ -801,14 +832,12 @@ public void read(ExceptionlessInputStream in, boolean readCounts) {
         pruneCutoff = in.readInt();
         lexicon = null;
         lexiconInv = new FVector(N);
-
         for (int i = 0; i < N; ++i) {
             Feature f =
                     Feature.lexReadFeature(in, this, previousClass, previousPackage,
                             previousClassifier, previousSIdentifier, previousBSIdentifier);
             int index = in.readInt();
             lexiconInv.set(index, f);
-
             previousClass = f.getClass();
             previousPackage = f.getPackage();
             previousClassifier = f.getGeneratingClassifier();
@@ -817,7 +846,7 @@ public void read(ExceptionlessInputStream in, boolean readCounts) {
             else if (f.hasByteStringIdentifier())
                 previousBSIdentifier = f.getByteStringIdentifier();
         }
-
+        
         if (readCounts) {
             featureCounts = new IVector();
             featureCounts.read(in);