diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index 4f35d159a..c2165ee1a 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -12,7 +12,9 @@ import java.util.regex.*; import java.io.*; import java.nio.charset.Charset; - +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.Path; import cc.mallet.classify.*; import cc.mallet.pipe.*; import cc.mallet.pipe.iterator.*; @@ -295,36 +297,58 @@ else if (keepSequence.value) { // Create the instance list and open the input file // - InstanceList instances = new InstanceList (instancePipe); - Reader fileReader; + + BufferedReader fileReader; if (inputFile.value.toString().equals ("-")) { - fileReader = new InputStreamReader (System.in); + fileReader = new BufferedReader(new InputStreamReader (System.in)); } else { - fileReader = new InputStreamReader(new FileInputStream(inputFile.value), encoding.value); + fileReader = Files.newBufferedReader(Paths.get(inputFile.value.toString())); } // // Read instances from the file // - instances.addThruPipe (new CsvIterator (fileReader, Pattern.compile(lineRegex.value), - dataOption.value, labelOption.value, nameOption.value)); - - // - // Save instances to output file - // ObjectOutputStream oos; if (outputFile.value.toString().equals ("-")) { oos = new ObjectOutputStream(System.out); } else { - oos = new ObjectOutputStream(new FileOutputStream(outputFile.value)); + try { + Files.delete(outputFile.value.toPath()); + } catch (java.nio.file.NoSuchFileException e){ + // file doesn't exist + + } + oos = new ObjectOutputStream(new FileOutputStream(outputFile.value, true)); } - oos.writeObject(instances); + CsvIterator csvIterator = new CsvIterator (fileReader, Pattern.compile(lineRegex.value), + dataOption.value, labelOption.value, nameOption.value); + InstanceList instances = new InstanceList (instancePipe); + Integer totalInstances = 0; + while (csvIterator.hasNext()) { + + instances.addThruPipe(csvIterator.next()); + if (instances.size() == 10000) { + oos.writeUnshared(instances); + totalInstances += instances.size(); + logger.info(String.format("wrote %d instances to output file", totalInstances)); + instances = new InstanceList(instancePipe); + oos.reset(); + } + + } + + + if (instances.size() > 0) + oos.writeUnshared(instances); + totalInstances += instances.size(); + logger.info(String.format("wrote %d instances to output file", totalInstances)); oos.close(); + fileReader.close(); // If we are reusing a pipe from an instance list @@ -332,7 +356,6 @@ else if (keepSequence.value) { // or feature alphabets. To maintain compatibility, // we now save that original instance list back to disk // with the new alphabet. - if (usePipeFromVectorsFile.wasInvoked()) { System.out.println(" Rewriting extended pipe from " + usePipeFromVectorsFile.value);