From efb855564023859d09700c9f9ad7a5cef6ab9aa9 Mon Sep 17 00:00:00 2001 From: Jonathan Foley Date: Fri, 19 Jul 2019 11:30:01 -0700 Subject: [PATCH 1/5] use buffered reading of input file during data load --- src/cc/mallet/classify/tui/Csv2Vectors.java | 40 +++++++++++++-------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index 4f35d159a..db34d2320 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -12,7 +12,9 @@ import java.util.regex.*; import java.io.*; import java.nio.charset.Charset; - +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.Path; import cc.mallet.classify.*; import cc.mallet.pipe.*; import cc.mallet.pipe.iterator.*; @@ -295,35 +297,46 @@ else if (keepSequence.value) { // Create the instance list and open the input file // - InstanceList instances = new InstanceList (instancePipe); - Reader fileReader; + + BufferedReader fileReader; if (inputFile.value.toString().equals ("-")) { - fileReader = new InputStreamReader (System.in); + fileReader = new BufferedReader(new InputStreamReader (System.in)); } else { - fileReader = new InputStreamReader(new FileInputStream(inputFile.value), encoding.value); + fileReader = Files.newBufferedReader(Paths.get(inputFile.value.toString())); } // // Read instances from the file // - instances.addThruPipe (new CsvIterator (fileReader, Pattern.compile(lineRegex.value), - dataOption.value, labelOption.value, nameOption.value)); - - // - // Save instances to output file - // ObjectOutputStream oos; if (outputFile.value.toString().equals ("-")) { oos = new ObjectOutputStream(System.out); } else { - oos = new ObjectOutputStream(new FileOutputStream(outputFile.value)); + Files.delete(outputFile.value.toPath()); + oos = new ObjectOutputStream(new FileOutputStream(outputFile.value, true)); } - oos.writeObject(instances); + CsvIterator csvIterator = new CsvIterator (fileReader, Pattern.compile(lineRegex.value), + dataOption.value, labelOption.value, nameOption.value); + InstanceList instances = new InstanceList (instancePipe); + while (csvIterator.hasNext()) { + + instances.addThruPipe(csvIterator.next()); + if (instances.size() == 1000000) { + oos.writeObject(instances); + instances = new InstanceList(instancePipe); + } + + } + + + if (instances.size() > 0) + oos.writeObject(instances); + oos.close(); @@ -332,7 +345,6 @@ else if (keepSequence.value) { // or feature alphabets. To maintain compatibility, // we now save that original instance list back to disk // with the new alphabet. - if (usePipeFromVectorsFile.wasInvoked()) { System.out.println(" Rewriting extended pipe from " + usePipeFromVectorsFile.value); From e241fbed99701db35e6719a4b671d8ec76ba4ff1 Mon Sep 17 00:00:00 2001 From: Jonathan Foley Date: Fri, 19 Jul 2019 11:34:33 -0700 Subject: [PATCH 2/5] handle missing file --- src/cc/mallet/classify/tui/Csv2Vectors.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index db34d2320..4d1daec49 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -317,7 +317,12 @@ else if (keepSequence.value) { oos = new ObjectOutputStream(System.out); } else { - Files.delete(outputFile.value.toPath()); + try { + Files.delete(outputFile.value.toPath()); + } catch (java.nio.file.NoSuchFileException e){ + // file doesn't exist + + } oos = new ObjectOutputStream(new FileOutputStream(outputFile.value, true)); } CsvIterator csvIterator = new CsvIterator (fileReader, Pattern.compile(lineRegex.value), From 289e09a052389aedcf486a68a3d78628fdc0783c Mon Sep 17 00:00:00 2001 From: Jonathan Foley Date: Mon, 22 Jul 2019 11:26:42 -0700 Subject: [PATCH 3/5] don't keep references to written output objects --- src/cc/mallet/classify/tui/Csv2Vectors.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index 4d1daec49..0ccc65490 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -331,8 +331,8 @@ else if (keepSequence.value) { while (csvIterator.hasNext()) { instances.addThruPipe(csvIterator.next()); - if (instances.size() == 1000000) { - oos.writeObject(instances); + if (instances.size() == 10000) { + oos.writeUnshared(instances); instances = new InstanceList(instancePipe); } @@ -340,7 +340,7 @@ else if (keepSequence.value) { if (instances.size() > 0) - oos.writeObject(instances); + oos.writeUnshared(instances); oos.close(); From 4c53ed3ff72a602ea664c7bb25c8feb7cea345fe Mon Sep 17 00:00:00 2001 From: Jonathan Foley Date: Wed, 24 Jul 2019 07:38:57 -0700 Subject: [PATCH 4/5] reset OutputStream write to free resources --- src/cc/mallet/classify/tui/Csv2Vectors.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index 0ccc65490..3075af7dc 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -334,6 +334,7 @@ else if (keepSequence.value) { if (instances.size() == 10000) { oos.writeUnshared(instances); instances = new InstanceList(instancePipe); + oos.reset(); } } @@ -343,6 +344,7 @@ else if (keepSequence.value) { oos.writeUnshared(instances); oos.close(); + fileReader.close(); // If we are reusing a pipe from an instance list From 462c7004582e84dc9121563f85ad4ca474b0612e Mon Sep 17 00:00:00 2001 From: Jonathan Foley Date: Wed, 24 Jul 2019 10:50:54 -0700 Subject: [PATCH 5/5] log progress --- src/cc/mallet/classify/tui/Csv2Vectors.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cc/mallet/classify/tui/Csv2Vectors.java b/src/cc/mallet/classify/tui/Csv2Vectors.java index 3075af7dc..c2165ee1a 100644 --- a/src/cc/mallet/classify/tui/Csv2Vectors.java +++ b/src/cc/mallet/classify/tui/Csv2Vectors.java @@ -328,11 +328,14 @@ else if (keepSequence.value) { CsvIterator csvIterator = new CsvIterator (fileReader, Pattern.compile(lineRegex.value), dataOption.value, labelOption.value, nameOption.value); InstanceList instances = new InstanceList (instancePipe); + Integer totalInstances = 0; while (csvIterator.hasNext()) { instances.addThruPipe(csvIterator.next()); if (instances.size() == 10000) { oos.writeUnshared(instances); + totalInstances += instances.size(); + logger.info(String.format("wrote %d instances to output file", totalInstances)); instances = new InstanceList(instancePipe); oos.reset(); } @@ -342,7 +345,8 @@ else if (keepSequence.value) { if (instances.size() > 0) oos.writeUnshared(instances); - + totalInstances += instances.size(); + logger.info(String.format("wrote %d instances to output file", totalInstances)); oos.close(); fileReader.close();