From a284c9b21d0aaa8985efc8f643a495175e76b74b Mon Sep 17 00:00:00 2001 From: Ben McCann <322311+benmccann@users.noreply.github.com> Date: Thu, 2 Jan 2025 14:16:20 -0800 Subject: [PATCH] simplify --- core/src/main/java/tech/tablesaw/api/Row.java | 24 +++++++- .../main/java/tech/tablesaw/api/Table.java | 59 +------------------ .../java/tech/tablesaw/api/TableTest.java | 17 ------ 3 files changed, 26 insertions(+), 74 deletions(-) diff --git a/core/src/main/java/tech/tablesaw/api/Row.java b/core/src/main/java/tech/tablesaw/api/Row.java index 5cabeaa6d..a5348ac06 100644 --- a/core/src/main/java/tech/tablesaw/api/Row.java +++ b/core/src/main/java/tech/tablesaw/api/Row.java @@ -809,8 +809,30 @@ public Column column(int columnIndex) { return tableSlice.column(columnIndex); } + /** Returns true if every value this row is equal to the corresponding value in the given row */ + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + + Row other = (Row) obj; + if (columnCount() != other.columnCount()) { + return false; + } + + for (int columnIndex = 0; columnIndex < columnCount(); columnIndex++) { + Column c = column(columnIndex); + if (!c.equals(getRowNumber(), other.getRowNumber())) { + return false; + } + } + return true; + } + /** Returns a hash computed on the values in the backing table at this row */ - public int rowHash() { + @Override + public int hashCode() { int[] values = new int[columnCount()]; for (int i = 0; i < columnCount(); i++) { Column column = tableSlice.column(i); diff --git a/core/src/main/java/tech/tablesaw/api/Table.java b/core/src/main/java/tech/tablesaw/api/Table.java index f712c3f35..7efac146b 100644 --- a/core/src/main/java/tech/tablesaw/api/Table.java +++ b/core/src/main/java/tech/tablesaw/api/Table.java @@ -546,25 +546,6 @@ public static boolean compareRows(int rowNumber, Table table1, Table table2) { return true; } - /** - * Returns true if every value in row1 is equal to the same value in row2, where row1 and row2 are - * both rows from this table - */ - private boolean duplicateRows(Row row1, Row row2) { - if (row1.columnCount() != row2.columnCount()) { - return false; - } - boolean result; - for (int columnIndex = 0; columnIndex < row1.columnCount(); columnIndex++) { - Column c = column(columnIndex); - result = c.equals(row1.getRowNumber(), row2.getRowNumber()); - if (!result) { - return false; - } - } - return true; - } - public Table[] sampleSplit(double table1Proportion) { Table[] tables = new Table[2]; int table1Count = (int) Math.round(rowCount() * table1Proportion); @@ -932,51 +913,17 @@ public TableSliceGroup splitOn(CategoricalColumn... columns) { * this table, appears only once in the returned table. */ public Table dropDuplicateRows() { - Table temp = emptyCopy(); - Int2ObjectMap uniqueHashes = new Int2ObjectOpenHashMap<>(); - // ListMultimap uniqueHashes = ArrayListMultimap.create(); + Set uniqueRows = new HashSet<>(); for (Row row : this) { - if (!isDuplicate(row, uniqueHashes)) { + if (!uniqueRows.contains(row)) { + uniqueRows.add(row); temp.append(row); } } return temp; } - /** - * Returns true if all the values in row are identical to those in another row previously seen and - * recorded in the list. - * - * @param row the row to evaluate - * @param uniqueHashes a map of row hashes to the id of an exemplar row that produces that hash. - * If two different rows produce the same hash, then the row number for each is placed in the - * list, so that there are exemplars for both - * @return true if the row's values exactly match a row that was previously seen - */ - @VisibleForTesting - protected boolean isDuplicate(Row row, Int2ObjectMap uniqueHashes) { - int hash = row.rowHash(); - if (!uniqueHashes.containsKey(hash)) { - IntArrayList rowNumbers = new IntArrayList(); - rowNumbers.add(row.getRowNumber()); - uniqueHashes.put(hash, rowNumbers); - return false; - } - - // the hashmap contains the hash, make sure the actual row values match - IntArrayList matchingKeys = uniqueHashes.get(hash); - - for (int key : matchingKeys) { - Row oldRow = this.row(key); - if (duplicateRows(row, oldRow)) { - return true; - } - } - uniqueHashes.get(hash).add(row.getRowNumber()); - return false; - } - /** Returns only those records in this table that have no columns with missing values */ public Table dropRowsWithMissingValues() { diff --git a/core/src/test/java/tech/tablesaw/api/TableTest.java b/core/src/test/java/tech/tablesaw/api/TableTest.java index 66aa2c8fb..d659a0ead 100644 --- a/core/src/test/java/tech/tablesaw/api/TableTest.java +++ b/core/src/test/java/tech/tablesaw/api/TableTest.java @@ -22,9 +22,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; -import it.unimi.dsi.fastutil.ints.IntArrayList; import java.io.File; import java.io.StringReader; import java.io.StringWriter; @@ -919,18 +916,4 @@ public void testToStringColumnsWithVaryingSizes() { fail("toString shouldn't throw " + e); } } - - @Test - void testDropDuplicateWithHashCollision() throws Exception { - Table testTable = Table.read().usingOptions(CsvReadOptions - .builder(new File("../data/missing_values.csv")) - .missingValueIndicator("-")); - Row row0 = testTable.row(0); - Int2ObjectMap uniqueHashes = new Int2ObjectOpenHashMap<>(); - IntArrayList value = new IntArrayList(new int[] {1, 0}); - uniqueHashes.put(row0.rowHash(), value); - boolean isDuplicate = testTable.isDuplicate(row0, uniqueHashes); - assertTrue(isDuplicate, "Duplicate row not found"); - } - }