Skip to content

Commit

Permalink
simplify
Browse files Browse the repository at this point in the history
  • Loading branch information
benmccann committed Jan 2, 2025
1 parent 9400590 commit 4591325
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 75 deletions.
23 changes: 22 additions & 1 deletion core/src/main/java/tech/tablesaw/api/Row.java
Original file line number Diff line number Diff line change
Expand Up @@ -809,8 +809,29 @@ public Column<?> column(int columnIndex) {
return tableSlice.column(columnIndex);
}

/** Returns true if every value is equal to the corresponding value in the given row */
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != this.getClass()) {
return false;
}

Row other = (Row) obj;
if (columnCount() != other.columnCount()) {
return false;
}

for (int columnIndex = 0; columnIndex < columnCount(); columnIndex++) {
if (!column(columnIndex).equals(getRowNumber(), other.getRowNumber())) {
return false;
}
}
return true;
}

/** Returns a hash computed on the values in the backing table at this row */
public int rowHash() {
@Override
public int hashCode() {
int[] values = new int[columnCount()];
for (int i = 0; i < columnCount(); i++) {
Column<?> column = tableSlice.column(i);
Expand Down
60 changes: 3 additions & 57 deletions core/src/main/java/tech/tablesaw/api/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import static tech.tablesaw.api.QuerySupport.not;
import static tech.tablesaw.selection.Selection.selectNRowsAtRandom;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.*;
import com.google.common.primitives.Ints;
Expand Down Expand Up @@ -546,25 +545,6 @@ public static boolean compareRows(int rowNumber, Table table1, Table table2) {
return true;
}

/**
* Returns true if every value in row1 is equal to the same value in row2, where row1 and row2 are
* both rows from this table
*/
private boolean duplicateRows(Row row1, Row row2) {
if (row1.columnCount() != row2.columnCount()) {
return false;
}
boolean result;
for (int columnIndex = 0; columnIndex < row1.columnCount(); columnIndex++) {
Column<?> c = column(columnIndex);
result = c.equals(row1.getRowNumber(), row2.getRowNumber());
if (!result) {
return false;
}
}
return true;
}

public Table[] sampleSplit(double table1Proportion) {
Table[] tables = new Table[2];
int table1Count = (int) Math.round(rowCount() * table1Proportion);
Expand Down Expand Up @@ -932,51 +912,17 @@ public TableSliceGroup splitOn(CategoricalColumn<?>... columns) {
* this table, appears only once in the returned table.
*/
public Table dropDuplicateRows() {

Table temp = emptyCopy();
Int2ObjectMap<IntArrayList> uniqueHashes = new Int2ObjectOpenHashMap<>();
// ListMultimap<Integer, Integer> uniqueHashes = ArrayListMultimap.create();
Set uniqueRows = new HashSet<>();
for (Row row : this) {
if (!isDuplicate(row, uniqueHashes)) {
if (!uniqueRows.contains(row)) {
uniqueRows.add(row);
temp.append(row);
}
}
return temp;
}

/**
* Returns true if all the values in row are identical to those in another row previously seen and
* recorded in the list.
*
* @param row the row to evaluate
* @param uniqueHashes a map of row hashes to the id of an exemplar row that produces that hash.
* If two different rows produce the same hash, then the row number for each is placed in the
* list, so that there are exemplars for both
* @return true if the row's values exactly match a row that was previously seen
*/
@VisibleForTesting
protected boolean isDuplicate(Row row, Int2ObjectMap<IntArrayList> uniqueHashes) {
int hash = row.rowHash();
if (!uniqueHashes.containsKey(hash)) {
IntArrayList rowNumbers = new IntArrayList();
rowNumbers.add(row.getRowNumber());
uniqueHashes.put(hash, rowNumbers);
return false;
}

// the hashmap contains the hash, make sure the actual row values match
IntArrayList matchingKeys = uniqueHashes.get(hash);

for (int key : matchingKeys) {
Row oldRow = this.row(key);
if (duplicateRows(row, oldRow)) {
return true;
}
}
uniqueHashes.get(hash).add(row.getRowNumber());
return false;
}

/** Returns only those records in this table that have no columns with missing values */
public Table dropRowsWithMissingValues() {

Expand Down
17 changes: 0 additions & 17 deletions core/src/test/java/tech/tablesaw/api/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import java.io.File;
import java.io.StringReader;
import java.io.StringWriter;
Expand Down Expand Up @@ -919,18 +916,4 @@ public void testToStringColumnsWithVaryingSizes() {
fail("toString shouldn't throw " + e);
}
}

@Test
void testDropDuplicateWithHashCollision() throws Exception {
Table testTable = Table.read().usingOptions(CsvReadOptions
.builder(new File("../data/missing_values.csv"))
.missingValueIndicator("-"));
Row row0 = testTable.row(0);
Int2ObjectMap<IntArrayList> uniqueHashes = new Int2ObjectOpenHashMap<>();
IntArrayList value = new IntArrayList(new int[] {1, 0});
uniqueHashes.put(row0.rowHash(), value);
boolean isDuplicate = testTable.isDuplicate(row0, uniqueHashes);
assertTrue(isDuplicate, "Duplicate row not found");
}

}

0 comments on commit 4591325

Please sign in to comment.