Skip to content

Commit

Permalink
Normalize data counts in a generic case-insensitive way
Browse files Browse the repository at this point in the history
  • Loading branch information
onursumer committed Oct 1, 2024
1 parent f7d91c0 commit 179186e
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,52 @@ private StudyViewFilterContext createContext(StudyViewFilter studyViewFilter) {
}

private List<ClinicalDataCountItem> generateDataCountItemsFromDataCounts(List<ClinicalDataCount> dataCounts) {
return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId))
return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId))
.entrySet().parallelStream().map(e -> {
ClinicalDataCountItem item = new ClinicalDataCountItem();
item.setAttributeId(e.getKey());
item.setCounts(e.getValue());
item.setCounts(normalizeDataCounts(e.getValue()));
return item;
}).toList();
}


private List<ClinicalDataCount> normalizeDataCounts(List<ClinicalDataCount> dataCounts) {
// Normalize data counts ignoring the attribute value case
// For example attribute values "TRUE", "True", and 'true' will be merged into a single aggregated count
return dataCounts
.stream()
.collect(
Collectors.groupingBy(
c -> c.getValue().toLowerCase(),
Collectors.reducing(new ClinicalDataCount(), (count1, count2) -> {
String attributeId =
count1.getAttributeId() != null
? count1.getAttributeId()
: count2.getAttributeId();
String value = count1.getValue() != null
? count1.getValue()
: count2.getValue();
if (count1.getValue() != null && count2.getValue() != null) {
value = count1.getValue().compareTo(count2.getValue()) > 0
? count1.getValue()
: count2.getValue();
}
Integer count = (count1.getCount() != null ? count1.getCount(): 0) +
(count2.getCount() != null ? count2.getCount(): 0);

ClinicalDataCount reduced = new ClinicalDataCount();
reduced.setAttributeId(attributeId);
reduced.setValue(value);
reduced.setCount(count);
return reduced;
})
)
)
.values()
.stream()
.toList();
}

public static List<ClinicalDataCountItem> calculateMissingNaCountsForClinicalDataCountItems(
List<ClinicalDataCountItem> clinicalDataCountItems,
List<String> filteredAttributes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@
<include refid="normalizeAttributeValue">
<property name="attribute_value" value="attribute_value"/>
</include>
) = '${dataFilterValue.value}'
) ILIKE '${dataFilterValue.value}'
</trim>
</foreach>
</sql>
Expand Down Expand Up @@ -505,7 +505,7 @@
<include refid="normalizeAttributeValue">
<property name="attribute_value" value="value"/>
</include>
) = '${dataFilterValue.value}'
) ILIKE '${dataFilterValue.value}'
</trim>
</foreach>
</sql>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -695,28 +695,12 @@
OR upperUTF8(${attribute_value})='N/A'
</sql>

<!-- This is to match boolean values ignoring the case -->
<sql id="isAttributeValueTrue">
upperUTF8(${attribute_value})='TRUE'
</sql>
<sql id="isAttributeValueFalse">
upperUTF8(${attribute_value})='FALSE'
</sql>

<sql id="normalizeAttributeValue">
multiIf(
<include refid="isAttributeValueNA">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'NA',
<include refid="isAttributeValueTrue">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'True',
<include refid="isAttributeValueFalse">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'False',
${attribute_value}
)
</sql>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,13 @@ public void getDeadCounts() {
Collections.emptyList()
);

assertEquals(6, categoricalClinicalDataCounts.size());
assertEquals(3, findClinicaDataCount(categoricalClinicalDataCounts, "True"));
assertEquals(4, findClinicaDataCount(categoricalClinicalDataCounts, "False"));
assertEquals(10, categoricalClinicalDataCounts.size());
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "True"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "TRUE"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "true"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "False"));
assertEquals(2, findClinicaDataCount(categoricalClinicalDataCounts, "FALSE"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "false"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Released"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Collected"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Unknown"));
Expand Down

0 comments on commit 179186e

Please sign in to comment.