diff --git a/src/main/java/org/cbioportal/web/columnar/util/NewClinicalDataBinUtil.java b/src/main/java/org/cbioportal/web/columnar/util/NewClinicalDataBinUtil.java index 2aa8f129d1e..fdc7afcd4b8 100644 --- a/src/main/java/org/cbioportal/web/columnar/util/NewClinicalDataBinUtil.java +++ b/src/main/java/org/cbioportal/web/columnar/util/NewClinicalDataBinUtil.java @@ -126,6 +126,12 @@ public static List calculateDynamicDataBins( * Size of the generated list is equal to 'dataCount.count', * and each ClinicalData in the list contains the same value 'dataCount.value' * + * This method improves the performance of the data binning because it allows us to fetch only + * the clinical data counts data which is a lot more compact and faster to generated than the actual clinical data. + * We only need the attribute id and the value of the clinical data to generate data bins. + * Constructing the clinical data in memory by using clinical data counts significantly improves the performance, + * and it also allows us to use the exact same SQL used by the clinical data counts endpoint. + * * @param dataCount ClinicalDataCount instance containing the count and the value * @return a list of ClinicalData with size 'dataCount.count' and value 'dataCount.value' */ diff --git a/src/main/java/org/cbioportal/web/util/DataBinner.java b/src/main/java/org/cbioportal/web/util/DataBinner.java index ad93810574a..5dac86d99e8 100644 --- a/src/main/java/org/cbioportal/web/util/DataBinner.java +++ b/src/main/java/org/cbioportal/web/util/DataBinner.java @@ -31,6 +31,11 @@ public class DataBinner { @Autowired private LogScaleDataBinner logScaleDataBinner; + /** + * This method should only be invoked by legacy endpoints because it requires sample/patient ids. + * + * @deprecated + */ public List calculateClinicalDataBins( T dataBinFilter, ClinicalDataType clinicalDataType, @@ -53,6 +58,9 @@ public List calculateClinicalDataBins( List unfilteredClinicalData ) { // calculate data bins for unfiltered clinical data + // we need this additional calculation to know the bins generated for the initial state. + // this allows us to keep the number of bins and bin ranges consistent. + // we only want to update the counts for each bin, we don't want to regenerate the bins for the filtered data. List dataBins = calculateDataBins( dataBinFilter, unfilteredClinicalData @@ -66,16 +74,21 @@ public List calculateClinicalDataBins( ); } + /** + * This method should only be invoked by legacy endpoints because it requires sample/patient ids. + * + * @deprecated + */ public List recalcBinCount( List dataBins, ClinicalDataType clinicalDataType, List clinicalData, - List ids + List caseIds ) { return recalcBinCount( dataBins, clinicalData, - countNAs(clinicalData, clinicalDataType, ids) + countNAs(clinicalData, clinicalDataType, caseIds) ); } @@ -125,17 +138,22 @@ public List recalcBinCount( return dataBins; } + /** + * This method should only be invoked by legacy endpoints because it requires sample/patient ids. + * + * @deprecated + */ public List calculateDataBins( T dataBinFilter, ClinicalDataType clinicalDataType, List clinicalData, - List ids + List caseIds ) { return calculateDataBins( dataBinFilter, clinicalDataType, clinicalData, - ids, + caseIds, DEFAULT_DISTINCT_VALUE_THRESHOLD ); } @@ -161,14 +179,19 @@ public List calculateDataBins( return calculateDataBins(dataBinFilter, clinicalData, naDataBin, distinctValueThreshold); } + /** + * This method should only be invoked by legacy endpoints because it requires sample/patient ids. + * + * @deprecated + */ public List calculateDataBins( T dataBinFilter, ClinicalDataType clinicalDataType, List clinicalData, - List ids, + List caseIds, Integer distinctValueThreshold ) { - DataBin naDataBin = calcNaDataBin(clinicalData, clinicalDataType, ids); + DataBin naDataBin = calcNaDataBin(clinicalData, clinicalDataType, caseIds); return calculateDataBins(dataBinFilter, clinicalData, naDataBin, distinctValueThreshold); } @@ -580,18 +603,21 @@ public List adjustCustomBins( * NA count is: Number of clinical data marked actually as "NA" + Number of patients/samples without clinical data. * Assuming that clinical data is for a single attribute. * + * This method should only be invoked by legacy endpoints because it requires sample/patient ids. + * * @param clinicalData clinical data list for a single attribute - * @param ids sample/patient ids + * @param caseIds sample/patient ids * * @return 'NA' clinical data count as a DataBin instance + * @deprecated */ public DataBin calcNaDataBin( List clinicalData, ClinicalDataType clinicalDataType, - List ids + List caseIds ) { DataBin bin = initNaDataBin(); - bin.setCount(countNAs(clinicalData, clinicalDataType, ids).intValue()); + bin.setCount(countNAs(clinicalData, clinicalDataType, caseIds).intValue()); return bin; } @@ -618,7 +644,12 @@ public DataBin initNaDataBin() { return bin; } - public Long countNAs(List clinicalData, ClinicalDataType clinicalDataType, List ids) { + /** + * This method should only be invoked by legacy endpoints because it requires sample/patient ids + * + * @deprecated + */ + public Long countNAs(List clinicalData, ClinicalDataType clinicalDataType, List caseIds) { // Calculate the number of clinical data marked actually as "NA", "NAN", or "N/A" Long count = countNAs(clinicalData); @@ -637,7 +668,7 @@ public Long countNAs(List clinicalData, ClinicalDataType clinicalDataT uniqueClinicalDataIds = Collections.emptySet(); } - Set uniqueInputIds = new HashSet<>(ids); + Set uniqueInputIds = new HashSet<>(caseIds); // remove the ids with existing clinical data, // size of the difference (of two sets) is the count we need