Skip to content

Commit

Permalink
deprecate certain methods, and add more comments
Browse files Browse the repository at this point in the history
  • Loading branch information
onursumer committed Jul 2, 2024
1 parent 09fc4a0 commit 4a983b1
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ public static List<ClinicalDataBin> calculateDynamicDataBins(
* Size of the generated list is equal to 'dataCount.count',
* and each ClinicalData in the list contains the same value 'dataCount.value'
*
* This method improves the performance of the data binning because it allows us to fetch only
* the clinical data counts data which is a lot more compact and faster to generated than the actual clinical data.
* We only need the attribute id and the value of the clinical data to generate data bins.
* Constructing the clinical data in memory by using clinical data counts significantly improves the performance,
* and it also allows us to use the exact same SQL used by the clinical data counts endpoint.
*
* @param dataCount ClinicalDataCount instance containing the count and the value
* @return a list of ClinicalData with size 'dataCount.count' and value 'dataCount.value'
*/
Expand Down
53 changes: 42 additions & 11 deletions src/main/java/org/cbioportal/web/util/DataBinner.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ public class DataBinner {
@Autowired
private LogScaleDataBinner logScaleDataBinner;

/**
* This method should only be invoked by legacy endpoints because it requires sample/patient ids.
*
* @deprecated
*/
public <T extends DataBinFilter> List<DataBin> calculateClinicalDataBins(
T dataBinFilter,
ClinicalDataType clinicalDataType,
Expand All @@ -53,6 +58,9 @@ public <T extends DataBinFilter> List<DataBin> calculateClinicalDataBins(
List<Binnable> unfilteredClinicalData
) {
// calculate data bins for unfiltered clinical data
// we need this additional calculation to know the bins generated for the initial state.
// this allows us to keep the number of bins and bin ranges consistent.
// we only want to update the counts for each bin, we don't want to regenerate the bins for the filtered data.
List<DataBin> dataBins = calculateDataBins(
dataBinFilter,
unfilteredClinicalData
Expand All @@ -66,16 +74,21 @@ public <T extends DataBinFilter> List<DataBin> calculateClinicalDataBins(
);
}

/**
* This method should only be invoked by legacy endpoints because it requires sample/patient ids.
*
* @deprecated
*/
public List<DataBin> recalcBinCount(
List<DataBin> dataBins,
ClinicalDataType clinicalDataType,
List<Binnable> clinicalData,
List<String> ids
List<String> caseIds
) {
return recalcBinCount(
dataBins,
clinicalData,
countNAs(clinicalData, clinicalDataType, ids)
countNAs(clinicalData, clinicalDataType, caseIds)
);
}

Expand Down Expand Up @@ -125,17 +138,22 @@ public List<DataBin> recalcBinCount(
return dataBins;
}

/**
* This method should only be invoked by legacy endpoints because it requires sample/patient ids.
*
* @deprecated
*/
public <T extends DataBinFilter> List<DataBin> calculateDataBins(
T dataBinFilter,
ClinicalDataType clinicalDataType,
List<Binnable> clinicalData,
List<String> ids
List<String> caseIds
) {
return calculateDataBins(
dataBinFilter,
clinicalDataType,
clinicalData,
ids,
caseIds,
DEFAULT_DISTINCT_VALUE_THRESHOLD
);
}
Expand All @@ -161,14 +179,19 @@ public <T extends DataBinFilter> List<DataBin> calculateDataBins(
return calculateDataBins(dataBinFilter, clinicalData, naDataBin, distinctValueThreshold);
}

/**
* This method should only be invoked by legacy endpoints because it requires sample/patient ids.
*
* @deprecated
*/
public <T extends DataBinFilter> List<DataBin> calculateDataBins(
T dataBinFilter,
ClinicalDataType clinicalDataType,
List<Binnable> clinicalData,
List<String> ids,
List<String> caseIds,
Integer distinctValueThreshold
) {
DataBin naDataBin = calcNaDataBin(clinicalData, clinicalDataType, ids);
DataBin naDataBin = calcNaDataBin(clinicalData, clinicalDataType, caseIds);

return calculateDataBins(dataBinFilter, clinicalData, naDataBin, distinctValueThreshold);
}
Expand Down Expand Up @@ -580,18 +603,21 @@ public List<BigDecimal> adjustCustomBins(
* NA count is: Number of clinical data marked actually as "NA" + Number of patients/samples without clinical data.
* Assuming that clinical data is for a single attribute.
*
* This method should only be invoked by legacy endpoints because it requires sample/patient ids.
*
* @param clinicalData clinical data list for a single attribute
* @param ids sample/patient ids
* @param caseIds sample/patient ids
*
* @return 'NA' clinical data count as a DataBin instance
* @deprecated
*/
public DataBin calcNaDataBin(
List<Binnable> clinicalData,
ClinicalDataType clinicalDataType,
List<String> ids
List<String> caseIds
) {
DataBin bin = initNaDataBin();
bin.setCount(countNAs(clinicalData, clinicalDataType, ids).intValue());
bin.setCount(countNAs(clinicalData, clinicalDataType, caseIds).intValue());

return bin;
}
Expand All @@ -618,7 +644,12 @@ public DataBin initNaDataBin() {
return bin;
}

public Long countNAs(List<Binnable> clinicalData, ClinicalDataType clinicalDataType, List<String> ids) {
/**
* This method should only be invoked by legacy endpoints because it requires sample/patient ids
*
* @deprecated
*/
public Long countNAs(List<Binnable> clinicalData, ClinicalDataType clinicalDataType, List<String> caseIds) {
// Calculate the number of clinical data marked actually as "NA", "NAN", or "N/A"

Long count = countNAs(clinicalData);
Expand All @@ -637,7 +668,7 @@ public Long countNAs(List<Binnable> clinicalData, ClinicalDataType clinicalDataT
uniqueClinicalDataIds = Collections.emptySet();
}

Set<String> uniqueInputIds = new HashSet<>(ids);
Set<String> uniqueInputIds = new HashSet<>(caseIds);

// remove the ids with existing clinical data,
// size of the difference (of two sets) is the count we need
Expand Down

0 comments on commit 4a983b1

Please sign in to comment.