Skip to content

Commit

Permalink
simplify clinical data binning related SQL
Browse files Browse the repository at this point in the history
  • Loading branch information
onursumer committed Jul 10, 2024
1 parent f602587 commit 516ffe7
Show file tree
Hide file tree
Showing 11 changed files with 350 additions and 159 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.cbioportal.model.CopyNumberCountByGene;
import org.cbioportal.model.GenomicDataCount;
import org.cbioportal.model.Sample;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;

import java.util.List;
Expand All @@ -32,6 +33,8 @@ public interface StudyViewRepository {

List<ClinicalAttribute> getClinicalAttributes();

Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap();

List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter);

Map<String, AlterationCountByGene> getTotalProfiledCounts(StudyViewFilter studyViewFilter, String alterationType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@ List<CopyNumberCountByGene> getCnaGenes(StudyViewFilter studyViewFilter, Categor
List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, AlterationFilterHelper alterationFilterHelper);

List<ClinicalDataCount> getPatientClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues);

List<ClinicalDataCount> getSampleClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues );

List<ClinicalDataCount> getClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.cbioportal.persistence.enums.ClinicalAttributeDataSource;
import org.cbioportal.persistence.helper.AlterationFilterHelper;
import org.cbioportal.web.parameter.CategorizedClinicalDataCountFilter;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
Expand Down Expand Up @@ -83,6 +84,25 @@ public List<ClinicalAttribute> getClinicalAttributes() {
return mapper.getClinicalAttributes();
}

@Override
public Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap() {
if (clinicalAttributesMap.isEmpty()) {
buildClinicalAttributeNameMap();
}

Map<String, ClinicalDataType> attributeDatatypeMap = new HashMap<>();

clinicalAttributesMap
.get(ClinicalAttributeDataSource.SAMPLE)
.forEach(attribute -> attributeDatatypeMap.put(attribute.getAttrId(), ClinicalDataType.SAMPLE));

clinicalAttributesMap
.get(ClinicalAttributeDataSource.PATIENT)
.forEach(attribute -> attributeDatatypeMap.put(attribute.getAttrId(), ClinicalDataType.PATIENT));

return attributeDatatypeMap;
}

@Override
public List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter) {
CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter = extractClinicalDataCountFilters(studyViewFilter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import org.cbioportal.model.GenomicDataCount;
import org.cbioportal.model.CopyNumberCountByGene;
import org.cbioportal.model.Sample;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;

import java.util.List;
import java.util.Map;

public interface StudyViewColumnarService {

Expand All @@ -19,6 +21,8 @@ public interface StudyViewColumnarService {
List<CopyNumberCountByGene> getCnaGenes(StudyViewFilter interceptedStudyViewFilter);
List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter studyViewFilter);

Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap();

List<ClinicalDataCountItem> getClinicalDataCounts(StudyViewFilter studyViewFilter, List<String> filteredAttributes);

List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter);
Expand All @@ -28,6 +32,5 @@ public interface StudyViewColumnarService {
List<ClinicalData> getSampleClinicalData(StudyViewFilter studyViewFilter, List<String> attributeIds);

List<GenomicDataCount> getGenomicDataCounts(StudyViewFilter studyViewFilter);



}
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
import org.cbioportal.persistence.StudyViewRepository;
import org.cbioportal.service.AlterationCountService;
import org.cbioportal.service.StudyViewColumnarService;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;

import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

@Service
Expand Down Expand Up @@ -58,6 +60,11 @@ public List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter stu
return alterationCountService.getStructuralVariantGenes(studyViewFilter);
}

@Override
public Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap() {
return studyViewRepository.getClinicalAttributeDatatypeMap();
}

@Override
public List<ClinicalDataCountItem> getClinicalDataCounts(StudyViewFilter studyViewFilter, List<String> filteredAttributes) {
return studyViewRepository.getClinicalDataCounts(studyViewFilter, filteredAttributes)
Expand Down Expand Up @@ -85,6 +92,4 @@ public List<ClinicalData> getPatientClinicalData(StudyViewFilter studyViewFilter
public List<ClinicalData> getSampleClinicalData(StudyViewFilter studyViewFilter, List<String> attributeIds) {
return studyViewRepository.getSampleClinicalData(studyViewFilter, attributeIds);
}


}
71 changes: 25 additions & 46 deletions src/main/java/org/cbioportal/web/columnar/ClinicalDataBinner.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

@Component
public class ClinicalDataBinner {
Expand All @@ -27,6 +26,15 @@ public ClinicalDataBinner(
this.dataBinner = dataBinner;
}

private List<ClinicalData> convertCountsToData(List<ClinicalDataCount> clinicalDataCounts)
{
return clinicalDataCounts
.stream()
.map(NewClinicalDataBinUtil::generateClinicalDataFromClinicalDataCount)
.flatMap(Collection::stream)
.toList();
}

@Cacheable(cacheResolver = "generalRepositoryCacheResolver", condition = "@cacheEnabledConfig.getEnabled()")
public List<ClinicalDataBin> fetchClinicalDataBinCounts(
DataBinMethod dataBinMethod,
Expand All @@ -43,53 +51,30 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
List<String> attributeIds = attributes.stream().map(ClinicalDataBinFilter::getAttributeId).collect(Collectors.toList());

// a new StudyView filter to partially filter by study and sample ids only
StudyViewFilter partialFilter = new StudyViewFilter();
partialFilter.setStudyIds(studyViewFilter.getStudyIds());
partialFilter.setSampleIdentifiers(studyViewFilter.getSampleIdentifiers());

// filter only by study id and sample identifiers, ignore rest
// we need this additional partial filter because we always need to know the bins generated for the initial state
// which allows us to keep the number of bins and bin ranges consistent even if there are additional data filters.
// we only want to update the counts for each bin, we don't want to regenerate the bins for the filtered data.
// NOTE: partial filter is only needed when dataBinMethod == DataBinMethod.STATIC but that's always the case
// for the frontend implementation. we can't really use dataBinMethod == DataBinMethod.DYNAMIC because of the
// complication it brings to the frontend visualization and filtering
List<Sample> unfilteredSamples = studyViewColumnarService.getFilteredSamples(partialFilter);
List<Sample> filteredSamples = studyViewColumnarService.getFilteredSamples(studyViewFilter);

// TODO make sure unique sample and patient keys don't need to be distinct
List<String> unfilteredUniqueSampleKeys = unfilteredSamples.stream().map(Sample::getUniqueSampleKey).collect(Collectors.toList());
List<String> filteredUniqueSampleKeys = filteredSamples.stream().map(Sample::getUniqueSampleKey).collect(Collectors.toList());
List<String> unfilteredUniquePatientKeys = unfilteredSamples.stream().map(Sample::getUniquePatientKey).collect(Collectors.toList());
List<String> filteredUniquePatientKeys = filteredSamples.stream().map(Sample::getUniquePatientKey).collect(Collectors.toList());
StudyViewFilter partialFilter = new StudyViewFilter();
partialFilter.setStudyIds(studyViewFilter.getStudyIds());
partialFilter.setSampleIdentifiers(studyViewFilter.getSampleIdentifiers());

// TODO make sure we don't need a distinction between sample vs patient attribute ids here
// ideally we shouldn't because we have patient clinical data separated from sample clinical data in clickhouse

// we need the clinical data for the partial filter in order to generate the bins for initial state
// we use the filtered data to calculate the counts for each bin, we do not regenerate bins for the filtered data
List<ClinicalData> unfilteredClinicalDataForSamples = studyViewColumnarService.getSampleClinicalData(partialFilter, attributeIds);
List<ClinicalData> filteredClinicalDataForSamples = studyViewColumnarService.getSampleClinicalData(studyViewFilter, attributeIds);
List<ClinicalData> unfilteredClinicalDataForPatients = studyViewColumnarService.getPatientClinicalData(partialFilter, attributeIds);
List<ClinicalData> filteredClinicalDataForPatients = studyViewColumnarService.getPatientClinicalData(studyViewFilter, attributeIds);
// we use the filtered data to calculate the counts for each bin, we do not regenerate bins for the filtered data
List<ClinicalDataCountItem> unfilteredClinicalDataCounts = studyViewColumnarService.getClinicalDataCounts(partialFilter, attributeIds);
List<ClinicalDataCountItem> filteredClinicalDataCounts = studyViewColumnarService.getClinicalDataCounts(studyViewFilter, attributeIds);

Map<String, ClinicalDataType> attributeDatatypeMap = NewClinicalDataBinUtil.toAttributeDatatypeMap(
unfilteredClinicalDataForSamples.stream().map(ClinicalData::getAttrId).collect(Collectors.toList()),
unfilteredClinicalDataForPatients.stream().map(ClinicalData::getAttrId).collect(Collectors.toList()),
Collections.emptyList() // TODO ignoring conflictingPatientAttributeIds for now
// TODO ignoring conflictingPatientAttributeIds for now
List<ClinicalData> unfilteredClinicalData = convertCountsToData(
unfilteredClinicalDataCounts.stream().flatMap(c -> c.getCounts().stream()).toList()
);
List<ClinicalData> filteredClinicalData = convertCountsToData(
filteredClinicalDataCounts.stream().flatMap(c -> c.getCounts().stream()).toList()
);

List<Binnable> unfilteredClinicalData = Stream.of(
unfilteredClinicalDataForSamples,
unfilteredClinicalDataForPatients
// unfilteredClinicalDataForConflictingPatientAttributes /// TODO ignoring conflictingPatientAttributeIds for now
).flatMap(Collection::stream).collect(Collectors.toList());

List<Binnable> filteredClinicalData = Stream.of(
filteredClinicalDataForSamples,
filteredClinicalDataForPatients
// filteredClinicalDataForConflictingPatientAttributes // TODO ignoring conflictingPatientAttributeIds for now
).flatMap(Collection::stream).collect(Collectors.toList());
Map<String, ClinicalDataType> attributeDatatypeMap = studyViewColumnarService.getClinicalAttributeDatatypeMap();

Map<String, List<Binnable>> unfilteredClinicalDataByAttributeId =
unfilteredClinicalData.stream().collect(Collectors.groupingBy(Binnable::getAttrId));
Expand All @@ -100,17 +85,13 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
List<ClinicalDataBin> clinicalDataBins = Collections.emptyList();

if (dataBinMethod == DataBinMethod.STATIC) {
if (!unfilteredSamples.isEmpty() && !unfilteredClinicalData.isEmpty()) {
if (!unfilteredClinicalData.isEmpty()) {
clinicalDataBins = NewClinicalDataBinUtil.calculateStaticDataBins(
dataBinner,
attributes,
attributeDatatypeMap,
unfilteredClinicalDataByAttributeId,
filteredClinicalDataByAttributeId,
unfilteredUniqueSampleKeys,
unfilteredUniquePatientKeys,
filteredUniqueSampleKeys,
filteredUniquePatientKeys
filteredClinicalDataByAttributeId
);
}
}
Expand All @@ -123,9 +104,7 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
dataBinner,
attributes,
attributeDatatypeMap,
filteredClinicalDataByAttributeId,
filteredUniqueSampleKeys,
filteredUniquePatientKeys
filteredClinicalDataByAttributeId
);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.cbioportal.web.columnar.util;

import org.cbioportal.model.Binnable;
import org.cbioportal.model.ClinicalData;
import org.cbioportal.model.ClinicalDataBin;
import org.cbioportal.model.ClinicalDataCount;
import org.cbioportal.model.DataBin;
import org.cbioportal.web.parameter.ClinicalDataBinCountFilter;
import org.cbioportal.web.parameter.ClinicalDataBinFilter;
Expand All @@ -15,7 +17,6 @@
import java.util.Map;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

public class NewClinicalDataBinUtil {
public static StudyViewFilter removeSelfFromFilter(ClinicalDataBinCountFilter dataBinCountFilter) {
Expand Down Expand Up @@ -70,33 +71,21 @@ public static List<ClinicalDataBin> calculateStaticDataBins(
List<ClinicalDataBinFilter> attributes,
Map<String, ClinicalDataType> attributeDatatypeMap,
Map<String, List<Binnable>> unfilteredClinicalDataByAttributeId,
Map<String, List<Binnable>> filteredClinicalDataByAttributeId,
List<String> unfilteredUniqueSampleKeys,
List<String> unfilteredUniquePatientKeys,
List<String> filteredUniqueSampleKeys,
List<String> filteredUniquePatientKeys
Map<String, List<Binnable>> filteredClinicalDataByAttributeId
) {
List<ClinicalDataBin> clinicalDataBins = new ArrayList<>();

for (ClinicalDataBinFilter attribute : attributes) {
if (attributeDatatypeMap.containsKey(attribute.getAttributeId())) {
ClinicalDataType clinicalDataType = attributeDatatypeMap.get(attribute.getAttributeId());
List<String> filteredIds = clinicalDataType == ClinicalDataType.PATIENT ? filteredUniquePatientKeys
: filteredUniqueSampleKeys;
List<String> unfilteredIds = clinicalDataType == ClinicalDataType.PATIENT
? unfilteredUniquePatientKeys
: unfilteredUniqueSampleKeys;

List<ClinicalDataBin> dataBins = dataBinner
.calculateClinicalDataBins(attribute, clinicalDataType,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
unfilteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
filteredIds, unfilteredIds)
.calculateClinicalDataBins(
attribute,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList()),
unfilteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList())
)
.stream()
.map(dataBin -> dataBinToClinicalDataBin(attribute, dataBin))
.collect(toList());
.toList();

clinicalDataBins.addAll(dataBins);
}
Expand All @@ -109,33 +98,54 @@ public static List<ClinicalDataBin> calculateDynamicDataBins(
DataBinner dataBinner,
List<ClinicalDataBinFilter> attributes,
Map<String, ClinicalDataType> attributeDatatypeMap,
Map<String, List<Binnable>> filteredClinicalDataByAttributeId,
List<String> filteredUniqueSampleKeys,
List<String> filteredUniquePatientKeys
Map<String, List<Binnable>> filteredClinicalDataByAttributeId
) {
List<ClinicalDataBin> clinicalDataBins = new ArrayList<>();

for (ClinicalDataBinFilter attribute : attributes) {

// if there is clinical data for requested attribute
if (attributeDatatypeMap.containsKey(attribute.getAttributeId())) {
ClinicalDataType clinicalDataType = attributeDatatypeMap.get(attribute.getAttributeId());
List<String> filteredIds = clinicalDataType == ClinicalDataType.PATIENT
? filteredUniquePatientKeys
: filteredUniqueSampleKeys;

List<ClinicalDataBin> dataBins = dataBinner
.calculateDataBins(attribute, clinicalDataType,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
filteredIds)
.calculateDataBins(
attribute,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList())
)
.stream()
.map(dataBin -> dataBinToClinicalDataBin(attribute, dataBin))
.collect(toList());
.toList();
clinicalDataBins.addAll(dataBins);
}
}

return clinicalDataBins;
}

/**
* Generate a list of ClinicalData from the given data count instance.
* Size of the generated list is equal to 'dataCount.count',
* and each ClinicalData in the list contains the same value 'dataCount.value'
*
* This method improves the performance of the data binning because it allows us to fetch only
* the clinical data counts data which is a lot more compact and faster to generated than the actual clinical data.
* We only need the attribute id and the value of the clinical data to generate data bins.
* Constructing the clinical data in memory by using clinical data counts significantly improves the performance,
* and it also allows us to use the exact same SQL used by the clinical data counts endpoint.
*
* @param dataCount ClinicalDataCount instance containing the count and the value
* @return a list of ClinicalData with size 'dataCount.count' and value 'dataCount.value'
*/
public static List<ClinicalData> generateClinicalDataFromClinicalDataCount(ClinicalDataCount dataCount)
{
List<ClinicalData> data = new ArrayList<>(dataCount.getCount());

for (int i=0; i < dataCount.getCount(); i++) {
ClinicalData d = new ClinicalData();
d.setAttrId(dataCount.getAttributeId());
d.setAttrValue(dataCount.getValue());
data.add(d);
}

return data;
}
}
Loading

0 comments on commit 516ffe7

Please sign in to comment.