diff --git a/README.md b/README.md index 6d0f3535..d8d56814 100644 --- a/README.md +++ b/README.md @@ -86,8 +86,6 @@ docker run \ --output_vcf=/output/YOUR_OUTPUT_VCF \ --output_gvcf=/output/YOUR_OUTPUT_GVCF \ --num_shards=$(nproc) \ **This will use all your cores to run make_examples. Feel free to change.** - --vcf_stats_report=true \ **Optional. Creates VCF statistics report in html file. Default is false. - --disable_small_model=true \ **Optional. Disables the small model from make_examples stage. Default is false. --logging_dir=/output/logs \ **Optional. This saves the log output for each stage separately. --haploid_contigs="chrX,chrY" \ **Optional. Heterozygous variants in these contigs will be re-genotyped as the most likely of reference or homozygous alternates. For a sample with karyotype XY, it should be set to "chrX,chrY" for GRCh38 and "X,Y" for GRCh37. For a sample with karyotype XX, this should not be used. --par_regions_bed="/input/GRCh3X_par.bed" \ **Optional. If --haploid_contigs is set, then this can be used to provide PAR regions to be excluded from genotype adjustment. Download links to this files are available in this page. diff --git a/docs/README.md b/docs/README.md index d176134e..a761a13a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,7 +13,8 @@ * [DeepVariant hybrid (PacBio and Illumina) case study](deepvariant-hybrid-case-study.md) * [DeepVariant Complete Genomics T7 case study](deepvariant-complete-t7-case-study.md) * [DeepVariant Complete Genomics G400 case study](deepvariant-complete-g400-case-study.md) -* [RNA-seq Case Study](deepvariant-rnaseq-case-study.md) for Illumina RNA-seq. +* [RNA-seq Case Study](deepvariant-rnaseq-case-study.md) for Illumina + RNA-seq. * [PacBio Iso-Seq/MAS-Seq Case Study](deepvariant-masseq-case-study.md). * [Runtime and accuracy metrics for all DeepVariant models](metrics.md) * [Best practices for multi-sample variant calling](trio-merge-case-study.md) @@ -21,8 +22,6 @@ * Pangenome-aware DeepVariant WGS: [Mapped with BWA](pangenome-aware-wgs-bwa-case-study.md), [Mapped with VG](pangenome-aware-wgs-vg-case-study.md) -* Pangenome-aware DeepVariant WES: - [Mapped with BWA](pangenome-aware-wes-bwa-case-study.md) ## Visualization and analysis diff --git a/docs/deepvariant-fast-pipeline-case-study.md b/docs/deepvariant-fast-pipeline-case-study.md index db341d4c..8a519aa0 100644 --- a/docs/deepvariant-fast-pipeline-case-study.md +++ b/docs/deepvariant-fast-pipeline-case-study.md @@ -23,7 +23,7 @@ Here we create Google Cloud compute instance. You may skip this step if you run the case study on a local computer with GPU. ```bash -gcloud compute instances create "deepvariant-fast-pipeline" \ +gcloud compute instances create "deepvariant-casestudy" \ --scopes "compute-rw,storage-full,cloud-platform" \ --maintenance-policy "TERMINATE" \ --accelerator=type=nvidia-tesla-p4,count=1 \ @@ -31,13 +31,8 @@ gcloud compute instances create "deepvariant-fast-pipeline" \ --image-project "ubuntu-os-cloud" \ --machine-type "n1-standard-16" \ --boot-disk-size "100" \ - --zone "us-central1-a" -``` - -You can then ssh into the machine by running: - -```bash -gcloud compute ssh "deepvariant-fast-pipeline" --zone us-central1-a + --zone "us-central1-a" \ + --min-cpu-platform "Intel Skylake" ``` ## Install Nvidia drivers and Nvidia container toolkit (optional) @@ -52,7 +47,6 @@ For this case study we used the that automates the CUDA and container tools kit installation. Please note that the script takes about 30 minutes to run. - ```bash wget https://raw.githubusercontent.com/google/deepvariant/refs/heads/r1.8.0/scripts/install_nvidia_docker.sh chmod +x install_nvidia_docker.sh @@ -64,7 +58,7 @@ chmod +x install_nvidia_docker.sh ### Get DeepVariant Docker image ```bash -BIN_VERSION="1.8.0" +BIN_VERSION="1.8.0-rc0" sudo docker pull google/deepvariant:"${BIN_VERSION}-gpu" ``` @@ -86,7 +80,7 @@ gcloud storage cp gs://deepvariant/case-study-testdata/GCA_000001405.15_GRCh38_n ```bash mkdir -p input -gcloud storage cp gs://deepvariant/pacbio-case-study-testdata/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam* input/ +gcloud storage cp gs://deepvariant/pacbio-case-study-testdata/HG003.pfda_challenge.grch38.phased.chr20.bam* input/ ``` ## Run DeepVariant pipeline on chromosome 20 alignments @@ -116,7 +110,7 @@ cat <$FILE --examples=/tmp/examples.tfrecords@14.gz --gvcf=/tmp/examples.gvcf.tfrecord@14.gz --mode=calling ---reads=/input/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam +--reads=/input/HG003.pfda_challenge.grch38.phased.chr20.bam --ref=/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz --alt_aligned_pileup=diff_channels --max_reads_per_partition=600 @@ -162,7 +156,7 @@ cat <$FILE --outfile=/output/variants.chr20.vcf --gvcf_outfile=/output/variants.gvcf.chr20.vcf --small_model_cvo_records=/tmp/examples_call_variant_outputs.tfrecords@14.gz ---cpus=14 +--cpus=0 EOM ``` @@ -184,7 +178,7 @@ time sudo docker run \ --shm_prefix dv \ --num_shards 14 \ --buffer_size 10485760 \ - 2>&1 | tee /tmp/fast_pipeline.docker.log + 2>&1 | tee /tmp/fast_pipeline.Docker_chr20.log ``` * `-v` allows to map local directory inside docker container. @@ -217,9 +211,9 @@ variants.gvcf.chr20.vcf With the same settings the pipeline takes approximately 10 minutes. ``` -real 8m15.252s -user 0m0.007s -sys 0m0.035s +real 10m35.875s +user 0m0.009s +sys 0m0.034s ``` ## Benchmark output @@ -236,8 +230,6 @@ curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG003_G ``` HAPPY_VERSION=v0.3.12 - -time sudo docker run \ -v ${PWD}/output:/output \ -v ${PWD}/benchmark:/benchmark \ -v ${PWD}/reference:/reference \ @@ -254,10 +246,9 @@ time sudo docker run \ ``` ``` -Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 10628 10543 85 22403 74 11375 40 29 0.992002 0.993290 0.507744 0.992646 NaN NaN 1.748961 2.138647 -INDEL PASS 10628 10543 85 22403 74 11375 40 29 0.992002 0.993290 0.507744 0.992646 NaN NaN 1.748961 2.138647 - SNP ALL 70166 70101 65 105602 71 35342 12 12 0.999074 0.998989 0.334672 0.999032 2.296566 1.713281 1.883951 1.503192 - SNP PASS 70166 70101 65 105602 71 35342 12 12 0.999074 0.998989 0.334672 0.999032 2.296566 1.713281 1.883951 1.503192 +INDEL ALL 10628 10560 68 22592 70 11520 39 30 0.993602 0.993678 0.509915 0.993640 NaN NaN 1.748961 2.324711 +INDEL PASS 10628 10560 68 22592 70 11520 39 30 0.993602 0.993678 0.509915 0.993640 NaN NaN 1.748961 2.324711 + SNP ALL 70166 70142 24 104271 23 34047 7 5 0.999658 0.999672 0.326524 0.999665 2.296566 1.74197 1.883951 1.849802 + SNP PASS 70166 70142 24 104271 23 34047 7 5 0.999658 0.999672 0.326524 0.999665 2.296566 1.74197 1.883951 1.849802 ``` diff --git a/docs/deepvariant-pacbio-model-case-study.md b/docs/deepvariant-pacbio-model-case-study.md index 68d72553..f533622b 100644 --- a/docs/deepvariant-pacbio-model-case-study.md +++ b/docs/deepvariant-pacbio-model-case-study.md @@ -1,145 +1,119 @@ -# DeepVariant with PacBio HiFi data +# Using DeepVariant for small variant calling from PacBio HiFi reads + +#### Author: William Rowell In this case study we describe applying DeepVariant to PacBio HiFi reads to call variants. We will call small variants from a publicly available whole genome HiFi dataset from PacBio. -### Updated dataset in release 1.8.0 - -In release 1.8.0, we have updated the PacBio test data from HG003 Sequel-II to -latest Revio with SPRQ chemistry data to showcase performance on the updated -platform and chemistry. The full bam data is available [here](https://downloads.pacbcloud.com/public/revio/2024Q4/WGS/GIAB_trio/HG003/analysis/GRCh38.m84039_241002_000337_s3.hifi_reads.bc2020.bam). - -The dataset used in this case-study has following attributes: - -```bash -Sample: HG003 -Region: Chr20 -Chemistry: REVIO SPRQ -Coverage: 32x -``` +Starting in v1.4.0, PacBio calling uses one-step variant calling. If you're +looking for documentation for the two-step process, please look at v1.3.0. ## Prepare environment -In this case-study, we will use [Docker](https://docs.docker.com/get-docker/) to -run DeepVariant for variant calling and -[hap.py](https://github.com/illumina/hap.py) for benchmarking. +### Tools -If you want to run on GPU machines, or use `Singularity` instead of `Docker`, -please follow [Quick Start](deepvariant-quick-start.md) documentation. +[Singularity](https://sylabs.io/docs/) will be used to run DeepVariant and +[hap.py](https://github.com/illumina/hap.py), and we'll use +[miniconda](https://docs.conda.io/en/latest/miniconda.html) and a conda +environment to handle the other dependencies for the case study and samtools. -### Create input and output directory structures and download inputs +- singularity (must be installed by `root` user; outside of the scope of this + case study) +- samtools ```bash -BASE="${HOME}/pacbio-case-study" - -# Set up input and output directory data -INPUT_DIR="${BASE}/input/data" -OUTPUT_DIR="${BASE}/output" - -## Create local directory structure -mkdir -p "${INPUT_DIR}" -mkdir -p "${OUTPUT_DIR}" +# add channels to conda configuration +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge + +# create the environment and install dependencies +conda create -y -n deepvariant_env +conda activate deepvariant_env +conda install -y samtools==1.10 +``` -# Download reference to input directory -FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids -curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | gunzip > ${INPUT_DIR}/GRCh38_no_alt_analysis_set.fasta -curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai > ${INPUT_DIR}/GRCh38_no_alt_analysis_set.fasta.fai +### Download Reference -HTTPDIR=https://storage.googleapis.com/deepvariant/pacbio-case-study-testdata -curl ${HTTPDIR}/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam > ${INPUT_DIR}/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam -curl ${HTTPDIR}/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam.bai > ${INPUT_DIR}/HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam.bai +We will be using GRCh38 for this case study. -# Set up input variables -REF="GRCh38_no_alt_analysis_set.fasta" -BAM="HG003.SPRQ.pacbio.GRCh38.nov2024.chr20.bam" -THREADS=$(nproc) -REGION="chr20" +```bash +mkdir -p reference -# Set up output variable -OUTPUT_VCF="HG003_PACBIO_SPRQ_GRCh38.chr20.output.vcf.gz" -OUTPUT_GVCF="HG003_PACBIO_SPRQ_GRCh38.chr20.output.g.vcf.gz" -INTERMEDIATE_DIRECTORY="intermediate_results_dir" +# download and decompress +curl ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | gunzip > reference/GRCh38_no_alt_analysis_set.fasta -mkdir -p "${OUTPUT_DIR}/${INTERMEDIATE_DIRECTORY}" +# index reference +samtools faidx reference/GRCh38_no_alt_analysis_set.fasta ``` -## Run DeepVariant +### Download Genome in a Bottle Benchmarks -We will run DeepVariant from docker using the `run_deepvariant` script. +We will benchmark our variant calls against v4.2.1 of the Genome in a Bottle +small variant benchmarks for HG003. ```bash -BIN_VERSION="1.8.0" - -sudo docker run \ - -v "${INPUT_DIR}":"${INPUT_DIR}" \ - -v "${OUTPUT_DIR}":"${OUTPUT_DIR}" \ - google/deepvariant:"${BIN_VERSION}" \ - /opt/deepvariant/bin/run_deepvariant \ - --model_type PACBIO \ - --ref "${INPUT_DIR}/${REF}" \ - --reads "${INPUT_DIR}/${BAM}" \ - --output_vcf "${OUTPUT_DIR}/${OUTPUT_VCF}" \ - --output_gvcf "${OUTPUT_DIR}/${OUTPUT_GVCF}" \ - --num_shards "${THREADS}" \ - --regions "${REGION}" \ - --intermediate_results_dir "${OUTPUT_DIR}/${INTERMEDIATE_DIRECTORY}" -``` +mkdir -p benchmark -By specifying `--model_type PACBIO`, you'll be using a model that is best -suited for PacBio data. - -NOTE: If you want to run each of the steps separately, add `--dry_run=true` to -the command above to figure out what flags you need in each step. Based on the -different model types, different flags are needed in the `make_examples` step. - -`--intermediate_results_dir` flag is optional. By specifying it, the -intermediate outputs of `make_examples` and `call_variants` stages can be found -in the directory. After the command, you can find these files in the directory: +FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG003_NA24149_father/NISTv4.2.1/GRCh38 -``` -call_variants_output.tfrecord.gz -gvcf.tfrecord-?????-of-?????.gz -make_examples.tfrecord-?????-of-?????.gz +curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed +curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi ``` -## Benchmark HG003 chr20 output from DeepVariant +### Download HG003 chr20 HiFi alignments -We will use Genome-in-a-Bottle (GIAB) dataset to evaluate the performance of -DeepVariant. +We'll use HG003 chr20 HiFi reads publicly available from the [PrecisionFDA Truth v2 Challenge](https://precision.fda.gov/challenges/10). -### Download Genome in a Bottle Benchmarks +```bash +mkdir -p input +HTTPDIR=https://downloads.pacbcloud.com/public/dataset/HG003/deepvariant-case-study -We will benchmark our variant calls against v4.2.1 of the Genome in a Bottle -small variant benchmarks for HG003. +curl ${HTTPDIR}/HG003.GRCh38.chr20.pFDA_truthv2.bam > input/HG003.GRCh38.chr20.pFDA_truthv2.bam +curl ${HTTPDIR}/HG003.GRCh38.chr20.pFDA_truthv2.bam.bai > input/HG003.GRCh38.chr20.pFDA_truthv2.bam.bai +``` + +## Run DeepVariant on chromosome 20 alignments ```bash -FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG003_NA24149_father/NISTv4.2.1/GRCh38 +ulimit -u 10000 # https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable/54746150#54746150 +BIN_VERSION="1.7.0" +mkdir -p deepvariant_output + +singularity exec --bind /usr/lib/locale/ \ + docker://google/deepvariant:${BIN_VERSION} \ + /opt/deepvariant/bin/run_deepvariant \ + --model_type PACBIO \ + --ref reference/GRCh38_no_alt_analysis_set.fasta \ + --reads input/HG003.GRCh38.chr20.pFDA_truthv2.bam \ + --output_vcf deepvariant_output/output.vcf.gz \ + --num_shards $(nproc) \ + --regions chr20 +``` -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > ${INPUT_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > ${INPUT_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > ${INPUT_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi +NOTE: If you want to run each of the steps separately, add `--dry_run=true` +to the command above to figure out what flags you need in each step. Based on +the different model types, different flags are needed in the `make_examples` +step. -TRUTH_VCF="HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -TRUTH_BED="HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -``` +## Benchmark output ```bash -sudo docker pull jmcdani20/hap.py:v0.3.12 - -sudo docker run \ - -v "${INPUT_DIR}":"${INPUT_DIR}" \ - -v "${OUTPUT_DIR}":"${OUTPUT_DIR}" \ - -v "${PWD}/happy:/happy" \ - jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ - "${INPUT_DIR}/${TRUTH_VCF}" \ - "${OUTPUT_DIR}/${OUTPUT_VCF}" \ - -f "${INPUT_DIR}/${TRUTH_BED}" \ - -r "${INPUT_DIR}/${REF}" \ - -o "${OUTPUT_DIR}/hg003.pacbio.chr20.happy.output" \ - --engine=vcfeval \ - --pass-only \ - -l "${REGION}" +mkdir -p happy + +singularity exec docker://jmcdani20/hap.py:v0.3.12 \ + /opt/hap.py/bin/hap.py \ + --threads $(nproc) \ + -r reference/GRCh38_no_alt_analysis_set.fasta \ + -f benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ + -o happy/giab-comparison.v4.2.first_pass \ + --engine=vcfeval \ + --pass-only \ + -l chr20 \ + benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ + deepvariant_output/output.vcf.gz ``` Output: @@ -147,8 +121,8 @@ Output: ``` Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 10628 10543 85 22403 74 11375 40 29 0.992002 0.993290 0.507744 0.992646 NaN NaN 1.748961 2.138647 -INDEL PASS 10628 10543 85 22403 74 11375 40 29 0.992002 0.993290 0.507744 0.992646 NaN NaN 1.748961 2.138647 - SNP ALL 70166 70101 65 105602 71 35342 12 12 0.999074 0.998989 0.334672 0.999032 2.296566 1.713281 1.883951 1.503192 - SNP PASS 70166 70101 65 105602 71 35342 12 12 0.999074 0.998989 0.334672 0.999032 2.296566 1.713281 1.883951 1.503192 +INDEL ALL 10628 10551 77 22590 69 11527 39 29 0.992755 0.993763 0.510270 0.993259 NaN NaN 1.748961 2.275319 +INDEL PASS 10628 10551 77 22590 69 11527 39 29 0.992755 0.993763 0.510270 0.993259 NaN NaN 1.748961 2.275319 + SNP ALL 70166 70141 25 98780 23 28559 5 11 0.999644 0.999672 0.289117 0.999658 2.296566 1.823452 1.883951 1.913585 + SNP PASS 70166 70141 25 98780 23 28559 5 11 0.999644 0.999672 0.289117 0.999658 2.296566 1.823452 1.883951 1.913585 ``` diff --git a/docs/deepvariant-quick-start.md b/docs/deepvariant-quick-start.md index c9005452..062a0377 100644 --- a/docs/deepvariant-quick-start.md +++ b/docs/deepvariant-quick-start.md @@ -33,7 +33,7 @@ If you want to compile the DeepVariant binaries for yourself, we also have a ### Get Docker image ```bash -BIN_VERSION="1.8.0" +BIN_VERSION="1.7.0" sudo apt -y update sudo apt-get -y install docker.io @@ -124,7 +124,6 @@ sudo docker run \ google/deepvariant:"${BIN_VERSION}" \ /opt/deepvariant/bin/run_deepvariant \ --model_type=WGS \ - --vcf_stats_report=true \ --ref=/input/ucsc.hg19.chr20.unittest.fasta \ --reads=/input/NA12878_S1.chr20.10_10p1mb.bam \ --regions "chr20:10,000,000-10,010,000" \ @@ -195,7 +194,6 @@ singularity run -B /usr/lib/locale/:/usr/lib/locale/ \ docker://google/deepvariant:"${BIN_VERSION}" \ /opt/deepvariant/bin/run_deepvariant \ --model_type=WGS \ **Replace this string with exactly one of the following [WGS,WES,PACBIO,ONT_R104,HYBRID_PACBIO_ILLUMINA]** - --vcf_stats_report=true \ --ref="${INPUT_DIR}"/ucsc.hg19.chr20.unittest.fasta \ --reads="${INPUT_DIR}"/NA12878_S1.chr20.10_10p1mb.bam \ --regions "chr20:10,000,000-10,010,000" \ @@ -267,16 +265,16 @@ You should see output similar to the following. ``` Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 4 4 0 13 0 9 0 0 1.000000 1.0 0.692308 1.000000 NaN NaN 0.333333 1.000000 -INDEL PASS 4 4 0 13 0 9 0 0 1.000000 1.0 0.692308 1.000000 NaN NaN 0.333333 1.000000 - SNP ALL 44 43 1 59 0 16 0 0 0.977273 1.0 0.271186 0.988506 1.2 1.36 0.333333 0.340909 - SNP PASS 44 43 1 59 0 16 0 0 0.977273 1.0 0.271186 0.988506 1.2 1.36 0.333333 0.340909 +INDEL ALL 4 4 0 13 0 9 0 0 1.0 1.0 0.692308 1.0 NaN NaN 0.333333 1.000000 +INDEL PASS 4 4 0 13 0 9 0 0 1.0 1.0 0.692308 1.0 NaN NaN 0.333333 1.000000 + SNP ALL 44 44 0 60 0 16 0 0 1.0 1.0 0.266667 1.0 1.2 1.307692 0.333333 0.363636 + SNP PASS 44 44 0 60 0 16 0 0 1.0 1.0 0.266667 1.0 1.2 1.307692 0.333333 0.363636 ``` [BAM]: http://genome.sph.umich.edu/wiki/BAM [BWA]: https://academic.oup.com/bioinformatics/article/25/14/1754/225615/Fast-and-accurate-short-read-alignment-with [docker build]: https://docs.docker.com/engine/reference/commandline/build/ -[Dockerfile]: https://github.com/google/deepvariant/blob/r1.8/Dockerfile +[Dockerfile]: https://github.com/google/deepvariant/blob/r1.7/Dockerfile [FASTA]: https://en.wikipedia.org/wiki/FASTA_format [Quick Start in r0.7]: https://github.com/google/deepvariant/blob/r0.7/docs/deepvariant-quick-start.md [VCF]: https://samtools.github.io/hts-specs/VCFv4.3.pdf diff --git a/docs/deepvariant-vg-case-study.md b/docs/deepvariant-vg-case-study.md index 24f0a551..86a38ef6 100644 --- a/docs/deepvariant-vg-case-study.md +++ b/docs/deepvariant-vg-case-study.md @@ -4,7 +4,7 @@ This is an example to run `vg giraffe`, so we can go from FASTQs --> BAM. For simplicity and consistency, we run the following with a -[Google Cloud instance with 96 cores](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform). +[Google Cloud instance with 64 cores](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform). I added more disks because 300G is not enough for the example below. I changed it to `--boot-disk-size "1000"`. @@ -184,11 +184,11 @@ And then, run DeepVariant. [DeepVariant Case Study](deepvariant-case-study.md).) ```bash -BIN_VERSION="1.8.0" +BIN_VERSION="1.7.0" sudo docker pull google/deepvariant:"${BIN_VERSION}" -time sudo docker run \ +time sudo docker run --rm \ -v "${DATA_DIR}":"${DATA_DIR}" \ -v "${PWD}:${PWD}" \ google/deepvariant:"${BIN_VERSION}" \ @@ -204,9 +204,9 @@ time sudo docker run \ Stage | Time (minutes) -------------------------------- | ----------------- -make_examples | 59m19.845s -call_variants | 49m41.643s -postprocess_variants (with gVCF) | 7m46.195s +make_examples | 101m31.676s +call_variants | 215m33.631s +postprocess_variants (with gVCF) | 24m44.242s ### Run hap.py @@ -244,16 +244,21 @@ Output: ``` Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 504501 502210 2291 954974 1522 429900 956 362 0.995459 0.997101 0.450169 0.996279 NaN NaN 1.489759 1.942299 -INDEL PASS 504501 502210 2291 954974 1522 429900 956 362 0.995459 0.997101 0.450169 0.996279 NaN NaN 1.489759 1.942299 - SNP ALL 3327496 3316336 11160 3823082 4229 500683 1696 356 0.996646 0.998727 0.130963 0.997686 2.102576 1.990152 1.535137 1.449299 - SNP PASS 3327496 3316336 11160 3823082 4229 500683 1696 356 0.996646 0.998727 0.130963 0.997686 2.102576 1.990152 1.535137 1.449299 +INDEL ALL 504501 502283 2218 958181 1471 433079 913 351 0.995604 0.997199 0.451980 0.996400 NaN NaN 1.489759 1.954212 +INDEL PASS 504501 502283 2218 958181 1471 433079 913 351 0.995604 0.997199 0.451980 0.996400 NaN NaN 1.489759 1.954212 + SNP ALL 3327496 3316374 11122 3820052 4177 497662 1686 344 0.996658 0.998743 0.130276 0.997699 2.102576 1.991054 1.535137 1.457635 + SNP PASS 3327496 3316374 11122 3820052 4177 497662 1686 344 0.996658 0.998743 0.130276 0.997699 2.102576 1.991054 1.535137 1.457635 ``` +| Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | +| ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | +| INDEL | 502283 | 2218 | 1471 | 0.995604 | 0.997199 | 0.9964 | +| SNP | 3316374 | 11122 | 4177 | 0.996658 | 0.998743 | 0.997699 | + This can be compared with -https://github.com/google/deepvariant/blob/r1.8/docs/metrics.md#accuracy. +https://github.com/google/deepvariant/blob/r1.7/docs/metrics.md#accuracy. Which shows that `vg giraffe` improves F1: -- Indel F1: 0.995945 --> 0.996279 -- SNP F1: 0.996213 --> 0.997686 +- Indel F1: 0.995998 --> 0.9964 +- SNP F1: 0.996237 --> 0.997699 diff --git a/docs/metrics-deeptrio.md b/docs/metrics-deeptrio.md index a2a5f306..0afcb9d0 100644 --- a/docs/metrics-deeptrio.md +++ b/docs/metrics-deeptrio.md @@ -2,18 +2,6 @@ ## WGS (Illumina) -## Setup - -The runtime and accuracy reported in this page are generated using -`n2-standard-96` GCP instances which has the following configuration: - -```bash -GCP instance type: n2-standard-96 -CPUs: 96-core (vCPU) -Memory: 384GiB -GPUs: 0 -``` - ### Runtime Runtime is on HG002/HG003/HG004 (all chromosomes). @@ -21,15 +9,15 @@ Reported runtime is an average of 5 runs. Stage | Wall time (minutes) -------------------------------- | ----------------- -make_examples | 172m53.87s -call_variants: HG002 | 269m26.55s -call_variants: HG003 | 268m2.29s -call_variants: HG004 | 270m22.72s -postprocess_variants (parallel) | 34m12.36s; 35m4.75s; 35m8.14s -vcf_stats_report(optional):HG002 | 6m36.58s -vcf_stats_report(optional):HG003 | 6m39.92s -vcf_stats_report(optional):HG003 | 6m40.64s -total | 1028m3.08s (17h08m3.08s) +make_examples | 381m27.76s +call_variants: HG002 | 376m44.92s +call_variants: HG003 | 379m55.40s +call_variants: HG004 | 380m27.95s +postprocess_variants (parallel) | 45m24.88s; 47m0.02s; 47m46.29s +vcf_stats_report(optional):HG002 | 9m20.03s +vcf_stats_report(optional):HG003 | 9m29.88s +vcf_stats_report(optional):HG003 | 9m29.88s +total | 1576m56.29s (26h16m56.29s) ### Accuracy @@ -59,13 +47,13 @@ truth), which was held out while training. | SNP | 71445 | 214 | 48 | 0.997014 | 0.999329 | 0.99817 | * See VCF stats report (for all chromosomes) - - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WGS/HG002.output.visual_report.html) - - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WGS/HG003.output.visual_report.html) - - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WGS/HG004.output.visual_report.html) + - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WGS/HG002.output.visual_report.html) + - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WGS/HG003.output.visual_report.html) + - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WGS/HG004.output.visual_report.html) ## PacBio (HiFi) -Read haplotagging in DeepTrio PacBio is on by default. You no longer +In v1.7.0, we introduced read haplotagging in DeepTrio PacBio. You no longer need to run DeepVariant->WhatsHap->DeepTrio, and can just run DeepTrio once. ### Runtime @@ -75,20 +63,20 @@ Reported runtime is an average of 5 runs. Stage | Wall time (minutes) -------------------------------- | ------------------- -make_examples | 16m48.88s+288m15.08s -call_variants: HG002 | 279m5.76s -call_variants: HG003 | 274m47.90s -call_variants: HG004 | 283m37.89s -postprocess_variants (parallel) | 44m12.28s; 51m39.02s; 51m52.66s -vcf_stats_report(optional):HG002 | 6m49.94s -vcf_stats_report(optional):HG003 | 6m53.24s -vcf_stats_report(optional):HG003 | 7m19.57s -total | 1206m35.85s (20h6m35.85s) +make_examples | 50m35.96s+621m56.74s +call_variants: HG002 | 364m39.93s +call_variants: HG003 | 368m0.84s +call_variants: HG004 | 372m44.77s +postprocess_variants (parallel) | 58m52.92s; 66m36.57s; 67m35.91s +vcf_stats_report(optional):HG002 | 9m33.72s +vcf_stats_report(optional):HG003 | 9m48.13s +vcf_stats_report(optional):HG003 | 10m1.22s +total | 1858m53.78s (30h58m53.78s) * See VCF stats report (for all chromosomes) - - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/PACBIO/HG002.output.visual_report.html) - - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/PACBIO/HG003.output.visual_report.html) - - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/PACBIO/HG004.output.visual_report.html) + - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/PACBIO/HG002.output.visual_report.html) + - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/PACBIO/HG003.output.visual_report.html) + - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/PACBIO/HG004.output.visual_report.html) ### Accuracy @@ -108,7 +96,6 @@ truth), which was held out while training. | ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | | INDEL | 10577 | 51 | 77 | 0.995201 | 0.993089 | 0.994144 | | SNP | 70143 | 23 | 35 | 0.999672 | 0.999502 | 0.999587 | - #### HG004: | Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | @@ -125,15 +112,15 @@ Reported runtime is an average of 5 runs. Stage | Wall time (minutes) -------------------------------- | -------------- -make_examples | 7m11.47s -call_variants: HG002 | 3m49.25s -call_variants: HG003 | 3m53.32s -call_variants: HG004 | 3m52.68s -postprocess_variants (parallel) | 0m40.52s; 0m42.09s; 0m42.30s -vcf_stats_report(optional):HG002 | 0m5.65s -vcf_stats_report(optional):HG003 | 0m5.69s -vcf_stats_report(optional):HG003 | 0m7.15s -total | 20m6.26s +make_examples | 15m6.77s +call_variants: HG002 | 5m16.13s +call_variants: HG003 | 5m18.83s +call_variants: HG004 | 5m19.09s +postprocess_variants (parallel) | 0m51.70s; 0m52.27s; 0m53.73s +vcf_stats_report(optional):HG002 | 0m7.84s +vcf_stats_report(optional):HG003 | 0m8.01s +vcf_stats_report(optional):HG003 | 0m10.00s +total | 32m20.47s ### Accuracy @@ -163,14 +150,14 @@ truth), which was held out while training. | SNP | 676 | 3 | 0 | 0.995582 | 1.0 | 0.997786 | * See VCF stats report (for all chromosomes) - - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WES/HG002.output.visual_report.html) - - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WES/HG003.output.visual_report.html) - - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.8.0/WES/HG004.output.visual_report.html) + - [HG002](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WES/HG002.output.visual_report.html) + - [HG003](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WES/HG003.output.visual_report.html) + - [HG004](https://storage.googleapis.com/deepvariant/visual_reports/DeepTrio/1.7.0/WES/HG004.output.visual_report.html) ## How to reproduce the metrics on this page For simplicity and consistency, we report runtime with a -[CPU instance with 96 CPUs](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform) +[CPU instance with 64 CPUs](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform) For bigger datasets (WGS and PACBIO), we used bigger disk size (900G). This is NOT the fastest or cheapest configuration. @@ -179,7 +166,7 @@ Use `gcloud compute ssh` to log in to the newly created instance. Download and run any of the following case study scripts: ``` -curl -O https://raw.githubusercontent.com/google/deepvariant/r1.8/scripts/inference_deeptrio.sh +curl -O https://raw.githubusercontent.com/google/deepvariant/r1.7/scripts/inference_deeptrio.sh # WGS bash inference_deeptrio.sh --model_preset WGS @@ -197,4 +184,4 @@ DeepTrio. The runtime numbers reported above are the average of 5 runs each. The accuracy metrics come from the hap.py summary.csv output file. The runs are deterministic so all 5 runs produced the same output. -[CPU instance with 96 CPUs]: deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform +[CPU instance with 64 CPUs]: deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform diff --git a/docs/metrics.md b/docs/metrics.md index fadb0c43..ad45a5fb 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,17 +1,5 @@ # Runtime and accuracy metrics for all release models -## Setup - -The runtime and accuracy reported in this page are generated using -`n2-standard-96` GCP instances which has the following configuration: - -```bash -GCP instance type: n2-standard-96 -CPUs: 96-core (vCPU) -Memory: 384GiB -GPUs: 0 -``` - ## WGS (Illumina) ### Runtime @@ -21,11 +9,11 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | ------------------ -make_examples | 54m58.62s -call_variants | 38m45.29s -postprocess_variants (with gVCF) | 8m22.88s -vcf_stats_report (optional) | 5m37.52s (optional) -total | 113m11.70s (1h53m11.70s) +make_examples | 114m35.42s +call_variants | 65m6.69s +postprocess_variants (with gVCF) | 12m7.55s +vcf_stats_report (optional) | 9m9.16s +total | 208m46.70s (3h28m46.70s) ### Accuracy @@ -37,7 +25,7 @@ held out while training. | INDEL | 501653 | 2848 | 1289 | 0.994355 | 0.997541 | 0.995945 | | SNP | 3306740 | 20756 | 4386 | 0.993762 | 0.998676 | 0.996213 | -[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.8.0/WGS/deepvariant.output.visual_report.html) +[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.7.0/WGS/deepvariant.output.visual_report.html) ## WES (Illumina) @@ -48,11 +36,11 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | ----------------- -make_examples | 3m17.64s -call_variants | 0m56.36s -postprocess_variants (with gVCF) | 0m39.27s -vcf_stats_report (optional) | 0m4.93s (optional) -total | 5m26.00s +make_examples | 6m35.96s +call_variants | 1m31.70s +postprocess_variants (with gVCF) | 0m58.50s +vcf_stats_report (optional) | 0m7.67s +total | 9m38.11s ### Accuracy @@ -61,26 +49,13 @@ held out while training. | Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | | ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | -| INDEL | 1020 | 31 | 7 | 0.970504 | 0.993327 | 0.981783 | -| SNP | 24984 | 295 | 60 | 0.98833 | 0.997604 | 0.992946 | +| INDEL | 1020 | 31 | 12 | 0.970504 | 0.988615 | 0.979476 | +| SNP | 24982 | 297 | 64 | 0.988251 | 0.997445 | 0.992827 | -[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.8.0/WES/deepvariant.output.visual_report.html) +[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.7.0/WES/deepvariant.output.visual_report.html) ## PacBio (HiFi) -### Updated dataset in release 1.8.0 - -In release 1.8.0, we have updated the PacBio test data from HG003 Sequel-II to -latest Revio with SPRQ chemistry data to showcase performance on the updated -platform and chemistry. The numbers reported here are generated using the bam -that can be found in: - -```bash -gs://deepvariant/pacbio-case-study-testdata/HG003.SPRQ.pacbio.GRCh38.nov2024.bam -``` - -Which is also available through [here](https://downloads.pacbcloud.com/public/revio/2024Q4/WGS/GIAB_trio/HG003/analysis/GRCh38.m84039_241002_000337_s3.hifi_reads.bc2020.bam). - ### Runtime Runtime is on HG003 (all chromosomes). @@ -88,11 +63,11 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | ------------------- -make_examples | 31m51.00s -call_variants | 34m49.62s -postprocess_variants (with gVCF) | 5m28.59s -vcf_stats_report (optional) | 5m36.49s (optional) -total | 86m50.09s (1h26m50.09s) +make_examples | 77m48.24s +call_variants | 82m33.11s +postprocess_variants (with gVCF) | 10m27.86s +vcf_stats_report (optional) | 10m1.56s +total | 190m21.52s (3h10m21.52s) ### Accuracy @@ -104,11 +79,10 @@ to run DeepVariant once. | Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | | ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | -| INDEL | 500955 | 3546 | 3373 | 0.992971 | 0.993555 | 0.993263 | -| SNP | 3321825 | 5670 | 4263 | 0.998296 | 0.99872 | 0.998508 | - +| INDEL | 501727 | 2774 | 2643 | 0.994501 | 0.994968 | 0.994735 | +| SNP | 3324603 | 2892 | 2056 | 0.999131 | 0.999382 | 0.999257 | -[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.8.0/PACBIO/deepvariant.output.visual_report.html) +[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.7.0/PACBIO/deepvariant.output.visual_report.html) ## ONT_R104 @@ -119,11 +93,11 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | -------------------- -make_examples | 53m25.60s -call_variants | 55m24.86s -postprocess_variants (with gVCF) | 7m17.83s -vcf_stats_report (optional) | 6m30.29s (optional) -total | 127m56.44s (2h7m56.44s) +make_examples | 125m50.85s +call_variants | 107m45.55s +postprocess_variants (with gVCF) | 11m30.19s +vcf_stats_report (optional) | 11m0.03s +total | 258m4.89s (4h18m4.89s) ### Accuracy @@ -132,11 +106,10 @@ truth), which was held out while training. | Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | | ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | -| INDEL | 452010 | 52491 | 40289 | 0.895955 | 0.920501 | 0.908062 | -| SNP | 3321452 | 6032 | 3942 | 0.998187 | 0.998815 | 0.998501 | +| INDEL | 443724 | 60777 | 42938 | 0.87953 | 0.914541 | 0.896694 | +| SNP | 3319440 | 8044 | 5005 | 0.997583 | 0.998495 | 0.998039 | - -[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.8.0/ONT_R104/deepvariant.output.visual_report.html) +[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.7.0/ONT_R104/deepvariant.output.visual_report.html) ## Hybrid (Illumina + PacBio HiFi) @@ -146,12 +119,12 @@ Runtime is on HG003 (all chromosomes). Reported runtime is an average of 5 runs. Stage | Time (minutes) --------------------------------- | ------------------ -make_examples | 71m52.43s -call_variants | 51m42.37s -postprocess_variants (with gVCF) | 4m6.13s -vcf_stats_report (optional) | 5m18.39s (optional) -total | 151m34.49s (2h31m34.49s) +-------------------------------- | ------------------- +make_examples | 160m29.82s +call_variants | 90m45.03s +postprocess_variants (with gVCF) | 6m43.66s +vcf_stats_report (optional) | 9m31.77s +total | 278m59.35s (4h38m59.35s) ### Accuracy @@ -163,7 +136,7 @@ out while training the hybrid model. | INDEL | 503109 | 1392 | 2636 | 0.997241 | 0.995022 | 0.99613 | | SNP | 3324179 | 3316 | 2049 | 0.999003 | 0.999384 | 0.999194 | -[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.8.0/HYBRID/deepvariant.output.visual_report.html) +[See VCF stats report.](https://storage.googleapis.com/deepvariant/visual_reports/DeepVariant/1.7.0/HYBRID/deepvariant.output.visual_report.html) ## Inspect outputs that produced the metrics above @@ -179,7 +152,7 @@ https://42basepairs.com/browse/gs/deepvariant/case-study-outputs ## How to reproduce the metrics on this page For simplicity and consistency, we report runtime with a -[CPU instance with 96 CPUs](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform) +[CPU instance with 64 CPUs](deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform) This is NOT the fastest or cheapest configuration. Use `gcloud compute ssh` to log in to the newly created instance. @@ -188,7 +161,7 @@ Download and run any of the following case study scripts: ``` # Get the script. -curl -O https://raw.githubusercontent.com/google/deepvariant/r1.8/scripts/inference_deepvariant.sh +curl -O https://raw.githubusercontent.com/google/deepvariant/r1.7/scripts/inference_deepvariant.sh # WGS bash inference_deepvariant.sh --model_preset WGS @@ -211,5 +184,5 @@ DeepVariant. The runtime numbers reported above are the average of 5 runs each. The accuracy metrics come from the hap.py summary.csv output file. The runs are deterministic so all 5 runs produced the same output. -[CPU instance with 96 CPUs]: deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform +[CPU instance with 64 CPUs]: deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform diff --git a/docs/pangenome-aware-metrics.md b/docs/pangenome-aware-metrics.md index 9b86e9fb..da80f67d 100644 --- a/docs/pangenome-aware-metrics.md +++ b/docs/pangenome-aware-metrics.md @@ -1,20 +1,16 @@ # Runtime and accuracy metrics for Pangenome-aware DeepVariant -## Setup +## How to reproduce the metrics on this page -The runtime and accuracy reported in this page are generated using -`n2-standard-96` GCP instances which has the following configuration: +We report runtime with a n1-highmem-96 machine, and we ran with +`--num_shards 50` to avoid OOM. -```bash -GCP instance type: n2-standard-96 -CPUs: 96-core (vCPU) -Memory: 384GiB -GPUs: 0 -``` +This is NOT the fastest or cheapest configuration. We plan to improve the high +memory usage in the future. ## WGS (Illumina) -BAM: We used the VG Giraffe mapped BAM file. The file is available here: +BAM: We used the VG Giraffe mapped BAM file. You can find it in: `gs://deepvariant/vg-case-study/HG003.novaseq.pcr-free.35x.vg-1.55.0.bam` ### Runtime @@ -24,21 +20,24 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | ------------------ -make_examples | 85m58.66s -call_variants | 313m49.80s -postprocess_variants (with gVCF) | 7m52.00s -vcf_stats_report (optional) | 5m48.88s -total | 423m6.93s (7h3m6.93s) +make_examples | 170m2.87s +call_variants | 522m44.28s +postprocess_variants (with gVCF) | 10m20.44s +vcf_stats_report (optional) | 9m4.78s +total | 721m14.42s (12h1m14.42s) ### Accuracy hap.py results on HG003 (all chromosomes, using NIST v4.2.1 truth), which was held out while training. +TODO: Update this table after retraining with pangenome=GBZ. Note that +these numbers are not a correct base for future comparison. + | Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | | ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | -| INDEL | 502338 | 2163 | 1511 | 0.995713 | 0.997122 | 0.996417 | -| SNP | 3320044 | 7452 | 4735 | 0.99776 | 0.998577 | 0.998168 | +| INDEL | 501918 | 2583 | 1757 | 0.99488 | 0.996652 | 0.995765 | +| SNP | 3318596 | 8900 | 8158 | 0.997325 | 0.997549 | 0.997437 | ## WES (Illumina) @@ -49,46 +48,16 @@ Reported runtime is an average of 5 runs. Stage | Time (minutes) -------------------------------- | ----------------- -make_examples | 5m4.22s -call_variants | 1m50.67s -postprocess_variants (with gVCF) | 0m38.74s -vcf_stats_report (optional) | 0m4.91s -total | 10m20.44s +make_examples | TODO +call_variants | TODO +postprocess_variants (with gVCF) | TODO +vcf_stats_report (optional) | TODO +total | TODO ### Accuracy hap.py results on HG003 (all chromosomes, using NIST v4.2.1 truth), which was held out while training. -| Type | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score | -| ----- | -------- | -------- | -------- | ------------- | ---------------- | --------------- | -| INDEL | 1020 | 31 | 15 | 0.970504 | 0.985782 | 0.978083 | -| SNP | 25006 | 273 | 54 | 0.989201 | 0.997845 | 0.993504 | - -## How to reproduce the metrics on this page - -For simplicity and consistency, we report runtime with a -[CPU instance with 96 CPUs](https://github.com/google/deepvariant/blob/r1.8/docs/deepvariant-details.md#command-for-a-cpu-only-machine-on-google-cloud-platform) -This is NOT the fastest or cheapest configuration. - -Use `gcloud compute ssh` to log in to the newly created instance. - -Download and run any of the following case study scripts: - -``` -# Get the script. -curl -O https://raw.githubusercontent.com/google/deepvariant/r1.8/scripts/inference_deepvariant.sh - -# WGS-PANGENOME -bash inference_deepvariant.sh --model_preset WGS_PANGENOME - -# WGS-PANGENOME -bash inference_deepvariant.sh --model_preset WES_PANGENOME -``` - -Runtime metrics are taken from the resulting log after each stage of -DeepVariant. - -The accuracy metrics came from the hap.py program. - +TODO diff --git a/docs/pangenome-aware-wes-bwa-case-study.md b/docs/pangenome-aware-wes-bwa-case-study.md deleted file mode 100644 index 4f924dc7..00000000 --- a/docs/pangenome-aware-wes-bwa-case-study.md +++ /dev/null @@ -1,166 +0,0 @@ -# DeepVariant Pangenome-aware WES case study (mapped with BWA) - -To make it faster to run over this case study, we run only on chromosome 20. - -## Prepare environment - -### Tools - -[Docker](https://docs.docker.com/get-docker/) will be used to run DeepVariant -and [hap.py](https://github.com/illumina/hap.py), - -### Download Reference - -We will be using GRCh38 for this case study. - -```bash -mkdir -p reference - -FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids - -curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | gunzip > reference/GRCh38_no_alt_analysis_set.fasta -curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai > reference/GRCh38_no_alt_analysis_set.fasta.fai -``` - -### Download Genome in a Bottle Benchmarks - -We will benchmark our variant calls against v4.2.1 of the Genome in a Bottle -small variant benchmarks for HG003. - -```bash -mkdir -p benchmark - -FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG003_NA24149_father/NISTv4.2.1/GRCh38 - -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -curl ${FTPDIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi -``` - -### Download HG003 chr20 BAM - -We'll use HG003 Illumina WES reads. - -```bash -mkdir -p input -HTTPDIR=https://storage.googleapis.com/deepvariant/exome-case-study-testdata - -curl ${HTTPDIR}/HG003.novaseq.wes_idt.100x.dedup.bam > input/HG003.novaseq.wes_idt.100x.dedup.bam -curl ${HTTPDIR}/HG003.novaseq.wes_idt.100x.dedup.bam.bai > input/HG003.novaseq.wes_idt.100x.dedup.bam.bai -``` - -### Download capture target BED file - -In this case study we'll use `idt_capture_novogene.grch38.bed` as the capture -target BED file. For evaluation, `hap.py` will intersect this BED with the GIAB -confident regions. - -```bash -HTTPDIR=https://storage.googleapis.com/deepvariant/exome-case-study-testdata - -curl ${HTTPDIR}/idt_capture_novogene.grch38.bed > input/idt_capture_novogene.grch38.bed -``` - -### Download GBZ built for GRCh38 - -```bash -mkdir -p input -HTTPDIR=https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.1-mc-grch38 - -curl ${HTTPDIR}/hprc-v1.1-mc-grch38.gbz > input/hprc-v1.1-mc-grch38.gbz -``` - -## Running Pangenome-aware DeepVariant with one command - -DeepVariant pipeline consists of 3 steps: `make_examples`, `call_variants`, and -`postprocess_variants`. You can now run DeepVariant with one command using the -`run_pangenome_aware_deepvariant` script. - -### Running on a CPU-only machine - -In this example, we used a -[n2-standard-96](https://cloud.google.com/compute/docs/general-purpose-machines) -machine. - -```bash -mkdir -p output -mkdir -p output/intermediate_results_dir - -BIN_VERSION="pangenome_aware_deepvariant-1.8.0" - -sudo docker pull google/deepvariant:"${BIN_VERSION}" - -sudo docker run \ - -v "${PWD}/input":"/input" \ - -v "${PWD}/output":"/output" \ - -v "${PWD}/reference":"/reference" \ - --shm-size 12gb \ - google/deepvariant:"${BIN_VERSION}" \ - /opt/deepvariant/bin/run_pangenome_aware_deepvariant \ - --model_type WES \ - --ref /reference/GRCh38_no_alt_analysis_set.fasta \ - --reads /input/HG003.novaseq.wes_idt.100x.dedup.bam \ - --pangenome /input/hprc-v1.1-mc-grch38.gbz \ - --output_vcf /output/HG003.output.vcf.gz \ - --output_gvcf /output/HG003.output.g.vcf.gz \ - --num_shards $(nproc) \ - --regions chr20 \ - --intermediate_results_dir /output/intermediate_results_dir -``` - - -By specifying `--model_type WES`, you'll be using a model that is best -suited for short-read WES data. - -NOTE: If you want to run each of the steps separately, add `--dry_run=true` -to the command above to figure out what flags you need in each step. - -`--intermediate_results_dir` flag is optional. By specifying it, the -intermediate outputs can be found in the directory. After the command, you can -find these intermediate files in the directory: - -``` -call_variants_output-?????-of-?????.tfrecord.gz -gvcf.tfrecord-?????-of-?????.gz -make_examples_pangenome_aware_dv.tfrecord-?????-of-?????.gz -``` - -For running on GPU machines, or using Singularity instead of Docker, see -[Quick Start](deepvariant-quick-start.md). - -## Benchmark on chr20 - -```bash -mkdir -p happy - -sudo docker pull jmcdani20/hap.py:v0.3.12 - -sudo docker run \ - -v "${PWD}/benchmark":"/benchmark" \ - -v "${PWD}/input":"/input" \ - -v "${PWD}/output":"/output" \ - -v "${PWD}/reference":"/reference" \ - -v "${PWD}/happy:/happy" \ - jmcdani20/hap.py:v0.3.12 \ - /opt/hap.py/bin/hap.py \ - /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ - /output/HG003.output.vcf.gz \ - -f /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ - -T /input/idt_capture_novogene.grch38.bed \ - -r /reference/GRCh38_no_alt_analysis_set.fasta \ - -o /happy/happy.output \ - --engine=vcfeval \ - --pass-only \ - -l chr20 -``` - -Output: - -``` -Benchmarking Summary: -Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 29 29 0 41 0 11 0 0 1.00000 1.0 0.268293 1.000000 NaN NaN 3.000000 2.727273 -INDEL PASS 29 29 0 41 0 11 0 0 1.00000 1.0 0.268293 1.000000 NaN NaN 3.000000 2.727273 - SNP ALL 685 683 2 704 0 21 0 0 0.99708 1.0 0.029830 0.998538 3.28125 3.266667 1.795918 1.838710 - SNP PASS 685 683 2 704 0 21 0 0 0.99708 1.0 0.029830 0.998538 3.28125 3.266667 1.795918 1.838710 -``` diff --git a/docs/pangenome-aware-wgs-bwa-case-study.md b/docs/pangenome-aware-wgs-bwa-case-study.md index e2a4f9f5..e3c8380b 100644 --- a/docs/pangenome-aware-wgs-bwa-case-study.md +++ b/docs/pangenome-aware-wgs-bwa-case-study.md @@ -68,31 +68,32 @@ DeepVariant pipeline consists of 3 steps: `make_examples`, `call_variants`, and ### Running on a CPU-only machine In this example, we used a -[n2-standard-96](https://cloud.google.com/compute/docs/general-purpose-machines) -machine. +[n1-standard-64](https://cloud.google.com/compute/docs/general-purpose-machines#n1-standard) +machine. Because reading in GBZ takes a lot of memory, we will run with +`--num_shards 18` below. ```bash mkdir -p output mkdir -p output/intermediate_results_dir -BIN_VERSION="pangenome_aware_deepvariant-1.8.0" +BIN_VERSION="1.7.0" +DOCKER=google/deepvariant:"${BIN_VERSION}" -sudo docker pull google/deepvariant:"${BIN_VERSION}" sudo docker run \ -v "${PWD}/input":"/input" \ -v "${PWD}/output":"/output" \ -v "${PWD}/reference":"/reference" \ --shm-size 12gb \ - google/deepvariant:"${BIN_VERSION}" \ + ${DOCKER} \ /opt/deepvariant/bin/run_pangenome_aware_deepvariant \ --model_type WGS \ --ref /reference/GRCh38_no_alt_analysis_set.fasta \ - --reads /input//HG003.novaseq.pcr-free.35x.dedup.grch38_no_alt.chr20.bam \ + --reads /input/HG003.novaseq.pcr-free.35x.dedup.grch38_no_alt.chr20.bam \ --pangenome /input/hprc-v1.1-mc-grch38.gbz \ --output_vcf /output/HG003.output.vcf.gz \ --output_gvcf /output/HG003.output.g.vcf.gz \ - --num_shards $(nproc) \ + --num_shards 18 \ --regions chr20 \ --intermediate_results_dir /output/intermediate_results_dir ``` @@ -130,8 +131,7 @@ sudo docker run \ -v "${PWD}/output":"/output" \ -v "${PWD}/reference":"/reference" \ -v "${PWD}/happy:/happy" \ - jmcdani20/hap.py:v0.3.12 \ - /opt/hap.py/bin/hap.py \ + jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ /output/HG003.output.vcf.gz \ -f /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ @@ -144,11 +144,13 @@ sudo docker run \ Output: +TODO: Update to new results + ``` Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 10628 10584 44 20850 19 9790 14 5 0.995860 0.998282 0.469544 0.99707 NaN NaN 1.748961 2.291024 -INDEL PASS 10628 10584 44 20850 19 9790 14 5 0.995860 0.998282 0.469544 0.99707 NaN NaN 1.748961 2.291024 - SNP ALL 70166 69932 234 86798 66 16764 45 3 0.996665 0.999058 0.193138 0.99786 2.296566 2.016604 1.883951 1.739749 - SNP PASS 70166 69932 234 86798 66 16764 45 3 0.996665 0.999058 0.193138 0.99786 2.296566 2.016604 1.883951 1.739749 +INDEL ALL 10628 10581 47 21124 21 10071 16 5 0.995578 0.998100 0.476756 0.996837 NaN NaN 1.748961 2.290880 +INDEL PASS 10628 10581 47 21124 21 10071 16 5 0.995578 0.998100 0.476756 0.996837 NaN NaN 1.748961 2.290880 + SNP ALL 70166 69910 256 92500 95 22457 38 3 0.996352 0.998644 0.242778 0.997496 2.296566 1.989145 1.883951 2.127093 + SNP PASS 70166 69910 256 92500 95 22457 38 3 0.996352 0.998644 0.242778 0.997496 2.296566 1.989145 1.883951 2.127093 ``` diff --git a/docs/pangenome-aware-wgs-vg-case-study.md b/docs/pangenome-aware-wgs-vg-case-study.md index e7d80869..3459efdf 100644 --- a/docs/pangenome-aware-wgs-vg-case-study.md +++ b/docs/pangenome-aware-wgs-vg-case-study.md @@ -1,4 +1,4 @@ -# DeepVariant Pangenome-aware WGS case study (mapped with VG) +# DeepVariant Pangenome-aware WGS case study (mapped with VG Giraffe) To make it faster to run over this case study, we run only on chromosome 20. @@ -73,31 +73,33 @@ DeepVariant pipeline consists of 3 steps: `make_examples`, `call_variants`, and ### Running on a CPU-only machine In this example, we used a -[n2-standard-96](https://cloud.google.com/compute/docs/general-purpose-machines) -machine. +[n1-standard-64](https://cloud.google.com/compute/docs/general-purpose-machines#n1-standard) +machine. Because reading in GBZ takes a lot of memory, we will run with +`--num_shards 18` below. ```bash mkdir -p output mkdir -p output/intermediate_results_dir -BIN_VERSION="pangenome_aware_deepvariant-1.8.0" +BIN_VERSION="1.7.0" +DOCKER=google/deepvariant:"${BIN_VERSION}" -sudo docker pull google/deepvariant:"${BIN_VERSION}" sudo docker run \ -v "${PWD}/input":"/input" \ -v "${PWD}/output":"/output" \ -v "${PWD}/reference":"/reference" \ --shm-size 12gb \ - google/deepvariant:"${BIN_VERSION}" \ + ${DOCKER} \ /opt/deepvariant/bin/run_pangenome_aware_deepvariant \ --model_type WGS \ --ref /reference/GRCh38_no_alt_analysis_set.fasta \ --reads /input/HG003.novaseq.pcr-free.35x.vg-1.55.0.chr20.bam \ + --make_examples_extra_args="min_mapping_quality=0,keep_legacy_allele_counter_behavior=true,normalize_reads=true" \ --pangenome /input/hprc-v1.1-mc-grch38.gbz \ --output_vcf /output/HG003.output.vcf.gz \ --output_gvcf /output/HG003.output.g.vcf.gz \ - --num_shards $(nproc) \ + --num_shards 18 \ --regions chr20 \ --intermediate_results_dir /output/intermediate_results_dir ``` @@ -135,8 +137,7 @@ sudo docker run \ -v "${PWD}/output":"/output" \ -v "${PWD}/reference":"/reference" \ -v "${PWD}/happy:/happy" \ - jmcdani20/hap.py:v0.3.12 \ - /opt/hap.py/bin/hap.py \ + jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ /output/HG003.output.vcf.gz \ -f /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ @@ -149,11 +150,13 @@ sudo docker run \ Output: +TODO: Update to new results + ``` Benchmarking Summary: Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio -INDEL ALL 10628 10594 34 21276 32 10189 21 8 0.996801 0.997114 0.478896 0.996957 NaN NaN 1.748961 2.231995 -INDEL PASS 10628 10594 34 21276 32 10189 21 8 0.996801 0.997114 0.478896 0.996957 NaN NaN 1.748961 2.231995 - SNP ALL 70166 70090 76 90303 94 20078 21 5 0.998917 0.998661 0.222340 0.998789 2.296566 1.942569 1.883951 1.599631 - SNP PASS 70166 70090 76 90303 94 20078 21 5 0.998917 0.998661 0.222340 0.998789 2.296566 1.942569 1.883951 1.599631 +INDEL ALL 10628 10585 43 21463 40 10383 26 12 0.995954 0.996390 0.483763 0.996172 NaN NaN 1.748961 2.194253 +INDEL PASS 10628 10585 43 21463 40 10383 26 12 0.995954 0.996390 0.483763 0.996172 NaN NaN 1.748961 2.194253 + SNP ALL 70166 70072 94 92606 97 22398 22 6 0.998660 0.998618 0.241863 0.998639 2.296566 1.982488 1.883951 1.982758 + SNP PASS 70166 70072 94 92606 97 22398 22 6 0.998660 0.998618 0.241863 0.998639 2.296566 1.982488 1.883951 1.982758 ``` diff --git a/scripts/inference_deepvariant.sh b/scripts/inference_deepvariant.sh index d825d505..3c194048 100755 --- a/scripts/inference_deepvariant.sh +++ b/scripts/inference_deepvariant.sh @@ -312,7 +312,7 @@ if [[ "${MODEL_PRESET}" = "PACBIO" ]]; then BASE="${HOME}/pacbio-case-study" REF="${REF:=${GCS_DATA_DIR}/case-study-testdata/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna}" - BAM="${BAM:=${GCS_DATA_DIR}/pacbio-case-study-testdata/HG003.SPRQ.pacbio.GRCh38.nov2024.bam}" + BAM="${BAM:=${GCS_DATA_DIR}/pacbio-case-study-testdata/HG003.SPRQ.pacbio.GRCh38.40x.nov2024.bam}" TRUTH_VCF="${TRUTH_VCF:=${GCS_DATA_DIR}/case-study-testdata/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz}" TRUTH_BED="${TRUTH_BED:=${GCS_DATA_DIR}/case-study-testdata/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed}" elif [[ "${MODEL_PRESET}" = "ONT_R104" ]]; then diff --git a/third_party/nucleus/io/gfile.cc b/third_party/nucleus/io/gfile.cc index a66fdcab..aafd069b 100644 --- a/third_party/nucleus/io/gfile.cc +++ b/third_party/nucleus/io/gfile.cc @@ -41,13 +41,13 @@ namespace nucleus { bool Exists(const std::string& filename) { // FileExists sets s to tensorflow::error::NOT_FOUND if it doesn't exist. - tensorflow::Status s = tensorflow::Env::Default()->FileExists(filename); + absl::Status s = tensorflow::Env::Default()->FileExists(filename); return s.ok(); } std::vector Glob(const std::string& pattern) { std::vector results; - ::tensorflow::Status s = + absl::Status s = tensorflow::Env::Default()->GetMatchingPaths(pattern, &results); return results; } @@ -56,7 +56,7 @@ ReadableFile::ReadableFile() {} std::unique_ptr ReadableFile::New(const std::string& filename) { std::unique_ptr file; - tensorflow::Status status = + absl::Status status = tensorflow::Env::Default()->NewRandomAccessFile(filename, &file); if (!status.ok()) { return nullptr; @@ -91,8 +91,7 @@ WritableFile::WritableFile() {} std::unique_ptr WritableFile::New(const std::string& filename) { std::unique_ptr file; - tensorflow::Status s = - tensorflow::Env::Default()->NewWritableFile(filename, &file); + absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file); if (!s.ok()) { return nullptr; @@ -105,7 +104,7 @@ std::unique_ptr WritableFile::New(const std::string& filename) { } bool WritableFile::Write(const std::string& s) { - tensorflow::Status status = file_->Append(s); + absl::Status status = file_->Append(s); return status.ok(); } diff --git a/third_party/nucleus/io/tfrecord_reader.cc b/third_party/nucleus/io/tfrecord_reader.cc index 2826bdbe..3b119e66 100644 --- a/third_party/nucleus/io/tfrecord_reader.cc +++ b/third_party/nucleus/io/tfrecord_reader.cc @@ -45,7 +45,7 @@ TFRecordReader::TFRecordReader() {} std::unique_ptr TFRecordReader::New( const std::string& filename, const std::string& compression_type) { std::unique_ptr file; - tensorflow::Status s = + absl::Status s = tensorflow::Env::Default()->NewRandomAccessFile(filename, &file); if (!s.ok()) { LOG(ERROR) << s; @@ -74,7 +74,7 @@ bool TFRecordReader::GetNext() { return false; } - tensorflow::Status s = reader_->ReadRecord(&offset_, &record_); + absl::Status s = reader_->ReadRecord(&offset_, &record_); return s.ok(); } diff --git a/third_party/nucleus/io/tfrecord_writer.cc b/third_party/nucleus/io/tfrecord_writer.cc index c4f79314..c84360d4 100644 --- a/third_party/nucleus/io/tfrecord_writer.cc +++ b/third_party/nucleus/io/tfrecord_writer.cc @@ -45,8 +45,7 @@ TFRecordWriter::TFRecordWriter() {} std::unique_ptr TFRecordWriter::New( const std::string& filename, const std::string& compression_type) { std::unique_ptr file; - tensorflow::Status s = - tensorflow::Env::Default()->NewWritableFile(filename, &file); + absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file); if (!s.ok()) { LOG(ERROR) << s; return nullptr; @@ -69,7 +68,7 @@ bool TFRecordWriter::WriteRecord(const std::string& record) { if (writer_ == nullptr) { return false; } - tensorflow::Status s = writer_->WriteRecord(record); + absl::Status s = writer_->WriteRecord(record); return s.ok(); } @@ -77,13 +76,13 @@ bool TFRecordWriter::Flush() { if (writer_ == nullptr) { return false; } - tensorflow:: Status s = writer_->Flush(); + absl::Status s = writer_->Flush(); return s.ok(); } bool TFRecordWriter::Close() { if (writer_ != nullptr) { - tensorflow::Status s = writer_->Close(); + absl::Status s = writer_->Close(); if (!s.ok()) { return false; } @@ -91,7 +90,7 @@ bool TFRecordWriter::Close() { } if (file_ != nullptr) { - tensorflow:: Status s = file_->Close(); + absl::Status s = file_->Close(); if (!s.ok()) { return false; }