Merge branch 'multiple_sequences_as_input' of https://github.com/luis…

…as/proteinfold into multiple_sequences_as_input
nf-core · Nov 26, 2024 · 560ece4 · 560ece4
2 parents ec35c2b + 5e97599
commit 560ece4
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 11 deletions.
diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config
@@ -28,7 +28,7 @@ params {
     colabfold_server = 'local'
     split_fasta      = true
     colabfold_db     = "${projectDir}/assets/dummy_db_dir"
-    input          = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv'
+    input            = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv'
 }
 
 process {

diff --git a/docs/output.md b/docs/output.md
@@ -23,7 +23,7 @@ The directories listed below will be created in the output directory after the p
 <details markdown="1">
 <summary>Output files</summary>
 
-- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
+- `alphafold2_standard/` or `alphafold2_split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
   - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
 - `DBs/` that contains symbolic links to the downloaded database and parameter files
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -35,7 +35,7 @@ The samplesheet can have as many columns as you desire, however, there is a stri
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
-Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the --split_fasta parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet.
+Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet.
 
 ## Running the pipeline
 

diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf
@@ -67,20 +67,23 @@ workflow PIPELINE_INITIALISATION {
     //
     ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json"))
     if (params.split_fasta) {
-        // here we have to validate that the ids are unique and valid as an extra step
+        // TODO: here we have to validate that the ids are unique and valid as an extra step
         // since it is not done with the samplesheet schema (they are all in the same file)
         ch_samplesheet.map { meta, fasta ->
             validateFasta(fasta)
         }
 
         // Split the fasta file into individual files for each sequence
-        ch_samplesheet.map{ meta,fasta -> fasta}
-                        .splitFasta( record: [header: true, sequence: true] )
-                        .collectFile { item ->
-                            [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ]
-                        }.map{
-                            file -> [[id: file.baseName], file]
-                        }.set{ch_samplesheet}
+        ch_samplesheet
+            .map { meta,fasta -> fasta }
+            .splitFasta( record: [header: true, sequence: true] )
+            .collectFile { item ->
+                [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ]
+            }
+            .map {
+                file -> [[id: file.baseName], file]
+            }
+            .set { ch_samplesheet }
     }
 
     emit: