diff --git a/brouillon.nf b/brouillon.nf new file mode 100644 index 0000000000000000000000000000000000000000..4b5e08076edc6e04897d5985625e7527713621f0 --- /dev/null +++ b/brouillon.nf @@ -0,0 +1,105 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def getAndCheckHeader() { + File file = new File(params.input) + assert file.exists() : "${params.input} file not found" + def line=""; + file.withReader { reader -> + line = reader.readLine() + } + def tab = line.split(/,/) + def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' + } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' + } + return tab +} + + +def returnFile(it) { + if (it == null) { + return null + } else { + if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information" + } + return file(it) +} + +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + + +workflow { + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + +} + + header = getAndCheckHeader() + Channel.from(file(params.input) + .splitCsv ( header:true, sep:',' ) ) + .map { row -> + def sample = row.sample + def paired = false + if (row.fastq_2 != null) { + paired = true + } + if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { + exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." + } + ["sample":row.sample, + "flowcell":row.flowcell, + "fastq_1":returnFile(row.fastq_1), + "fastq_2":returnFile(row.fastq_2), + "paired":paired, + "assembly":returnFile(row.assembly) ] + } + .set { ch_inputs } + + ch_inputs.view() + +ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + + ch_inputs + .map { item -> + if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} + else {[item.sample, item.fastq_1, item.fastq_2 ]}} + .set{ch_reads_sampleIDunique} + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} + else {[item.sample, item.paired]}} + .set{ch_meta} + + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } + else {[item.sample, item.assembly ]}} + .set { ch_assembly } + +ch_reads_sampleIDunique.view() +ch_meta.view() +ch_assembly.view() + + + // ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) + // ch_inputs + // .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } + // .set { ch_reads } + // ch_inputs + // .map { item -> [ item.sample, item.paired ] } + // .set { ch_paired } + // ch_inputs + // .map { item -> [ item.sample, item.assembly ] } + // .set { ch_assembly } + //ch_reads.view{ it -> "${it}" } + //ch_paired.view{ it -> "${it}" } \ No newline at end of file diff --git a/main.nf b/main.nf index 2bbe4e238f7eb75a247829c85b52a73fa8d2d0ff..4bfb6bbd827ee083b8e69a9890dfda77f3eeb58a 100644 --- a/main.nf +++ b/main.nf @@ -109,22 +109,19 @@ def getAndCheckHeader() { line = reader.readLine() } def tab = line.split(/,/) - if (! ((tab[0] == "sample") && (tab[1] == "fastq_1") )) { - exit 1, 'Error 1 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - if (tab.size() == 3 ){ - if (!((tab[2] == "fastq_2") || (tab[2] == "assembly"))) { - exit 1, 'Error 2 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - } - if (tab.size() == 4) { - if ( ! ((tab[2] == "fastq_2") && (tab[3] == "assembly"))) { - exit 1, 'Error 3 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' + def list = ['sample','flowcell','group','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' } return tab } + def returnFile(it) { if (it == null) { return null @@ -144,11 +141,6 @@ workflow { // Check mandatory parameters - //////////// - // Start check samplesheet - //////////// - if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - if (params.type.toUpperCase() == 'SR') { if ( assembly_tool == null || assembly_tool == ''){ @@ -179,27 +171,68 @@ workflow { if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.diamond_bank) ) { exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank" } + + + //////////// + // Start check samplesheet + //////////// + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + header = getAndCheckHeader() Channel.from(file(params.input) .splitCsv ( header:true, sep:',' ) ) .map { row -> def sample = row.sample - def paired = false - if (row.fastq_2 != null) { - paired = true - } if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } ["sample":row.sample, + "flowcell":row.flowcell, + "group":row.group, "fastq_1":returnFile(row.fastq_1), "fastq_2":returnFile(row.fastq_2), - "paired":paired, "assembly":returnFile(row.assembly) ] } .set { ch_inputs } - + ch_inputs + .map { item -> + def meta = [:] + meta.id = item.sample + if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} + meta.sample = item.sample + meta.flowcell = item.flowcell + meta.group = item.group + meta.assembly = item.assembly!=null + meta.type = params.type.toUpperCase() + if (meta.type=="SR"){ + return [meta,[item.fastq_1,item.fastq_2]] + } + else if (meta.type=="HIFI"){ + return [meta,[item.fastq_1]] + } + } + .set{ch_reads} + + + ch_inputs + .map { item -> + def meta = [:] + meta.id = item.sample + if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} + meta.sample = item.sample + meta.flowcell = item.flowcell + meta.group = item.group + meta.assembly = item.assembly!=null + meta.type = params.type.toUpperCase() + if (meta.type=="SR"){ + return [meta,item.assembly] + } + } + .set { ch_assembly } has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null) + //////////// // End check samplesheet //////////// @@ -210,6 +243,7 @@ workflow { ch_kaiju_db = Channel.empty() ch_eggnog_db = Channel.empty() ch_taxonomy = Channel.empty() + ch_diamond = Channel.empty() DATABASES () ch_host_fasta = DATABASES.out.host_fasta @@ -217,6 +251,7 @@ workflow { ch_kaiju_db = DATABASES.out.kaiju_db ch_eggnog_db = DATABASES.out.eggnog ch_taxonomy = DATABASES.out.taxonomy + ch_diamond = DATABASES.out.diamond ch_multiqc_config = Channel.empty() @@ -245,21 +280,9 @@ workflow { if ( params.type.toUpperCase() == "SR" ) { ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) - ch_inputs - .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } - .set { ch_reads } - ch_inputs - .map { item -> [ item.sample, item.paired ] } - .set { ch_paired } - ch_inputs - .map { item -> [ item.sample, item.assembly ] } - .set { ch_assembly } - //ch_reads.view{ it -> "${it}" } - //ch_paired.view{ it -> "${it}" } - + SR ( ch_reads, - ch_paired, ch_assembly, ch_host_fasta, ch_host_index, @@ -286,11 +309,6 @@ workflow { else if ( params.type.toUpperCase() == "HIFI" ) { ch_multiqc_config = file(params.hifi_multiqc_config, checkIfExists: true) - ch_inputs.map { item -> [ item.sample, item.assembly ] } // [sample, assembly] - .set { ch_assembly } - - ch_inputs.map { item -> [ item.sample, item.fastq_1 ] } // [sample, reads] - .set { ch_reads } HIFI_READS ( ch_reads, @@ -326,6 +344,7 @@ workflow { SH ( ch_reads, ch_assembly, + ch_diamond, ch_eggnog_db, ch_taxonomy ) diff --git a/modules/assembly.nf b/modules/assembly.nf index 0f4ec71095c98f9d295d89678e56ecaf77d966aa..22d962adfebfab9f77489233b291b7a075bf06c5 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -1,23 +1,23 @@ process METASPADES { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_SR' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("metaspades/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("metaspades/${sampleId}.log"), path("metaspades/${sampleId}.params.txt"), emit: report + tuple val(meta), path("metaspades/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("metaspades/${meta.id}.log"), path("metaspades/${meta.id}.params.txt"), emit: report script: (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0] if ( unit =~ /GB/ ) { """ - metaspades.py -t ${task.cpus} -m $mem -1 ${read1} -2 ${read2} -o metaspades - mv metaspades/scaffolds.fasta metaspades/${sampleId}.contigs.fa - mv metaspades/spades.log metaspades/${sampleId}.log - mv metaspades/params.txt metaspades/${sampleId}.params.txt + metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades + mv metaspades/scaffolds.fasta metaspades/${meta.id}.contigs.fa + mv metaspades/spades.log metaspades/${meta.id}.log + mv metaspades/params.txt metaspades/${meta.id}.params.txt """ } else { error "Memory setting for the ASSEMBLY process is in $unit, it must be in GB (check config files) " @@ -26,72 +26,72 @@ process METASPADES { process MEGAHIT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_SR' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("megahit/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("megahit/${sampleId}.log"), path("megahit/${sampleId}.params.txt"), emit: report + tuple val(meta), path("megahit/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("megahit/${meta.id}.log"), path("megahit/${meta.id}.params.txt"), emit: report script: """ - megahit -t ${task.cpus} -1 ${read1} -2 ${read2} -o megahit --out-prefix "${sampleId}" - mv megahit/options.json megahit/${sampleId}.params.txt + megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit --out-prefix "${meta.id}" + mv megahit/options.json megahit/${meta.id}.params.txt rm -r megahit/intermediate_contigs """ } process HIFIASM_META { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_HIFI' input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("hifiasm-meta/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("hifiasm-meta/${sampleId}.log"), path("hifiasm-meta/${sampleId}.params.txt"), emit: report + tuple val(meta), path("hifiasm-meta/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("hifiasm-meta/${meta.id}.log"), path("hifiasm-meta/${meta.id}.params.txt"), emit: report script: """ mkdir hifiasm-meta - hifiasm_meta -t ${task.cpus} -o ${sampleId} $reads 2> hifiasm-meta/${sampleId}.log + hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm-meta/${meta.id}.log # gfa to fasta format - awk '/^S/{print ">"\$2"\\n"\$3}' ${sampleId}.p_ctg.gfa | fold > hifiasm-meta/${sampleId}.contigs.fa + awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm-meta/${meta.id}.contigs.fa - mv ${sampleId}.cmd hifiasm-meta/${sampleId}.params.txt + mv ${meta.id}.cmd hifiasm-meta/${meta.id}.params.txt """ } process METAFLYE { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_HIFI' input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("metaflye/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("metaflye/${sampleId}.log"), path("metaflye/${sampleId}.params.json"), emit: report + tuple val(meta), path("metaflye/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("metaflye/${meta.id}.log"), path("metaflye/${meta.id}.params.json"), emit: report script: """ mkdir metaflye - flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${sampleId}.log + flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log - mv metaflye/assembly.fasta metaflye/${sampleId}.contigs.fa - mv metaflye/params.json metaflye/${sampleId}.params.json + mv metaflye/assembly.fasta metaflye/${meta.id}.contigs.fa + mv metaflye/params.json metaflye/${meta.id}.params.json """ } diff --git a/modules/assign_taxonomy.nf b/modules/assign_taxonomy.nf index b4b414d13e5e1baf6687a9d99faa0fd8b2ea2739..b7b840317d02c96de1e7634b77b86db31dce5b62 100644 --- a/modules/assign_taxonomy.nf +++ b/modules/assign_taxonomy.nf @@ -1,25 +1,25 @@ process ASSIGN_TAXONOMY { - tag "${sampleId}" - publishDir "${params.outdir}/07_taxo_affi/${sampleId}", mode: 'copy' + tag "${meta.id}" + publishDir "${params.outdir}/07_taxo_affi/${meta.id}", mode: 'copy' label 'PYTHON' input: tuple path(accession2taxid), path(new_taxdump) - tuple val(sampleId), path(m8), path(sam_coverage), path(prot_len) + tuple val(meta), path(m8), path(sam_coverage), path(prot_len) output: - tuple val(sampleId), path("${sampleId}.percontig.tsv"), emit: t_percontig - tuple val(sampleId), path("${sampleId}.pergene.tsv"), emit: t_pergene - tuple val(sampleId), path("${sampleId}.warn.tsv"), emit: t_warn - tuple val(sampleId), path("graphs"), emit: t_graphs - path "${sampleId}_quantif_percontig.tsv", emit: q_all - path "${sampleId}_quantif_percontig_by_superkingdom.tsv", emit: q_superkingdom - path "${sampleId}_quantif_percontig_by_phylum.tsv", emit: q_phylum - path "${sampleId}_quantif_percontig_by_order.tsv", emit: q_order - path "${sampleId}_quantif_percontig_by_class.tsv", emit: q_class - path "${sampleId}_quantif_percontig_by_family.tsv", emit: q_family - path "${sampleId}_quantif_percontig_by_genus.tsv", emit: q_genus - path "${sampleId}_quantif_percontig_by_species.tsv", emit: q_species + tuple val(meta.id), path("${meta.id}.percontig.tsv"), emit: t_percontig + tuple val(meta.id), path("${meta.id}.pergene.tsv"), emit: t_pergene + tuple val(meta.id), path("${meta.id}.warn.tsv"), emit: t_warn + tuple val(meta.id), path("graphs"), emit: t_graphs + path "${meta.id}_quantif_percontig.tsv", emit: q_all + path "${meta.id}_quantif_percontig_by_superkingdom.tsv", emit: q_superkingdom + path "${meta.id}_quantif_percontig_by_phylum.tsv", emit: q_phylum + path "${meta.id}_quantif_percontig_by_order.tsv", emit: q_order + path "${meta.id}_quantif_percontig_by_class.tsv", emit: q_class + path "${meta.id}_quantif_percontig_by_family.tsv", emit: q_family + path "${meta.id}_quantif_percontig_by_genus.tsv", emit: q_genus + path "${meta.id}_quantif_percontig_by_species.tsv", emit: q_species path "top_taxons_per_contig.tsv", emit: top_taxon_file script: @@ -38,9 +38,9 @@ process ASSIGN_TAXONOMY { aln2taxaffi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \ - -o ${sampleId} -b ${m8} --keep_only_best_aln \ + -o ${meta.id} -b ${m8} --keep_only_best_aln \ --query_length_file ${prot_len} -v --write_top_taxons - merge_contig_quantif_perlineage.py -c ${sampleId}.percontig.tsv -s ${sam_coverage} -o ${sampleId}_quantif_percontig + merge_contig_quantif_perlineage.py -c ${meta.id}.percontig.tsv -s ${sam_coverage} -o ${meta.id}_quantif_percontig new_taxdump_original=$new_taxdump if [ "\${new_taxdump_original#*.}" == "tar.gz" ] diff --git a/modules/best_hits.nf b/modules/best_hits.nf index 91660cc9af7a74806d592973873a28371fe7dfff..7620953929ef1f6f6397dfd829e15a4f6ec904cd 100644 --- a/modules/best_hits.nf +++ b/modules/best_hits.nf @@ -2,13 +2,13 @@ process BEST_HITS { publishDir "${params.outdir}/06_func_annot/06_3_functional_annotation", mode: 'copy' input: - tuple val(sampleId), path(m8) + tuple val(meta), path(m8) output: - path "${sampleId}.best_hit", emit: best_hits + path "${meta.id}.best_hit", emit: best_hits script: """ - filter_diamond_hits.py -o ${sampleId}.best_hit ${m8} + filter_diamond_hits.py -o ${meta.id}.best_hit ${m8} """ } \ No newline at end of file diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index 921fe4e54efed00c19f35ba256943400f4aa2682..af6913c1a1073752f685e8c843e0a0b847efb057 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -1,20 +1,20 @@ process INDIVIDUAL_CD_HIT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy' label 'CD_HIT' input: - tuple val(sampleId), path(ffn) + tuple val(meta), path(ffn) val pct_id output: - path("${sampleId}.cd-hit-est.${pct_id}.fasta"), emit: clstr_fasta - path("${sampleId}.cd-hit-est.${pct_id}.table_cluster_contigs.txt"), emit: clstr_table + path("${meta.id}.cd-hit-est.${pct_id}.fasta"), emit: clstr_fasta + path("${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt"), emit: clstr_table script: """ - cd-hit-est -c ${pct_id} -i ${ffn} -o ${sampleId}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150 - cat ${sampleId}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${sampleId}.cd-hit-est.${pct_id}.table_cluster_contigs.txt + cd-hit-est -c ${pct_id} -i ${ffn} -o ${meta.id}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150 + cat ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt """ } @@ -48,7 +48,7 @@ process GLOBAL_CD_HIT { workflow CD_HIT { take: -ch_assembly // channel: [ val(sampleid), path(assemblyfasta) ] +ch_assembly // channel: [ val(meta.id), path(assemblyfasta) ] ch_percentage_identity // channel: val main: diff --git a/modules/cutadapt.nf b/modules/cutadapt.nf index 82e6e54b810583d6a47a178265ecf704597a384f..6d5bda00a7f280a2f692e71ceef9fd532a9de37c 100644 --- a/modules/cutadapt.nf +++ b/modules/cutadapt.nf @@ -1,26 +1,26 @@ process CUTADAPT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_cutadapt.log' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) val adapter1 val adapter2 output: - tuple val(sampleId), path("*${sampleId}*_R1.fastq.gz"), path("*${sampleId}*_R2.fastq.gz"), emit: reads - path "${sampleId}_cutadapt.log", emit: report + tuple val(meta), path("*${meta.id}*.fastq.gz"), emit: reads + path "${meta.id}_cutadapt.log", emit: report script: if (!params.use_sickle & params.skip_host_filter) { // output are final cleaned paths - output_paths = "-o cleaned_${sampleId}_R1.fastq.gz -p cleaned_${sampleId}_R2.fastq.gz" + output_paths = "-o cleaned_${meta.id}_R1.fastq.gz -p cleaned_${meta.id}_R2.fastq.gz" } else { // tempory paths not saved in publish dir - output_paths = "-o ${sampleId}_cutadapt_R1.fastq.gz -p ${sampleId}_cutadapt_R2.fastq.gz" + output_paths = "-o ${meta.id}_cutadapt_R1.fastq.gz -p ${meta.id}_cutadapt_R2.fastq.gz" } if (!params.use_sickle){ quality_trim = "-q 20,20" @@ -29,6 +29,6 @@ process CUTADAPT { } """ cutadapt -a $adapter1 -A $adapter2 $output_paths -m 36 --trim-n -q 20,20 --max-n 0 \ - --cores=${task.cpus} ${read1} ${read2} > ${sampleId}_cutadapt.log + --cores=${task.cpus} ${reads[0]} ${reads[1]} > ${meta.id}_cutadapt.log """ -} \ No newline at end of file +} diff --git a/modules/diamond.nf b/modules/diamond.nf index 13b4523fb5ace23944c12464590178537fa7dab0..308165573941f378f72a389634240eec008b1ea5 100644 --- a/modules/diamond.nf +++ b/modules/diamond.nf @@ -1,22 +1,22 @@ process DIAMOND { - publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$sampleId", mode: 'copy' - tag "${sampleId}" + publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$meta.id", mode: 'copy' + tag "${meta.id}" input: - tuple val(sampleId), path(faa) - val diamond_bank + tuple val(meta), path(faa) + path diamond_bank output: - tuple val(sampleId), path("${sampleId}_aln_diamond.m8"), emit: m8 + tuple val(meta), path("${meta.id}_aln_diamond.m8"), emit: m8 script: fmt="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen stitle" fmt_tab=fmt.replaceAll(" ","\t") """ echo "$fmt_tab" > head.m8 - diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${faa} -o ${sampleId}_aln_diamond.nohead.m8 -f 6 $fmt - cat head.m8 ${sampleId}_aln_diamond.nohead.m8 > ${sampleId}_aln_diamond.m8 - rm ${sampleId}_aln_diamond.nohead.m8 + diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${faa} -o ${meta.id}_aln_diamond.nohead.m8 -f 6 $fmt + cat head.m8 ${meta.id}_aln_diamond.nohead.m8 > ${meta.id}_aln_diamond.m8 + rm ${meta.id}_aln_diamond.nohead.m8 rm head.m8 """ } diff --git a/modules/eggnog_mapper.nf b/modules/eggnog_mapper.nf index cee277ef83c26910d3448eba7b0903cf73aa4d48..c3521ee113877f9494a3bef5fabe6aea35ad6746 100644 --- a/modules/eggnog_mapper.nf +++ b/modules/eggnog_mapper.nf @@ -1,20 +1,20 @@ process EGGNOG_MAPPER { publishDir "${params.outdir}/06_func_annot/06_3_functional_annotation", mode: 'copy' - tag "${sampleId}" + tag "${meta.id}" label 'EGGNOG' input: - tuple val(sampleId), path(faa) + tuple val(meta), path(faa) path db output: - path "${sampleId}_diamond_one2one.emapper.seed_orthologs", emit: seed - path "${sampleId}_diamond_one2one.emapper.annotations", emit: annot + path "${meta.id}_diamond_one2one.emapper.seed_orthologs", emit: seed + path "${meta.id}_diamond_one2one.emapper.annotations", emit: annot path 'v_eggnogmapper.txt', emit: version script: """ - /eggnog-mapper-2.0.4-rf1/emapper.py -i ${faa} --output ${sampleId}_diamond_one2one -m diamond --cpu ${task.cpus} --data_dir ${db} --target_orthologs one2one + /eggnog-mapper-2.0.4-rf1/emapper.py -i ${faa} --output ${meta.id}_diamond_one2one -m diamond --cpu ${task.cpus} --data_dir ${db} --target_orthologs one2one /eggnog-mapper-2.0.4-rf1/emapper.py -v &> v_eggnogmapper.txt """ } \ No newline at end of file diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 2e06c235281f144752671de0e785adbf52d37ffa..f215148a6b12f9b6189612c697814e998f1a9e15 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -1,55 +1,26 @@ -process FASTQC_RAW { - tag "${sampleId}" +process FASTQC { + tag "${meta.id}" label 'FASTQC' - publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_raw", mode: 'copy' + publishDir "${params.outdir}/01_clean_qc/01_2_qc/", mode: 'copy' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html + path "$outdir/${meta.id}/*.zip", emit: zip + path "$outdir/${meta.id}/*.html", emit: html script: + if (reads[0]==~/cleaned.*/){ outdir="fastqc_cleaned" } else { outdir="fastqc_raw" } + if (meta.type=="SR"){ + fastq = "${reads[0]} ${reads[1]}" + option = "--nogroup" + } else if (meta.type=="HIFI"){ + fastq = "${reads}" + option = "" } """ - mkdir ${sampleId} ; fastqc --nogroup --quiet -o ${sampleId} --threads ${task.cpus} ${read1} ${read2} - """ -} - -process FASTQC_CLEANED { - tag "${sampleId}" - label 'FASTQC' - publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_cleaned", mode: 'copy' - - input: - tuple val(sampleId), path(read1), path(read2) - - output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html - - script: - """ - mkdir ${sampleId}; fastqc --nogroup --quiet -o ${sampleId} --threads ${task.cpus} ${read1} ${read2} - """ -} - -process FASTQC_HIFI { - tag "${sampleId}" - label 'FASTQC' - publishDir "${params.outdir}/${publishdir}", mode: 'copy' - - input: - tuple val(sampleId), path(read), val(publishdir) - - output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html - - script: - """ - echo ${params.outdir}/${publishdir} - mkdir ${sampleId}; fastqc --quiet -o ${sampleId} --threads ${task.cpus} ${read} + mkdir -p $outdir/${meta.id} + fastqc $option --quiet -o $outdir/${meta.id} --threads ${task.cpus} $fastq """ } diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf index 9afb6d8c5191beec580b06fc8440f790e11222fc..96366f7b78987a33f8f1657c436da36aae6c929e 100644 --- a/modules/feature_counts.nf +++ b/modules/feature_counts.nf @@ -1,20 +1,20 @@ // Quantification of reads on each gene in each sample. process FEATURE_COUNTS { - tag "${sampleId}" + tag "${meta.id}" label 'QUANTIFICATION' publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy' input: - tuple val(sampleId), file(gff_prokka), file(bam), file(bam_index) + tuple val(meta), file(gff_prokka), file(bam), file(bam_index) output: - path "${sampleId}.featureCounts.tsv", emit: count_table - path "${sampleId}.featureCounts.tsv.summary", emit: summary - path "${sampleId}.featureCounts.stdout" + path "${meta.id}.featureCounts.tsv", emit: count_table + path "${meta.id}.featureCounts.tsv.summary", emit: summary + path "${meta.id}.featureCounts.stdout" script: """ - featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${sampleId}.featureCounts.tsv ${bam} &> ${sampleId}.featureCounts.stdout + featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${meta.id}.featureCounts.tsv ${bam} &> ${meta.id}.featureCounts.stdout """ } @@ -42,8 +42,8 @@ process QUANTIFICATION_TABLE { workflow QUANTIFICATION { take: - ch_gff // channel: [ val(sampleid), path(gff) ] - ch_bam // channel: [ val(sampleid), path(bam), path(bam_index) ] + ch_gff // channel: [ val(meta), path(gff) ] + ch_bam // channel: [ val(meta), path(bam), path(bam_index) ] ch_individual_clstr_table ch_global_clstr_table diff --git a/modules/filtering_cpm.nf b/modules/filtering_cpm.nf new file mode 100644 index 0000000000000000000000000000000000000000..951a2018db7e7aed1f5c0915a8e54e68f7595c6f --- /dev/null +++ b/modules/filtering_cpm.nf @@ -0,0 +1,45 @@ +process CHUNK_ASSEMBLY_FILTER { + label 'ASSEMBLY_FILTER' + + input: + tuple val(meta), path(assembly_file), path(idxstats) + val min_cpm + + output: + tuple val(meta), path("${chunk_name}_select_cpm${min_cpm}.fasta"), emit: chunk_selected + tuple val(meta), path("${chunk_name}_discard_cpm${min_cpm}.fasta"), emit: chunk_discarded + + script: + chunk_name = assembly_file.baseName + """ + Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta + """ +} + +process MERGE_ASSEMBLY_FILTER { + label 'ASSEMBLY_FILTER' + + tag "${meta.id}" + publishDir "${params.outdir}/03_filtering/", mode: 'copy' + + input: + tuple val(meta), path(select_fasta) + tuple val(meta), path(discard_fasta) + val min_cpm + + output: + tuple val(meta), path("${meta.id}_select_contigs_cpm${min_cpm}.fasta"), emit: merged_selected + tuple val(meta), path("${meta.id}_discard_contigs_cpm${min_cpm}.fasta"), emit: merged_discarded + + shell: + ''' + echo !{select_fasta} | sed "s/ /\\n/g" | sort > select_list + echo !{discard_fasta} | sed "s/ /\\n/g" | sort > discard_list + + for i in `cat select_list` ; do cat $i >> !{meta.id}_select_contigs_cpm!{min_cpm}.fasta ; done + for j in `cat discard_list` ; do cat $j >> !{meta.id}_discard_contigs_cpm!{min_cpm}.fasta ; done + + rm select_list + rm discard_list + ''' +} \ No newline at end of file diff --git a/modules/host_filter.nf b/modules/host_filter.nf index 3939bc2db0cca7680b9116bc5491d786ca96065b..cb575491ea0ef601e1b7eaf6acf66e1061a133d0 100644 --- a/modules/host_filter.nf +++ b/modules/host_filter.nf @@ -1,5 +1,5 @@ process HOST_FILTER { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: '*.bam' @@ -9,32 +9,32 @@ process HOST_FILTER { else null} input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) path fasta path index output: - tuple val(sampleId), path("cleaned_${sampleId}_R1.fastq.gz"), path("cleaned_${sampleId}_R2.fastq.gz"), emit: reads - path "host_filter_flagstat/${sampleId}.host_filter.flagstat", emit: hf_report - path "${sampleId}.no_filter.flagstat", emit: nf_report + tuple val(meta), path("cleaned_${meta.id}*.fastq.gz"), emit: reads + path "host_filter_flagstat/${meta.id}.host_filter.flagstat", emit: hf_report + path "${meta.id}.no_filter.flagstat", emit: nf_report script: """ - bwa-mem2 mem -t ${task.cpus} ${fasta} ${read1} ${read2} > ${sampleId}.bam - samtools view -bhS -f 12 ${sampleId}.bam > ${sampleId}.without_host.bam + bwa-mem2 mem -t ${task.cpus} ${fasta} ${reads[0]} ${reads[1]} > ${meta.id}.bam + samtools view -bhS -f 12 ${meta.id}.bam > ${meta.id}.without_host.bam mkdir host_filter_flagstat - samtools flagstat ${sampleId}.bam > ${sampleId}.no_filter.flagstat - samtools flagstat ${sampleId}.without_host.bam >> host_filter_flagstat/${sampleId}.host_filter.flagstat - samtools sort -n -o ${sampleId}.without_host_sort.bam ${sampleId}.without_host.bam - samtools fastq -N -1 cleaned_${sampleId}_R1.fastq.gz -2 cleaned_${sampleId}_R2.fastq.gz ${sampleId}.without_host_sort.bam - rm ${sampleId}.bam - rm ${sampleId}.without_host.bam - rm ${sampleId}.without_host_sort.bam + samtools flagstat ${meta.id}.bam > ${meta.id}.no_filter.flagstat + samtools flagstat ${meta.id}.without_host.bam >> host_filter_flagstat/${meta.id}.host_filter.flagstat + samtools sort -n -o ${meta.id}.without_host_sort.bam ${meta.id}.without_host.bam + samtools fastq -N -1 cleaned_${meta.id}_R1.fastq.gz -2 cleaned_${meta.id}_R2.fastq.gz ${meta.id}.without_host_sort.bam + rm ${meta.id}.bam + rm ${meta.id}.without_host.bam + rm ${meta.id}.without_host_sort.bam """ } process HOST_FILTER_HIFI { - tag "${sampleId}" + tag "${meta.id}" label "MINIMAP2" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' @@ -45,27 +45,27 @@ process HOST_FILTER_HIFI { else null} input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) path fasta output: - tuple val(sampleId), path("cleaned_${sampleId}.fastq.gz"), emit: reads - path "host_filter_flagstat/${sampleId}.host_filter.flagstat", emit: hf_report - path "${sampleId}.no_filter.flagstat", emit: nf_report + tuple val(meta), path("cleaned_${meta.id}.fastq.gz"), emit: reads + path "host_filter_flagstat/${meta.id}.host_filter.flagstat", emit: hf_report + path "${meta.id}.no_filter.flagstat", emit: nf_report script: """ minimap2 -ax map-hifi -t ${task.cpus} ${fasta} ${reads} | samtools sort -@ ${task.cpus} -o ${sampleId}.bam - samtools view -@ ${task.cpus} -bh -f 4 ${sampleId}.bam > ${sampleId}.without_host.bam + samtools view -@ ${task.cpus} -bh -f 4 ${meta.id}.bam > ${meta.id}.without_host.bam mkdir host_filter_flagstat - samtools flagstat ${sampleId}.bam -@ ${task.cpus} > ${sampleId}.no_filter.flagstat - samtools flagstat ${sampleId}.without_host.bam -@ ${task.cpus} > host_filter_flagstat/${sampleId}.host_filter.flagstat - samtools fastq -@ ${task.cpus} ${sampleId}.without_host.bam | gzip > cleaned_${sampleId}.fastq.gz + samtools flagstat ${meta.id}.bam -@ ${task.cpus} > ${meta.id}.no_filter.flagstat + samtools flagstat ${meta.id}.without_host.bam -@ ${task.cpus} > host_filter_flagstat/${meta.id}.host_filter.flagstat + samtools fastq -@ ${task.cpus} ${meta.id}.without_host.bam | gzip > cleaned_${meta.id}.fastq.gz - rm ${sampleId}.bam - rm ${sampleId}.without_host.bam + rm ${meta.id}.bam + rm ${meta.id}.without_host.bam """ } diff --git a/modules/kaiju.nf b/modules/kaiju.nf index c5db60d35ab6f307a625621cac35aba742904333..a21dd8128169233f234d5b4d6c65b963488beb8e 100644 --- a/modules/kaiju.nf +++ b/modules/kaiju.nf @@ -1,18 +1,18 @@ taxon_levels = "phylum class order family genus species" process KAIJU { - tag "${sampleId}" + tag "${meta.id}" label "KAIJU" publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju.out.gz' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) tuple path(nodes), path(fmi), path(names) output: - path "${sampleId}_kaiju.out.gz", emit: kaiju_result - path "${sampleId}.krona", emit: krona_tab_file + path "${meta.id}_kaiju.out.gz", emit: kaiju_result + path "${meta.id}.krona", emit: krona_tab_file path "*.summary_*", emit: k_all path "*.summary_species", emit: k_species path "*.summary_genus", emit: k_genus @@ -23,34 +23,34 @@ process KAIJU { script: """ - kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${read1} -j ${read2} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v + kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads[0]} -j ${reads[1]} -o ${meta.id}_kaiju_MEM_verbose.out -a mem -v - kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u + kaiju2krona -t ${nodes} -n ${names} -i ${meta.id}_kaiju_MEM_verbose.out -o ${meta.id}.krona -u for i in ${taxon_levels} ; do - kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out + kaiju2table -t ${nodes} -n ${names} -r \$i -o ${meta.id}_kaiju_MEM.out.summary_\$i ${meta.id}_kaiju_MEM_verbose.out done - grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz + grep -v U ${meta.id}_kaiju_MEM_verbose.out | gzip > ${meta.id}_kaiju.out.gz - rm ${sampleId}_kaiju_MEM_verbose.out + rm ${meta.id}_kaiju_MEM_verbose.out """ } process KAIJU_HIFI { - tag "${sampleId}" + tag "${meta.id}" label "KAIJU" publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju.out.gz' input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) tuple path(nodes), path(fmi), path(names) output: - path "${sampleId}_kaiju.out.gz", emit: kaiju_result - path "${sampleId}.krona", emit: krona_tab_file + path "${meta.id}_kaiju.out.gz", emit: kaiju_result + path "${meta.id}.krona", emit: krona_tab_file path "*.summary_*", emit: k_all path "*.summary_species", emit: k_species path "*.summary_genus", emit: k_genus @@ -61,17 +61,17 @@ process KAIJU_HIFI { script: """ - kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v + kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads} -o ${meta.id}_kaiju_MEM_verbose.out -a mem -v - kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u + kaiju2krona -t ${nodes} -n ${names} -i ${meta.id}_kaiju_MEM_verbose.out -o ${meta.id}.krona -u for i in ${taxon_levels} ; do - kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out + kaiju2table -t ${nodes} -n ${names} -r \$i -o ${meta.id}_kaiju_MEM.out.summary_\$i ${meta.id}_kaiju_MEM_verbose.out done - grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz - rm ${sampleId}_kaiju_MEM_verbose.out + grep -v U ${meta.id}_kaiju_MEM_verbose.out | gzip > ${meta.id}_kaiju.out.gz + rm ${meta.id}_kaiju_MEM_verbose.out """ } diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf new file mode 100644 index 0000000000000000000000000000000000000000..888e67e38f694fc399967d36c7d810f3d6ca1aba --- /dev/null +++ b/modules/merge_fastq.nf @@ -0,0 +1,23 @@ +process MERGE_FASTQ { + tag "$meta.id" + publishDir "${params.outdir}/02_assembly/merged_fastq", mode: 'copy' + label 'MERGE_FASTQ' + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + + script: + def readList = reads.collect{ it.toString() } + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + zcat ${read1.join(' ')} > ${meta.sample}_1.merged.fastq + zcat ${read2.join(' ')} > ${meta.sample}_2.merged.fastq + gzip ${meta.sample}_1.merged.fastq + gzip ${meta.sample}_2.merged.fastq + """ +} \ No newline at end of file diff --git a/modules/metaquast.nf b/modules/metaquast.nf index 5fbae9679a81fd3a1068b114c0ad8371b6476490..0fa75f3c124e71448e0c4ace1dc0b20c3a7f106b 100644 --- a/modules/metaquast.nf +++ b/modules/metaquast.nf @@ -1,15 +1,15 @@ process QUAST { - tag "${sampleId}" + tag "${meta.id}" label 'QUAST' publishDir "${params.outdir}", mode: 'copy' input: - tuple val(sampleId), path(assembly) + tuple val(meta), path(assembly) val step output: - path "${outdirModule}/${sampleId}/*", emit: all - path "${{outdirModule}}/${sampleId}/report.tsv", emit: report + path "${outdirModule}/${meta.id}/*", emit: all + path "${{outdirModule}}/${meta.id}/report.tsv", emit: report script: @@ -22,8 +22,8 @@ process QUAST { } } """ - mkdir -p $outdirModule/${sampleId}/ - touch $outdirModule/${sampleId}/report.tsv - metaquast.py --threads ${task.cpus} --rna-finding --max-ref-number 0 --min-contig 0 ${assembly} -o $outdirModule/${sampleId} --labels ${sampleId} + mkdir -p $outdirModule/${meta.id}/ + touch $outdirModule/${meta.id}/report.tsv + metaquast.py --threads ${task.cpus} --rna-finding --max-ref-number 0 --min-contig 0 ${assembly} -o $outdirModule/${meta.id} --labels ${meta.id} """ } diff --git a/modules/prokka.nf b/modules/prokka.nf index 984e972e11f66d5c97dc4258cbe84a2c34c2e59f..d6d0ca466771c5c08279f8ef208de6b2995c3d5c 100644 --- a/modules/prokka.nf +++ b/modules/prokka.nf @@ -1,45 +1,45 @@ process PROKKA { - tag "${sampleId}" + tag "${meta.id}" input: - tuple val(sampleId), file(assembly_file) + tuple val(meta), file(assembly_file) output: - tuple val(sampleId), path("PROKKA_${sampleId}"), emit: prokka_results - path "PROKKA_${sampleId}/${sampleId}.txt", emit: report + tuple val(meta), path("PROKKA_${meta.id}"), emit: prokka_results + path "PROKKA_${meta.id}/${meta.id}.txt", emit: report script: """ - prokka --metagenome --noanno --rawproduct --outdir PROKKA_${sampleId} --prefix ${sampleId} ${assembly_file} --centre X --compliant --cpus ${task.cpus} - rm PROKKA_${sampleId}/*.gbk - gt gff3validator PROKKA_${sampleId}/${sampleId}.gff + prokka --metagenome --noanno --rawproduct --outdir PROKKA_${meta.id} --prefix ${meta.id} ${assembly_file} --centre X --compliant --cpus ${task.cpus} + rm PROKKA_${meta.id}/*.gbk + gt gff3validator PROKKA_${meta.id}/${meta.id}.gff """ } process RENAME_CONTIGS_AND_GENES { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/04_structural_annot", mode: 'copy' label 'PYTHON' input: - tuple val(sampleId), path(prokka_results) + tuple val(meta), path(prokka_results) output: - tuple val(sampleId), path("${sampleId}.annotated.fna"), emit: fna - tuple val(sampleId), path("${sampleId}.annotated.ffn"), emit: ffn - tuple val(sampleId), path("${sampleId}.annotated.faa"), emit: faa - tuple val(sampleId), path("${sampleId}.annotated.gff"), emit: gff - tuple val(sampleId), path("${sampleId}_prot.len"), emit: prot_length + tuple val(meta), path("${meta.id}.annotated.fna"), emit: fna + tuple val(meta), path("${meta.id}.annotated.ffn"), emit: ffn + tuple val(meta), path("${meta.id}.annotated.faa"), emit: faa + tuple val(meta), path("${meta.id}.annotated.gff"), emit: gff + tuple val(meta), path("${meta.id}_prot.len"), emit: prot_length script: """ - grep "^gnl" ${prokka_results}/${sampleId}.gff > ${sampleId}_only_gnl.gff + grep "^gnl" ${prokka_results}/${meta.id}.gff > ${meta.id}_only_gnl.gff - Rename_contigs_and_genes.py -f ${sampleId}_only_gnl.gff -faa ${prokka_results}/${sampleId}.faa \ - -ffn ${prokka_results}/${sampleId}.ffn -fna ${prokka_results}/${sampleId}.fna \ - -p ${sampleId} -oGFF ${sampleId}.annotated.gff -oFAA ${sampleId}.annotated.faa \ - -oFFN ${sampleId}.annotated.ffn -oFNA ${sampleId}.annotated.fna + Rename_contigs_and_genes.py -f ${meta.id}_only_gnl.gff -faa ${prokka_results}/${meta.id}.faa \ + -ffn ${prokka_results}/${meta.id}.ffn -fna ${prokka_results}/${meta.id}.fna \ + -p ${meta.id} -oGFF ${meta.id}.annotated.gff -oFAA ${meta.id}.annotated.faa \ + -oFFN ${meta.id}.annotated.ffn -oFNA ${meta.id}.annotated.fna - samtools faidx ${sampleId}.annotated.faa; cut -f 1,2 ${sampleId}.annotated.faa.fai > ${sampleId}_prot.len + samtools faidx ${meta.id}.annotated.faa; cut -f 1,2 ${meta.id}.annotated.faa.fai > ${meta.id}_prot.len """ } diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf index deddbc2eb1125d4b1ca86b5b1024375f062d66e6..559b7496651b31ba5f27a5c54c8812fd0370771a 100644 --- a/modules/read_alignment.nf +++ b/modules/read_alignment.nf @@ -1,80 +1,80 @@ process BWA_MEM { - tag "${sampleId}" - publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/${sampleId}", mode: 'copy' + tag "${meta.id}" + publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/${meta.id}", mode: 'copy' input: - tuple val(sampleId), path(fna), path(read1), path(read2) + tuple val(meta), path(fna), path(reads) output: - tuple val(sampleId), path("${sampleId}.sort.bam"), path("${sampleId}.sort.bam.bai"), emit: bam - tuple val(sampleId), path("${sampleId}_coverage.tsv"), emit: sam_coverage - path "${sampleId}*" + tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta), path("${meta.id}_coverage.tsv"), emit: sam_coverage + path "${meta.id}*" script: """ bwa-mem2 index ${fna} -p ${fna} - bwa-mem2 mem -t ${task.cpus} ${fna} ${read1} ${read2} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${sampleId}.sort.bam - samtools index -@ ${task.cpus} ${sampleId}.sort.bam + bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.sort.bam + samtools index -@ ${task.cpus} ${meta.id}.sort.bam - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.sort.bam.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.sort.bam.idxstats """ } process MINIMAP2 { - tag "${sampleId}" + tag "${meta.id}" label 'MINIMAP2' - publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/$sampleId", mode: 'copy' + publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/$meta.id", mode: 'copy' input: - tuple val(sampleId), path(fna_prokka), path(reads) + tuple val(meta), path(fna_prokka), path(reads) output: - tuple val(sampleId), path("${sampleId}.sort.bam"), path("${sampleId}.sort.bam.bai"), emit: bam - tuple val(sampleId), path("${sampleId}_coverage.tsv"), emit: sam_coverage - path "${sampleId}*" + tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta), path("${meta.id}_coverage.tsv"), emit: sam_coverage + path "${meta.id}*" script: """ # align reads to contigs, keep only primary aln and sort resulting bam minimap2 -t ${task.cpus} -ax map-hifi $fna_prokka $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${sampleId}.sort.bam - samtools index ${sampleId}.sort.bam -@ ${task.cpus} - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools index ${meta.id}.sort.bam -@ ${task.cpus} + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.sort.bam.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.sort.bam.idxstats """ } process MINIMAP2_FILTERING { - tag "${sampleId}" + tag "${meta.id}" label 'MINIMAP2' publishDir "${params.outdir}/02_assembly/logs/", mode: 'copy' input: - tuple val(sampleId), path(assembly), path(reads) + tuple val(meta), path(assembly), path(reads) output: - tuple val(sampleId), path("${sampleId}.idxstats"), emit: sam_idxstat - path "${sampleId}.flagstat", emit: sam_flagstat - path "${sampleId}*" + tuple val(meta), path("${meta.id}.idxstats"), emit: sam_idxstat + path "${meta.id}.flagstat", emit: sam_flagstat + path "${meta.id}*" script: """ # align reads to contigs, keep only primary aln and sort resulting bam minimap2 -t ${task.cpus} -ax map-hifi $assembly $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${sampleId}.sort.bam - samtools index ${sampleId}.sort.bam -@ ${task.cpus} - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools index ${meta.id}.sort.bam -@ ${task.cpus} + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.idxstats - rm ${sampleId}.sort.bam* + rm ${meta.id}.sort.bam* """ } diff --git a/modules/reads_deduplication.nf b/modules/reads_deduplication.nf index 0d907ad643f733965bcfb0a27b2cd9728917c792..83d2b56ebf3d39079978ee38863c50cc5cc72f00 100644 --- a/modules/reads_deduplication.nf +++ b/modules/reads_deduplication.nf @@ -1,34 +1,34 @@ process READS_DEDUPLICATION { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy', pattern: '*.fastq.gz' publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.idxstats' publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.flagstat' input: - tuple val(sampleId), path(assembly), path(read1), path(read2) + tuple val(meta), path(assembly), path(reads) output: - tuple val(sampleId), path("${sampleId}_R1_dedup.fastq.gz"), path("${sampleId}_R2_dedup.fastq.gz"), emit: dedup - tuple val(sampleId), path("${sampleId}.count_reads_on_contigs.idxstats"), emit: idxstats - path "${sampleId}.count_reads_on_contigs.flagstat", emit: flagstat + tuple val(meta), path("${meta.id}*_dedup.fastq.gz"), emit: dedup + tuple val(meta), path("${meta.id}.count_reads_on_contigs.idxstats"), emit: idxstats + path "${meta.id}.count_reads_on_contigs.flagstat", emit: flagstat script: """ mkdir logs bwa-mem2 index ${assembly} -p ${assembly} - bwa-mem2 mem ${assembly} ${read1} ${read2} | samtools view -bS - | samtools sort -n -o ${sampleId}.sort.bam - - samtools fixmate -m ${sampleId}.sort.bam ${sampleId}.fixmate.bam - samtools sort -o ${sampleId}.fixmate.positionsort.bam ${sampleId}.fixmate.bam - samtools markdup -r -S -s -f ${sampleId}.stats ${sampleId}.fixmate.positionsort.bam ${sampleId}.filtered.bam - samtools index ${sampleId}.filtered.bam - samtools idxstats ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.idxstats - samtools flagstat ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.flagstat - samtools sort -n -o ${sampleId}.filtered.sort.bam ${sampleId}.filtered.bam - samtools fastq -N -1 ${sampleId}_R1_dedup.fastq.gz -2 ${sampleId}_R2_dedup.fastq.gz ${sampleId}.filtered.sort.bam - rm ${sampleId}.sort.bam - rm ${sampleId}.fixmate.bam - rm ${sampleId}.fixmate.positionsort.bam - rm ${sampleId}.filtered.bam - rm ${sampleId}.filtered.sort.bam + bwa-mem2 mem ${assembly} ${reads[0]} ${reads[1]} | samtools view -bS - | samtools sort -n -o ${meta.id}.sort.bam - + samtools fixmate -m ${meta.id}.sort.bam ${meta.id}.fixmate.bam + samtools sort -o ${meta.id}.fixmate.positionsort.bam ${meta.id}.fixmate.bam + samtools markdup -r -S -s -f ${meta.id}.stats ${meta.id}.fixmate.positionsort.bam ${meta.id}.filtered.bam + samtools index ${meta.id}.filtered.bam + samtools idxstats ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.idxstats + samtools flagstat ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.flagstat + samtools sort -n -o ${meta.id}.filtered.sort.bam ${meta.id}.filtered.bam + samtools fastq -N -1 ${meta.id}_R1_dedup.fastq.gz -2 ${meta.id}_R2_dedup.fastq.gz ${meta.id}.filtered.sort.bam + rm ${meta.id}.sort.bam + rm ${meta.id}.fixmate.bam + rm ${meta.id}.fixmate.positionsort.bam + rm ${meta.id}.filtered.bam + rm ${meta.id}.filtered.sort.bam """ } \ No newline at end of file diff --git a/modules/sickle.nf b/modules/sickle.nf index d4d010bbc59ee7b08bef37640ae5a0b2a1067f70..4eaf9eb6a034956c8851138f41a7b00499ebb31a 100644 --- a/modules/sickle.nf +++ b/modules/sickle.nf @@ -1,31 +1,31 @@ process SICKLE { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_sickle.log' input: - tuple val(sampleId), path(read1), path(read2), val(paired) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("*${sampleId}*_R1.fastq.gz"), path("*${sampleId}*_R2.fastq.gz"), emit: reads - path "${sampleId}_single_sickle.fastq.gz", emit: single - path "${sampleId}_sickle.log", emit: report + tuple val(meta), path("*${meta.id}*.fastq.gz"), emit: reads + path "${meta.id}_single_sickle.fastq.gz", emit: single + path "${meta.id}_sickle.log", emit: report script: - mode = paired ? 'pe' : 'se' + if (params.skip_host_filter) { // output are final cleaned files - options = "-o cleaned_${sampleId}_R1.fastq.gz -p cleaned_${sampleId}_R2.fastq.gz" + options = "-o cleaned_${meta.id}_R1.fastq.gz -p cleaned_${meta.id}_R2.fastq.gz" } else { //tempory files not saved in publish dir - options = "-o ${sampleId}_sickle_R1.fastq.gz -p ${sampleId}_sickle_R2.fastq.gz" + options = "-o ${meta.id}_sickle_R1.fastq.gz -p ${meta.id}_sickle_R2.fastq.gz" } options += " -t " + params.quality_type """ - sickle ${mode} -f ${read1} -r ${read2} $options \ - -s ${sampleId}_single_sickle.fastq.gz -g > ${sampleId}_sickle.log + sickle 'pe' -f ${reads[0]} -r ${reads[1]} $options \ + -s ${meta.id}_single_sickle.fastq.gz -g > ${meta.id}_sickle.log """ } \ No newline at end of file diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index 78e362eb7c14d9eed61b7cc78a585e387a74085a..ca8eadd148f105d2ddf2f222957a345d1d9264d5 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -85,7 +85,7 @@ workflow DATABASES { ch_diamond = Channel.empty() if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) ) { - ch_diamond = Channel.fromPath(file(params.diamond_bank)) + ch_diamond =Channel.value(file(params.diamond_bank)) } GET_DB_VERSIONS( @@ -103,6 +103,7 @@ workflow DATABASES { kaiju_db = ch_kaiju_db eggnog = ch_eggnog taxonomy = ch_taxonomy.first() + diamond = ch_diamond } process INDEX_HOST { diff --git a/subworkflows/01_clean_qc.nf b/subworkflows/01_clean_qc.nf index 764987fb04433a27d8211057a5183fa5dc564b80..044d5325d15ccb1c6292e127c0d0a626141cd50f 100644 --- a/subworkflows/01_clean_qc.nf +++ b/subworkflows/01_clean_qc.nf @@ -1,13 +1,12 @@ include { CUTADAPT } from '../modules/cutadapt' include { SICKLE } from '../modules/sickle' include { HOST_FILTER; HOST_FILTER_HIFI } from '../modules/host_filter' -include { FASTQC_RAW; FASTQC_CLEANED; FASTQC_HIFI; FASTQC_HIFI as FASTQC_HIFI_RAW } from '../modules/fastqc' +include { FASTQC as FASTQC_RAW; FASTQC as FASTQC_CLEANED; FASTQC as FASTQC_HIFI; FASTQC as FASTQC_HIFI_RAW } from '../modules/fastqc' include { KAIJU_AND_MERGE; KAIJU_AND_MERGE_FOR_HIFI } from '../modules/kaiju' workflow STEP_01_CLEAN_QC { take: raw_reads - paired host_fasta host_index kaiju_db @@ -25,10 +24,8 @@ workflow STEP_01_CLEAN_QC { ch_cutadapt_report = CUTADAPT.out.report if (params.use_sickle) { - ch_sickle_reads = ch_intermediate_reads.join(paired) - // ch_sickle_reads.view{ it -> "${it}" } SICKLE ( - ch_sickle_reads + ch_intermediate_reads ) ch_intermediate_reads = SICKLE.out.reads ch_sickle_report = SICKLE.out.report @@ -76,7 +73,6 @@ workflow STEP_01_CLEAN_QC { emit: preprocessed_reads = ch_preprocessed_reads - cutadapt_report = ch_cutadapt_report sickle_report = ch_sickle_report before_filter_report = ch_before_filter_report @@ -94,11 +90,8 @@ workflow STEP_01_CLEAN_QC_HIFI { kaiju_db main: - outdir_raw = '01_clean_qc/01_2_qc/fastqc_raw' - raw_reads.map {it -> [it[0], it[1], outdir_raw]} - .set { ch_raw_reads_qc } - - FASTQC_HIFI_RAW(ch_raw_reads_qc) + + FASTQC_HIFI_RAW(raw_reads) ch_fastqc_raw_report = FASTQC_HIFI_RAW.out.zip if (!params.skip_host_filter) { @@ -110,11 +103,7 @@ workflow STEP_01_CLEAN_QC_HIFI { ch_before_filter_report = HOST_FILTER_HIFI.out.nf_report ch_after_filter_report = HOST_FILTER_HIFI.out.hf_report - - outdir_cleaned = '01_clean_qc/01_2_qc/fastqc_cleaned' - ch_preprocessed_reads.map {it -> [it[0], it[1], outdir_cleaned]} - .set { ch_preprocessed_reads_qc } - FASTQC_HIFI(ch_preprocessed_reads_qc) + FASTQC_HIFI(ch_preprocessed_reads) ch_fastqc_clean_report = FASTQC_HIFI.out.zip } diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 8658d580c4e57355a868d0e86c94609f70a04332..355ebc3c99a768bd71d6a9a3654916f07ec343b1 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -16,21 +16,18 @@ workflow STEP_02_ASSEMBLY { if(assembly_tool == 'metaspades') { METASPADES(preprocessed_reads) ch_assembly = METASPADES.out.assembly - } + } else if(assembly_tool == 'megahit') { - MEGAHIT(preprocessed_reads) - ch_assembly = MEGAHIT.out.assembly - } + MEGAHIT(preprocessed_reads) + ch_assembly = MEGAHIT.out.assembly + } else { exit 1, "Invalid short read assembly parameter: ${assembly_tool}" } } - - // ch_filtered = Channel.value(false) ASSEMBLY_QUAST( ch_assembly, 'ASSEMBLY' ) - ch_assembly_report = ASSEMBLY_QUAST.out.report ch_assembly_and_preprocessed = ch_assembly.join(preprocessed_reads, remainder: true) diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index 5d04839cf449fb6611ead0a583326a48e4cfcd39..5c6d81ed2d8ae1088447d5a09120a39a495f538f 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -1,50 +1,8 @@ -process CHUNK_ASSEMBLY_FILTER { - label 'ASSEMBLY_FILTER' - - input: - tuple val(sampleId), path(assembly_file), path(idxstats) - val min_cpm +include { CHUNK_ASSEMBLY_FILTER} from '../modules/filtering_cpm.nf' +include { MERGE_ASSEMBLY_FILTER} from '../modules/filtering_cpm.nf' +include { QUAST as FILTERED_QUAST } from '../modules/metaquast' - output: - tuple val(sampleId), path("${chunk_name}_select_cpm${min_cpm}.fasta"), emit: chunk_selected - tuple val(sampleId), path("${chunk_name}_discard_cpm${min_cpm}.fasta"), emit: chunk_discarded - - script: - chunk_name = assembly_file.baseName - """ - Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta - """ -} - -process MERGE_ASSEMBLY_FILTER { - label 'ASSEMBLY_FILTER' - - tag "${sampleId}" - publishDir "${params.outdir}/03_filtering/", mode: 'copy' - - input: - tuple val(sampleId), path(select_fasta) - tuple val(sampleId), path(discard_fasta) - val min_cpm - - output: - tuple val(sampleId), path("${sampleId}_select_contigs_cpm${min_cpm}.fasta"), emit: merged_selected - tuple val(sampleId), path("${sampleId}_discard_contigs_cpm${min_cpm}.fasta"), emit: merged_discarded - - shell: - ''' - echo !{select_fasta} | sed "s/ /\\n/g" | sort > select_list - echo !{discard_fasta} | sed "s/ /\\n/g" | sort > discard_list - - for i in `cat select_list` ; do cat $i >> !{sampleId}_select_contigs_cpm!{min_cpm}.fasta ; done - for j in `cat discard_list` ; do cat $j >> !{sampleId}_discard_contigs_cpm!{min_cpm}.fasta ; done - - rm select_list - rm discard_list - ''' -} - -workflow ASSEMBLY_FILTER { +workflow STEP_03_ASSEMBLY_FILTER { take: assembly_and_idxstats min_cpm @@ -72,6 +30,11 @@ workflow ASSEMBLY_FILTER { ) ch_merged_selected = MERGE_ASSEMBLY_FILTER.out.merged_selected + FILTERED_QUAST( ch_merged_selected, 'FILTERING' ) + ch_filtered_report = FILTERED_QUAST.out.report + + emit: selected = ch_merged_selected + report = ch_filtered_report } \ No newline at end of file diff --git a/subworkflows/05_alignment.nf b/subworkflows/05_alignment.nf index 4bd23e10797cebc4bf02a94d65b2589d9d262553..05d2763837dd9e186e33c910fb2b4f841f9c307e 100644 --- a/subworkflows/05_alignment.nf +++ b/subworkflows/05_alignment.nf @@ -5,6 +5,7 @@ workflow STEP_05_ALIGNMENT { take: contigs_and_reads prokka_faa + diamond main: if (params.type == 'SR') { @@ -18,10 +19,9 @@ workflow STEP_05_ALIGNMENT { ch_bam = MINIMAP2.out.bam ch_sam_coverage = MINIMAP2.out.sam_coverage } - DIAMOND ( prokka_faa, - params.diamond_bank + diamond ) ch_m8 = DIAMOND.out.m8 diff --git a/subworkflows/06_functionnal_annot.nf b/subworkflows/06_functionnal_annot.nf index 4d205e2abc0f2d2bb25e10ff86740c847a582460..621dcc415a28cc06cf92ac091f6a3cdbeeb5bd65 100644 --- a/subworkflows/06_functionnal_annot.nf +++ b/subworkflows/06_functionnal_annot.nf @@ -10,11 +10,11 @@ include { FUNCTIONAL_ANNOT_TABLE } from '../modules/functional_annot_table' workflow STEP_06_FUNC_ANNOT { take: - ffn // channel: [ val(sampleid), path(ffn) ] - faa // channel: [ val(sampleid), path(faa) ] - gff // channel: [ val(sampleid), path(gff) ] - bam // channel: [ val(sampleid), path(bam), path(bam_index) ] - m8 // channel: [ val(sampleId), path(diamond_file) ] + ffn // channel: [ val(meta), path(ffn) ] + faa // channel: [ val(meta), path(faa) ] + gff // channel: [ val(meta), path(gff) ] + bam // channel: [ val(meta), path(bam), path(bam_index) ] + m8 // channel: [ val(meta), path(diamond_file) ] eggnog_db main: diff --git a/subworkflows/07_taxonomic_affi.nf b/subworkflows/07_taxonomic_affi.nf index cbcc5347ab78bbbd34c7b2f7804c449adc9b2f47..9382704e10e0a7d020e75fc2e3d17af00b59acf4 100644 --- a/subworkflows/07_taxonomic_affi.nf +++ b/subworkflows/07_taxonomic_affi.nf @@ -4,9 +4,9 @@ include { QUANTIF_AND_TAXONOMIC_TABLE_CONTIGS } from '../modules/quantif_and_tax workflow STEP_07_TAXO_AFFI { take: taxonomy - diamond_result // channel: [ val(sampleId), path(diamond_file) ] - sam_coverage // channel: [ val(sampleId), path(samtools coverage) ] - prot_length // channel: [ val(sampleId), path(prot_length) ] + diamond_result // channel: [ val(meta), path(diamond_file) ] + sam_coverage // channel: [ val(meta), path(samtools coverage) ] + prot_length // channel: [ val(meta), path(prot_length) ] main: ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true) .join(prot_length, remainder: true) diff --git a/subworkflows/hifi_reads.nf b/subworkflows/hifi_reads.nf index d35441cd5cac8d80e21dcf146887e4cc8296c040..118409b0e3b0197847493fd276298223a27bc445 100644 --- a/subworkflows/hifi_reads.nf +++ b/subworkflows/hifi_reads.nf @@ -1,9 +1,9 @@ -include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' +include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { STEP_01_CLEAN_QC_HIFI} from './01_clean_qc' include { STEP_02_ASSEMBLY_HIFI as S02_ASSEMBLY } from './02_assembly' include { MINIMAP2_FILTERING } from '../modules/read_alignment' -include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' +include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow HIFI_READS { take: @@ -50,15 +50,50 @@ workflow HIFI_READS { // ASSEMBLY if (!params.stop_at_clean ) { + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_tmp = ch_preprocessed_reads + .map { + meta, fastq -> + [ meta.sample, meta, fastq ] + } + .groupTuple(by: [0]) + .branch { + id, meta, fastq -> + single : fastq.size() == 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + } + + + MERGE_FASTQ ( + ch_reads_tmp.multiple + ) + .reads + .mix(ch_reads_tmp.single) + .set{ch_preprocessed_reads} + + ///////////////////// + //End manage Flowcell + ///////////////////// S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_assembly_report = S02_ASSEMBLY.out.assembly_report } - // // stat on assemblies - // ASSEMBLY_QUAST( ch_assembly, 'ASSEMBLY' ) - // ch_quast_report = ASSEMBLY_QUAST.out.report // FILTERING if ( !params.skip_filtering && !params.stop_at_clean && !params.stop_at_assembly) { @@ -87,11 +122,7 @@ workflow HIFI_READS { ) ch_assembly = S03_FILTERING.out.selected - - S04_FILTERED_QUAST( ch_assembly,'FILTERING' ) - - ch_assembly_filtered_report = S04_FILTERED_QUAST.out.report - + ch_assembly_filtered_report = S03_FILTERING.out.report } @@ -110,7 +141,5 @@ workflow HIFI_READS { assembly_filtered_report = ch_assembly_filtered_report kaiju_report = ch_kaiju_report - - } diff --git a/subworkflows/shared.nf b/subworkflows/shared.nf index 1678a11d2abd782b7dc92dd5a95e12d0e5e29c30..16a1393c3056ffa63eb2f3572180cf4a05a47728 100644 --- a/subworkflows/shared.nf +++ b/subworkflows/shared.nf @@ -7,6 +7,7 @@ workflow SHARED { take: reads assembly + diamond eggnog_db taxonomy @@ -37,7 +38,7 @@ workflow SHARED { ch_m8 = Channel.empty() ch_sam_coverage = Channel.empty() if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) { - S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa ) + S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa, diamond) ch_bam = S05_ALIGNMENT.out.bam ch_m8 = S05_ALIGNMENT.out.m8 ch_sam_coverage = S05_ALIGNMENT.out.sam_coverage diff --git a/subworkflows/short_reads.nf b/subworkflows/short_reads.nf index b00117404d26c2df7d86b42aca2186f386b99eee..8d1b128b62f6e01ae143c2ec7bdf19904688631d 100644 --- a/subworkflows/short_reads.nf +++ b/subworkflows/short_reads.nf @@ -1,13 +1,11 @@ include { STEP_01_CLEAN_QC as S01_CLEAN_QC } from './01_clean_qc' include { STEP_02_ASSEMBLY as S02_ASSEMBLY } from './02_assembly' -include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' -include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' - +include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' +include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow SHORT_READS { take: reads - paired assembly host_fasta host_index @@ -34,7 +32,6 @@ workflow SHORT_READS { if ( !params.skip_clean ) { S01_CLEAN_QC ( reads, - paired, host_fasta, host_index, kaiju_db @@ -52,6 +49,46 @@ workflow SHORT_READS { ch_dedup = Channel.empty() if ( !params.stop_at_clean ) { + + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_tmp = ch_preprocessed_reads + .map { + meta, fastq -> + [ meta.sample, meta, fastq ] + } + .groupTuple(by: [0]) + .branch { + id, meta, fastq -> + single : fastq.size() == 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + } + + + MERGE_FASTQ ( + ch_reads_tmp.multiple + ) + .reads + .mix(ch_reads_tmp.single) + .set{ch_preprocessed_reads} + + ///////////////////// + //End manage Flowcell + ///////////////////// + S02_ASSEMBLY ( ch_preprocessed_reads, assembly, has_assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_dedup = S02_ASSEMBLY.out.dedup @@ -76,9 +113,7 @@ workflow SHORT_READS { ch_min_contigs_cpm ) ch_assembly = S03_FILTERING.out.selected - - S04_FILTERED_QUAST( ch_assembly, 'FILTERING' ) - ch_filtered_report = S04_FILTERED_QUAST.out.report + ch_filtered_report = S03_FILTERING.out.report } emit: