This is an extension of a question I asked yesterday. I have looked all over StackOverflow and have not found an instance of this specific NameError:
Building DAG of jobs...
Updating job done.
InputFunctionException in line 148 of /home/nasiegel/2022-h1n1/Snakefile:
Error:
NameError: free variable 'combinator' referenced before assignment in enclosing scope
Wildcards:
Traceback:
File "/home/nasiegel/2022-h1n1/Snakefile", line 131, in aggregate_decompress_h1n1
I assumed it was an issue having to do with the symbolic file paths in my function:
def aggregate_decompress_h1n1(wildcards):
checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]
filenames = expand(
SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
OUTPUTDIR + "{basenames}_quant/quant.sf",
basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
return filenames
However, hardcoding the paths does not resolve the issue. I've attached the full Snakefile below any advice would be appreciated.
Original file
# Snakemake file - input raw reads to generate quant files for analysis in R
configfile: "config.yaml"
import io
import os
import pandas as pd
import pathlib
from snakemake.exceptions import print_exception, WorkflowError
#----SET VARIABLES----#
PROJ = config["proj_name"]
INPUTDIR = config["raw-data"]
SCRATCH = config["scratch"]
REFERENCE = config["ref"]
OUTPUTDIR = config["outputDIR"]
# Adapters
SE_ADAPTER = config['seq']['SE']
SE_SEQUENCE = config['seq']['trueseq-se']
# Organsim
TRANSCRIPTOME = config['transcriptome']['rhesus']
SPECIES = config['species']['rhesus']
SAMPLE_LIST = glob_wildcards(INPUTDIR + "{basenames}_R1.fastq.gz").basenames
rule all:
input:
"final.txt",
# dowload referemce files
REFERENCE + SE_ADAPTER,
REFERENCE + SPECIES,
# multiqc
SCRATCH + "fastqc/raw_multiqc.html",
SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
SCRATCH + "fastqc/trimmed_multiqc.html",
SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
rule download_trimmomatic_adapter_file:
output: REFERENCE + SE_ADAPTER
shell: "curl -L -o {output} {SE_SEQUENCE}"
rule download_transcriptome:
output: REFERENCE + SPECIES
shell: "curl -L -o {output} {TRANSCRIPTOME}"
rule download_data:
output: "high_quality_files.tgz"
shell: "curl -L -o {output} https://osf.io/pcxfg/download"
checkpoint decompress_h1n1:
output: directory(INPUTDIR)
input: "high_quality_files.tgz"
shell: "tar xzvf {input}"
rule fastqc:
input: INPUTDIR + "{basenames}_R1.fastq.gz"
output:
raw_html = SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
raw_zip = SCRATCH + "fastqc/{basenames}_R1_fastqc.zip"
conda: "env/rnaseq.yml"
wrapper: "0.80.3/bio/fastqc"
rule multiqc:
input:
raw_qc = expand(SCRATCH + "fastqc/{basenames}_R1_fastqc.zip", basenames=SAMPLE_LIST),
trim_qc = expand(SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip", basenames=SAMPLE_LIST)
output:
raw_multi_html = SCRATCH + "fastqc/raw_multiqc.html",
raw_multi_stats = SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
trim_multi_html = SCRATCH + "fastqc/trimmed_multiqc.html",
trim_multi_stats = SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
conda: "env/rnaseq.yml"
shell:
"""
multiqc -n multiqc.html {input.raw_qc} #run multiqc
mv multiqc.html {output.raw_multi_html} #rename html
mv multiqc_data/multiqc_general_stats.txt {output.raw_multi_stats} #move and rename stats
rm -rf multiqc_data #clean-up
#repeat for trimmed data
multiqc -n multiqc.html {input.trim_qc} #run multiqc
mv multiqc.html {output.trim_multi_html} #rename html
mv multiqc_data/multiqc_general_stats.txt {output.trim_multi_stats} #move and rename stats
rm -rf multiqc_data #clean-up
"""
rule trimmmomatic_se:
input:
reads= INPUTDIR + "{basenames}_R1.fastq.gz",
adapters= REFERENCE + SE_ADAPTER,
output:
reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
unpaired = SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz"
conda: "env/rnaseq.yml"
log: SCRATCH + "logs/fastqc/{basenames}_R1_trim_unpaired.log"
shell:
"""
trimmomatic SE {input.reads} \
{output.reads} {output.unpaired} \
ILLUMINACLIP:{input.adapters}:2:0:15 LEADING:2 TRAILING:2 \
SLIDINGWINDOW:4:2 MINLEN:25
"""
rule fastqc_trim:
input: SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz"
output:
html = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
zip = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip"
log: SCRATCH + "logs/fastqc/{basenames}_R1_trimmed.log"
conda: "env/rnaseq.yml"
wrapper: "0.35.2/bio/fastqc"
rule salmon_quant:
input:
reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
index_dir = OUTPUTDIR + "quant/sc_ensembl_index"
output: OUTPUTDIR + "{basenames}_quant/quant.sf"
params: OUTPUTDIR + "{basenames}_quant"
log: SCRATCH + "logs/salmon/{basenames}_quant.log"
conda: "env/rnaseq.yml"
shell:
"""
salmon quant -i {input.index_dir} --libType A -r {input.reads} -o {params} --seqBias --gcBias --validateMappings
"""
def aggregate_decompress_h1n1(wildcards):
checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]
filenames = expand(
SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
OUTPUTDIR + "{basenames}_quant/quant.sf",
basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
return filenames
rule salmon_index:
input: REFERENCE + SPECIES
output: directory(OUTPUTDIR + "quant/sc_ensembl_index")
conda: "env/rnaseq.yml"
shell: "salmon index --index {output} --transcripts {input} # --type quasi"
rule done:
input: aggregate_decompress_h1n1
output: "final.txt"
shell: "touch {output}"
I think it's due to you used expand
function in a wrong way, expand
only accepts two positional arguments, where the first one is pattern and the second one is function (optional). If you want to supply multiple patterns you should wrap these patterns in list.
After some studying on source code of snakemake, it turns out expand
function doesn't check if user provides < 3 positional arguments, there is a variable combinator
in if-else
that would only be created when there are 1 or 2 positional arguments, the massive amount of positional arguments you provide skip this part and lead to the error when it tries to use combinator
later.
Source code: https://snakemake.readthedocs.io/en/v6.5.4/_modules/snakemake/io.html