Search code examples
pythonsnakemake

Weird NameError with python function in Snakemake script


This is an extension of a question I asked yesterday. I have looked all over StackOverflow and have not found an instance of this specific NameError:

Building DAG of jobs...
Updating job done.
InputFunctionException in line 148 of /home/nasiegel/2022-h1n1/Snakefile:
Error:
  NameError: free variable 'combinator' referenced before assignment in enclosing scope
Wildcards:

Traceback:
  File "/home/nasiegel/2022-h1n1/Snakefile", line 131, in aggregate_decompress_h1n1

I assumed it was an issue having to do with the symbolic file paths in my function:

def aggregate_decompress_h1n1(wildcards):
    checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]    
    filenames = expand(
     SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
     SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
     SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
     SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
     SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
     SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
     OUTPUTDIR + "{basenames}_quant/quant.sf",
     basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
    return filenames

However, hardcoding the paths does not resolve the issue. I've attached the full Snakefile below any advice would be appreciated.

Original file

# Snakemake file - input raw reads to generate quant files for analysis in R
configfile: "config.yaml"

import io 
import os
import pandas as pd
import pathlib
from snakemake.exceptions import print_exception, WorkflowError

#----SET VARIABLES----#
PROJ = config["proj_name"]
INPUTDIR = config["raw-data"]
SCRATCH = config["scratch"]
REFERENCE = config["ref"]
OUTPUTDIR = config["outputDIR"]

# Adapters
SE_ADAPTER = config['seq']['SE']
SE_SEQUENCE = config['seq']['trueseq-se']

# Organsim
TRANSCRIPTOME = config['transcriptome']['rhesus']
SPECIES = config['species']['rhesus']

SAMPLE_LIST = glob_wildcards(INPUTDIR + "{basenames}_R1.fastq.gz").basenames

rule all:
    input:
        "final.txt",
        # dowload referemce files
        REFERENCE + SE_ADAPTER,
        REFERENCE + SPECIES,
        # multiqc
        SCRATCH + "fastqc/raw_multiqc.html", 
        SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
        SCRATCH + "fastqc/trimmed_multiqc.html", 
        SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"

rule download_trimmomatic_adapter_file:
    output: REFERENCE + SE_ADAPTER
    shell: "curl -L -o {output} {SE_SEQUENCE}"

rule download_transcriptome:
    output: REFERENCE + SPECIES
    shell: "curl -L -o {output} {TRANSCRIPTOME}"

rule download_data:
    output: "high_quality_files.tgz"
    shell: "curl -L -o {output} https://osf.io/pcxfg/download"

checkpoint decompress_h1n1:
    output: directory(INPUTDIR)
    input: "high_quality_files.tgz"
    shell: "tar xzvf {input}"

rule fastqc:
    input:  INPUTDIR + "{basenames}_R1.fastq.gz"
    output:
        raw_html = SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
        raw_zip = SCRATCH + "fastqc/{basenames}_R1_fastqc.zip"
    conda: "env/rnaseq.yml"
    wrapper: "0.80.3/bio/fastqc"

rule multiqc:
    input:
        raw_qc = expand(SCRATCH + "fastqc/{basenames}_R1_fastqc.zip", basenames=SAMPLE_LIST),
        trim_qc = expand(SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip", basenames=SAMPLE_LIST)
    output:
        raw_multi_html = SCRATCH + "fastqc/raw_multiqc.html", 
        raw_multi_stats = SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
        trim_multi_html = SCRATCH + "fastqc/trimmed_multiqc.html", 
        trim_multi_stats = SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
    conda: "env/rnaseq.yml"
    shell: 
        """
        multiqc -n multiqc.html {input.raw_qc} #run multiqc
        mv multiqc.html {output.raw_multi_html} #rename html
        mv multiqc_data/multiqc_general_stats.txt {output.raw_multi_stats} #move and rename stats
        rm -rf multiqc_data #clean-up
        #repeat for trimmed data
        multiqc -n multiqc.html {input.trim_qc} #run multiqc
        mv multiqc.html {output.trim_multi_html} #rename html
        mv multiqc_data/multiqc_general_stats.txt {output.trim_multi_stats} #move and rename stats
        rm -rf multiqc_data #clean-up
        """ 

rule trimmmomatic_se:
    input: 
        reads= INPUTDIR + "{basenames}_R1.fastq.gz",
        adapters= REFERENCE + SE_ADAPTER,
    output: 
        reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
        unpaired = SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz"
    conda: "env/rnaseq.yml"
    log: SCRATCH + "logs/fastqc/{basenames}_R1_trim_unpaired.log"
    shell:
        """
        trimmomatic SE {input.reads} \
        {output.reads} {output.unpaired} \
        ILLUMINACLIP:{input.adapters}:2:0:15 LEADING:2 TRAILING:2 \
        SLIDINGWINDOW:4:2 MINLEN:25    
        """
rule fastqc_trim:
    input: SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz"
    output:
      html = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
      zip = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip"
    log: SCRATCH + "logs/fastqc/{basenames}_R1_trimmed.log"
    conda: "env/rnaseq.yml"
    wrapper: "0.35.2/bio/fastqc"

rule salmon_quant:
    input:
        reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
        index_dir = OUTPUTDIR + "quant/sc_ensembl_index"
    output: OUTPUTDIR + "{basenames}_quant/quant.sf"
    params: OUTPUTDIR + "{basenames}_quant"
    log: SCRATCH + "logs/salmon/{basenames}_quant.log"
    conda: "env/rnaseq.yml"
    shell:
        """
        salmon quant -i {input.index_dir} --libType A -r {input.reads} -o {params} --seqBias --gcBias --validateMappings
        """

def aggregate_decompress_h1n1(wildcards):
    checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]    
    filenames = expand(
     SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
     SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
     SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
     SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
     SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
     SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
     OUTPUTDIR + "{basenames}_quant/quant.sf",
     basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
    return filenames

rule salmon_index:
    input: REFERENCE + SPECIES
    output: directory(OUTPUTDIR + "quant/sc_ensembl_index")
    conda: "env/rnaseq.yml"
    shell: "salmon index --index {output} --transcripts {input} # --type quasi"

rule done:
    input: aggregate_decompress_h1n1
    output: "final.txt"
    shell: "touch {output}"

Solution

  • I think it's due to you used expand function in a wrong way, expand only accepts two positional arguments, where the first one is pattern and the second one is function (optional). If you want to supply multiple patterns you should wrap these patterns in list.

    After some studying on source code of snakemake, it turns out expand function doesn't check if user provides < 3 positional arguments, there is a variable combinator in if-else that would only be created when there are 1 or 2 positional arguments, the massive amount of positional arguments you provide skip this part and lead to the error when it tries to use combinator later.

    Source code: https://snakemake.readthedocs.io/en/v6.5.4/_modules/snakemake/io.html