环境创建及软件安装(必须)

conda create -n snakemake -y &&
conda activate snakemake &&
conda install mamba -y  &&
mamba install python=3.10 -y &&
mamba install snakemake -y &&
mamba install -y sra-tools fastqc trim-galore hisat2 subread multiqc samtools salmon fastp

文件夹(这个无所谓,Snakemake会自动创建)

mkdir -p project/rnaseq
cd project/rnaseq

mkdir -p  data/rawdata/sra  data/rawdata/fastq data/rawdata/qc data/cleandata/trim_galore  data/cleandata/fastqc data/cleandata/multiqc
mkdir -p Mapping/Hisat2  
mkdir -p Expression/featureCounts  
tree

参考基因组(必须)

https://www.jianshu.com/p/6298877b6253

https://www.ensembl.org/info/data/ftp/index.html?redirect=no

参考基因组文件

nohup wget  https://ftp.ensembl.org/pub/release-100/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz &

基因组注释文件

 wget https://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz

数据下载及解压 (非必须内容,这只是前期写的一个下载转换代码)

#!/bin/bash

# 将此脚本的所有输出重定向到日志文件
exec > download_and_convert.log 2>&1

# SRR ID数组
srr_ids=("SRR17214819" "SRR17214820") # 根据需要添加更多SRR ID

# 原始数据目录
sra_data_dir="./data/rawdata/sra"
# FASTQ文件的输出目录
fastq_data_dir="./data/rawdata/fastq"

# 检查给定SRR ID的FASTQ文件是否已存在的函数
check_fastq_files_exist() {
    local srr_id=$1
    # 查找格式为 "${srr_id}_*.fastq.gz" 的FASTQ文件
    local files=("${fastq_data_dir}/${srr_id}"_*.fastq.gz)
    
    # 检查是否找到任何文件
    if [ -e "${files[0]}" ]; then
        return 0 # 文件存在
    else
        return 1 # 文件不存在
    fi
}

# 遍历每个SRR ID
for srr_id in "${srr_ids[@]}"; do
    # 检查FASTQ文件是否已存在
    if check_fastq_files_exist "$srr_id"; then
        echo "FASTQ文件 $srr_id 已存在,跳过..."
        continue
    fi

    # SRA文件的路径
    sra_file="${sra_data_dir}/${srr_id}/${srr_id}.sra"

    # 检查SRA文件是否已存在
    if [[ -f "$sra_file" ]]; then
        echo "SRA文件 $srr_id 已存在,但FASTQ文件不存在,进行转换..."
    else
        # Prefetch
        echo "开始预取 $srr_id"
        prefetch "$srr_id" -O "$sra_data_dir" &
        
        # 等待prefetch完成
        wait
    fi

    # Fastq-dump
    echo "开始转换 $srr_id"
    fastq-dump --split-files --gzip "$sra_file" --outdir "$fastq_data_dir" &
    
    # 等待fastq-dump完成
    wait
done

echo "所有过程已完成。"

Snakemake正式流程(重要!!)

运行一个样本做例子

此流程中,你需要指定SAMPLE的SRR号,index路径,GTF路径即可!

SAMPLE = ["SRR17214819", "SRR17214820"]
# 参考基因组需另外下载
index = "/home/shpc_101389/database/GRCh38.100/Hisat2Index/genome"
GTF = "/home/shpc_101389/database/GRCh38.100/Homo_sapiens.GRCh38.100.gtf"

rule all:
    input:
        expand("rawdata/sra/{sample}/{sample}.sra",sample = SAMPLE),
        expand("rawdata/fastq/{sample}_1.fastq.gz",sample = SAMPLE),
        expand("rawdata/fastq/{sample}_2.fastq.gz",sample = SAMPLE),
        expand("rawdata/qc/{sample}_1_fastqc.zip",sample = SAMPLE),
        expand("rawdata/qc/{sample}_2_fastqc.zip",sample = SAMPLE),
        expand("cleandata/trim_galore/{sample}_1_val_1.fq.gz",sample = SAMPLE),
        expand("cleandata/trim_galore/{sample}_2_val_2.fq.gz",sample = SAMPLE),
        expand("Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam",sample = SAMPLE),
        expand("Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam.bai",sample = SAMPLE),
        expand("Mapping/Hisat2/{sample}.flagstat",sample = SAMPLE),
        expand("Expression/featureCounts/all_feature.txt")
# 下载
rule prefetch:
    output:
        "rawdata/sra/{sample}/{sample}.sra"
    shell:
        "prefetch {wildcards.sample} -O rawdata/sra/"

# 转换
rule fasterqdump:
    input:
        "rawdata/sra/{sample}/{sample}.sra"
    output:
        fastq1="rawdata/fastq/{sample}_1.fastq.gz",
        fastq2="rawdata/fastq/{sample}_2.fastq.gz"
    shell:
        """
        fasterq-dump -p -e 10 --split-3 -O rawdata/fastq {input[0]} &&
        gzip -c rawdata/fastq/{wildcards.sample}_1.fastq > {output.fastq1} &&
        gzip -c rawdata/fastq/{wildcards.sample}_2.fastq > {output.fastq2} &&
        rm -rf rawdata/fastq/{wildcards.sample}_1.fastq &&
        rm -rf rawdata/fastq/{wildcards.sample}_2.fastq
        """


# 质控
rule fastqc:
    input:
        "rawdata/fastq/{sample}_1.fastq.gz",
        "rawdata/fastq/{sample}_2.fastq.gz"
    output:
        "rawdata/qc/{sample}_1_fastqc.zip",
        "rawdata/qc/{sample}_2_fastqc.zip"
    shell:
        "fastqc -t 5 -o rawdata/qc {input[0]}>{output[0]}"
        "fastqc -t 5 -o rawdata/qc {input[1]}>{output[1]}"

# 过滤       
rule trim_galore:
    input:
        "rawdata/fastq/{sample}_1.fastq.gz",
        "rawdata/fastq/{sample}_2.fastq.gz"
    output:
        "cleandata/trim_galore/{sample}_1_val_1.fq.gz",
        "cleandata/trim_galore/{sample}_2_val_2.fq.gz"
    log:
        "cleandata/trim_galore/trim_galore_{sample}.log"
    shell:
        "trim_galore --phred33 -q 20 --length 36 --stringency 3 --fastqc --paired --max_n 3 -o cleandata/trim_galore {input[0]} {input[1]}>{log}"
        
# 比对 建索引        
rule hisat2:
    input:
        "cleandata/trim_galore/{sample}_1_val_1.fq.gz",
        "cleandata/trim_galore/{sample}_2_val_2.fq.gz"
    output:
        "Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam",
        "Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam.bai"
    log:
        "Mapping/Hisat2/hisat2_{sample}.log"
    shell:
        """
        hisat2 --threads 5 -x {index} -1 {input[0]} -2 {input[1]} | 
        samtools sort --threads 1 -o {output[0]} &&
        samtools index --threads 5 {output[0]} {output[1]} > {log}
        """
        
# 统计比对结果
rule samtools:
    input:
        "Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam"
    output:
        "Mapping/Hisat2/{sample}.flagstat"
    shell:
        "samtools flagstat -@ 5 {input} > {output}"
 
# 定量       
rule featureCounts:
    input:
        expand("Mapping/Hisat2/{sample}.Hisat_aln.sorted.bam",sample = SAMPLE)
    output:
        "Expression/featureCounts/all_feature.txt"
    log:
        "Expression/featureCounts/featureCounts.log"
    shell:
        "featureCounts -T 8 -p -t exon -g gene_id -a {GTF} -o {output} {input} 2>{log}"  

执行命令

nohup snakemake --cores 16 --keep-going -s snakefile >snakemake.log &