-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile-ega
170 lines (126 loc) · 4.92 KB
/
Snakefile-ega
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import csv
from tabulate import tabulate
from snakemake.utils import min_version
min_version("7.25.3")
include: "rules/common.smk"
metadata_df = read_metadata(config["metadata_path"], config["dataset_id"] )
# print metadata table
headers = ['EGAR ID', 'ENA Run ID', 'Fastq Filename', 'EGAF ID']
print(tabulate(metadata_df, headers=headers, tablefmt="grid"))
def get_unique_fastq_filenames(merged_table):
# extract the third column
fastq_filenames = [row[2] for row in merged_table]
# get unique values, keep the order
unique_fastq_filenames = list(dict.fromkeys(fastq_filenames))
filenames_without_extension = [filename.replace('.fastq.gz', '') for filename in unique_fastq_filenames]
return filenames_without_extension
FASTQ_FILENAMES = get_unique_fastq_filenames(metadata_df)
def get_unique_ega_filenames(merged_table):
# extract the fourth column
ega_filenames = [row[3] for row in merged_table]
# get unique values, keep the order
unique_ega_filenames = list(dict.fromkeys(ega_filenames))
return unique_ega_filenames
EGA_FILENAMES = get_unique_ega_filenames(metadata_df)
SAMPLES = get_unique_ena_runs(metadata_df)
print(f"unique EGA filenames: {EGA_FILENAMES}")
print(f"unique FASTQ filenames: {FASTQ_FILENAMES}")
print(f"unique ENA filenames (final sample names): {SAMPLES}")
wildcard_constraints:
egasample="EGAF\d+",
sample="ERR\d+"
rule all:
input: "out/workflow-ega.done"
rule create_metadata:
output:
"out/metadata.csv"
run:
metadata_df = read_metadata(config["metadata_path"], config["dataset_id"])
# write metadata_df to a csv file
import csv
with open(output[0], "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(metadata_df)
rule check_initial_data:
"""
Check that the fastq data exists, and the md5s
"""
input:
fastq = config["input_path"]+"/"+config["dataset_id"]+"/{egasample}/{fastqsample}.fastq.gz",
metad = "out/metadata.csv"
output:
file_check = temp("out/{egasample}.{fastqsample}.checked")
log:
"logs/{egasample}.{fastqsample}.log"
shell:
"""
set -e # snakemake on the cluster doesn't stop on error when --keep-going is set
exec &> "{log}"
# check if the input file exists
if [ ! -f {input.fastq} ]; then
echo "ERROR: Input file {input.fastq} does not exist." >&2
exit 1
fi
echo "Input file {input.fastq} exists, proceeding with the pipeline."
# check if md5 file exists, and calculate message-digest fingerprint (checksum) for the file
if [ -f "{input.fastq}.md5" ]; then
echo "md5 file {input.fastq}.md5 exists" >&2
expected_md5=$(cat "{input.fastq}.md5" | awk '{{print $1}}')
actual_md5=$(md5sum {input.fastq} | awk '{{print $1}}')
if [[ "$expected_md5" == "$actual_md5" ]]; then
echo "MD5 checksum is correct." >&2
else
echo "Error: MD5 checksum does not match." >&2
exit 1
fi
fi
touch {output.file_check}
"""
rule rename_fastqs:
input:
expand("out/{egasample}.{fastqsample}.checked",
zip,
egasample=EGA_FILENAMES,
fastqsample=FASTQ_FILENAMES)
params:
input_path=config["input_path"]+"/"+config["dataset_id"],
metad="out/metadata.csv"
output:
"out/rename_fastqs.done"
log:
"logs/rename_fastqs.log"
shell:
"""
set -e # snakemake on the cluster doesn't stop on error when --keep-going is set
exec &> "{log}"
# remove possible trailing spaces
sed -i 's/[[:space:]]*$//g' {params.metad}
# loop over unique ERR* values in the second column
cut -d, -f2 {params.metad} | sort -u | while read -r err; do
echo "ERR: $err"
mkdir out/$err
# for each unique ERR* value, find the associated rows and print the desired paths
grep ",$err," {params.metad} | while IFS=, read -r egar_value ena_value sample_file egaf_value; do
echo "{params.input_path}/$egaf_value/$sample_file"
paired_suffix=$(echo "$sample_file" | awk -F'_' '{{print "_" $2}}')
echo "out/$err/$err$paired_suffix"
# check file exists and rename
if [ -f "{params.input_path}/$egaf_value/$sample_file" ]; then
echo "Copying file {params.input_path}/$egaf_value/$sample_file >> out/$err/$err$paired_suffix"
cp {params.input_path}/$egaf_value/$sample_file out/$err/$err$paired_suffix
fi
done
done
touch {output}
"""
rule final_workflow_check:
input:
"out/rename_fastqs.done"
output:
"out/workflow-ega.done"
shell:
"""
set -e # snakemake on the cluster doesn't stop on error when --keep-going is set
touch {output}
"""