-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathVCF_sequence_context.sh
More file actions
73 lines (57 loc) · 2.32 KB
/
VCF_sequence_context.sh
File metadata and controls
73 lines (57 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash -l
#SBATCH --time=2:00:00
#SBATCH --ntasks=5
#SBATCH --mem=24g
#SBATCH --tmp=24g
#SBATCH --mail-type=ALL
#SBATCH --mail-user=pmorrell@umn.edu
#SBATCH -o %j.out
#SBATCH -e %j.err
set -e
set -o pipefail
# Peter L. Morrell - 17 April 2025 - St. Paul, MN
module load bedtools2/2.31.0-gcc-8.2.0-7j35k74
INPUT_DIR="/scratch.global/pmorrell/Selective_Sweeps"
OUTPUT_DIR="/scratch.global/pmorrell/Selective_Sweeps"
REFERENCE="/panfs/jay/groups/9/morrellp/shared/References/Reference_Sequences/Barley/Morex_v2/Barley_Morex_V2_pseudomolecules.fasta.gz"
REFERENCE_INDEX="/panfs/jay/groups/9/morrellp/shared/References/Reference_Sequences/Barley/Morex_v2/Barley_Morex_V2_pseudomolecules.fasta.gz.fai"
# Left flank: 250
LEFT=250
# Right flank: 250
RIGHT=250
mkdir -p "${OUTPUT_DIR}"
log() {
local msg="$1"
echo "$(date +'%Y-%m-%d %H:%M:%S') - ${msg}"
}
process_vcfs() {
log " -> Read SNP information from VCF"
local VCF_FILE="$1"
local SAMPLE_NAME
SAMPLE_NAME=$(basename "${VCF_FILE}" .vcf.gz)
log " -> Processing VCF (chromosome): ${SAMPLE_NAME}"
local intermediate_bed
intermediate_bed=$(mktemp)
zcat < "${VCF_FILE}" | grep -v '#' | awk -v OFS='\t' '{print $1, $2-1, $2, $1 "_" $2}' > "${intermediate_bed}"
log " -> Generating intervals"
local intermediate_bed2
intermediate_bed2=$(mktemp)
bedtools slop -i "${intermediate_bed}" -g "${REFERENCE_INDEX}" -l "${LEFT}" -r "${RIGHT}" > "${intermediate_bed2}"
log " -> Generating contextual sequence output"
local temp_fasta
temp_fasta=$(mktemp)
local final_output="${OUTPUT_DIR}/${SAMPLE_NAME}.fasta"
# Generate FASTA to temp file first
bedtools getfasta -fi "${REFERENCE}" -bed "${intermediate_bed2}" -nameOnly -fo "${temp_fasta}"
# Process with sed to replace center base with N, then write directly to final output
log " -> Converting SNP position ($((LEFT+1))st base) to N"
sed "/^>/!s/\(.\{${LEFT}\}\)./\1N/" "${temp_fasta}" > "${final_output}"
# Clean up temporary files
rm -f "${intermediate_bed}" "${intermediate_bed2}" "${temp_fasta}"
log " -> Completed processing ${SAMPLE_NAME}"
}
log " -> Looking for VCF files in ${INPUT_DIR}..."
find "${INPUT_DIR}" -name "*.vcf.gz" | while read -r VCF_FILE; do
process_vcfs "${VCF_FILE}"
done
log " -> All samples processed."