From 3d63dae4d81ea0442ad21161242910fc53bcc149 Mon Sep 17 00:00:00 2001 From: Max Schubach Date: Thu, 30 Oct 2025 10:03:38 +0100 Subject: [PATCH 1/3] fix: Very long MNVs can cause sequences of != 500 --- .editorconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/.editorconfig b/.editorconfig index 5573fb6..db3f82b 100644 --- a/.editorconfig +++ b/.editorconfig @@ -9,6 +9,7 @@ insert_final_newline = true charset = utf-8 indent_style = space indent_size = 4 +max_line_length = 127 [*.{yml,yaml}] indent_style = space From f1b0aebe893c609973d4a1d06dd339f281c5e3d8 Mon Sep 17 00:00:00 2001 From: Max Schubach Date: Thu, 30 Oct 2025 10:04:20 +0100 Subject: [PATCH 2/3] fix: Very long MNVs can cause sequences of != 500 --- .../regulatorySequence/predictVariants.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/scripts/lib/tools/regulatorySequence/predictVariants.py b/src/scripts/lib/tools/regulatorySequence/predictVariants.py index 875e87f..da1ae16 100644 --- a/src/scripts/lib/tools/regulatorySequence/predictVariants.py +++ b/src/scripts/lib/tools/regulatorySequence/predictVariants.py @@ -49,7 +49,7 @@ def cli( variants_file, model_file, weights_file, reference_file, genome_file, output_file ): import os - os.environ["CUDA_VISIBLE_DEVICES"]="-1" + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import numpy as np import math import vcfpy @@ -57,7 +57,6 @@ def cli( import copy import tensorflow as tf - from seqiolib import Interval, Encoder, VariantType, Variant from seqiolib import utils @@ -72,9 +71,11 @@ def loadAndPredict(sequences, model, variants=None): for sequence in sequences: if variants is not None: sequence.replace(variants[i]) - X.append(Encoder.one_hot_encode_along_channel_axis(sequence.getSequence())) + seq_encoded = Encoder.one_hot_encode_along_channel_axis(sequence.getSequence()) + X.append(seq_encoded) i += 1 - prediction = model.predict(np.array(X)) + X_array = np.array(X) + prediction = model.predict(X_array) return prediction def extendIntervals(intervals, region_length, genome_file): @@ -204,9 +205,16 @@ def pybedtoolsIntervalToInterval(interval_pybed): else: sequence_alt = copy.copy(sequence_ref) sequence_alt.replace(variant) - sequences_alt.append(sequence_alt) - sequences_ref.append(sequence_ref) - predict_avail_idx.add(alt_idx) + if len(sequence_alt.sequence) == input_length: + # FIXME: This is a hack. it seems that for longer MNVs the replacement does not work + sequences_alt.append(sequence_alt) + sequences_ref.append(sequence_ref) + predict_avail_idx.add(alt_idx) + else: + print( + "Cannot use variant %s because of wrong interval %s has wrong size after InDel Correction" + % (str(variant), str(interval)) + ) alt_idx += 1 click.echo("Predict reference...") results_ref = loadAndPredict(sequences_ref, model) @@ -243,7 +251,7 @@ def pybedtoolsIntervalToInterval(interval_pybed): for task_id in range(num_targets): to_add["RegSeq%d" % task_id] = to_add.get( "RegSeq%d" % task_id, [] - ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id],6)] + ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id], 6)] predict_idx += 1 else: for task_id in range(num_targets): From 1672dbca8dc3887cece4cdd1ef3aecb9051519b7 Mon Sep 17 00:00:00 2001 From: Max Schubach Date: Thu, 30 Oct 2025 10:13:05 +0100 Subject: [PATCH 3/3] changelog --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b1416..997a047 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # CHANGELOG +## v1.7.3 + +- fix that long MNVs can cause sequences of != 500 in regseq annotaton rule and cause a workflow failure ([#90](https://github.com/kircherlab/CADD-scripts/pull/90), [#89](https://github.com/kircherlab/CADD-scripts/issues/89)). + +## v1.7.2 + +- only snakemake >= 8.25.2 supported +- using only conda-forge and bioconda channels (no default anymore) +- new container docker://visze/cadd-scripts-v1_7:0.1.1 +- only conda >24.7.1 is allowed (no mamba support anymore) +- VCF2vepVCF.py script fix to extend header. Otherwise regseq will fail using the vcf library +- readme update + + ## v1.7.1 - containerization