kircherlab · visze · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/.editorconfig b/.editorconfig
@@ -9,6 +9,7 @@ insert_final_newline = true
 charset = utf-8
 indent_style = space
 indent_size = 4
+max_line_length = 127
 
 [*.{yml,yaml}]
 indent_style = space

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # CHANGELOG
 
+## v1.7.3
+
+- fix that long MNVs can cause sequences of != 500 in regseq annotaton rule and cause a workflow failure ([#90](https://github.com/kircherlab/CADD-scripts/pull/90), [#89](https://github.com/kircherlab/CADD-scripts/issues/89)).
+
+## v1.7.2
+
+- only snakemake >= 8.25.2 supported
+- using only conda-forge and bioconda channels (no default anymore)
+- new container docker://visze/cadd-scripts-v1_7:0.1.1
+- only conda >24.7.1 is allowed (no mamba support anymore)
+- VCF2vepVCF.py script fix to extend header. Otherwise regseq will fail using the vcf library
+- readme update
+
+
 ## v1.7.1
 
 - containerization

diff --git a/src/scripts/lib/tools/regulatorySequence/predictVariants.py b/src/scripts/lib/tools/regulatorySequence/predictVariants.py
@@ -49,15 +49,14 @@ def cli(
     variants_file, model_file, weights_file, reference_file, genome_file, output_file
 ):
     import os
-    os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
     import numpy as np
     import math
     import vcfpy
 
     import copy
 
     import tensorflow as tf
-
     from seqiolib import Interval, Encoder, VariantType, Variant
     from seqiolib import utils
 
@@ -72,9 +71,11 @@ def loadAndPredict(sequences, model, variants=None):
         for sequence in sequences:
             if variants is not None:
                 sequence.replace(variants[i])
-            X.append(Encoder.one_hot_encode_along_channel_axis(sequence.getSequence()))
+            seq_encoded = Encoder.one_hot_encode_along_channel_axis(sequence.getSequence())
+            X.append(seq_encoded)
             i += 1
-        prediction = model.predict(np.array(X))
+        X_array = np.array(X)
+        prediction = model.predict(X_array)
         return prediction
 
     def extendIntervals(intervals, region_length, genome_file):
@@ -204,9 +205,16 @@ def pybedtoolsIntervalToInterval(interval_pybed):
                 else:
                     sequence_alt = copy.copy(sequence_ref)
                     sequence_alt.replace(variant)
-                    sequences_alt.append(sequence_alt)
-                    sequences_ref.append(sequence_ref)
-                    predict_avail_idx.add(alt_idx)
+                    if len(sequence_alt.sequence) == input_length:
+                        # FIXME: This is a hack. it seems that for longer MNVs the replacement does not work
+                        sequences_alt.append(sequence_alt)
+                        sequences_ref.append(sequence_ref)
+                        predict_avail_idx.add(alt_idx)
+                    else:
+                        print(
+                            "Cannot use variant %s because of wrong interval %s has wrong size after InDel Correction"
+                            % (str(variant), str(interval))
+                        )
                 alt_idx += 1
         click.echo("Predict reference...")
         results_ref = loadAndPredict(sequences_ref, model)
@@ -243,7 +251,7 @@ def pybedtoolsIntervalToInterval(interval_pybed):
                 for task_id in range(num_targets):
                     to_add["RegSeq%d" % task_id] = to_add.get(
                         "RegSeq%d" % task_id, []
-                    ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id],6)]
+                    ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id], 6)]
                 predict_idx += 1
             else:
                 for task_id in range(num_targets):