diff --git a/.editorconfig b/.editorconfig index 5573fb6..db3f82b 100644 --- a/.editorconfig +++ b/.editorconfig @@ -9,6 +9,7 @@ insert_final_newline = true charset = utf-8 indent_style = space indent_size = 4 +max_line_length = 127 [*.{yml,yaml}] indent_style = space diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b1416..997a047 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # CHANGELOG +## v1.7.3 + +- fix that long MNVs can cause sequences of != 500 in regseq annotaton rule and cause a workflow failure ([#90](https://github.com/kircherlab/CADD-scripts/pull/90), [#89](https://github.com/kircherlab/CADD-scripts/issues/89)). + +## v1.7.2 + +- only snakemake >= 8.25.2 supported +- using only conda-forge and bioconda channels (no default anymore) +- new container docker://visze/cadd-scripts-v1_7:0.1.1 +- only conda >24.7.1 is allowed (no mamba support anymore) +- VCF2vepVCF.py script fix to extend header. Otherwise regseq will fail using the vcf library +- readme update + + ## v1.7.1 - containerization diff --git a/src/scripts/lib/tools/regulatorySequence/predictVariants.py b/src/scripts/lib/tools/regulatorySequence/predictVariants.py index 875e87f..da1ae16 100644 --- a/src/scripts/lib/tools/regulatorySequence/predictVariants.py +++ b/src/scripts/lib/tools/regulatorySequence/predictVariants.py @@ -49,7 +49,7 @@ def cli( variants_file, model_file, weights_file, reference_file, genome_file, output_file ): import os - os.environ["CUDA_VISIBLE_DEVICES"]="-1" + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import numpy as np import math import vcfpy @@ -57,7 +57,6 @@ def cli( import copy import tensorflow as tf - from seqiolib import Interval, Encoder, VariantType, Variant from seqiolib import utils @@ -72,9 +71,11 @@ def loadAndPredict(sequences, model, variants=None): for sequence in sequences: if variants is not None: sequence.replace(variants[i]) - X.append(Encoder.one_hot_encode_along_channel_axis(sequence.getSequence())) + seq_encoded = Encoder.one_hot_encode_along_channel_axis(sequence.getSequence()) + X.append(seq_encoded) i += 1 - prediction = model.predict(np.array(X)) + X_array = np.array(X) + prediction = model.predict(X_array) return prediction def extendIntervals(intervals, region_length, genome_file): @@ -204,9 +205,16 @@ def pybedtoolsIntervalToInterval(interval_pybed): else: sequence_alt = copy.copy(sequence_ref) sequence_alt.replace(variant) - sequences_alt.append(sequence_alt) - sequences_ref.append(sequence_ref) - predict_avail_idx.add(alt_idx) + if len(sequence_alt.sequence) == input_length: + # FIXME: This is a hack. it seems that for longer MNVs the replacement does not work + sequences_alt.append(sequence_alt) + sequences_ref.append(sequence_ref) + predict_avail_idx.add(alt_idx) + else: + print( + "Cannot use variant %s because of wrong interval %s has wrong size after InDel Correction" + % (str(variant), str(interval)) + ) alt_idx += 1 click.echo("Predict reference...") results_ref = loadAndPredict(sequences_ref, model) @@ -243,7 +251,7 @@ def pybedtoolsIntervalToInterval(interval_pybed): for task_id in range(num_targets): to_add["RegSeq%d" % task_id] = to_add.get( "RegSeq%d" % task_id, [] - ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id],6)] + ) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id], 6)] predict_idx += 1 else: for task_id in range(num_targets):