Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ insert_final_newline = true
charset = utf-8
indent_style = space
indent_size = 4
max_line_length = 127

[*.{yml,yaml}]
indent_style = space
Expand Down
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# CHANGELOG

## v1.7.3

- fix that long MNVs can cause sequences of != 500 in regseq annotaton rule and cause a workflow failure ([#90](https://github.com/kircherlab/CADD-scripts/pull/90), [#89](https://github.com/kircherlab/CADD-scripts/issues/89)).

## v1.7.2

- only snakemake >= 8.25.2 supported
- using only conda-forge and bioconda channels (no default anymore)
- new container docker://visze/cadd-scripts-v1_7:0.1.1
- only conda >24.7.1 is allowed (no mamba support anymore)
- VCF2vepVCF.py script fix to extend header. Otherwise regseq will fail using the vcf library
- readme update


## v1.7.1

- containerization
Expand Down
24 changes: 16 additions & 8 deletions src/scripts/lib/tools/regulatorySequence/predictVariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,14 @@ def cli(
variants_file, model_file, weights_file, reference_file, genome_file, output_file
):
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
import math
import vcfpy

import copy

import tensorflow as tf

from seqiolib import Interval, Encoder, VariantType, Variant
from seqiolib import utils

Expand All @@ -72,9 +71,11 @@ def loadAndPredict(sequences, model, variants=None):
for sequence in sequences:
if variants is not None:
sequence.replace(variants[i])
X.append(Encoder.one_hot_encode_along_channel_axis(sequence.getSequence()))
seq_encoded = Encoder.one_hot_encode_along_channel_axis(sequence.getSequence())
X.append(seq_encoded)
i += 1
prediction = model.predict(np.array(X))
X_array = np.array(X)
prediction = model.predict(X_array)
return prediction

def extendIntervals(intervals, region_length, genome_file):
Expand Down Expand Up @@ -204,9 +205,16 @@ def pybedtoolsIntervalToInterval(interval_pybed):
else:
sequence_alt = copy.copy(sequence_ref)
sequence_alt.replace(variant)
sequences_alt.append(sequence_alt)
sequences_ref.append(sequence_ref)
predict_avail_idx.add(alt_idx)
if len(sequence_alt.sequence) == input_length:
# FIXME: This is a hack. it seems that for longer MNVs the replacement does not work
sequences_alt.append(sequence_alt)
sequences_ref.append(sequence_ref)
predict_avail_idx.add(alt_idx)
else:
print(
"Cannot use variant %s because of wrong interval %s has wrong size after InDel Correction"
% (str(variant), str(interval))
)
alt_idx += 1
click.echo("Predict reference...")
results_ref = loadAndPredict(sequences_ref, model)
Expand Down Expand Up @@ -243,7 +251,7 @@ def pybedtoolsIntervalToInterval(interval_pybed):
for task_id in range(num_targets):
to_add["RegSeq%d" % task_id] = to_add.get(
"RegSeq%d" % task_id, []
) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id],6)]
) + [round(results_alt[predict_idx][task_id] - results_ref[predict_idx][task_id], 6)]
predict_idx += 1
else:
for task_id in range(num_targets):
Expand Down