-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path2_compute_synonym_identities_toHuman.py
More file actions
106 lines (94 loc) · 4.17 KB
/
2_compute_synonym_identities_toHuman.py
File metadata and controls
106 lines (94 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 16 15:55:48 2020
@author: user
"""
import os
import sys
import pandas as pd
def openfasta(filename):
# Open the file
fileHandle = open(filename)
# Read all the lines in the file
seqs = fileHandle.read().split(">") # Split file whenever a > is detected
# Close the file
fileHandle.close()
# Loop over the lines
seqdict = {}
for seq in seqs:
if seq: # Skip empty sequences
splitted = seq.split("\n") # Split different lines of one same sequence
# the first line is the name of the sequence, the rest is the sequence itself
seqdict[splitted[0]] = "".join(splitted[1:])
return seqdict
def translate_codon(codon):
GENETIC_CODE = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
codon = codon.replace("-","")
if codon in GENETIC_CODE.keys():
aa = GENETIC_CODE[codon]
return aa
def split_codons(humanseq):
n=0
codpositions = [0]
while len(humanseq) > n:
until = 3
while (until - humanseq[n:n+until].count("-"))<3:
until += 1
n = n+until
codpositions.append(n)
return codpositions
#%% Read fastas and compute codon identity
fasta_dir = sys.argv[1]
# Open the file
with open(os.path.join(fasta_dir,"CDS_AAidentities_toHuman_selection.tsv")) as f:
firstColumn = [line.split('\t')[0] for line in f]
#fileselection=pd.read_csv(os.path.join(fasta_dir,"CDS_AAidentities_toHuman_selection.tsv"))
#matrix2 = fileselection[fileselection.columns[0]]
#list2 = matrix2.tolist()
print(firstColumn)
# Get list of fasta files
fasta_files = {file[:-3]:os.path.join(fasta_dir,batch,file) for batch in os.listdir(fasta_dir) if os.path.isdir(os.path.join(fasta_dir,batch)) for file in os.listdir(os.path.join(fasta_dir,batch)) if file.endswith(".fa")}
outdf = pd.DataFrame()
for file in fasta_files.keys():
# Get cds and gene info
splitted_name = file.split("_")
cds = splitted_name[1]
if cds in firstColumn:
#if cds.isin(firstColumn):
print(cds)
outdf.loc[cds,"gene"] = splitted_name[0]
# Open file
alignment = openfasta(fasta_files[file])
# Compare each codon to that of human
codpositions = split_codons(alignment["Human"])
humancod = [alignment["Human"][codpositions[n]:codpositions[n+1]] for n in range(len(codpositions)-1)]
humanaa = [translate_codon(c) for c in humancod]
numberofspecies = 0
for species in alignment.keys():
speciescod = [alignment[species][codpositions[n]:codpositions[n+1]] for n in range(len(codpositions)-1)]
speciesaa = [translate_codon(c) for c in speciescod]
identity = [cod==humancod[n] for n,cod in enumerate(speciescod) if (humanaa[n]==speciesaa[n]) and humanaa[n] and speciesaa[n]]
if len(identity)>=10:
numberofspecies = numberofspecies + 1
outdf.loc[cds,species] = sum(identity)/len(identity)
outdf.loc[cds,"numberofspecies"] = numberofspecies
cols = [col for col in outdf.columns if col not in ["numberofspecies", "average","gene"]]
outdf.loc[cds,"average"] = (outdf.loc[cds ,cols].sum(skipna = True)) / numberofspecies
# Save results
outdf.to_csv(os.path.join(fasta_dir,"CDS_identities_toHuman.tsv"), sep="\t")