CodonUsage/2_compute_synonym_identities_toHuman.py at main · HannahBenisty/CodonUsage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 16 15:55:48 2020

@author: user
"""

import os
import sys
import pandas as pd

def openfasta(filename):
    # Open the file
    fileHandle = open(filename)
    # Read all the lines in the file
    seqs = fileHandle.read().split(">") # Split file whenever a > is detected
    # Close the file
    fileHandle.close()
    # Loop over the lines
    seqdict = {}
    for seq in seqs:
        if seq: # Skip empty sequences
            splitted = seq.split("\n") # Split different lines of one same sequence
            # the first line is the name of the sequence, the rest is the sequence itself
            seqdict[splitted[0]] = "".join(splitted[1:])
    return seqdict

def translate_codon(codon):
    GENETIC_CODE = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
    codon = codon.replace("-","")
    if codon in GENETIC_CODE.keys():
        aa = GENETIC_CODE[codon]
        return aa

def split_codons(humanseq):
    n=0
    codpositions = [0]
    while len(humanseq) > n:
        until = 3
        while (until - humanseq[n:n+until].count("-"))<3:
            until += 1
        n = n+until
        codpositions.append(n)
    return codpositions

#%% Read fastas and compute codon identity

fasta_dir = sys.argv[1]

# Open the file
with open(os.path.join(fasta_dir,"CDS_AAidentities_toHuman_selection.tsv")) as f:
    firstColumn = [line.split('\t')[0] for line in f]

#fileselection=pd.read_csv(os.path.join(fasta_dir,"CDS_AAidentities_toHuman_selection.tsv"))
#matrix2 = fileselection[fileselection.columns[0]]
#list2 = matrix2.tolist()
print(firstColumn)

# Get list of fasta files
fasta_files = {file[:-3]:os.path.join(fasta_dir,batch,file) for batch in os.listdir(fasta_dir) if os.path.isdir(os.path.join(fasta_dir,batch)) for file in os.listdir(os.path.join(fasta_dir,batch)) if file.endswith(".fa")}

outdf = pd.DataFrame()
for file in fasta_files.keys():
    # Get cds and gene info
    splitted_name = file.split("_")
    cds = splitted_name[1]
    if cds in firstColumn:
    #if cds.isin(firstColumn):
        print(cds)
        outdf.loc[cds,"gene"] = splitted_name[0]
        # Open file
        alignment = openfasta(fasta_files[file])
        # Compare each codon to that of human
        codpositions = split_codons(alignment["Human"])
        humancod = [alignment["Human"][codpositions[n]:codpositions[n+1]] for n in range(len(codpositions)-1)]
        humanaa = [translate_codon(c) for c in humancod]
        numberofspecies = 0
        for species in alignment.keys():
            speciescod = [alignment[species][codpositions[n]:codpositions[n+1]] for n in range(len(codpositions)-1)]
            speciesaa = [translate_codon(c) for c in speciescod]
            identity = [cod==humancod[n] for n,cod in enumerate(speciescod) if (humanaa[n]==speciesaa[n]) and humanaa[n] and speciesaa[n]]
            if len(identity)>=10:
                numberofspecies = numberofspecies + 1
                outdf.loc[cds,species] = sum(identity)/len(identity)
        outdf.loc[cds,"numberofspecies"] = numberofspecies
        cols = [col for col in outdf.columns if col not in ["numberofspecies", "average","gene"]]
        outdf.loc[cds,"average"] = (outdf.loc[cds ,cols].sum(skipna = True)) / numberofspecies

# Save results
outdf.to_csv(os.path.join(fasta_dir,"CDS_identities_toHuman.tsv"), sep="\t")