運用Python解生物資訊問題(2)

練習一

The file orf_exons_chr17.txt contains a list of genes on chromosome 17 and their ORF exon sequences.

ORF exons are the parts of the exons of a gene that contribute to its ORF/CDS (i.e. without the UTRs).

It means that exons that are contained entirely in the UTRs (and thus have no contribution to the ORF at all) are not included in the list.

A) Parse the file into the following data structure: { ‘gene_symbol1’: ‘orf_exon_seq1’, ‘orf_exon_seq2’, ‘orf_exon_seq3’, …, ‘gene_symbol2’: ‘orf_exon_seq1’, ‘orf_exon_seq2’, ‘orf_exon_seq3’, …, ‘gene_symbol3’: ‘orf_exon_seq1’, ‘orf_exon_seq2’, ‘orf_exon_seq3’, …, … }














import os
from collections import defaultdict
file = os.path.abspath('orf_exons_chr17.txt')
with open(file, 'r') as f:
    content = f.readlines()
    last_gene = None
    orf_exons_per_gene = defaultdict(list)
    for line in content:
        line = line.rstrip('\n')
        if line.startswith('ORF Exon #'):
            exon_seq = line[line.find(':') + 2:]
            orf_exons_per_gene[last_gene].append(exon_seq)
        else:
            last_gene = line.replace(':', '')

B) How many genes are there in this data? How many ORF-contributing exons in total? What is the average number of ORF-contributing exons per gene?









num_exons = {}
for gene, exon_seq in orf_exons_per_gene.items():
    num = len(exon_seq)
    num_exons[gene] = num


print('There are %d genes' % (len(num_exons)))
print('There are %d orf contributing exons' % (sum(num_exons.values())))
print('There are average %.2f orf contributing exons per gene' % (sum(num_exons.values()) / len(num_exons)))

C) Count the number of genes per number of ORF-contributing exons (i.e. genes that have that number of ORF-contributing exons).






from collections import Counter

print('Number of genes per number of orf contributing exons')

for n_exon, n_gene in sorted(Counter(num_exons.values()).items(), reverse = True):
    print(f'{n_exon}: {n_gene}')

D)
Find the five genes with the highest number of ORF-contributing exons





def get_value(pair):
    return pair[1]
print('The genes with highest number of orf-contributing exons:')
for gene, num in sorted(num_exons.items(), key=get_value, reverse=True)[:5]:
     print(f'gene:{gene}:({num})')

E)
Write the names of all the genes with only a single ORF-contributing exon into a file.





with open('genes_with_single_exon.txt', 'w') as g:
    g.write('These are genes with only one orf-contributing exon:\n')
    for gene, num in sorted(num_exons.items()):
        if num == 1:
            g.write(f'{gene}\n')

F)
What is the average length of the ORF-contributing parts of exons?







all_exons = []
for exons in orf_exons_per_gene.values():
    all_exons.extend(exons)
total_len = 0
for exons in all_exons:
    total_len += len(exons)
print('Average length of orf contributing exons are: %.2f' % (total_len / len(all_exons)))

G)
Find the five shortest ORF-contributing parts of exons.



print('Five shortest orf-contributing exons:')
for e in sorted(all_exons, key=len)[:5]:
    print(e)

練習二

Use the file orf_exons_chr17.txt that you worked with in the lab exercise.

Recover the coding DNA and protein sequences of the genes. Find the genes on chromosome 17 with the DNA motif GCGCGCGCGC in their coding regions. Find those with the protein motif RKRKRK.


















CODON_TABLE = {
    'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
    'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
    'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
    'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
    'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
    'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
    'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
    'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
}



















gene_to_coding_dna = defaultdict(str)
for gene, exon in orf_exons_per_gene.items():
    gene_to_coding_dna[gene] = ''.join(exon)

def translate_dna(gene):
    gene = gene.replace('T', 'U')
    aa_seq = []
    for i in range(0, len(gene), 3):
        codon = gene[i: i + 3]
        aa = CODON_TABLE[codon]
        if aa == '*':
            break
        else:
            aa_seq.append(aa)
    return ''.join(aa_seq)
gene_to_protein = defaultdict(str)
for gene, coding_dna in gene_to_coding_dna.items():
    gene_to_protein[gene] = translate_dna(coding_dna)
    print(translate_dna(coding_dna))



for gene, coding_dna in gene_to_coding_dna.items():
    if 'GCGCGCGCGC' in coding_dna:
        print(gene)



for gene, protein in gene_to_protein.items():
    if 'RKRKRK' in protein:
        print(gene)

練習三

A) Write a program that generates random RNA transcripts, assuming that all 4 RNA nucleotides are of equal probability. Let each sequence begin with a start codon, and end only when a stop codon is introduced (make sure to take the reading frame into account).


















from random import choice
stop_codons = []
for codon, aa in CODON_TABLE.items():
    if aa == '*':
        stop_codons.append(codon)
def rand_nt():
    return choice('AGCU')
def new_codon():
    return rand_nt() + rand_nt() + rand_nt()
def rand_rna():
    rna = ['AUG']
    while True:
        codon = new_codon()
        rna.append(codon)
        if codon in stop_codons:
            break
    return ''.join(rna)
rand_rna()

B) Run the simulation 1,000 times. What would be the average transcript length if RNA sequences were created at random?







i = 0
len_rna = []
while i < 1000:
    len_rna.append(len(rand_rna()))
    i += 1
average_len = sum(len_rna) / len(len_rna)
print(f'The average rna length is {average_len}')

C) What is the mean transcript length of random RNA sequences as a function of (mean) GC content? Calculate for the following GC-content values: 10%, 20%, 30%, …, 90%.
Explain the results (how and why does the length of transcripts depend on GC content?).












from collections import defaultdict
gc_content_of_rna = defaultdict(list)
for _ in range(10000):
    random_rna = rand_rna()
    len_rna = len(random_rna)
    gc_content = (random_rna.count('G') + random_rna.count('C')) / len_rna
    gc_content = round(gc_content, 1)
    if 0.1 <= gc_content <= 0.9:
        gc_content_of_rna[gc_content].append(len_rna)
for gc_content, length in gc_content_of_rna.items():
    mean_length = sum(length) / len(length)
    print(f'Mean transcript length of GC content {gc_content * 100}%: {mean_length}')

運用Python解生物資訊問題(2)

練習一

練習二

練習三

Read more

RNA-Seq overivew slide

RNA-Seq Analysis

三角函數

FASTAQ format