In [4]:
fasta = open('GCA_000008865.2_ASM886v2_genomic.fna', mode='r')
lst = []
for line in fasta:
    x = line.strip()
    if '>' not in x :
        lst.append(x)
sequence = ''.join(lst)
counter = 0
for i in range(len(sequence)-6):
    if sequence[i:i+6] == 'GAATTC':
        counter += 1
print(counter)
1133
In [5]:
from scipy import stats
p_value = stats.binom_test(1133, len(sequence), 0.0002392315, 'less')
p_value
Out[5]:
7.897818517501663e-79
In [ ]: