SEQUENCE = []
with open("genome.fasta") as genome: # read chromosome sequence from fasta-file
    genome.readline()
    for line in genome:
        SEQUENCE.append(line.strip())
SEQUENCE = "".join(SEQUENCE)
CDS = []
with open("genome.gff") as ft: # read all CDS and information about them from annotation file gff3
    for line in ft:
        row = line.strip().split("\t")
        if (line.startswith("#") or not line.strip()) or row[2] != "CDS":
            continue
        start = int(row[3])
        end = int(row[4])
        strand = row[6]
        info = row[8].split(";")[-3][8:]
        CDS.append((start, end, strand, info))


def complement(seq): # function, that return complement seq for input seq
    pairs = {"A": "T", "G": "C", "C": "G", "T": "A"}
    result = []
    for letter in seq:
        result.append(pairs[letter])
    return "".join(result)[::-1]


def cutter(seq, pos, number): # function, that cut a part with number length before (if number < 0) or after (number > 0) from seq for pos (include pos)
    if number > 0:
        if pos + number > len(seq):
            return seq[pos:] + seq[:pos + number - len(seq)]
        return seq[pos:pos + number]
    if pos + number < 0:
        return seq[pos + number + 1:] + seq[:pos + 1]
    return seq[pos + number:pos]


with open("POSITIVE.fasta", "w") as file: # create fasta-file with positive-control group
    i = 1
    _seq = ""
    for cds in CDS:
        print(f">Seq{i} {cds[3]}", file=file)
        if cds[2] == "+":
            _seq = cutter(SEQUENCE, cds[0], -26)[:-1]
        else:
            _seq = complement(cutter(SEQUENCE, cds[1], 26)[1:])
        print(_seq, file=file)
        i += 1
with open("NEGATIVE.fasta", "w") as file: # create fasta-file with negative-control group
    i = 1
    _seq = ""
    for cds in CDS:
        print(f">Seq{i} {cds[3]}", file=file)
        if cds[2] == "+":
            _seq = cutter(SEQUENCE, cds[0], 25)
        else:
            _seq = complement(cutter(SEQUENCE, cds[1], -25))
        print(_seq, file=file)
        i += 1
with open("TRAIN.fasta", "w") as file: # create fasta-file with train-group
    kws = ["ibosomal", "polymerase", "ranscription", "ranslation"]
    i = 1
    _seq = ""
    for cds in CDS:
        if not any(kw in cds[3] for kw in kws):
            continue
        print(f">Seq{i} {cds[3]}", file=file)
        if cds[2] == "+":
            _seq = cutter(SEQUENCE, cds[0], -26)[:-1]
        else:
            _seq = complement(cutter(SEQUENCE, cds[1], 26)[1:])
        print(_seq, file=file)
        i += 1
