wk6/pset/dna/dna.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

import csv
import sys


def main():

    # TODO: Check for command-line usage
    if len(sys.argv) < 1 or len(sys.argv) > 2:
        print("Oopsie you used the wrong number of arguments D: you should use two next time :3")
        return
    # check to see if arg 0 is a csv file
    if not sys.argv[0].endswith('.csv'):
        print("Oh no, that isn't a csv file :( that won't work can you fix that please?")
        return
    # check to see if arg 1 is a txt file
    if not sys.argv[1].endswith('.txt'):
        print("Oh no, that isn't a txt file :( that won't work can you fix that please?")
        return

    # TODO: Read database file into a variable
    rows = []
    with open(argv[0]) as file:
        reader = csv.DictReader(file)
        for row in reader:
            rows.append(row)

    # TODO: Read DNA sequence file into a variable
    sequence = ''
    with open(argv[1]) as file:
        sequence = sequence.join(file.readlines())

    # TODO: Find longest match of each STR in DNA sequence
    # 

    # TODO: Check database for matching profiles

    return


# Given entire DNA sequence and an STR (ACTG) as inputs, outputs the longest repitition
def longest_match(sequence, subsequence):
    """Returns length of longest run of subsequence in sequence."""

    # Initialize variables
    longest_run = 0
    subsequence_length = len(subsequence)
    sequence_length = len(sequence)

    # Check each character in sequence for most consecutive runs of subsequence
    for i in range(sequence_length):

        # Initialize count of consecutive runs
        count = 0

        # Check for a subsequence match in a "substring" (a subset of characters) within sequence
        # If a match, move substring to next potential match in sequence
        # Continue moving substring and checking for matches until out of consecutive matches
        while True:

            # Adjust substring start and end
            start = i + count * subsequence_length
            end = start + subsequence_length

            # If there is a match in the substring
            if sequence[start:end] == subsequence:
                count += 1

            # If there is no match in the substring
            else:
                break

        # Update most consecutive matches found
        longest_run = max(longest_run, count)

    # After checking for runs at each character in seqeuence, return longest run found
    return longest_run


main()