blob: 4f967a98204ecbf2a21b23110c63465c777a35d1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
import csv
import sys
def main():
# TODO: Check for command-line usage
if len(sys.argv) < 2 and len(sys.argv) > 3:
print("Oopsie you used the wrong number of arguments D: you should use two next time :3")
return
# # check to see if arg 0 is a csv file
# if not sys.argv[0].endswith('.csv'):
# print("Oh no, that isn't a csv file :( that won't work can you fix that please?")
# return
# # check to see if arg 1 is a txt file
# if not sys.argv[1].endswith('.txt'):
# print("Oh no, that isn't a txt file :( that won't work can you fix that please?")
# return
# TODO: Read database file into a variable
database = 'databases/' + sys.argv[1]
rows = []
fields = []
with open(database) as file:
reader = csv.DictReader(file)
fields = reader.fieldnames
for row in reader:
rows.append(row)
# TODO: Read DNA sequence file into a variable
text = 'sequences/' + sys.argv[2]
sequence = ''
with open(text) as file:
sequence = sequence.join(file.readlines())
# TODO: Find longest match of each STR in DNA sequence
matches = []
for field in fields:
if field != 'name':
matches.append(longest_match(sequence, field))
else:
matches.append('')
# print(rows)
# print(fields)
# print(matches)
# TODO: Check database for matching profiles
for person in rows:
print(person)
counter = 0
for i in range(len(fields)):
if fields[i] != 'name':
if person[fields[i]] != matches[i]:
break
else:
counter += 1
if counter == range(len(fields)):
print(person[0])
return
print('No match')
return
# Given entire DNA sequence and an STR (ACTG) as inputs, outputs the longest repitition
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()
|