This is my code for the problem set week 6 DNA. When I test with the small.csv it works correctly but when testing with the large.csv it seems to incorrectly count the repeating sequence. Can anyone help me find the error in my code? I am very new to this.
import csv
import sys
if len(sys.argv) != 3:
sys.exit("Usage: python dna.py STRcounts DNASequence")
check = True
STRlist = []
Humanlist = []
# copy person list
with open(sys.argv[1],"r") as STR:
readSTR = csv.reader(STR)
for row in readSTR:
if check:
STRlist.append(row)
check = False
else:
Humanlist.append(row)
Slist = STRlist[0]
Slist.remove("name")
# print(Humanlist)
# print(Slist)
seq=[]
# copy sequence
with open(sys.argv[2],"r") as text:
readtext = csv.reader(text)
for i in readtext:
seq = i
text = seq[0]
# print(text)
# create dictionary for STR
STRdict = {}
for STR in Slist:
STRdict[STR] = 0
for STR in Slist:
for letter in range(len(text)):
if STR == text[letter:letter+len(STR)]:
STRdict[STR] += 1
check = False
for human in range(len(Humanlist)):
for STR in range(len(Slist)):
if str(STRdict[Slist[STR]]) == str(Humanlist[human][STR+1]):
check = True
else:
check = False
break
if check:
print(Humanlist[human][0])
break
if not check:
print("no match")
I commented out the unnecessary portions and added the code to get the max
length of STR repeats. The rest of the your code is unchanged and I get the results expected.
I didn't examine all of the code for possible improvements, but it does get the correct results.
The reason your code wasn't correct is that it counted all occurrences of the STR in the string instead of counting consecutive repeats (and then finding the max number of repeats).
import csv
import sys
if len(sys.argv) != 3:
sys.exit("Usage: python dna.py STRcounts DNASequence")
check = True
STRlist = []
Humanlist = []
# copy person list
with open(sys.argv[1],"r") as STR:
readSTR = csv.reader(STR)
for row in readSTR:
if check:
STRlist.append(row)
check = False
else:
Humanlist.append(row)
Slist = STRlist[0]
Slist.remove("name")
# print(Humanlist)
# print(Slist)
seq=[]
# copy sequence
with open(sys.argv[2],"r") as text:
readtext = csv.reader(text)
for i in readtext:
seq = i
text = seq[0]
# print(text)
# create dictionary for STR
STRdict = {}
"""
for STR in Slist:
STRdict[STR] = 0"""
for STR in Slist:
idx = 0
max_= 0
while idx < len(text):
num_repeats = 0
while STR == text[idx:idx+len(STR)]:
num_repeats += 1
idx += len(STR)
if num_repeats > max_:
max_ = num_repeats
idx += 1
STRdict[STR] = max_
#print(STR, max_)
"""for letter in range(len(text)):
if STR == text[letter:letter+len(STR)]:
STRdict[STR] += 1"""
check = False
for human in range(len(Humanlist)):
for STR in range(len(Slist)):
if str(STRdict[Slist[STR]]) == str(Humanlist[human][STR+1]):
check = True
else:
check = False
break
if check:
print(Humanlist[human][0])
break
if not check:
print("no match")
This question is from Harvard problem