Search code examples
pythoncs50dna-sequence

CS50 pset 6 DNA works with small.csv but not large.csv


This is my code for the problem set week 6 DNA. When I test with the small.csv it works correctly but when testing with the large.csv it seems to incorrectly count the repeating sequence. Can anyone help me find the error in my code? I am very new to this.

import csv
import sys
if len(sys.argv) != 3:
        sys.exit("Usage: python dna.py STRcounts DNASequence")
check = True
STRlist = []
Humanlist = []
# copy person list
with open(sys.argv[1],"r") as STR:
    readSTR = csv.reader(STR)
    for row in readSTR:
        if check:
            STRlist.append(row)
            check = False
        else:
            Humanlist.append(row)
Slist = STRlist[0]
Slist.remove("name")
# print(Humanlist)
# print(Slist)
seq=[]
# copy sequence
with open(sys.argv[2],"r") as text:
    readtext = csv.reader(text)
    for i in readtext:
        seq = i
text = seq[0]
# print(text)
# create dictionary for STR

STRdict = {}
for STR in Slist:
    STRdict[STR] = 0
for STR in Slist:
    for letter in range(len(text)):
        if STR == text[letter:letter+len(STR)]:
            STRdict[STR] += 1
check = False
for human in range(len(Humanlist)):
    for STR in range(len(Slist)):
        if str(STRdict[Slist[STR]]) == str(Humanlist[human][STR+1]):
            check = True
        else:
            check = False
            break
    if check:
        print(Humanlist[human][0])
        break
if not check:
    print("no match")

Solution

  • I commented out the unnecessary portions and added the code to get the max length of STR repeats. The rest of the your code is unchanged and I get the results expected.

    I didn't examine all of the code for possible improvements, but it does get the correct results.

    The reason your code wasn't correct is that it counted all occurrences of the STR in the string instead of counting consecutive repeats (and then finding the max number of repeats).

    import csv
    import sys
    if len(sys.argv) != 3:
            sys.exit("Usage: python dna.py STRcounts DNASequence")
    check = True
    STRlist = []
    Humanlist = []
    # copy person list
    with open(sys.argv[1],"r") as STR:
        readSTR = csv.reader(STR)
        for row in readSTR:
            if check:
                STRlist.append(row)
                check = False
            else:
                Humanlist.append(row)
    Slist = STRlist[0]
    Slist.remove("name")
    # print(Humanlist)
    # print(Slist)
    seq=[]
    # copy sequence
    with open(sys.argv[2],"r") as text:
        readtext = csv.reader(text)
        for i in readtext:
            seq = i
    text = seq[0]
    # print(text)
    # create dictionary for STR
    
    STRdict = {}
    """
    for STR in Slist:
        STRdict[STR] = 0"""
    for STR in Slist:
        idx = 0
        max_= 0
        while idx < len(text):
            num_repeats = 0
            while STR == text[idx:idx+len(STR)]:
                num_repeats += 1
                idx += len(STR)
            if num_repeats > max_:
                max_ = num_repeats
    
            idx += 1
        STRdict[STR] = max_
        #print(STR, max_)
                
        """for letter in range(len(text)):
            if STR == text[letter:letter+len(STR)]:
                STRdict[STR] += 1"""
    check = False
    for human in range(len(Humanlist)):
        for STR in range(len(Slist)):
            if str(STRdict[Slist[STR]]) == str(Humanlist[human][STR+1]):
                check = True
            else:
                check = False
                break
        if check:
            print(Humanlist[human][0])
            break
    if not check:
        print("no match")
    

    This question is from Harvard problem