cs50 dna pset6 Works on small database but not on large - python

This is my solution to CS50 pset6 DNA problem in python. It works fine on small database but gives an
Index error: List Index Out of range.
I tried print to see where is the error.. It prints out large database as well. Not sure what to do next.
import csv
import sys
def main():
# TODO: Check for command-line usage
if len(sys.argv) != 3:
print("Usage: python dna.py database.csv sequence.txt")
sys.exit(1)
# TODO: Read database file into a variable
dna_database =[]
with open(sys.argv[1], "r") as dna_data_file:
reader = csv.DictReader(dna_data_file)
for row in reader:
dna_database.append(row)
# TODO: Read DNA sequence file into a variable
with open(sys.argv[2], "r") as load_sequence:
sequence = load_sequence.read()
# TODO: Find longest match of each STR in DNA sequence
STR = list(dna_database[0].keys())[1:]
STR_match ={}
for i in range(len(dna_database)):
# print(dna_database)
STR_match[STR[i]] = longest_match(sequence,STR[i])
# TODO: Check database for matching profiles
for i in range(len(dna_database)):
matches = 0
for j in range(len(STR)):
if int(STR_match[STR[j]]) == int(dna_database[i][STR[j]]):
matches += 1
if matches == len(STR):
print(dna_database[i]['name'])
sys.exit(0)
print("No Match")
return
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()

Related

how to count the biggest consecutive occurences of substring in string?

I'm doing an exercise (cs50 - DNA) where I have to count specific consecutive substrings (STRS) mimicking DNA sequences, I'm finding myself overcomplicating my code and I'm having a hard time figuring out how to proceed.
I have a list of substrings:
strs = ['AGATC', 'AATG', 'TATC']
And a String with a random sequence of letters:
AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
I want to count the biggest consecutive substrings that match each strs.
So:
'AGATC' - AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
'AATG' - AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
'TATC' - AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
resulting in [4, 1, 5]
(Note that this isn't the best example since there are no random repeating patterns scatered around but I think it illustrates what I'm looking for)
I know that I should be something of the likes of re.match(rf"({strs}){2,}", string) because str.count(strs) will give me ALL consecutive and non consecutive items.
My code so far:
#!/usr/bin/env python3
import csv
import sys
from cs50 import get_string
# sys.exit to terminate the program
# sys.exit(2) UNIX default for wrong args
if len(sys.argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
sys.exit(2)
# open file, make it into a list, get STRS, remove header
with open(sys.argv[1], "r") as database:
data = list(csv.reader(database))
STRS = data[0]
data.pop(0)
# remove "name" so only thing remaining are STRs
STRS.pop(0)
# open file to compare agaist db
with open(sys.argv[2], "r") as seq:
sequence = seq.read()
sequenceCount = []
# for each STR count the occurences
# sequence.count(s) returns all
for s in STRS:
sequenceCount.append(sequence.count(s))
print(STRS)
print(sequenceCount)
"""
sequenceCount = {}
# for each STR count the occurences
for s in STRS:
sequenceCount[s] = sequence.count(s)
for line in data:
print(line)
for item in line[1:]:
continue
# rf"({STRS}){2,}"
"""
Regular expression for finding repeating strings is like r"(AGATC)+".
For example,
import re
sequence = "AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG"
pattern = "AGATC"
r = re.search(r"({})+".format(pattern), sequence)
if r:
print("start at", r.start())
print("end at", r.end())
If a match is found, then you can access the starting and ending position by .start and .end methods. You can calculate the repetition using them.
If you need to find all matches in the sequence, then you can use re.finditer, which gives you match objects iteratively.
You can loop over target patterns and find the longest one.
Here using two for loops; one to grab each string (sequence) from strs, and the other to iterate over our dna strand to match each string from strs against it, and a while loop is used if a match was found to keep looking for consecutive (back2back) matches. (Added inline comments to give brief explanations on each step)
dna = 'AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATAGATCTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG'
strs = ['AGATC', 'AATG', 'TATC']
def seq_finder(sequence, dna):
start = 0 # Will allow us to skip scanned sequences
counter = [0] * len(sequence) # Create a list of zeros to store sequence occurrences
for idx, seq in enumerate(sequence): # Iterate over every entry in our sequence "strs"
k = len(seq)
holder = 0 # A temporarily holder that will store #occurrences of *consecutive* sequences
for i in range(start, len(dna)): # For each sequence, iterate over our "dna" strand
if dna[i:i+k] == strs[idx]: # If match is found:
holder += 1 # Increment our holder by 1
while dna[i:i+k] == dna[i+k:i+k*2]: # If our match has an identical match ahead (consecutively):
holder += 1 # Increment our holder by 1
i += k # Start the next list indexing from our new match
start = i + 1 # To skip repetitive iterations over same matches
if holder > counter[idx]:
counter[idx] = holder # Only replace counter if new holder > old holder
holder = 0 # Reset the holder when we existed our of our while loop (finished finding consecutives)
return counter

Scrabble cheater: scoring wildcard characters to zero in Python

I'm new to python world, and I made a code of scrabble finder with two wildcards (* and ?) in it. When scoring the word, I would like to score wildcard letters to zero, but it looks like it doesn't work. I'm wondering what is missing here.
When you look into the line after "# Add score and valid word to the empty list", I tried to code if a letter in the word is not in the rack, I removed the letter so that I can only score other characters that are not coming from wildcards and matches with the letter in the rack. For example, if I have B* in my rack and the word is BO, I would like to remove O and only score B so that I can score wildcard to zero.
But the result is not what I expected.
import sys
if len(sys.argv) < 2:
print("no rack error.")
exit(1)
rack = sys.argv[1]
rack_low = rack.lower()
# Turn the words in the sowpods.txt file into a Python list.
with open("sowpods.txt","r") as infile:
raw_input = infile.readlines()
data = [datum.strip('\n') for datum in raw_input]
# Find all of the valid sowpods words that can be made
# up of the letters in the rack.
valid_words = []
# Call each word in the sowpods.txt
for word in data:
# Change word to lowercase not to fail due to case.
word_low = word.lower()
candidate = True
rack_letters = list(rack_low)
# Iterate each letter in the word and check if the letter is in the
# Scrabble rack. If used once in the rack, remove the letter from the rack.
# If there's no letter in the rack, skip the letter.
for letter in word_low:
if letter in rack_letters:
rack_letters.remove(letter)
elif '*' in rack_letters:
rack_letters.remove('*')
elif '?' in rack_letters:
rack_letters.remove('?')
else:
candidate = False
if candidate == True:
# Add score and valid word to the empty list
total = 0
for letter in word_low:
if letter not in rack_letters:
word_strip = word_low.strip(letter)
for letter in word_strip:
total += scores[letter]
valid_words.append([total, word_low])
I'm going to go a slightly different route with my answer and hopefully speed the overall process up. We're going to import another function from the standard library -- permutations -- and then find possible results by trimming the total possible word list by the length of the rack (or, whatever argument is passed).
I've commented accordingly.
import sys
from itertools import permutations # So we can get our permutations from all the letters.
if len(sys.argv) < 2:
print("no rack error.")
exit(1)
rack = sys.argv[1]
rack_low = rack.lower()
# Turn the words in the sowpods.txt file into a Python list.
txt_path = r'C:\\\\\sowpods.txt'
with open(txt_path,'r') as infile:
raw_input = infile.readlines()
# Added .lower() here.
data = [i.strip('\n').lower() for i in raw_input]
## Sample rack of 7 letters with wildcard character.
sample_rack = 'jrnyoj?'
# Remove any non-alphabetic characters (i.e. - wildcards)
# We're using the isalpha() method.
clean_rack = ''.join([i for i in sample_rack if i.isalpha()])
# Trim word list to the letter count in the rack.
# (You can skip this part, but it might make producing results a little quicker.)
trimmed_data = [i for i in data if len(i) <= len(clean_rack)]
# Create all permutations from the letters in the rack
# We'll iterate over a count from 2 to the length of the rack
# so that we get all relevant permutations.
all_permutations = list()
for i in range(2, len(clean_rack) + 1):
all_permutations.extend(list(map(''.join, permutations(clean_rack, i))))
# We'll use set().intersection() to help speed the discovery process.
valid_words = list(set(all_permutations).intersection(set(trimmed_data)))
# Print sorted list of results to check.
print(f'Valid words for a rack containing letters \'{sample_rack}\' are:\n\t* ' + '\n\t* '.join(sorted(valid_words)))
Our output would be the following:
Valid words for a rack containing letters 'jrnyoj?' are:
* jo
* jor
* joy
* no
* nor
* noy
* ny
* on
* ony
* or
* oy
* yo
* yon
If you want to verify that the results are actually in the sowpods.txt file, you can just index the sowpods.txt list by where the word you want to look up is indexed:
trimmed_data[trimmed_data.index('jor')]
When you are totalling the scores you are using the words from the wordlist and not the inputted words:
total=0
for letter in word_low:
...
Rather, this should be:
total=0
for letter in rack_low:
...
Also, You do not need to loop and remove the letters with strip at the end.
you can just have:
total = 0
for letter in rack_low:
if letter not in rack_letters:
try:
total += scores[letter]
except KeyError: # If letter is * or ? then a KeyError occurs
pass
valid_words.append([total, word_low])

I want to create a program that finds the longest substring in alphabetical order using python using basics

my code for finding longest substring in alphabetical order using python
what I mean by longest substring in alphabetical order?
if the input was"asdefvbrrfqrstuvwxffvd" the output wil be "qrstuvwx"
#we well use the strings as arrays so don't be confused
s='abcbcd'
#give spaces which will be our deadlines
h=s+' (many spaces) '
#creat outputs
g=''
g2=''
#list of alphapets
abc='abcdefghijklmnopqrstuvwxyz'
#create the location of x"the character the we examine" and its limit
limit=len(s)
#start from 1 becouse we substract one in the rest of the code
x=1
while (x<limit):
#y is the curser that we will move the abc array on it
y=0
#putting our break condition first
if ((h[x]==' ') or (h[x-1]==' ')):
break
for y in range(0,26):
#for the second character x=1
if ((h[x]==abc[y]) and (h[x-1]==abc[y-1]) and (x==1)):
g=g+abc[y-1]+abc[y]
x+=1
#for the third to the last character x>1
if ((h[x]==abc[y]) and (h[x-1]==abc[y-1]) and (x!=1)):
g=g+abc[y]
x+=1
if (h[x]==' '):
break
print ("Longest substring in alphabetical order is:" +g )
it doesn't end,as if it's in infinite loop
what should I do?
I am a beginner so I want some with for loops not functions from libraries
Thanks in advance
To avoid infinite loop add x += 1 in the very end of your while-loop. As a result your code works but works wrong in general case.
The reason why it works wrong is that you use only one variable g to store the result. Use at least two variables to compare previous found substring and new found substring or use list to remember all substrings and then choose the longest one.
s = 'abcbcdiawuhdawpdijsamksndaadhlmwmdnaowdihasoooandalw'
longest = ''
current = ''
for idx, item in enumerate(s):
if idx == 0 or item > s[idx-1]:
current = current + item
if idx > 0 and item <= s[idx-1]:
current = ''
if len(current)>len(longest):
longest = current
print(longest)
Output:
dhlmw
For your understanding 'b'>'a' is True, 'a'>'b' is False etc
Edit:
For longest consecutive substring:
s = 'asdefvbrrfqrstuvwxffvd'
abc = 'abcdefghijklmnopqrstuvwxyz'
longest = ''
current = ''
for idx, item in enumerate(s):
if idx == 0 or abc.index(item) - abc.index(s[idx-1]) == 1:
current = current + item
else:
current = item
if len(current)>len(longest):
longest = current
print(longest)
Output:
qrstuvwx
def sub_strings(string):
substring = ''
string +='\n'
i = 0
string_dict ={}
while i < len(string)-1:
substring += string[i]
if ord(substring[-1])+1 != ord(string[i+1]):
string_dict[substring] = len(substring)
substring = ''
i+=1
return string_dict
s='abcbcd'
sub_strings(s)
{'abc': 3, 'bcd': 3}
To find the longest you can domax(sub_strings(s))
So here which one do you want to be taken as the longest??. Now that is a problem you would need to solve
You can iterate through the string and keep comparing to the last character and append to the potentially longest string if the current character is greater than the last character by one ordinal number:
def longest_substring(s):
last = None
current = longest = ''
for c in s:
if not last or ord(c) - ord(last) == 1:
current += c
else:
if len(current) > len(longest):
longest = current
current = c
last = c
if len(current) > len(longest):
longest = current
return longest
so that:
print(longest_substring('asdefvbrrfqrstuvwxffvd'))
would output:
qrstuvwx

How to find the largest set of consecutive integers in python?

I'm looking at a very large set of binary data which is in a separate file. The challenge is to find the largest consecutive number of 1's and 0's. I have already accessed the file from within Python (I'm using Python btw) and have been able to code to find out the total number of 0's and 1's. any help would be much appreciated as I am a total beginner to coding when using Python. Cheers.
This what I've done thus far:
filename = "C:/01.txt"
file = open(filename, "r")
count_1 = 0
count_0 = 0
for line in file:
count_0 = count_0 + line.count("0")
count_1 = count_1 + line.count("1")
pass
print("Number of 1s = " + str(count_1))
print("Number of 0s = " + str(count_0))
I have not actually started the coding to find the consecutive numbers.
To find the longest occurrence of a certain substring, you could use a function like this one:
def longest_segment(sub, string):
return max(m.group() for m in re.finditer(r'(%s)\1*' % sub, string))
This works by finding all occurrences of the provided substring, sub, in the string, and returning the longest one.
Here is a simple solution: Loop through the data, count consecutive 1s read and when reading a 0 (meaning you reached the end of one segment) compare it's length to the longest segment of consecutive 1s found so far.
def getMaxSegmentLength(readable):
current_length= 0
max_length= 0
for x in readable:
if x == '1':
current_length+= 1
else:
max_length= max(max_length, current_length)
current_length= 0
return max(max_length, current_length)
def main():
# open a file located in G:/input.txt in read mode and name the file object: inputf
with open('G:/input.txt', 'r') as inputf:
# put all the text in filef in s
s= inputf.read()
# get the longest streak of 1s in string s
n= getMaxSegmentLength(s)
print(n)
if __name__ == '__main__':
main()
s=raw_input() #read s from file in this case
zero=0
one=0
zz=0
oo=0
for i in list(s):
if i=='1':
if zz>=1:
zero=max(zero,zz)
zz=0
oo+=1
else:
if oo>=1:
one=max(one,oo)
oo=0
zz+=1
if oo>=1:
one=max(oo,one)
if zz>=1:
zero=max(zero,zz)
print zero,one
#O(n)

Transforming source word into target word

I need some help with my code. I need to convert one input word into another, changing one letter at a time. currently my program does this but very inefficiently and does not find the shortest route. Any help would be appreciated.
import re
def same(item, target):
return len([c for (c, t) in zip(item, target) if c == t])
def build(pattern, words, seen, list):
return [word for word in words
if re.search(pattern, word) and word not in seen.keys() and
word not in list]
def find(word, words, seen, target, path):
list = []
for i in range(len(word)):
list += build(word[:i] + "." + word[i + 1:], words, seen, list)
if len(list) == 0:
return False
list = sorted([(same(w, target), w) for w in list])
for (match, item) in list:
if match >= len(target) - 1:
if match == len(target) - 1:
path.append(item)
return True
seen[item] = True
for (match, item) in list:
path.append(item)
if find(item, words, seen, target, path):
return True
path.pop()
fname = 'dictionary.txt'
file = open(fname)
lines = file.readlines()
while True:
start = input("Enter start word:")
words = []
for line in lines:
word = line.rstrip()
if len(word) == len(start):
words.append(word)
target = input("Enter target word:")
break
count = 0
path = [start]
seen = {start : True}
if find(start, words, seen, target, path):
path.append(target)
print(len(path) - 1, path)
else:
print("No path found")
edit: Below is another failed attempt by me to fix this problem by trying a different approach. This time it does not seem to loop properly.
def find(start, words, target): # find function. Word = start word, words =
start=list(start)
target=list(target)
print("Start word is ", start)
print("Target word is ", target)
letter = 0
while start != target:
if letter == len(target):
letter = 0
continue
elif start[letter] == target[letter]:
letter = letter + 1
continue
else:
testword = list(start)
testword[letter] = target[letter]
testword = ''.join(testword)
if testword in words:
start[letter] = target[letter]
letter = letter + 1
print(start)
continue
else:
letter = letter + 1
continue
letter = letter + 1
continue
fname = "dictionary.txt"
file = open(fname) # Open the dictionary
lines = file.readlines() # Read each line from the dictionary and store it in lines
while True: # Until ended
start = input("Enter start word:") # Take a word from the user
words = [] # Inititialise Array 'words'
for line in lines: # For each line in the dictionary
word = line.rstrip() #strip all white space and characters from the end of a string
if len(word) == len(start):
words.append(word)
if start not in words:
print("Your start word is not valid")
continue
target = input("Enter target word:")
if len(start) != len(target):
print("Please choose two words of equal length")
continue
if target not in words:
print("Your target word is not valid")
continue
break
edit: Here is the basic algorithm to the code. (Both variants are compatiable with my purpose).
-input start word
-input target word
- if len(start) = len(target)
continue
-check dictionary to see if target and start words are present
- find what letters are different from the start to target word
- change one different letter in the start word until start word
=target
word #after each letter change, output must be valid word in dictionary
The goal is to achieve this in the least amount of steps which is not achieved, the first section of code does this, I think but in a huge amount of steps I know could be far more efficient
Here's a breadth-first search that doesn't use any 3rd party modules. I don't guarantee that it finds the shortest solutions, but it appears to work. ;) It stops when it finds a solution, but due to the random order of sets each run of the program may find a different solution for a given start & target pair.
import re
# The file containing the dictionary
fname = '/usr/share/dict/words'
start, target = 'hide', 'seek'
wlen = len(start)
wrange = range(wlen)
words = set()
with open(fname) as f:
for word in f:
w = word.rstrip()
# Grab words of the correct length that aren't proper nouns
# and that don't contain non-alpha chars like apostrophe or hyphen
if len(w) == wlen and w.islower() and w.isalpha():
words.add(w)
print('word set size:', len(words))
# Build a regex to find words that differ from `word` by one char
def make_pattern(word):
pat = '|'.join(['{}.{}'.format(word[:i], word[i+1:]) for i in wrange])
return re.compile(pat)
# Find words that extend each chain of words in `seq`
def find(seq):
result = []
seen = set()
for current in seq:
pat = make_pattern(current[-1])
matches = {w for w in words if pat.match(w)} - seen
if target in matches:
return current + (target,)
result.extend(current + (w,) for w in matches)
seen.update(matches)
words.difference_update(matches)
seq[:] = result
# Search for a solution
seq = [(start,)]
words.discard(start)
while True:
solution = find(seq)
if solution:
print(solution)
break
size = len(seq)
print(size)
if size == 0:
print('No solutions found')
break
typical output
word set size: 2360
9
55
199
479
691
('hide', 'hire', 'here', 'herd', 'heed', 'seed', 'seek')
I ought to mention that all those word chains chew up a bit of RAM, I'll try to think of a more compact approach. But it shouldn't really be a problem on modern machines, unless you're working with really large words.
Using a bit of preprocessing to group equal length words, you can use the networkx 3rd party library to build a graph, then use its shortest_path algorithm to retrieve it. Note that I've used the default dictionary available on most *nix systems and limited it to words of 5 characters or less.
from collections import defaultdict
import networkx as nx
# Group the words into equal length so we only compare within words of equal length
with open('/usr/share/dict/words') as fin:
words = defaultdict(set)
for word in (line.strip() for line in fin if line.islower() and len(line) <= 6):
words[len(word)].add(word)
graph = nx.Graph()
for k, g in words.items():
while g:
word = g.pop()
matches = {w for w in g if sum(c1 != c2 for c1, c2 in zip(word, w)) == 1}
graph.add_edges_from((word, match) for match in matches)
Then, get the shortest route, eg:
In [1]: ' -> '.join(nx.shortest_path(graph, 'hide', 'seek'))
Out[1]: 'hide -> hire -> here -> herd -> heed -> seed -> seek'
In [2]: ' -> '.join(nx.shortest_path(graph, 'cat', 'dog'))
Out[2]: 'cat -> cot -> cog -> dog'

Categories

Resources