https://cs50.harvard.edu/x/2020/psets/6/dna/#:~:text=python%20dna.py%20databases/large.csv%20sequences/5.txt
I'm trying to solve this problem from CS50 but it just works for the small database, when I try it for the large one the program overcounts.
import csv
if len(argv) != 3:
print("DIGITA DIREITO, IMBECIL")
exit()
with open(argv[1], "r") as source:
reader = list(csv.reader(source))
reader[0].remove("name")
i = reader[0]
with open(argv[2], "r") as sequence:
seq = sequence.read()
values = []
for j in range(len(i)):
value = 0
counter = 0
pos = 0
prevpos = 0
while pos < len(seq):
pos = seq.find(i[j], pos)
if pos == -1:
counter = 0
break
elif (pos != 1):
counter += 1
prevpos = pos
pos += len(i[j])
if value < counter:
value = counter
values.append(value)
for row in range(len(reader)):
print(reader[row])
print(values)
values = list(map(str, values))
search = list(reader)
search.pop(0)
for result in search:
if result[1:] == values:
print(f"{result[0]}")
break
elif result == search[-1]:
print("No match")
I think you are just counting the STRs repetitions in the sequence, not the maximum consecutive STR repetitions. This is what the problem asks
I want to create a dictionary within my function, but I cannot. The only possible solution I have found is to create the name of the dictionary outside the function and before calling it.
argentina = {}
def get_dic(file,phone):
for line in file.readlines():
if (line[0] == '#'):
names = line.rstrip().strip()
phone[names] = ''
else:
phone[names] = phone[names] + line.rstrip().strip()
get_dic(open(sys.argv[1],'r'), argentina)
get_dic(open(sys.argv[1],'r'), argentina) will open a dictionary called 'argentina', but I wont need to create argentina = {} in advance.
Just create the dict inside and return it from the function:
def get_dic(file):
phone = {}
names = None
for line in file.readlines():
if line[0] == '#':
names = line.strip()
phone[names] = ""
else:
phone[names] += line.strip()
return phone
argentina = get_dic(open(sys.argv[1],'r'))
I try to write a Python 3.4 code to index text document from external
and this my attempt. when run it error message:
raw input is not defined
What I want is:
to tokenize the document which is out of python 34 folder
to remove stop words
to stem
indexing
The code:
import string
def RemovePunc():
line = []
i = 0
text_input = ""
total_text_input = "C:Users\Kelil\Desktop\IRS_Assignment\project.txt"
#This part removes the punctuation and converts input text to lowercase
while i != 1:
text_input = raw_input
if text_input == ".":
i = 1
else:
new_char_string = ""
for char in text_input:
if char in string.punctuation:
char = " "
new_char_string = new_char_string + char
line = line + [new_char_string.lower()]
#This is a list with all of the text that was entered in
total_text_input = (total_text_input + new_char_string).lower()
return line
def RemoveStopWords(line):
line_stop_words = []
stop_words = ['a','able','about','across','after','all','almost','also','am','among',
'an','and','any','are','as','at','be','because','been','but','by','can',
'cannot','could','dear','did','do','does','either','else','ever','every',
'for','from','get','got','had','has','have','he','her','hers','him','his',
'how','however','i','if','in','into','is','it','its','just','least','let',
'like','likely','may','me','might','most','must','my','neither','no','nor',
'not','of','off','often','on','only','or','other','our','own','rather','said',
'say','says','she','should','since','so','some','than','that','the','their',
'them','then','there','these','they','this','tis','to','too','twas','us',
'wants','was','we','were','what','when','where','which','while','who',
'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
#this part removes the stop words for the list of inputs
line_stop_words = []
sent = ""
word = ""
test = []
for sent in line:
word_list = string.split(sent)
new_string = ""
for word in word_list:
if word not in stop_words:
new_string = new_string + word + " "
new_string = string.split(new_string)
line_stop_words = line_stop_words + [new_string]
return(line_stop_words)
def StemWords(line_stop_words):
leaf_words = "s","es","ed","er","ly","ing"
i=0
while i < 6:
count = 0
length = len(leaf_words[i])
while count < len(line_stop_words):
line = line_stop_words[count]
count2 = 0
while count2 < len(line):
#line is the particular list(or line) that we are dealing with, count if the specific word
if leaf_words[i] == line[count2][-length:]:
line[count2] = line[count2][:-length]
count2 = count2 + 1
line_stop_words[count] = line
count2 = 0
count = count + 1
count = 0
i = i + 1
return(line_stop_words)
def indexDupe(lineCount,occur):
if str(lineCount) in occur:
return True
else:
return False
def Indexing(line_stop_words):
line_limit = len(line_stop_words)
index = []
line_count = 0
while line_count < line_limit:
for x in line_stop_words[line_count]:
count = 0
while count <= len(index):
if count == len(index):
index = index + [[x,[str(line_count+1)]]]
break
else:
if x == index[count][0]:
if indexDupe(line_count+1,index[count][1]) == False:
index[count][1] += str(line_count+1)
break
count = count + 1
line_count = line_count + 1
return(index)
def OutputIndex(index):
print ("Index:")
count = 0
indexLength = len(index)
while count < indexLength:
print (index[count][0],)
count2 = 0
lineOccur = len(index[count][1])
while count2 < lineOccur:
print (index[count][1][count2],)
if count2 == lineOccur -1:
print ("")
break
else:
print (",",)
count2 += 1
count += 1
line = RemovePunc()
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)
index = Indexing(line_stop_words)
OutputIndex(index)
#smichak already put the right answer in the comments. raw_input was renamed to input in Python 3. So you want:
text_input = input()
Don't forget those parentheses, since you want to call the function.
import sys
import pickle
import string
def Menu():
print ("\n***********MENU************")
print ("0. Quit")
print ("1. Read text file")
print ("2. Display counts")
print ("3. Display statistics of word lengths")
print ("4. Print statistics to file")
def Loop():
choice = -1
while choice !=0:
Menu()
choice = (int(input("Please choose 1-4 to perform function. Press 0 to exit the program. Thank you. \n")))
if choice == 0:
print ("Exit program. Thank you.")
sys.exit
elif choice == 1:
user_File = ReadTextFile()
elif choice == 2:
DisplayCounts(user_File)
elif choice == 3:
DisplayStats(user_File)
elif choice == 4:
PrintStats(aDictionary)
else:
print ("Error.")
def ReadTextFile():
print "\n"
while True:
InputFile = input("Please enter a file name (NOTE: must have quotation marks around name and extension): ")
if (InputFile.lower().endswith('.txt')):
break
else:
print("That was an incorrect file name. Please try again.")
continue
return InputFile
def DisplayCounts(InputFile):
print "\n"
numCount = 0
dotCount = 0
commaCount = 0
lineCount = 0
wordCount = 0
with open(InputFile, 'r') as f:
for line in f:
wordCount+=len(line.split())
lineCount+=1
for char in line:
if char.isdigit() == True:
numCount+=1
elif char == '.':
dotCount+=1
elif char == ',':
commaCount+=1
print("Number count: " + str(numCount))
print("Comma count: " + str(commaCount))
print("Dot count: " + str(dotCount))
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
def DisplayStats(InputFile):
print "\n"
temp1 = []
temp2 = []
lengths = []
myWords = []
keys = []
values = []
count = 0
with open(InputFile, 'r') as f:
for line in f:
words = line.split()
for word in words:
temp2.append(word)
temp1.append(len(word))
for x in temp1:
if x not in lengths:
lengths.append(x)
lengths.sort()
dictionaryStats = {}
for x in lengths:
dictionaryStats[x] = []
for x in lengths:
for word in temp2:
if len(word) == x:
dictionaryStats[x].append(word)
for key in dictionaryStats:
print("Key = " + str(key) + " Total number of words with " + str(key) + " characters = " + str(len(dictionaryStats[key])))
return dictionaryStats
def PrintStats(aDictionary):
print "\n"
aFile = open("statsWords.dat", 'w')
for key in aDictionary:
aFile.write(str(key) + " : " + str(aDictionary[key]) + "\n")
aFile.close()
Loop()
There's something with that last function that is really tripping me up. I keep getting errors. I know aDictionary is not defined but I do not even know what to define it as! Any of you guys have an idea? Thanks.
with open("some_file.txt","W") as f:
print >> f, "Something goes here!"
its hard to say without your errors. .. but you almost certainly need to have aDictionary defined
also probably something like
def DisplayCounts(InputFile):
print "\n"
numCount = 0
dotCount = 0
commaCount = 0
lineCount = 0
wordCount = 0
with open(InputFile, 'r') as f:
for line in f:
wordCount+=len(line.split())
lineCount+=1
for char in line:
if char.isdigit() == True:
numCount+=1
elif char == '.':
dotCount+=1
elif char == ',':
commaCount+=1
return dict(numbers=numCount,
comma=commaCount,
dot=dotCount,
line=lineCount,
word=wordCount)
result = DisplayCounts("someinput.txt")
print result
In the following function, i upload a file from a template and pass it to the following function.But the data gets crippled if there is a \n or \t(This is a tab separated file).
1.If there is a \n or some special characters it stores the data in the next row.How to avoid this .
2.data is not None or data != "" still stores a null value
def save_csv(csv_file,cnt):
ret = 1
arr = []
try:
flag = 0
f = open(csv_file)
for l in f:
if flag == 0:
flag += 1
continue
parts = l.split("\t")
counter = 1
if(len(parts) > 6):
ret = 2
else:
taggeddata = Taggeddata()
for data in parts:
data = str(data.strip())
if counter == 1 and (data is not None or data != ""):
taggeddata.field1 = data
elif counter == 2 and (data is not None or data != ""):
taggeddata.field2 = data
elif counter == 3 and (data is not None or data != ""):
taggeddata.field3 = data
elif counter == 4 and (data is not None or data != ""):
taggeddata.field4 = data
elif counter == 5 and (data is not None or data != ""):
taggeddata.field5 = data
elif counter == 6 and (data is not None or data != ""):
taggeddata.field6 = data
counter += 1
taggeddata.content_id = cnt.id
taggeddata.save()
arr.append(taggeddata)
return ret
except:
write_exception("Error while processing data and storing")
Use the stdlib's csv module to parse your text, it will be much better at it.
Your expression data is not None or data != "" is always true, you meant data is not None and data != "". Note that you can simplify this to just elif counter == 3 and data: