I am grabbing a lot of data from and SQL query that takes a long time to run. Since the SQL query takes so long to run, I am grabbing the data from the database in its most granular form. I then cycle through this data once and aggregate it in the forms that are useful to me.
My problem is that I am repeating myself over and over again. However, I am not sure of the best way to refactor this control flow. Thanks in advance!
def processClickOutData(cls, raw_data):
singles = {}
total={}
absolute_total = 0
channels = {}
singles_true = {}
total_true={}
channels_true = {}
absolute_total_true = 0
list_channels = set([])
list_tids = set([])
total_position = {}
total_position_true = {}
tid_position = {}
channel_position = {}
channel_position_true = {}
tid_position_true = {}
for row in raw_data:
gap=row[0]
count=row[1]
tid=row[2]
prefered=row[3]
channel=row[4]
position=row[5]
list_channels.add(channel)
list_tids.add(tid)
absolute_total += int(count)
if total.has_key(gap):
total[gap] += count
else:
total[gap] = count
if singles.has_key(gap) and singles[gap].has_key(tid):
singles[gap][tid] += count
elif singles.has_key(gap):
singles[gap][tid] = count
else:
singles[gap] = {}
singles[gap][tid] = count
if channels.has_key(gap) and channels[gap].has_key(channel):
channels[gap][channel] += count
elif channels.has_key(gap):
channels[gap][channel] = count
else:
channels[gap] = {}
channels[gap][channel] = count
if total_position.has_key(position):
total_position[position] += count
else:
total_position[position] = count
if tid_position.has_key(position) and tid_position[position].has_key(tid):
tid_position[position][tid] += count
elif tid_position.has_key(position):
tid_position[position][tid] = count
else:
tid_position[position] = {}
tid_position[position][tid] = count
if channel_position.has_key(position) and channel_position[position].has_key(channel):
channel_position[position][channel] += count
elif channel_position.has_key(position):
channel_position[position][channel] = count
else:
channel_position[position] = {}
channel_position[position][channel] = count
if prefered == 0:
absolute_total_true += count
if total_true.has_key(gap):
total_true[gap] += count
else:
total_true[gap] = count
if singles_true.has_key(gap) and singles_true[gap].has_key(tid):
singles_true[gap][tid] += count
elif singles_true.has_key(gap):
singles_true[gap][tid] = count
else:
singles_true[gap] = {}
singles_true[gap][tid] = count
if channels_true.has_key(gap) and channels_true[gap].has_key(channel):
channels_true[gap][channel] += count
elif channels_true.has_key(gap):
channels_true[gap][channel] = count
else:
channels_true[gap] = {}
channels_true[gap][channel] = count
if total_position_true.has_key(position):
total_position_true[position] += count
else:
total_position_true[position] = count
if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid):
tid_position_true[position][tid] += count
elif tid_position_true.has_key(position):
tid_position_true[position][tid] = count
else:
tid_position_true[position] = {}
tid_position_true[position][tid] = count
if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel):
channel_position_true[position][channel] += count
elif channel_position_true.has_key(position):
channel_position_true[position][channel] = count
else:
channel_position_true[position] = {}
channel_position_true[position][channel] = count
final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
"total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
return final_values
The entire structure you're using to store the data is probably wrong, but since I don't know how you're using it, I can't help you with that.
You can get rid of all of those has_key() calls by using collections.defaultdict. Note thedict.has_key(key) is deprecated anyway, you should just use key in thedict instead.
Look at how I change the for loop too -- you can assign to names right in the for statement, no need to do it separately.
from collections import defaultdict
def processClickOutData(cls, raw_data):
absolute_total = 0
absolute_total_true = 0
list_channels = set()
list_tids = set()
total = defaultdict(int)
total_true = defaultdict(int)
total_position = defaultdict(int)
total_position_true = defaultdict(int)
def defaultdict_int():
return defaultdict(int)
singles = defaultdict(defaultdict_int)
singles_true = defaultdict(defaultdict_int)
channels = defaultdict(defaultdict_int)
channels_true = defaultdict(defaultdict_int)
tid_position = defaultdict(defaultdict_int)
tid_position_true = defaultdict(defaultdict_int)
channel_position = defaultdict(defaultdict_int)
channel_position_true = defaultdict(defaultdict_int)
for gap, count, prefered, channel, position in raw_data:
list_channels.add(channel)
list_tids.add(tid)
absolute_total += count
total[gap] += count
singles[gap][tid] += count
channels[gap][channel] += count
total_position[position] += count
tid_position[position][tid] += count
channel_position[position][channel] += count
if prefered == 0:
absolute_total_true += count
total_true[gap] += count
singles_true[gap][tid] += count
channels_true[gap][channel] += count
total_position_true[position] += count
tid_position_true[position][tid] += count
channel_position_true[position][channel] += count
final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
"total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
return final_values
What this does is automatically fill in the correct default values if the keys don't exist. You've got two kinds here. Where you're adding ints, you want to start with 0 if it doesn't exist -- that's what int returns, hence defaultdict(int). Where you're adding a dictionary that adds ints, you need to use a function that returns a defaultdict(int) which is what defaultdict_int does.
Edit: Suggested alternate dictionary structure:
position = defaultdict(lambda: defaultdict(defaultdict_int))
gap = defaultdict(lambda: defaultdict(defaultdict_int))
absolute_total = 0
for gap, count, prefered, channel, position in raw_data:
absolute_total += count
posd = position[position]
posd.setdefault('total', 0)
posd['total'] += count
posd['tid'][tid] += count
posd['channel'][channel] += count
gapd = gap[gap]
gapd.setdefault('total', 0)
gapd['total'] += count
gapd['tid'][tid] += count
gapd['channel'][channel] += count
Do the same with the _true versions as well, and you've gone from 12 dicts to 4.
Related
https://cs50.harvard.edu/x/2020/psets/6/dna/#:~:text=python%20dna.py%20databases/large.csv%20sequences/5.txt
I'm trying to solve this problem from CS50 but it just works for the small database, when I try it for the large one the program overcounts.
import csv
if len(argv) != 3:
print("DIGITA DIREITO, IMBECIL")
exit()
with open(argv[1], "r") as source:
reader = list(csv.reader(source))
reader[0].remove("name")
i = reader[0]
with open(argv[2], "r") as sequence:
seq = sequence.read()
values = []
for j in range(len(i)):
value = 0
counter = 0
pos = 0
prevpos = 0
while pos < len(seq):
pos = seq.find(i[j], pos)
if pos == -1:
counter = 0
break
elif (pos != 1):
counter += 1
prevpos = pos
pos += len(i[j])
if value < counter:
value = counter
values.append(value)
for row in range(len(reader)):
print(reader[row])
print(values)
values = list(map(str, values))
search = list(reader)
search.pop(0)
for result in search:
if result[1:] == values:
print(f"{result[0]}")
break
elif result == search[-1]:
print("No match")
I think you are just counting the STRs repetitions in the sequence, not the maximum consecutive STR repetitions. This is what the problem asks
i am coding python in PyCharm and it is giving me this warning:
type 'list' doesn't have expected attribute 'tolist'
however I have declared my variable list, and here is my code:
...
my_list = []
big_list = []
i= 0
count = 0
while i < len(data):
if data[i][3] < 0:
i += 1
continue
my_list.append([data[i][0], data[i][1], data[i][2], data[i][3]])
if i == len(df) - 1:
count += 1
self.myfancyfunction(my_list, count)
big_list.append(my_list)
elif i < len(data) - 1 and data[i][3] != data[i + 1][3]:
count += 1
self.myfancyfunction(my_list, count)
big_list.append(my_list)
my_list = []
cluster += 1
i += 1
in the two instances of self.myfancyfunction(my_list, count), the my_list variable is underlined and shows the above error.
Could you please help to fix this?
update
here is my fancy function:
def myfancyfunction(self, array_ls, count):
dict1 = {}
dict2 = {}
array_ls = np.asarray(array_ls)
array_ls = array_ls[:, 2].astype(int)
self.info_ids.append(array_ls.tolist())
sys.stdout.flush()
with open('myfile.txt', "r") as myfile:
for line in myfile:
if int(line.split()[0]) in array_ls:
for element in line.split()[1:]:
key = element.split(":")[0]
value = float(element.split(":")[1])
if key in self.reference:
if not str(key) in dict1:
dict1[str(key)] = 1
dict2[str(key)] = value
else:
dict1[str(key)] += 1
dict2[str(key)] += value
self.info_freq.append(dict1)
self.info_vals.append(dict2)
I am inside a class and have a couple of functions.
def get_data_list(file_object,column_number):
contents = []
for string in file_object:
contents.append(tuple(string.split(',')))
list = []
for i in range(len(contents) - 1):
list.append((contents[i + 1][0], float(contents[i + 1][column_number])))
list.sort()
return list
def average_data(list_of_tuples):
dict = {'01':'January','02':'Februday','03':'March','04':'April','05':'May','06':'June','07':'July','08':'August','09':'September','10':'October','11':'November','12':'December'}
current_month = 0
total = 0
count = 1
average_data = []
for k in list_of_tuples:
for data in k:
data = str(data)
month = data[4:6]
if month == current_month:
total += k[1]
if count != 1:
count += 1
else:
current_month = month
average = float(total/count)
average_data.append((float(average),dict[data[4:6]]+data[0:4]))
total = 0
average_data = sorted(average_data)
return average_data
These are my code but when I try to run it returns an error:
KeyError:'28'or some other keyError with numbers
But I thought I already set all the numbers in dict...
And also, the data[4:6] comes from the date number, such as 20160407, 20141105.
The problem I am trying to solve is reading in a file in that contains a list of words. Then counting the number of vowels in each word and display each word in a table along with the number of its vowels and the total vowels in the word, and at the end display the total number of vowels in all of the words.
I am trying to solve the problem by reading the file in through a for loop and creating a dictionary that is associated with every word like
mississippi['a_count' : 0, 'e_ocunt' : 0, 'i_count' : 4 ,'o_count' : 0, 'u_count' : 0, 'y_count' : 0]
My problem is that I am not sure how to create the dictionaries as the variable changes due to a loop. I am just ending up with empty dictionaries.
here's a screenshot of my output http://imgur.com/mksgdTc
my test code in the file is Mississippi California Wisconsin all on different lines.
try:
word_file = open("vowel.txt", "r")
count = 0
dic = {}
a_count = 0
e_count = 0
i_count = 0
o_count = 0
u_count = 0
y_count = 0
total_count = 0
#this establishes the top of the table
print('Number','{:>8}'.format('word'),'{:>8}'.format('A'),'{:>4}'.format('E'),'{:>4}'.format('I'),'{:>4}'.format('O'),'{:>4}'.format('U'),'{:>4}'.format('Y'),'{:>8}'.format('Total'))
print("__________________________________________________________")
for word in word_file:
count+=1
word = {}
print(word)
word_a_count = 0
word_e_count = 0
word_i_count = 0
word_o_count = 0
word_u_count = 0
word_y_count = 0
word_total_count = 0
for letters in word:
print(letters)
if letters.lower() == "a":
a_count+= 1
total_count += 1
word_a_count +=1
word['a_count'] = word_a_count
if letters.lower() == "e":
e_count+= 1
total_count += 1
word_e_count +=1
word['e_count'] = word_e_count
if letters.lower() == "i":
i_count+= 1
total_count += 1
word_i_count +=1
word['i_count'] = word_i_count
if letters.lower() == "o":
o_count+= 1
total_count += 1
word_o_count +=1
word['o_count'] = word_o_count
if letters.lower() == "u":
u_count+= 1
total_count += 1
word_u_count +=1
word['u_count'] = word_u_count
if letters.lower() == "y":
y_count+= 1
total_count += 1
word_y_count +=1
word['y_count'] = word_y_count
print('Totals','{:>8}'.format(' '),'{:>8}'.format(word['a_count']),'{:>4}'.format\
(word['e_count']),'{:>4}'.format(word['i_count']),'{:>4}'.format\
(word['o_count']),'{:>4}'.format(word['u_count']),'{:>4}'.\
format(word['y_count']))
#this creates the bottom barrier of the table
print("__________________________________________________________")
#code for totals print
print('Totals','{:>8}'.format(' '),'{:>8}'.format(a_count),'{:>4}'.format(e_count),'{:>4}'.format(i_count),'{:>4}'.format(o_count),'{:>4}'.format(u_count),'{:>4}'.format(y_count),'{:>6}'.format(total_count))
except IOError:
print("The file does not seem to exists. The program is halting.")
Focus on this section -- word is re-assigned as an empty dict on every iteration of the loop:
for word in word_file:
count+=1
word = {}
However, commenting word = {} out now throws an error when the first vowel is read from file (since now the dict isn't empty). Remember that word is the current line in the text file that you are iterating over, so word['u_count'] = word_u_count is interpreted as an instruction to change a character in the string. Python strings are immutable, so an error is thrown.
Your program is much longer than it needs to be - when you notice repetition in your code consider refactoring to take advantage of loops and iteration, to make your program more concise. You could separate all the logic for counting the letters in a word into one procedure:
def countletters(word, letterstocount):
count = {}
word = word.lower()
for char in word:
if char in letterstocount:
if char in count:
count[char] += 1
else:
count[char] = 1
return count
#example call
vowels = "aeiou"
print(countletters('Arizona', vowels))
which you then call for each word in your file.
In Python 2 I'd do something like this...
#! /usr/bin/env python
'''
Count vowels in a list of words & show a grand total
Words come from a plain text file with one word per line
'''
import sys
vowels = 'aeiouy'
def make_count_dict():
''' Create a dict for counting vowels with all values initialised to 0 '''
return dict(zip(vowels, (0,)*len(vowels)))
def get_counts(d):
return ' '.join('%2d' % d[k] for k in vowels)
def count_vowels(wordlist):
hline = '_'*45
print '%3s: %-20s: %s' % ('Num', 'Word', ' '.join('%2s' % v for v in vowels))
print hline
total_counts = make_count_dict()
for num, word in enumerate(wordlist, start=1):
word_counts = make_count_dict()
for ch in word.lower():
if ch in vowels:
word_counts[ch] += 1
total_counts[ch] += 1
print '%3d: %-20s: %s' % (num, word, get_counts(word_counts))
print hline
print '%-25s: %s' % ('Total', get_counts(total_counts))
def main():
fname = len(sys.argv) > 1 and sys.argv[1]
if fname:
try:
with open(fname, 'r') as f:
wordlist = f.read().splitlines()
except IOError:
print "Can't find file '%s'; aborting." % fname
exit(1)
else:
wordlist = ['Mississippi', 'California', 'Wisconsin']
count_vowels(wordlist)
if __name__ == '__main__':
main()
I have iterated over a database and created a dictionary. The key is records in field 1 and the values are the averages of in each column for their corresponding record in field 1. The questions I have is what is the best way to output my dictionary to a table?
myDict = {}
def Calculate(key, fields, dt):
results = {}
for rec in arcpy.da.SearchCursor(table, "*"):
header[names] = row[1]
if results.has_key(key):
result = results[key]
i = 0
while i < len(fields):
result[i] += rec[fields[i]]
i += 1
result[len(fields)] += 1
else:
temp = []
i = 0
while i < len(fields):
temp.append(rec[fields[i]])
i += 1
temp.append (1)
results[rec[key]] = temp
endResults = {}
for k in results:
j = 0
tempEndResults = []
while j < len(results[k]) - 1:
tempEndResults.append(results[k][j] / results[k][len(results[k])-1])
j += 1
endResults[k] = tempEndResults
i += 1
return endResults
Calculate(1, [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], myDict)
The questions I have is what is the best way
to output my dictionary to a table?
To create a CSV, you can use a nested loop to print the inner values in tabular form:
for k, seq in myDict.items():
for elem in seq:
print '%s,%s' % (k, elem)
This should give you a nice looking CSV table.