Adding list in form of tuples to a dictionary - python

Assuming there is a list with sublists like this
[[2013, 'Patric', 'M', 1356], [2013, 'Helena', 'F', 202], [2013, 'Patric', 'F', 6],[1993, 'Patric', 'F', 7]......]
which is an output of def list_of_names() where 2013 is year, M is gender and 1356 is number of M births etc.
And I want to create a dictionary which outputs the name as a key and values as tuples (year, number_of_males,number_of_females) . So for example:
{ .. ’Patric’:[... , (1993, 0, 7), (2013, 1356, 6), ... ], ... }.
Technically 1993 is year, 0 is number of males and 7 is number of females and the tuples should be arranged in order of the years.
and I'm stuck on how to add this info into a dictionary
def name_Index(names):
d = dict()
L = readNames() #the list with from previous def which outputs different names and info as above
newlist = []
for sublist in L:

from collections import defaultdict
def list_of_names():
return [[2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7]]
def name_Index():
tmp = defaultdict(lambda:defaultdict(lambda: [0,0]))
for year, name, sex, N in list_of_names():
i = 0 if sex == 'M' else 1
tmp[name][year][i] += N
d = {}
for name, entries in tmp.items():
d[name] = [(year, M, F) for (year, (M,F)) in entries.items()]
return d
print name_Index()

This was my attempt at the problem:
from collections import defaultdict, namedtuple
from itertools import groupby
data = [[2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7]]
names = defaultdict(list)
datum = namedtuple('datum', 'year gender number')
for k, g in groupby(data, key=lambda x: x[1]):
for l in g:
year, name, gender, number = l
names[k].append(datum(year, gender, number))
final_dict = defaultdict(list)
for n in names:
for k, g in groupby(names[n], lambda x: x.year):
males = 0
females = 0
for l in g:
if l.gender == 'M':
males += l.number
elif l.gender == 'F':
females += l.number
final_dict[n].append((k, males, females))
print(final_dict)

The most convenient will be to use collections.defauldict. It returns dictionary-like object, that returns default value, if it doesn't find key. In your case, you use a list as default value, and in your loop you append tuples to it:
from collections import defaultdict
names = [ [2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7] ]
def name_Index(data):
# name => year => sex
d = defaultdict(lambda: defaultdict(lambda: {'F': 0, 'M': 0}))
for year, name, sex, births in data:
d[name][year][sex] += births
# if you are fine with defauldict result: return d
# else collect results into tuples:
result = {}
for name, data in d.items():
result[name] = [(year, c['M'], c['F']) for year, c in data.items()]
return result
print name_Index(names)
# {'Helena': [(2013, 0, 202)], 'Patric': [(1993, 0, 7), (2013, 1356, 6)]}

I didn't understand why you are taking names as an argument of name_Index function and then calling readNames, there must be some necessity required for your work. Hence, i just put a dummy readNames function and sent None as argument to name_Index. Using class is a good technique to solve complicated data structures. Btw, nicely written question i must admit.
def readNames ():
return [[2013, 'Patric', 'M', 1356], [2013, 'Helena', 'F', 202], [2013, 'Patric', 'F', 6],[1993, 'Patric', 'F', 7]]
class YearOb(object):
def __init__(self):
self.male = 0
self.female = 0
def add_birth_data(self, gender, birth_count):
if gender == "M":
self.male += birth_count
else:
self.female += birth_count
class NameOb(object):
def __init__(self):
self.yearobs = dict()
def add_record(self, year, gender, birth_count):
if year not in self.yearobs:
self.yearobs[year]=YearOb()
self.yearobs[year].add_birth_data(gender, birth_count)
def get_as_list(self):
list_data = []
for year, yearob in self.yearobs.items():
list_data.append((year, yearob.male, yearob.female))
return list_data
def name_Index(names):
d = dict()
L = readNames() #the list with from previous def which outputs different names and info as above
newlist = []
for sublist in L:
name = sublist[1]
if name not in d:
d[name]=NameOb()
d[name].add_record(sublist[0], sublist[2], sublist[3])
for name, nameob in d.items():
d[name] = nameob.get_as_list()
return d
print(name_Index(None))

Related

Find most common word in a list of sets

I'm currently working in my university projects in NLP. I'd like to display the most common words contained in this list of sets:
[{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
This is what I've accomplished until now:
def word_list(sent):
if isinstance(sent, str):
tokens = set(word_tokenize(sent.lower()))
else:
tokens = set([t for s in sent for t in word_tokenize(s.lower())])
tokens = set([stemmer.stem(t) for t in tokens])
for w in stopword_final:
tokens.discard(w)
return tokens
def get_most_relevant_words(definitions):
list_of_words = list()
most_common_word_dict = dict()
for d1 in definitions:
list_of_words.append(word_list(d1))
for elem in list_of_words:
for word in elem:
print(word)
word_counter = Counter(word)
most_occurrences = word_counter.most_common(3)
most_common_word_dict.update({word: most_occurrences})
return most_common_word_dict
The desired output should be: {fear: 2, feel: 2}
The output that it prints is: {'feel': [('e', 2), ('f', 1), ('l', 1)]}
Use collections.Counter:
from collections import Counter
list_of_sets = [{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
words = [word for my_set in list_of_sets for word in my_set]
c = Counter(words)
print(c)
output:
Counter({
'fear': 2,
'face': 2,
'feel': 2,
'properti': 1,
'despit': 1,
'allow': 1,
'situat': 1,
'ani': 1,
'someth': 1,
'unpleas': 1,
'make': 1,
'abil': 1,
'us': 1,
'scar': 1
})
You can simply iterate through the 2 sets, find common terms, and update the count in a dictionary. By the way, 'face' should also be included in your result.
lst = [{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
dic = {}
for word1 in lst[0]:
for word2 in lst[1]:
if word1 == word2:
dic[word1] = dic.get(word1, 0) + 2
print(dic)
#{'fear': 2, 'feel': 2, 'face': 2}

List that resembles a dict to dict

I have a list that already quite resembles a dictionary:
l=["'S':'NP''VP'", "'NP':'DET''N'", "'VP':'V'", "'DET':'a'", "'DET':'an'", "'N':'elephant'", "'N':'elephants'", "'V':'talk'", "'V':'smile'"]
I want to create a dictionary keeping all information:
dict= {'S': [['NP','VP']],
'NP': [['DET', 'N']],
'VP': [['V']], 'DET': [['a'], ['an']],
'N': [['elephants'], ['elephant']],
'V': [['talk'], ['smile]]}
I tried using this:
d = {}
elems = filter(str.isalnum,l.replace('"',"").split("'"))
values = elems[1::2]
keys = elems[0::2]
d.update(zip(keys,values))
and this:
s = l.split(",")
dictionary = {}
for i in s:
dictionary[i.split(":")[0].strip('\'').replace("\"", "")] = i.split(":")[1].strip('"\'')
print(dictionary)
You can use collections.defaultdict with re:
import re, collections
l=["'S':'NP''VP'", "'NP':'DET''N'", "'VP':'V'", "'DET':'a'", "'DET':'an'", "'N':'elephant'", "'N':'elephants'", "'V':'talk'", "'V':'smile'"]
d = collections.defaultdict(list)
for i in l:
d[(k:=re.findall('\w+', i))[0]].append(k[1:])
print(dict(d))
Output:
{'S': [['NP', 'VP']], 'NP': [['DET', 'N']], 'VP': [['V']], 'DET': [['a'], ['an']], 'N': [['elephant'], ['elephants']], 'V': [['talk'], ['smile']]}

Output to screen and csv format python

I have a python nested dictionary output, I have been able to remove the first set of cruly brackets using RocketDict, but 1) I can't remove the second set of curly brackets 2)I tried to export it to a csv file giving the column names and that doesn't work because I can't figure out how to get the int#/# values that increment in the rows. For Example here was my initial output:
Before RocketDict:
{ intx/x : {'value1: 'A', 'value2: 'B', value3: 'C'},
inty/y : {'value1: 'X', 'value2: 'Y', value3: 'Z'}}
After the RocketDict:
intx/x : {'value1: 'A', 'value2: 'B', value3: 'C'},
inty/y : {'value1: 'X', 'value2: 'Y', value3: 'Z'}
Desired output:
intx/x : 'value1: 'A', 'value2: 'B', value3: 'C',
inty/y : 'value1: 'X', 'value2: 'Y', value3: 'Z'
Desired output to the csv:
Here is the full script:
results = requests.get(url, headers=headers)
inventory = results.json()
data = inventory['config']
class RocketDict(UserDict):
def __str__(self):
r = ['']
r.extend(['\t{} : {}'.format(k, v)
for k, v in self.items()])
return ',\n'.join(r)
if __name__ == '__main__':
#standard dict object
# inventory = {('key-%02d' % v): v for v in range(1, 10)}
# print(inventory, '\n')
# Wrap that dict object into a RocketDict.
d2 = RocketDict(data)
print(d2)
csv_columns = ['value1','value2','value3']
dict_data = d2
csv_file = 'mycsv.csv'
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in dict_data:
writer.writerow(d2)
except IOError:
print("I/O error")
Use pandas -
RocketDict ={ 'intx/x' : {'value1': 'A', 'value2': 'B', 'value3': 'C'},
'inty/y' : {'value1': 'X', 'value2': 'Y', 'value3': 'Z'}}
import pandas as pd
pd.DataFrame(RocketDict).transpose().to_csv('out.csv', index =True)

Row comparison and append loop by columns

I have a bunch of school data that I maintain on a master list for monthly testing scores. Everytime a child takes a score and there is an update on 'Age', 'Score', 'School' I would insert a new row with updated data and keep track of all the changes. I am trying to figure out a python script to do this but since I am a newbie, I keep running in to issues.
I tried writing a loop but keep getting errors to include "False", "The Truth value of a series is ambigious", "tuple indices must be integers, not str"
master_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[15,14,17,13],
'School':['AB', 'CD', 'EF', 'GH'],
'Score':[80, 75, 62, 100],
'Date': ['3/1/2019', '3/1/2019', '3/1/2019', '3/1/2019']})
updates_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[16,14,17,13],
'School':['AB', 'ZX', 'EF', 'GH'],
'Score':[80, 90, 62, 100],
'Date': ['4/1/2019', '4/1/2019', '4/1/2019', '4/1/2019']})
# What I am trying to get is:
updated_master = pd.DataFrame({'ID': ['A', 'A', 'B', 'B', 'C','D'],
'Age':[15,16,14,14,17,13],
'School':['AB', 'AB', 'CD', 'ZX', 'EF', 'GH'],
'Score':[80, 80, 75, 90, 62, 100],
'Date': ['3/1/2019', '4/1/2019', '3/1/2019', '4/1/2019', '3/1/2019', '3/1/2019']})
temp_delta_list = []
m_score = master_df.iloc[1:, master_df.columns.get_loc('Score')]
m_age = master_df.iloc[1:, master_df.columns.get_loc('Age')]
m_school = master_df.iloc[1:, master_df.columns.get_loc('School')]
u_score = updates_df.iloc[1:, updates_df.columns.get_loc('Score')]
u_age = updates_df.iloc[1:, updates_df.columns.get_loc('Age')]
u_school = updates_df.iloc[1:, updates_df.columns.get_loc('School')]
for i in updates_df['ID'].values:
updated_temp_score = updates_df[updates_df['ID'] == i], u_score
updated_temp_age = updates_df[updates_df['ID'] == i], u_age
updated_temp_school = updates_df[updates_df['ID'] == i], u_school
master_temp_score = master_df[master_df['ID'] == i], m_score
master_temp_age = master_df[master_df['ID'] == i], m_age
master_temp_school = updates_df[master_df['ID'] == i], m_school
if (updated_temp_score == master_temp_score) | (updated_temp_age == master_temp_age) | (updated_temp_school == master_temp_school):
pass
else:
temp_deltas = updates_df[(updates_df['ID'] == i)]
temp_delta_list.append(temp_deltas)
I ultimately want to have the loop compare each row values for each ID and return rows that have any difference and then append the master_df

reportlab dynamic data-driven header outputs wrong subtitle

I have created some fictitious, though representative, clinical trial type data using Pandas, and now come to some test reporting in ReportLab.
The data has a block (~50 rows) where the treatment column is 'Placebo' and the same amount where the treatment is 'Active'. I simply want to list the data using a sub-heading of 'Treatment Group: Placebo' for the first set and 'Treatment Group: Active' for the second.
There are some hits on a similar topic, and, indeed I've used one of the suggested techniques, namely to extend the arguments of a header functions using partial from functools.
title1 = "ACME Corp CONFIDENTIAL"
title2 = "XYZ123 / Anti-Hypertensive Draft"
title3 = "Protocol XYZ123"
title4 = "Study XYZ123"
title5 = "Listing of Demographic Data by Treatment Arm"
title6 = "All subjects"
def title(canvas, doc, bytext):
canvas.saveState()
canvas.setFont(styleN.fontName, styleN.fontSize)
canvas.drawString(DOCMARGIN, PAGE_HEIGHT*.975, title1)
canvas.drawString(DOCMARGIN, PAGE_HEIGHT*.950, title2)
canvas.drawString(DOCMARGIN, PAGE_HEIGHT*.925, title3)
canvas.drawCentredString(PAGE_WIDTH/2.0, PAGE_HEIGHT*.900, title4)
canvas.drawCentredString(PAGE_WIDTH/2.0, PAGE_HEIGHT*.875, title5)
canvas.drawCentredString(PAGE_WIDTH/2.0, PAGE_HEIGHT*.850, title6)
canvas.drawString(DOCMARGIN, PAGE_HEIGHT*.825, "Treatment Group:" + bytext)
canvas.restoreState()
This is then called as follows. n_groups has the value of 2 from a summary query and 0 maps to 'Placebo' and 1 maps to active.
def build_pdf(doc):
ptemplates = []
for armcd in range(n_groups):
ptemplates.append(PageTemplate(id = 'PT' + str(armcd), frames = [dataFrame,],
onPage = partial(title, bytext=t_dict[armcd]),
onPageEnd = foot))
doc.addPageTemplates(ptemplates)
elements = []
for armcd in range(n_groups):
elements.append(NextPageTemplate('PT' + str(armcd)))
sublist = [t for t in lista if t[0] == (armcd+1)]
sublist.insert(0,colheads)
data_table = Table(sublist, 6*[40*mm], len(sublist)*[DATA_CELL_HEIGHT], repeatRows=1)
data_table.setStyle(styleC)
elements.append(data_table)
elements.append(PageBreak())
doc.build(elements)
The report produces 6 pages. The first 3 pages of placebo data are correct, pages 5 & 6 of active data are correct, but page 4 - which should be the first page of the second 'active' group has the sub-title 'Treatment Group: Placebo'.
I have re-organized the order of the statements multiple times, but can't get Page 4 to sub-title correctly. Any help, suggestions or magic would be much appreciated.
[Edit 1: sample data structure]
The 'top' of the data starts as:
[
[1, 'Placebo', '000001-000015', '1976-09-20', 33, 'F', 'Black'],
[1, 'Placebo', '000001-000030', '1959-04-26', 50, 'M', 'Asian'],
[1, 'Placebo', '000001-000031', '1946-02-07', 64, 'F', 'Asian'],
[1, 'Placebo', '000001-000046', '1947-11-08', 62, 'M', 'Asian'],
etc for 50 rows, then continues with
[2, 'Active', '000001-000002', '1962-02-28', 48, 'F', 'Black'],
[2, 'Active', '000001-000008', '1975-10-20', 34, 'M', 'Black'],
[2, 'Active', '000001-000013', '1959-01-19', 51, 'M', 'White'],
[2, 'Active', '000001-000022', '1962-01-12', 48, 'F', 'Black'],
[2, 'Active', '000001-000036', '1976-10-17', 33, 'F', 'Asian'],
[2, 'Active', '000001-000045', '1980-12-31', 29, 'F', 'White'],
for another 50.
The column header inserted is:
['Treatment Arm Code',
'Treatment Arm',
'Site ID - Subject ID',
'Date of Birth',
'Age (Years)',
'Gender',
'Ethnicity'],
[Edit 2: A solution - move the PageBreak() and make it conditional:]
def build_pdf(doc):
ptemplates = []
for armcd in range(n_groups):
ptemplates.append(PageTemplate(id = 'PT' + str(armcd), frames = [dataFrame,],
onPage = partial(title, bytext=t_dict[armcd]),
onPageEnd = foot))
doc.addPageTemplates(ptemplates)
elements = []
for armcd in range(n_groups):
elements.append(NextPageTemplate('PT' + str(armcd)))
if armcd > 0:
elements.append(PageBreak())
sublist = [t for t in lista if t[0] == (armcd+1)]
sublist.insert(0,colheads)
data_table = Table(sublist, 6*[40*mm], len(sublist)*[DATA_CELL_HEIGHT], repeatRows=1)
data_table.setStyle(styleC)
elements.append(data_table)
doc.build(elements)

Categories

Resources