Storing data into namedtuples with empty fields to add other stuff - python

['Date,Open,High,Low,Close,Volume,Adj Close',
'2014-02-12,1189.00,1190.00,1181.38,1186.69,1724500,1186.69',
'2014-02-11,1180.17,1191.87,1172.21,1190.18,2050800,1190.18',
'2014-02-10,1171.80,1182.40,1169.02,1172.93,1945200,1172.93',
'2014-02-07,1167.63,1177.90,1160.56,1177.44,2636200,1177.44',
'2014-02-06,1151.13,1160.16,1147.55,1159.96,1946600,1159.96',
'2014-02-05,1143.38,1150.77,1128.02,1143.20,2394500,1143.20',
'2014-02-04,1137.99,1155.00,1137.01,1138.16,2811900,1138.16',
'2014-02-03,1179.20,1181.72,1132.01,1133.43,4569100,1133.43']
I need to make a namedtuple for each of the lines in this list of lines, basically the fields would be the word in the first line 'Date,Open,High,Low,Close,Volume,Adj Close', I will then be making some calculations and will need to add 2 more fields at the end of each namedtuple. Any help on how I can do this?

from collections import namedtuple
data = ['Date,Open,High,Low,Close,Volume,Adj Close',
'2014-02-12,1189.00,1190.00,1181.38,1186.69,1724500,1186.69',
'2014-02-11,1180.17,1191.87,1172.21,1190.18,2050800,1190.18',
'2014-02-10,1171.80,1182.40,1169.02,1172.93,1945200,1172.93',
'2014-02-07,1167.63,1177.90,1160.56,1177.44,2636200,1177.44',
'2014-02-06,1151.13,1160.16,1147.55,1159.96,1946600,1159.96',
'2014-02-05,1143.38,1150.77,1128.02,1143.20,2394500,1143.20',
'2014-02-04,1137.99,1155.00,1137.01,1138.16,2811900,1138.16',
'2014-02-03,1179.20,1181.72,1132.01,1133.43,4569100,1133.43']
def convert_to_named_tuples(data):
# get the names for the named tuple
field_names = data[0].split(",")
# these are you two extra custom fields
field_names.append("extra1")
field_names.append("extra2")
# field names can't have spaces in them (they have to be valid python identifiers
# and "Adj Close" isn't)
field_names = [field_name.replace(" ", "_") for field_name in field_names]
# you can do this as many times as you like..
# personally I'd do it manually once at the start and just check you're getting
# the field names you expect here...
ShareData = namedtuple("ShareData", field_names)
# unpack the data into the named tuples
share_data_list = []
for row in data[1:]:
fields = row.split(",")
fields += [None, None]
share_data = ShareData(*fields)
share_data_list.append(share_data)
return share_data_list
# check it works..
share_data_list = convert_to_named_tuples(data)
for share_data in share_data_list:
print share_data
Actually this is better I think since it converts the fields into the right types. On the downside it won't take arbitraty data...
from collections import namedtuple
from datetime import datetime
data = [...same as before...]
field_names = ["Date","Open","High","Low","Close","Volume", "AdjClose", "Extra1", "Extra2"]
ShareData = namedtuple("ShareData", field_names)
def convert_to_named_tuples(data):
share_data_list = []
for row in data[1:]:
row = row.split(",")
fields = (datetime.strptime(row[0], "%Y-%m-%d"), # date
float(row[1]), float(row[2]),
float(row[3]), float(row[4]),
int(row[5]), # volume
float(row[6]), # adj close
None, None) # extras
share_data = ShareData(*fields)
share_data_list.append(share_data)
return share_data_list
# test
share_data_list = convert_to_named_tuples(data)
for share_data in share_data_list:
print share_data
But I agree with other posts.. why use namedtuple when you can use a class definition..

Any special reason why you want to used namedtuples? If you want to add fields later maybe you should use a dictionary. If you really wan't to go the namedtuple way though, you could use a placeholder like:
from collections import namedtuple
field_names = data[0].replace(" ", "_").lower().split(",")
field_names += ['placeholder_1', 'placeholder_2']
Entry = namedtuple('Entry', field_names)
list_of_named_tuples = []
mock_data = [None, None]
for row in data[1:]:
row_data = row.split(",") + mock_data
list_of_named_tuples.append(Entry(*row_data))
If, instead, you want to parse your data into a list of dictionaries (more pythonic IMO) you should do:
field_names = data[0].split(",")
list_of_dicts = [dict(zip(field_names, row.split(','))) for row in data[1:]]
EDIT: Note that even though you may use dictionaries instead of namedtuples for the small dataset from your example, doing so with large amounts of data will translate into a higher memory footprint for your program.

why don't you use a dictionary for the data, adding additional keys is then easy
dataList = []
keys = myData[0].split(',')
for row in myData:
tempdict = dict()
for index, value in enumerate(row.split(',')):
tempdict[keys[index]] = value
# if your additional values are going to be determined here then
# you can do whatever calculations you need and add them
# otherwise you do work with this list elsewhere
dataList.append(tempdict)

Related

Trying to Access keys in Dict from their values

I'm importing a CSV to a dictionary, where there are a number of houses labelled (I.E. 1A, 1B,...)
Rows are labelled containing some item such as 'coffee' and etc. In the table is data indicating how much of each item each house hold needs.
Excel screenshot
What I am trying to do it check the values of the key value pairs in the dictionary for anything that isn't blank (containing either 1 or 2), and then take the key value pair and the 'PRODUCT NUMBER' (from the csv) and append those into a new list.
I want to create a shopping list that will contain what item I need, with what quantity, to which household.
the column containing 'week' is not important for this
I import the CSV into python as a dictionary like this:
import csv
import pprint
from typing import List, Dict
input_file_1 = csv.DictReader(open("DATA CWK SHOPPING DATA WEEK 1 FILE B.xlsb.csv"))
table: List[Dict[str, int]] = [] #list
for row in input_file_1:
string_row: Dict[str, int] = {} #dictionary
for column in row:
string_row[column] = row[column]
table.append(string_row)
I found on 'geeksforgeeks' how to access the pair by its value. however when I try this in my dictionary, it only seems to be able to search for the last row.
# creating a new dictionary
my_dict ={"java":100, "python":112, "c":11}
# list out keys and values separately
key_list = list(my_dict.keys())
val_list = list(my_dict.values())
# print key with val 100
position = val_list.index(100)
print(key_list[position])
I also tried to do a for in range loop, but that didn't seem to work either:
for row in table:
if row["PRODUCT NUMBER"] == '1' and row["Week"] == '1':
for i in range(8):
if string_row.values() != ' ':
print(row[i])
Please, if I am unclear anywhere, please let me know and I will clear it up!!
Here is a loop I made that should do what you want.
values = list(table.values())
keys = list(table.keys())
new_table = {}
index = -1
for i in range(values.count("")):
index = values.index("", index +1)
new_table[keys[index]] = values[index]
If you want to remove those values from the original dict you can just add in
d.pop(keys[index]) into the loop

creating Dataframe from a lot of lists

I want to create dataframe from my data. What I do is essentially a grid search over different parameters for my algorithm. Do you have any idea how can this be done better, because right now if I need to add two more parameters in my grid, or add more data on which I perform my analysis — I need to manually add a lot of lists, then append to it some values, and then in Dataframe dict add another column. IS there another way? Because right now it looks really ugly.
type_preds = []
type_models = []
type_lens = []
type_smalls = []
lfc1s = []
lfc2s = []
lfc3s = []
lv2s = []
sfp1s = []
len_small_fils = []
ratio_small_fills = []
ratio_big_fils = []
for path_to_config in path_to_results.iterdir():
try:
type_pred, type_model, type_len, type_small, len_small_fil, ratio_big_fil, ratio_small_fill = path_to_config.name[:-4].split('__')
except:
print(path_to_config)
continue
path_to_trackings = sorted([str(el) for el in list(path_to_config.iterdir())])[::-1]
sfp1, lv2, lfc3, lfc2, lfc1 = display_metrics(path_to_gts, path_to_trackings)
type_preds.append(type_pred)
type_models.append(type_model)
type_lens.append(type_len)
type_smalls.append(type_small)
len_small_fils.append(len_small_fil)
ratio_big_fils.append(ratio_big_fil)
ratio_small_fills.append(ratio_small_fill)
lfc1s.append(lfc1)
lfc2s.append(lfc2)
lfc3s.append(lfc3)
lv2s.append(lv2)
sfp1s.append(sfp1)
df = pd.DataFrame({
'type_pred': type_preds,
'type_model': type_models,
'type_len': type_lens,
'type_small': type_smalls,
'len_small_fil': len_small_fils,
'ratio_small_fill': ratio_small_fills,
'ratio_big_fill': ratio_big_fils,
'lfc3': lfc3s,
'lfc2': lfc2s,
'lfc1': lfc1s,
'lv2': lv2s,
'sfp1': sfp1s
})
Something along these lines might make it easier:
data = []
for path_to_config in path_to_results.iterdir():
row = []
try:
row.extend(path_to_config.name[:-4].split('__'))
except:
print(path_to_config)
continue
path_to_trackings = sorted([str(el) for el in list(path_to_config.iterdir())])[::-1]
row.extend(display_metrics(path_to_gts, path_to_trackings))
data.append(row)
df = pd.DataFrame(
data,
columns=[
"type_pred",
"type_model",
"type_len",
"type_small",
"len_small_fil",
"ratio_big_fil",
"ratio_small_fill",
# End of first half
"sfp1",
"lv2",
"lfc3",
"lfc2",
"lfc1",
])
Then every time you add an extra return variable to either the first or second function you just need to add an extra column to the final DataFrame creation.

generating duplicate values (fill down?) when parsing XML into Dataframe

I have a problem parsing XML into a data frame using Python. When I print out the values, some values seem to 'filldown', or repeat themselves. (see column adres). Does anyone one know what could be wrong?
import xml.etree.ElementTree as et
import pandas as pd
import xmltodict
import json
tree = et.parse('20191125_DMG_PI.xml')
root = tree.getroot()
df_cols = ["status", "priref", "full_name", "detail", "adres"]
rows = []
for record in root:
for child in record:
s_priref = child.get('priref')
for field in child.findall('Address'):
s_address = field.find('address').text
#for sub in field.findall('address.country'):
# s_country = sub.find('value').text if s_country is not None else None
for field in child.findall('name'):
s_full_name = field.find('value').text
for field in child.findall('name.status'):
s_status = field.find('value').text
for field in child.findall('level_of_detail'):
s_detail = field.find('value').text
rows.append({"status": s_status,
"priref": s_priref,
"full_name": s_full_name,
"detail": s_detail,
"adres": s_address},)
out_df = pd.DataFrame(rows, columns=df_cols)
print(out_df)
First off, findall() returns an empty list if there is nothing found which matches the search criteria, so in the loop
for field in child.findall("..."):
# this is only performed if child.findall() doesn't return empty
The consequence of this, in this case, is that s_address, s_full_name, s_status, and s_detail do not necessarily get assigned to a new value on each iteration of the outer loop. Hence, they will retain the value from the most recent iteration that the respective child.findall() clause returned non-empty.
The simple way to fix this is to assign them all to some initial value on each iteration of the outer loop, i.e.
for child in record:
s_piref = child.get('piref')
s_address = ''
s_full_name = ''
s_detail = ''
s_status = ''
# ...
Although it might be better (perhaps more 'pythonic') to do something like this:
# Store child.findall() and field.find() keys in a dict
dict = {'Address' : 'address',
'name' : 'value',
'name.status' : 'value',
'level_of_detail' : 'value'}
# To store the reference keys
ref = ["adres", "full_name", "status", "detail", "piref"]
for record in root:
# Initialize a second dict from the same keys mapping to
# empty strings instead
s = dict.fromkeys(dict.keys(), '')
s["piref"] = "piref"
for key in dict:
for field in child.findall(key):
s[key] = field.find(m[key])
rows.append(dict(zip(ref, s.values())),)
Which should work just the same as the other method but makes it easier to add more keys/fields as needed.

create a filtered list of dictionaries based on existing list of dictionaries

I have a list of dictionaries read in from csv DictReader that represent rows of a csv file:
rows = [{"id":"123","date":"1/1/18","foo":"bar"},
{"id":"123","date":"2/2/18", "foo":"baz"}]
I would like to create a new dictionary, where only unique ID's are stored. But I would like to only keep the row entry with the most recent date. Based on the above example, it would keep the row with date 2/2/18.
I was thinking of doing something like this, but having trouble translating the pseudocode in the else statement into actual python.
I can figure out the part of checking the two dates for which is more recent, but having the most trouble figuring out how I check the new list for the dictionary that contains the same id and then retrieving the date from that row.
Note: Unfortunately, due to resource constraints on our platform I am unable to use pandas for this project.
new_data = []
for row in rows:
if row['id'] not in new_data:
new_data.append(row)
else:
check the element in new_data with the same id as row['id']
if that element's date value is less recent:
replace it with the current row
else :
continue to next row in rows
You'll need a function to convert your date (as string) to a date (as date).
import datetime
def to_date(date_str):
d1, m1, y1 = [int(s) for s in date_str.split('/')]
return datetime.date(y1, m1, d1)
I assumed your date format is d/m/yy. Consider using datetime.strptime to parse your dates, as illustrated by Alex Hall's answer.
Then, the idea is to loop over your rows and store them in a new structure (here, a dict whose keys are the IDs). If a key already exists, compare its date with the current row, and take the right one. Following your pseudo-code, this leads to:
rows = [{"id":"123","date":"1/1/18","foo":"bar"},
{"id":"123","date":"2/2/18", "foo":"baz"}]
new_data = dict()
for row in rows:
existing = new_data.get(row['id'], None)
if existing is None or to_date(existing['date']) < to_date(row['date']):
new_data[row['id']] = row
If your want your new_data variable to be a list, use new_data = list(new_data.values()).
import datetime
rows = [{"id":"123","date":"1/1/18","foo":"bar"},
{"id":"123","date":"2/2/18", "foo":"baz"}]
def parse_date(d):
return datetime.datetime.strptime(d, "%d/%m/%y").date()
tmp_dict = {}
for row in rows:
if row['id'] not in tmp_dict.keys():
tmp_dict['id'] = row
else:
if parse_date(row['date']) > parse_date(tmp_dict[row['id']]):
tmp_dict['id'] = row
print tmp_dict.values()
output
[{'date': '2/2/18', 'foo': 'baz', 'id': '123'}]
Note: you can merge the two if to if row['id'] not in tmp_dict.keys() || parse_date(row['date']) > parse_date(tmp_dict[row['id']]) for cleaner and shorter code
Firstly, work with proper date objects, not strings. Here is how to parse them:
from datetime import datetime, date
rows = [{"id": "123", "date": "1/1/18", "foo": "bar"},
{"id": "123", "date": "2/2/18", "foo": "baz"}]
for row in rows:
row['date'] = datetime.strptime(row['date'], '%d/%m/%y').date()
(check if the format is correct)
Then for the actual task:
new_data = {}
for row in rows:
new_data[row['id']] = max(new_data.get(row['id'], date.min),
row['date'])
print(new_data.values())
Alternatively:
Here are some generic utility functions that work well here which I use in many places:
from collections import defaultdict
def group_by_key_func(iterable, key_func):
"""
Create a dictionary from an iterable such that the keys are the result of evaluating a key function on elements
of the iterable and the values are lists of elements all of which correspond to the key.
"""
result = defaultdict(list)
for item in iterable:
result[key_func(item)].append(item)
return result
def group_by_key(iterable, key):
return group_by_key_func(iterable, lambda x: x[key])
Then the solution can be written as:
by_id = group_by_key(rows, 'id')
for id_num, group in list(by_id.items()):
by_id[id_num] = max(group, key=lambda r: r['date'])
print(by_id.values())
This is less efficient than the first solution because it creates lists along the way that are discarded, but I use the general principles in many places and I thought of it first, so here it is.
If you like to utilize classes as much as I do, then you could make your own class to do this:
from datetime import date
rows = [
{"id":"123","date":"1/1/18","foo":"bar"},
{"id":"123","date":"2/2/18", "foo":"baz"},
{"id":"456","date":"3/3/18","foo":"bar"},
{"id":"456","date":"1/1/18","foo":"bar"}
]
class unique(dict):
def __setitem__(self, key, value):
#Add key if missing or replace key if date is newer
if key not in self or self[key]["date"] < value["date"]:
dict.__setitem__(self, key, value)
data = unique() #Initialize new class based on dict
for row in rows:
d, m, y = map(int, row["date"].split('/')) #Split date into parts
row["date"] = date(y, m, d) #Replace date value
data[row["id"]] = row #Set new data. Will overwrite same ids with more recent
print data.values()
Outputs:
[
{'date': datetime.date(18, 2, 2), 'foo': 'baz', 'id': '123'},
{'date': datetime.date(18, 3, 3), 'foo': 'bar', 'id': '456'}
]
Keep in mind that data is a dict that essentially overrides the __setitem__ method that uses IDs as keys. And the dates are date objects so they can be compared easily.

Fast distinct list of elements in an array python

I need to speed up the time to count the distinct elements in this code and I'm not really sure how to do a faster count.
def process_columns(columns):
with open(columns, 'r') as src:
data = csv.reader(src, delimiter ='\t', skipinitialspace = False)
category = []
group = columns.split("/")
group = group[-1].split(".")
if group[0] in ["data_1", "data_2"]:
for row in data:
if row[0] not in category:
category.append(row[0])
message = "\t%d distinct elements from %ss" % (len(category), group[0])
print message
A master method to count distinct elements in a python array is :
array = [1,1,2,3,3,4,5,6,6]
n_elts = len(set(array))
print(n_elts)
Output:
6
Without much knowledge on your data, here's a quick way to maintain a set of unique words for your groups, using collections.defaultdict.
from collections import defaultdict
def process_columns(columns):
categories = defaultdict(set) # initialises a default dict with values as sets
with open(columns, 'r') as src:
data = csv.reader(src, delimiter ='\t', skipinitialspace = False)
group = columns.split("/")[-1].split('.')
for row in data:
categories[group[0]].update(row[0])
for k in categories:
message = "\t%d distinct elements from %ss" % (len(categories[k]), k)
print message
Initialise category as a set; and remove the if block to add data into category,replace it with category.add
category = {}
group = columns.split("/")
group = group[-1].split(".")
if group[0] in ["data_1", "data_2"]:
for row in data:
category.add(row[0])
Hope this is clear

Categories

Resources