Convert a multi-column table to a dict of dicts - python

I have custom class to simulate a row in table (in database concept), each column is a string.
class Row:
def __init__(self, filename, message, version):
self.filename = filename
self.message = message
self.version = version
And I use a list to store them.
Assume I don't know the range of each column, and I want to transfer this 'table' to a dict of dicts,
such that it would be easier to query for all rows that filename = OOO and version = XXXX.
What would be a better way to do it? Right now I could iterate through all rows and build the range for particular column but it's kind of spaghetti code.

Easiest is probably something like this. If you know your rows are immutable, you could provide a hash method though - that might look a little nicer.
#!/usr/local/cpython-3.3/bin/python
class Row:
def __init__(self, filename, message, version):
self.filename = filename
self.message = message
self.version = version
def __str__(self):
return '{} {} {}'.format(self.filename, self.message, self.version)
__repr__ = __str__
def main():
list_ = [
Row('abc', 'message1', 'version1'),
Row('def', 'message2', 'version2'),
Row('ghi', 'message3', 'version3'),
Row('jkl', 'message4', 'version4'),
Row('mno', 'message5', 'version5'),
]
dict_ = {}
for row in list_:
tuple_ = (row.filename, row.version)
dict_[tuple_] = row
sought = ('def', 'version2')
print(dict_[sought])
main()

Related

Sorting objects using CSV data

Before I start I am noob
So, I made objects using data from a CSV file which was in the form 'doe,john,m,20', which can be seen below:
class FitClinic:
def __init__(self, lname, fname, gender, age):
self.__lname = lname
self.__fname = fname
self.__gender = gender
self.__age = age
def __del__(self):
print("Customer has been deleted")
def set_lname(self):
pass
def get_lname(self):
return self.__lname
def set_fname(self):
pass
def get_fname(self):
return self.__fname
def set_gender(self):
pass
def get_gender(self):
return self.__gender
def set_age(self):
pass
def get_age(self):
return self.__age
fh=open('fit_clinic_20.csv', 'r')
fh.seek(3)
listofcustomers=[]
for row in fh:
c = row.split(",")
listofcustomers.append(FitClinic(c[0], c[1], c[2], c[3:]))
What I need to do is sort these objects by the fname attribute which I have no idea how to do, please help, thanks.
You can use sorted with key
sorted_list=sorted(listofcustomers,key=lambda x: x.get_fname())
Refer to Sorting HOW TO
To check the result, you can just print the information with the method you have implemented:
for x in sorted_list:
print(x.get_fname())
In a more complicated situation, advised by #Maurice Reeves, you can also add __str__ and __repr__ methods.
def __str__(self):
str_to_print=f'lname:{self.__lname},'
str_to_print+=f'fname:{self.__fname},'
str_to_print+=f'gender:{self.__gender},'
str_to_print+=f'age:{self.__age}'
return str_to_print
__repr__=__str__
Then you can print by:
for x in sorted_list:
print(x)
Refer to Python doc.
BTW, you can use pandas to load csv file conveniently.
import pandas
csv_pd=pandas.read_csv('fit_clinic_20.csv')
csv_pd.sort_values(by=['fname']) # If fname is the head of your csv file. If not, just add it.
Refer to pandas.DataFrame.sort_values

Python class containing self-referencing array

I have a class like this...
class person:
def __init__(self, name):
self.name = name
self.carData = ""
self.HouseData = ""
#other assets
And I'm looping through it like this...
john = person('John');
for index, row in john.carData.iterrows():
#do something
for index, row in john.HouseData.iterrows():
#do something
But in this case "do something" is repetitive code - I want to do the same thing for carData, HouseData and all the other assets a person can own.
What's the best way to loop through these in a non-repetitive way? I was thinking of something like this in the class definition but I'm pretty sure it's at least inelegant and probably wrong...
class person:
def __init__(self, name):
self.name = name
self.carData = ""
self.HouseData = ""
#other assets
self.assets = [self.carData, self.HouseData etc]
john = person('John');
for asset in john.assets:
for index, row in asset.iterrows():
#do something
Is that bad practise to define a list of objects (in this case self.assets) that itself refers to the same objects stored in the class? Apologies if this is vague - I'm just trying to figure out the cleanest way to deal with this scenario.
If you plan on applying the same code to every attribute of your object, you can try john.__dict__:
john = person('John')
for attr, value in john.__dict__.items():
# e.g. attr == 'carData'
for index, row in value.iterrows():
# do something
See this question.
It's kind of hackish, but if (for example) you want to apply the code to every attribute except "name", do this:
john = person('John')
for attr, value in john.__dict__.items():
# e.g. attr == 'carData'
if attr != 'name':
for index, row in value.iterrows():
# do something

Python Building Object from two sources

I need to be able to build my buildObject using data extracted from csv file columns
class BuildObject(ObjectID):
def __init__(self, ObjectID, ObjectName, ObjectPrice, ObjectLocation, ObjectColour, ObjectAge, ObjectTag):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
def main():
with open(filename1, "r") as csv1, open(filename2, "r") as csv2:
csvReader1 = csv.DictReader(csv1)
csvReader2 = csv.DictReader(csv2)
csvList = []
for row1, row2 in zip(csvReader1, csvReader2):
csvList.append((row2["ObjectName"], row1["ObjectId"], row1["ObjectPrice"]))
return csvList
Comment: My concern is with this answer that it will work fine provided the csv files have the exact same objectID and in the same order - but will happen if a objectID/Object is missing only in one of the csv files?
Therefore, you can't use zip(csvReader1, csvReader2), you
need random access to a Date_Record using the ObjectID as key/index.
As you mentinioned large amounts of data I would recommend go with SQL.
If you want to do it using Python objects change the following:
def __init__(self):
self._data_store = {}
#data_store.setter
def data_store(self, data):
...
self._data_store[record['ObjectID'] = record
Question: The one topic would be the create a BuildObject for every unique itemID using the data from the csv files and sql query
Checking your code, got the following Error:
class BuildObject(ObjectID):
NameError: name 'ObjectID' is not defined
Why do you inherit from ObjectID?
Where are these class defined?
Consider the following:
class Data_Record():
"""
This class object hold all data for ONE Record
"""
def __init__(self, ObjectID, ObjectName):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
# ... (omitted for brevity)
class Data_Store():
"""
This class object handels Data_Record, reading from csv or sql or anywhere
"""
def __init__(self):
# List to hold all Data_Record objects
self._data_store = []
# Access read only the Data_Record objects
#property
def data_store(self):
return self._data_store
# Add ONE Data_Record from either csv or sql or anywhere
#data_store.setter
def data_store(self, data):
# Condition type(data)
if isinstance(data, dict):
record = Data_Record(**data)
elif isinstance(data, list):
record = Data_Record(**tuple(data))
else:
raise(ValueError, "Data of type({}) are not supported!".format(type(data)))
self._data_store.append(record)
# Method to read from csv
def read_csv(self, fname1, fname2):
# ... (omitted for brevity)
csvReader1, csvReader2 = ([], [])
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store = (csv2["ObjectName"], csv1["ObjectId"])
# Method to read from sql
def read_sql(self, sql, query):
result = sql.query(query)
for record in result:
self.data_store = record
Alternative: Without #property/getter/setter.
Here the read(... functions have to know how to add a new Date_Record object to self.data_store. Note: self.data_store is now a public attribute.
If you decide, later on, to store not in memory, you have to rewrite both read(... functions.
class Data_Record():
def __init__(self, data=None):
# Condition type(data)
if isinstance(data, dict):
self.ObjectID = data['ObjectID']
self.ObjectName = data['ObjectName']
elif isinstance(data, list):
# List have to be in predefined order
# e.g ObjectID == Index 0 ObjectName == Index 1 etc.
self.ObjectID = data[0]
self.ObjectName = data[1]
else:
self.ObjectID = None
self.ObjectName = None
class Data_Store():
def __init__(self):
self.data_store = []
def read_csv(self, fname1, fname2):
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store.append(Data_Record((csv2["ObjectName"], csv1["ObjectId"])))
def read_sql(self, query):
for record in SQL.query(query):
self.data_store.append(Data_Record(record))

Best approach for extensible return object in python

I'm not that familiar with python, so I would like some advice on the best approach for the following situation:
I have created a function analyzing logs and returning tuples like this (example code):
def analyse(log):
# do magic
return (list_of_timestamps, list_of_values)
Usage:
log = import_my_log()
timestamps, data = analyse(log)
if timestamps:
plot(log)
Than we needed plots of other data, so we extended analyse():
def analyse(log):
# do magic
return (list_of_timestamps, list_of_values, list_of_other_values)
so we needed to change the line timestamps, data = analyse(log) to timestamps, data, other_data = analyse(log), because it changes the interface. I C-Code, I would return a struct and access like mydata.timestamps. My only solution seems to create a class myData with members, which I would return and access like the C-Struct.
Is there a better/more pythonic way?
Simply use a dictionary:
return {'timestamps': list_of_timestamps, 'values': list_of_values}
If you need to later add things, you just add more keys:
return {'timestamps': list_of_timestamps, 'values': list_of_values, 'avg_coffee_consumption': 12e9}
With dicts, you get the values back via indexing with the keys, i.e.:
result = analyse(log)
v = result['values']
You can also actually use your struct approach; now, Python objects are run-time extendable, so your analyze might contain
class return_t:
pass
def analyze(log):
ret_val = return_t()
ret_val.values = list_of_values
ret_val.timestamps = list_of_timestamps
ret_val.avg_coffee_consumption = 12e9
return ret_val
...
result = analyze(l)
coffee_used = ret_val.avg_coffee_consumptions * (max(ret_val.timestamps) - min(ret_val.timestamps))
Create a dict:
def analyse(log):
# do magic
return {'timestamps':list_of_timestamps, 'values': list_of_values}
Or create a class:
class MeasureItem():
def __init__(self, timestamp, value)
self.timestamp = timestamp
self.value = value
def analyse(log):
# do magic
return MeasureItem(timestamp, value)

python generator

I have homework that I am stuck on. I have gone as far as I can but I am stuck, can someone point me in the right direction.... I am getting stick in making each data row a new object. Normally i would think I could just iterate over the rows, but that will only return last row
Question:
Modify the classFactory.py source code so that the DataRow class returned by the build_row function has another method:
retrieve(self, curs, condition=None)
self is (as usual) the instance whose method is being called, curs is a database cursor on an existing database connection, and condition (if present) is a string of condition(s) which must be true of all received rows.
The retrieve method should be a generator, yielding successive rows of the result set until it is completely exhausted. Each row should be a new object of type DataRow.
This is what I have------
the test:
import unittest
from classFactory import build_row
class DBTest(unittest.TestCase):
def setUp(self):
C = build_row("user", "id name email")
self.c = C([1, "Steve Holden", "steve#holdenweb.com"])
def test_attributes(self):
self.assertEqual(self.c.id, 1)
self.assertEqual(self.c.name, "Steve Holden")
self.assertEqual(self.c.email, "steve#holdenweb.com")
def test_repr(self):
self.assertEqual(repr(self.c),
"user_record(1, 'Steve Holden', 'steve#holdenweb.com')")
if __name__ == "__main__":
unittest.main()
the script I am testing
def build_row(table, cols):
"""Build a class that creates instances of specific rows"""
class DataRow:
"""Generic data row class, specialized by surrounding function"""
def __init__(self, data):
"""Uses data and column names to inject attributes"""
assert len(data)==len(self.cols)
for colname, dat in zip(self.cols, data):
setattr(self, colname, dat)
def __repr__(self):
return "{0}_record({1})".format(self.table, ", ".join([" {0!r}".format(getattr(self, c)) for c in self.cols]))
DataRow.table = table
DataRow.cols = cols.split()
return DataRow
It should roughly be something like the following:
def retrieve(self, curs, condition=None):
query_ = "SELECT * FROM rows"
if condition is not None:
query_ += " %s" %condition
curs.execute(query_)
for row in curs.fetchall(): # iterate over the retrieved results
yield row # and yield each row in turn
Iterate over the rows as normal, but use yield instead of return.

Categories

Resources