I need to be able to build my buildObject using data extracted from csv file columns
class BuildObject(ObjectID):
def __init__(self, ObjectID, ObjectName, ObjectPrice, ObjectLocation, ObjectColour, ObjectAge, ObjectTag):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
def main():
with open(filename1, "r") as csv1, open(filename2, "r") as csv2:
csvReader1 = csv.DictReader(csv1)
csvReader2 = csv.DictReader(csv2)
csvList = []
for row1, row2 in zip(csvReader1, csvReader2):
csvList.append((row2["ObjectName"], row1["ObjectId"], row1["ObjectPrice"]))
return csvList
Comment: My concern is with this answer that it will work fine provided the csv files have the exact same objectID and in the same order - but will happen if a objectID/Object is missing only in one of the csv files?
Therefore, you can't use zip(csvReader1, csvReader2), you
need random access to a Date_Record using the ObjectID as key/index.
As you mentinioned large amounts of data I would recommend go with SQL.
If you want to do it using Python objects change the following:
def __init__(self):
self._data_store = {}
#data_store.setter
def data_store(self, data):
...
self._data_store[record['ObjectID'] = record
Question: The one topic would be the create a BuildObject for every unique itemID using the data from the csv files and sql query
Checking your code, got the following Error:
class BuildObject(ObjectID):
NameError: name 'ObjectID' is not defined
Why do you inherit from ObjectID?
Where are these class defined?
Consider the following:
class Data_Record():
"""
This class object hold all data for ONE Record
"""
def __init__(self, ObjectID, ObjectName):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
# ... (omitted for brevity)
class Data_Store():
"""
This class object handels Data_Record, reading from csv or sql or anywhere
"""
def __init__(self):
# List to hold all Data_Record objects
self._data_store = []
# Access read only the Data_Record objects
#property
def data_store(self):
return self._data_store
# Add ONE Data_Record from either csv or sql or anywhere
#data_store.setter
def data_store(self, data):
# Condition type(data)
if isinstance(data, dict):
record = Data_Record(**data)
elif isinstance(data, list):
record = Data_Record(**tuple(data))
else:
raise(ValueError, "Data of type({}) are not supported!".format(type(data)))
self._data_store.append(record)
# Method to read from csv
def read_csv(self, fname1, fname2):
# ... (omitted for brevity)
csvReader1, csvReader2 = ([], [])
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store = (csv2["ObjectName"], csv1["ObjectId"])
# Method to read from sql
def read_sql(self, sql, query):
result = sql.query(query)
for record in result:
self.data_store = record
Alternative: Without #property/getter/setter.
Here the read(... functions have to know how to add a new Date_Record object to self.data_store. Note: self.data_store is now a public attribute.
If you decide, later on, to store not in memory, you have to rewrite both read(... functions.
class Data_Record():
def __init__(self, data=None):
# Condition type(data)
if isinstance(data, dict):
self.ObjectID = data['ObjectID']
self.ObjectName = data['ObjectName']
elif isinstance(data, list):
# List have to be in predefined order
# e.g ObjectID == Index 0 ObjectName == Index 1 etc.
self.ObjectID = data[0]
self.ObjectName = data[1]
else:
self.ObjectID = None
self.ObjectName = None
class Data_Store():
def __init__(self):
self.data_store = []
def read_csv(self, fname1, fname2):
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store.append(Data_Record((csv2["ObjectName"], csv1["ObjectId"])))
def read_sql(self, query):
for record in SQL.query(query):
self.data_store.append(Data_Record(record))
Related
In the code below as you can see I have a test class that inherits from the sqliteDict class. There is also a get_term() method that returns the keys for the dictionary. In the main part, first I make an instance of the class and try to make a new sqliteDict file and assign simple data to it through a context manager block. Until now everything works great but when I try to read the data through the second context manager block from the same file, it seems the data is not saved in the file.
from collections import defaultdict
from sqlitedict import SqliteDict
class test(SqliteDict):
def __init__(self, filename: str = "inverted_index.sqlite", new = False):
super().__init__(filename, flag="n" if new else "c")
self._index = defaultdict(list) if new else self
def get_terms(self):
"""Returns all unique terms in the index."""
return self._index.keys()
if __name__ == "__main__":
with test("test.sqlite",new=True) as d:
d._index["test"]= ["ok"]
print("first attempt: ", [t for t in d.get_terms()])
d.commit()
with test("test.sqlite", new=False) as f:
print("second attempt: ",[t for t in f.get_terms()])
and the result is:
first attempt: ['test']
second attempt: []
Instead of using a dict to store and pass data we are going completely OOPS approach of storing the data as class attributes and call the get methods defined according to need.
In Java i was able to achieve this but having some trouble in Python. Any Solution would be helpful.
import json
class InputModel:
def __init__(self, input_payload):
self.id1 = input_payload["id1"]
self.route = RouteModel(input_payload["route"])
self.id2 = input_payload["id2"]
self.id3 = input_payload["id3"]
self.id4 = input_payload["id4"]
self.id5 = input_payload["id5"]
def get_id1(self):
return self.id1
#similar for other ids
class RouteModel:
def __init__(self, input_payload_route):
self.id6 = input_payload_route["id6"]
self.id7 = input_payload_route["id7"]
def get_id6(self):
return self.id6
#similar for other ids
json_str = '{"id1":"string","route":{"id6":"string","id7":"string"},"id2": "string","id3": "string","id4": "string","id5": "string"}'
json_dict = json.loads(json_str)
im = InputModel(json_dict)
print(im.get_id1())
print(im.get_id6())
not able to access the nested class attributes
Seems like you went for 1 extra indent in your class methods, thus you couldn't reach them.
Also, to reach id6 of RouteModel, you had to refer to 'route' first:
import json
class InputModel:
def __init__(self, input_payload):
self.id1 = input_payload["id1"]
self.route = RouteModel(input_payload["route"])
self.id2 = input_payload["id2"]
self.id3 = input_payload["id3"]
self.id4 = input_payload["id4"]
self.id5 = input_payload["id5"]
def get_id1(self):
return self.id1
#similar for other ids
class RouteModel:
def __init__(self, input_payload_route):
self.id6 = input_payload_route["id6"]
self.id7 = input_payload_route["id7"]
def get_id6(self):
return self.id6
#similar for other ids
json_str = '{"id1":"string","route":{"id6":"string","id7":"string"},"id2": "string","id3": "string","id4": "string","id5": "string"}'
json_dict = json.loads(json_str)
im = InputModel(json_dict)
print(im.get_id1())
print(im.route.get_id6())
Output:
string
string
The problem is that you are only defining get_id* in your local scope, you need to assign it to the instance if you insist on defining it inside the __init__ method.
I minimized your code example to isolate your issue.
class RouteModel:
def __init__(self):
self.id6 = "foo"
def get_id6(self_=self):
return self_.id6
self.get_id6 = get_id6
rm = RouteModel()
print(rm.get_id6())
>>> "foo"
If I understand your question correctly, you want to be able to access the ids directly as attributes, no matter how deep they are nested in the dictionary.
This solution creates the attributes recursively:
import json
class InputModel:
def __init__(self, payload):
self.create_attrs(payload)
def create_attrs(self, d):
for key, value in d.items():
# if the value is a dict, call create_attrs recursively
if isinstance(value, dict):
self.create_attrs(value)
else:
# create an attribute key=value, e.g. id1="string"
setattr(self, key, value)
json_str = '{"id1":"string","route":{"id6":"string","id7":"string"},"id2": "string","id3": "string","id4": "string","id5": "string"}'
json_dict = json.loads(json_str)
im = InputModel(json_dict)
print(im.id1)
print(im.id6)
After going through answers provided, mostly have defined instance attributes and not class attributes.
Correct me if I'm wrong here but I think this is how class attributes are defined right?
import json
class InputModel:
def __init__(self, input_payload):
InputModel.id1 = input_payload["id1"]
InputModel.route = RouteModel(input_payload["route"])
InputModel.id2 = input_payload["id2"]
InputModel.id3 = input_payload["id3"]
InputModel.id4 = input_payload["id4"]
InputModel.id5 = input_payload["id5"]
def get_id1():
return InputModel.id1
#OR
##classmethod
#def get_id1(cls):
# return cls.id1
#similar for other ids
class RouteModel:
def __init__(self, input_payload_route):
RouteModel.id6 = input_payload_route["id6"]
RouteModel.id7 = input_payload_route["id7"]
def get_id6():
return RouteModel.id6
#similar for other ids
json_str = '{"id1":"string","route":{"id6":"string","id7":"string"},"id2": "string","id3": "string","id4": "string","id5": "string"}'
json_dict = json.loads(json_str)
InputModel(json_dict)
print(InputModel.get_id1())
print(InputModel.route.get_id6())
print(RouteModel.get_id6())
I have the following code to insert documents into a MongoDB, the problem is that it's quite slow since I'm unable to multiprocessor it, and considering I have to check if each document inserted already exist or not I believe it's impossible to use bulk-inserts. I'm wondering if there is a faster method to this problem. After doing a profiling on below I found that check record() and update_upstream() are two functions that are very time consuming. So optimising them would increase the overall speed. Any inputs on how to optimise below would be highly appreciated. Thank you!
import os
import pymongo
from directory import Directory
from pymongo import ASCENDING
from pymongo import DESCENDING
from pymongo import MongoClient
from storage_config import StorageConfig
from tqdm import tqdm
dir = Directory()
def DB_collections(collection_type):
types = {'p': 'player_stats',
't': 'team_standings',
'f': 'fixture_stats',
'l': 'league_standings',
'pf': 'fixture_players_stats'}
return types.get(collection_type)
class DB():
def __init__(self, league, season, func=None):
self.db_user = os.environ.get('DB_user')
self.db_pass = os.environ.get('DB_pass')
self.MONGODB_URL = f'mongodb+srv://{self.db_user}:{self.db_pass}#cluster0-mbqxj.mongodb.net/<dbname>?retryWrites=true&w=majority'
self.league = league
self.season = str(season)
self.client = MongoClient(self.MONGODB_URL)
self.DATABASE = self.client[self.league + self.season]
self.pool = multiprocessing.cpu_count()
self.playerfile = f'{self.league}_{self.season}_playerstats.json'
self.teamfile = f'{self.league}_{self.season}_team_standings.json'
self.fixturefile = f'{self.league}_{self.season}_fixturestats.json'
self.leaguefile = f'{self.league}_{self.season}_league_standings.json'
self.player_fixture = f'{self.league}_{self.season}_player_fixture.json'
self.func = func
def execute(self):
if self.func is not None:
return self.func(self)
def import_json(file):
"""Imports a json file in read mode
Args:
file(str): Name of file
"""
return dir.load_json(file , StorageConfig.DB_DIR)
def load_file(file):
try:
loaded_file = import_json(file)
return loaded_file
except FileNotFoundError:
print("Please check that", file, "exists")
def check_record(collection, index_dict):
"""Check if record exists in collection
Args:
index_dict (dict): key, value
"""
return collection.find_one(index_dict)
def collection_index(collection, index, *args):
"""Checks if index exists for collection,
and return a new index if not
Args:
collection (str): Name of collection in database
index (str): Dict key to be used as an index
args (str): Additional dict keys to create compound indexs
"""
compound_index = tuple((arg, ASCENDING) for arg in args)
if index not in collection.index_information():
return collection.create_index([(index, DESCENDING), *compound_index], unique=True)
def push_upstream(collection, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
record_id (str): record _id to be put for record in collection
record (dict): Data to be pushed in collection
"""
return collection.insert_one(record)
def update_upstream(collection, index_dict, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
index_dict (dict): key, value
record (dict): Data to be updated in collection
"""
return collection.update_one(index_dict, {"$set": record}, upsert=True)
def executePushPlayer(db):
playerstats = load_file(db.playerfile)
collection_name = DB_collections('p')
collection = db.DATABASE[collection_name]
collection_index(collection, 'p_id')
for player in tqdm(playerstats):
existingPost = check_record(collection, {'p_id': player['p_id']})
if existingPost:
update_upstream(collection, {'p_id': player['p_id']}, player)
else:
push_upstream(collection, player)
if __name__ == '__main__':
db = DB('EN_PR', '2019')
executePushPlayer(db)
You can cobine the check/insert/update logic into a single update_one() command using upsert=True, then use the bulk operators with something like:
updates = []
for player in tqdm(playerstats):
updates.append(UpdateOne({'p_id': player['p_id']}, player, upsert=True))
collection.bulk_write(updates)
Fianlly, check your index is being used with the following command at the MongoDB shell:
db.mycollection.aggregate([{ $indexStats: {} }])
And review the accesses.ops metric.
I have custom class to simulate a row in table (in database concept), each column is a string.
class Row:
def __init__(self, filename, message, version):
self.filename = filename
self.message = message
self.version = version
And I use a list to store them.
Assume I don't know the range of each column, and I want to transfer this 'table' to a dict of dicts,
such that it would be easier to query for all rows that filename = OOO and version = XXXX.
What would be a better way to do it? Right now I could iterate through all rows and build the range for particular column but it's kind of spaghetti code.
Easiest is probably something like this. If you know your rows are immutable, you could provide a hash method though - that might look a little nicer.
#!/usr/local/cpython-3.3/bin/python
class Row:
def __init__(self, filename, message, version):
self.filename = filename
self.message = message
self.version = version
def __str__(self):
return '{} {} {}'.format(self.filename, self.message, self.version)
__repr__ = __str__
def main():
list_ = [
Row('abc', 'message1', 'version1'),
Row('def', 'message2', 'version2'),
Row('ghi', 'message3', 'version3'),
Row('jkl', 'message4', 'version4'),
Row('mno', 'message5', 'version5'),
]
dict_ = {}
for row in list_:
tuple_ = (row.filename, row.version)
dict_[tuple_] = row
sought = ('def', 'version2')
print(dict_[sought])
main()
I have homework that I am stuck on. I have gone as far as I can but I am stuck, can someone point me in the right direction.... I am getting stick in making each data row a new object. Normally i would think I could just iterate over the rows, but that will only return last row
Question:
Modify the classFactory.py source code so that the DataRow class returned by the build_row function has another method:
retrieve(self, curs, condition=None)
self is (as usual) the instance whose method is being called, curs is a database cursor on an existing database connection, and condition (if present) is a string of condition(s) which must be true of all received rows.
The retrieve method should be a generator, yielding successive rows of the result set until it is completely exhausted. Each row should be a new object of type DataRow.
This is what I have------
the test:
import unittest
from classFactory import build_row
class DBTest(unittest.TestCase):
def setUp(self):
C = build_row("user", "id name email")
self.c = C([1, "Steve Holden", "steve#holdenweb.com"])
def test_attributes(self):
self.assertEqual(self.c.id, 1)
self.assertEqual(self.c.name, "Steve Holden")
self.assertEqual(self.c.email, "steve#holdenweb.com")
def test_repr(self):
self.assertEqual(repr(self.c),
"user_record(1, 'Steve Holden', 'steve#holdenweb.com')")
if __name__ == "__main__":
unittest.main()
the script I am testing
def build_row(table, cols):
"""Build a class that creates instances of specific rows"""
class DataRow:
"""Generic data row class, specialized by surrounding function"""
def __init__(self, data):
"""Uses data and column names to inject attributes"""
assert len(data)==len(self.cols)
for colname, dat in zip(self.cols, data):
setattr(self, colname, dat)
def __repr__(self):
return "{0}_record({1})".format(self.table, ", ".join([" {0!r}".format(getattr(self, c)) for c in self.cols]))
DataRow.table = table
DataRow.cols = cols.split()
return DataRow
It should roughly be something like the following:
def retrieve(self, curs, condition=None):
query_ = "SELECT * FROM rows"
if condition is not None:
query_ += " %s" %condition
curs.execute(query_)
for row in curs.fetchall(): # iterate over the retrieved results
yield row # and yield each row in turn
Iterate over the rows as normal, but use yield instead of return.