How to increase write speed on inserts, pymongo? - python

I have the following code to insert documents into a MongoDB, the problem is that it's quite slow since I'm unable to multiprocessor it, and considering I have to check if each document inserted already exist or not I believe it's impossible to use bulk-inserts. I'm wondering if there is a faster method to this problem. After doing a profiling on below I found that check record() and update_upstream() are two functions that are very time consuming. So optimising them would increase the overall speed. Any inputs on how to optimise below would be highly appreciated. Thank you!
import os
import pymongo
from directory import Directory
from pymongo import ASCENDING
from pymongo import DESCENDING
from pymongo import MongoClient
from storage_config import StorageConfig
from tqdm import tqdm
dir = Directory()
def DB_collections(collection_type):
types = {'p': 'player_stats',
't': 'team_standings',
'f': 'fixture_stats',
'l': 'league_standings',
'pf': 'fixture_players_stats'}
return types.get(collection_type)
class DB():
def __init__(self, league, season, func=None):
self.db_user = os.environ.get('DB_user')
self.db_pass = os.environ.get('DB_pass')
self.MONGODB_URL = f'mongodb+srv://{self.db_user}:{self.db_pass}#cluster0-mbqxj.mongodb.net/<dbname>?retryWrites=true&w=majority'
self.league = league
self.season = str(season)
self.client = MongoClient(self.MONGODB_URL)
self.DATABASE = self.client[self.league + self.season]
self.pool = multiprocessing.cpu_count()
self.playerfile = f'{self.league}_{self.season}_playerstats.json'
self.teamfile = f'{self.league}_{self.season}_team_standings.json'
self.fixturefile = f'{self.league}_{self.season}_fixturestats.json'
self.leaguefile = f'{self.league}_{self.season}_league_standings.json'
self.player_fixture = f'{self.league}_{self.season}_player_fixture.json'
self.func = func
def execute(self):
if self.func is not None:
return self.func(self)
def import_json(file):
"""Imports a json file in read mode
Args:
file(str): Name of file
"""
return dir.load_json(file , StorageConfig.DB_DIR)
def load_file(file):
try:
loaded_file = import_json(file)
return loaded_file
except FileNotFoundError:
print("Please check that", file, "exists")
def check_record(collection, index_dict):
"""Check if record exists in collection
Args:
index_dict (dict): key, value
"""
return collection.find_one(index_dict)
def collection_index(collection, index, *args):
"""Checks if index exists for collection,
and return a new index if not
Args:
collection (str): Name of collection in database
index (str): Dict key to be used as an index
args (str): Additional dict keys to create compound indexs
"""
compound_index = tuple((arg, ASCENDING) for arg in args)
if index not in collection.index_information():
return collection.create_index([(index, DESCENDING), *compound_index], unique=True)
def push_upstream(collection, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
record_id (str): record _id to be put for record in collection
record (dict): Data to be pushed in collection
"""
return collection.insert_one(record)
def update_upstream(collection, index_dict, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
index_dict (dict): key, value
record (dict): Data to be updated in collection
"""
return collection.update_one(index_dict, {"$set": record}, upsert=True)
def executePushPlayer(db):
playerstats = load_file(db.playerfile)
collection_name = DB_collections('p')
collection = db.DATABASE[collection_name]
collection_index(collection, 'p_id')
for player in tqdm(playerstats):
existingPost = check_record(collection, {'p_id': player['p_id']})
if existingPost:
update_upstream(collection, {'p_id': player['p_id']}, player)
else:
push_upstream(collection, player)
if __name__ == '__main__':
db = DB('EN_PR', '2019')
executePushPlayer(db)

You can cobine the check/insert/update logic into a single update_one() command using upsert=True, then use the bulk operators with something like:
updates = []
for player in tqdm(playerstats):
updates.append(UpdateOne({'p_id': player['p_id']}, player, upsert=True))
collection.bulk_write(updates)
Fianlly, check your index is being used with the following command at the MongoDB shell:
db.mycollection.aggregate([{ $indexStats: {} }])
And review the accesses.ops metric.

Related

How to delete a document in MongoDB

I am trying to create a delete method in order to delete a document that has the key:"name" and the value:"Rhonda". Whenever I execute my current code, I get an AttributeError saying:"'AnimalShelter' object has no attribute 'delete'". How do I get the method to return the deleted document's JSON contents? Here is my code:
testing_script.ipynb
from animal_shelter import AnimalShelter
# now need to create the object from the class
shelter = AnimalShelter("aacuser","Superman")
data = {"age_upon_outcome":"2 years","animal_type":"Dog","breed":"Dachshund","color":"Black and tan","name":"Rhonda","outcome_subtype":"Partner","outcome_type":"Adopt","sex_upon_outcome":"Female"}
new_values = {"$set": {"age_upon_outcome":"3 years"}}
# if shelter.create(data):
# print("Animal added")
# else:
# print("Failed to add animal")
# Calls the read function
# shelter.read(data)
# Calls the update function
# shelter.update(data, new_values)
# Calls the delete function
shelter.delete(data)
output
AttributeError Traceback (most recent call last)
<ipython-input-5-60b1d887dfb8> in <module>
17
18 # Calls the delete function
---> 19 shelter.delete(data)
20
AttributeError: 'AnimalShelter' object has no attribute 'delete'
animal_shelter.py
from pymongo import MongoClient
from bson.objectid import ObjectId
class AnimalShelter(object):
""" CRUD operations for Animal collection in MongoDB """
def __init__(self,username,password):
# Initializing the MongoClient. This helps to
# access the MongoDB databases and collections.
# init to connect to mongodb without authentication
self.client = MongoClient('mongodb://localhost:55996')
# init connect to mongodb with authentication
# self.client = MongoClient('mongodb://%s:%s#localhost:55996/?authMechanism=DEFAULT&authSource=AAC'%(username, password))
self.database = self.client['AAC']
# Complete this create method to implement the C in CRUD.
def create(self, data):
if data is not None:
self.database.animals.insert(data) # data should be dictionary
return True # Tells whether the create function ran successfully
else:
raise Exception("Nothing to save ...")
# Create method to implement the R in CRUD.
def read(self, data):
return self.database.animals.find_one(data) #returns only one
# Update method to implement the U in CRUD.
def update(self, data, new_values):
if self.database.animals.count(data):
self.database.animals.update(data, new_values)
return self.database.animals.find({"age_upon_outcome":"3 years"})
else:
raise Exception("Nothing to update ...")
# Delete method to implement the D in CRUD
def delete(self, data)
result = self.database.animals.find_one_and_delete(data)
# print the _id key only if the result is not None
if("_id" in result):
print("find_one_and_delete ID:",result["_id"])
else:
print("Nothing to delete")
Problem is that functions that you are defining are outside the class. You have to put indentation on functions in class AnimalShelter
Also as pointed out in comment you are missing : in delete
Updated animal_sheltor.py
from pymongo import MongoClient
from bson.objectid import ObjectId
class AnimalShelter(object):
""" CRUD operations for Animal collection in MongoDB """
def __init__(self,username,password):
# Initializing the MongoClient. This helps to
# access the MongoDB databases and collections.
# init to connect to mongodb without authentication
self.client = MongoClient('mongodb://localhost:55996')
# init connect to mongodb with authentication
# self.client = MongoClient('mongodb://%s:%s#localhost:55996/?authMechanism=DEFAULT&authSource=AAC'%(username, password))
self.database = self.client['AAC']
# Complete this create method to implement the C in CRUD.
def create(self, data):
if data is not None:
self.database.animals.insert(data) # data should be dictionary
return True # Tells whether the create function ran successfully
else:
raise Exception("Nothing to save ...")
# Create method to implement the R in CRUD.
def read(self, data):
return self.database.animals.find_one(data) #returns only one
# Update method to implement the U in CRUD.
def update(self, data, new_values):
if self.database.animals.count(data):
self.database.animals.update(data, new_values)
return self.database.animals.find({"age_upon_outcome":"3 years"})
else:
raise Exception("Nothing to update ...")
# Delete method to implement the D in CRUD
def delete(self, data):
result = self.database.animals.find_one_and_delete(data)
# print the _id key only if the result is not None
if("_id" in result):
print("find_one_and_delete ID:",result["_id"])
else:
print("Nothing to delete")

Python Class Object not callable after loop

I'm using a class object for building a database. I'm using the object in two separate loops, such that when I start the second iteration of the first loop (after having gone through both once), I get a TypeError message saying class "object" not callable. How do I fix this?
import xml.etree.ElementTree as ET
import datetime
import re
import sqlite3
import os
import pathlib
class parameter:
# defines object
def __init__(self, id, value):
self.id = id
self.value = value
# defines equility for eliminating duplicates further on
def __eq__(self, other):
if (isinstance(other, parameter)):
return self.id == other.id and self.value == other.value
return False
db_path = "/Users/miguelnobremenezes/Documents/Code/xml_echo_script/echo.db"
filepath = "/Users/miguelnobremenezes/Documents/Code/xml_echo_script/xml_files"
conn = sqlite3.connect(db_path)
db = conn.cursor()
# Deletes old table
db.execute("DROP TABLE IF EXISTS echo")
# Creates new table with unique ID column
db.execute("CREATE TABLE echo (UNIQUE_ID)")
# Creates a list for adding parameters ie columns to DB
parameter_ids_added_to_db = []
#tree = ET.parse("/Users/mbp2013/Library/Mobile Documents/com~apple~CloudDocs/Documents/Code/xml_echo_script/exemplo.xml")
"""Extracts parameter_value pairs for each XML in directory i.e. patient AND adds unique parameter ids only if non-existent"""
# Iterating every file and respective directory on root folder
for root, dirs, files in os.walk(filepath):
for filename in files:
individual_filepath = os.path.join(root, filename)
print("File name:", filename)
print("File path:", individual_filepath)
tree = ET.parse(individual_filepath)
print("Found XML file\n")
parameters = []
unique_parameters = []
parameter_unique_ids = []
counter_elem = 0
counter_display = 0
for elem in tree.iter():
print("looping")
if elem.tag == "Birthdate":
new_id = elem.tag
new_id = re.sub("[^\w\s]", "_", new_id)
new_id = new_id.replace(" ", "_")
new_id = "pt" + "_" + str(new_id)
print(new_id)
if new_id not in parameter_unique_ids:
parameter_unique_ids.append(new_id)
counter_elem +=1
new_value = elem.text
print(new_value)
counter_display += 1
new_parameter = parameter(new_id, new_value)
print("added")
parameters.append(new_parameter)
print("added")
if elem.tag == "StudyInstanceUID":
unique_ID = elem.text
db.execute("INSERT INTO echo (UNIQUE_ID) VALUES (:value)",
{"value": unique_ID})
temp_ids = []
for parameter in parameters:
if parameter.id not in temp_ids:
temp_ids.append(parameter.id)
unique_parameters.append(parameter)
"""Inserts data into database"""
# Adds columns to DB if they don't already exist
for parameter_unique_id in parameter_unique_ids:
if parameter_unique_id not in parameter_ids_added_to_db:
db.execute('''ALTER TABLE echo ADD COLUMN ''' + parameter_unique_id)
parameter_ids_added_to_db.append(parameter_unique_id)
# Inserts values in specific column for each case
for unique_parameter in unique_parameters:
#print(unique_parameter.id)
#print(unique_parameter.value)
db.execute("UPDATE echo SET ("+unique_parameter.id+") = :value WHERE UNIQUE_ID = :unique_ID",
{"value": unique_parameter.value, "unique_ID": unique_ID})
# Confirm changes to database and close it
conn.commit()
conn.close()
The error is TypeError: 'parameter' object is not callable
I know the problem comes from lines:
temp_ids = []
for parameter in parameters:
if parameter.id not in temp_ids:
temp_ids.append(parameter.id)
unique_parameters.append(parameter)
You have
class parameter: ...
Then you have the loop:
for parameter in parameters:
inside an outer loop.
In the second iteration of the outer loop, the name parameter still refers to the variable from the last iteration of for parameter in parameters.
This means that any attempt to use parameter to refer to the class again, ie
new_parameter = parameter(new_id, new_value)
will fail.
The solution is to change either one of the parameter variables to another name (preferably the class name, as it violates PEP8 anyway).

Python Building Object from two sources

I need to be able to build my buildObject using data extracted from csv file columns
class BuildObject(ObjectID):
def __init__(self, ObjectID, ObjectName, ObjectPrice, ObjectLocation, ObjectColour, ObjectAge, ObjectTag):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
def main():
with open(filename1, "r") as csv1, open(filename2, "r") as csv2:
csvReader1 = csv.DictReader(csv1)
csvReader2 = csv.DictReader(csv2)
csvList = []
for row1, row2 in zip(csvReader1, csvReader2):
csvList.append((row2["ObjectName"], row1["ObjectId"], row1["ObjectPrice"]))
return csvList
Comment: My concern is with this answer that it will work fine provided the csv files have the exact same objectID and in the same order - but will happen if a objectID/Object is missing only in one of the csv files?
Therefore, you can't use zip(csvReader1, csvReader2), you
need random access to a Date_Record using the ObjectID as key/index.
As you mentinioned large amounts of data I would recommend go with SQL.
If you want to do it using Python objects change the following:
def __init__(self):
self._data_store = {}
#data_store.setter
def data_store(self, data):
...
self._data_store[record['ObjectID'] = record
Question: The one topic would be the create a BuildObject for every unique itemID using the data from the csv files and sql query
Checking your code, got the following Error:
class BuildObject(ObjectID):
NameError: name 'ObjectID' is not defined
Why do you inherit from ObjectID?
Where are these class defined?
Consider the following:
class Data_Record():
"""
This class object hold all data for ONE Record
"""
def __init__(self, ObjectID, ObjectName):
self.ObjectID= ObjectID
self.ObjectName= ObjectName
# ... (omitted for brevity)
class Data_Store():
"""
This class object handels Data_Record, reading from csv or sql or anywhere
"""
def __init__(self):
# List to hold all Data_Record objects
self._data_store = []
# Access read only the Data_Record objects
#property
def data_store(self):
return self._data_store
# Add ONE Data_Record from either csv or sql or anywhere
#data_store.setter
def data_store(self, data):
# Condition type(data)
if isinstance(data, dict):
record = Data_Record(**data)
elif isinstance(data, list):
record = Data_Record(**tuple(data))
else:
raise(ValueError, "Data of type({}) are not supported!".format(type(data)))
self._data_store.append(record)
# Method to read from csv
def read_csv(self, fname1, fname2):
# ... (omitted for brevity)
csvReader1, csvReader2 = ([], [])
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store = (csv2["ObjectName"], csv1["ObjectId"])
# Method to read from sql
def read_sql(self, sql, query):
result = sql.query(query)
for record in result:
self.data_store = record
Alternative: Without #property/getter/setter.
Here the read(... functions have to know how to add a new Date_Record object to self.data_store. Note: self.data_store is now a public attribute.
If you decide, later on, to store not in memory, you have to rewrite both read(... functions.
class Data_Record():
def __init__(self, data=None):
# Condition type(data)
if isinstance(data, dict):
self.ObjectID = data['ObjectID']
self.ObjectName = data['ObjectName']
elif isinstance(data, list):
# List have to be in predefined order
# e.g ObjectID == Index 0 ObjectName == Index 1 etc.
self.ObjectID = data[0]
self.ObjectName = data[1]
else:
self.ObjectID = None
self.ObjectName = None
class Data_Store():
def __init__(self):
self.data_store = []
def read_csv(self, fname1, fname2):
for csv1, csv2 in zip(csvReader1, csvReader2):
self.data_store.append(Data_Record((csv2["ObjectName"], csv1["ObjectId"])))
def read_sql(self, query):
for record in SQL.query(query):
self.data_store.append(Data_Record(record))

running transactions in db.Model subclassed put

I'm trying to create a google app engine data model with the following attributes:
store string, value pair into BigTable
if string, value pair DOES NOT exist, create the record
if string, value pair DOES exist, update the record, incrementing a counter
code:
class stringListRecord(db.Model):
type = db.StringProperty();
value = db.StringProperty();
refs = db.IntegerProperty(default=1);
def __init__(self, *args, **kw):
key = db.GqlQuery("SELECT __key__ FROM stringListRecord WHERE type = :1 AND value = :2", kw['type'], kw['value']).get();
if key != None:
kw['key'] = key;
db.Model.__init__(self, *args, **kw);
def increment_counter(self, key):
obj = db.get(key);
obj.refs += 1;
db.Model.put(obj);
def put(self):
if self.key() != None:
self.increment_counter(self.key());
#db.run_in_transaction(self.increment_counter, self.key());
else:
db.Model.put(self);
When I run the commented out code, i.e. db.run_in_transaction() I get:
Only ancestor queries are allowed inside transactions.
Is there a better way to get this functionality out of GAE?

Expressing multiple columns in berkeley db in python?

Say I have a simple table that contains username, firstname, lastname.
How do I express this in berkeley Db?
I'm currently using bsddb as the interface.
Cheers.
You have to pick one "column" as the key (must be unique; I imagine that would be "username" in your case) -- the only way searches will ever possibly happen. The other columns can be made to be the single string value of that key by any way you like, from pickling to simple joining with a character that's guaranteed to never occur in any of the columns, such as `\0' for many kind of "readable text strings".
If you need to be able to search by different keys you'll need other, supplementary and separate bsddb databases set up as "indices" into your main table -- it's lots of work, and there's lots of literature on the subject. (Alternatively, you move to a higher-abstraction technology, such as sqlite, which handles the indexing neatly on your behalf;-).
tl,dr: To express multiple columns in an ordered key value store like berkley db you need to learn about key composition. Look up my other answers about bsddb to learn more.
There is several ways to do that using ordered key/value store.
The simplest solution is to store documents as json values with a correct key.
Now you probably want to build index over those columns to retrieve documents without having to iterate over all the hashmap to find the correct object. For that you can use a secondaryDB that will build automatically the index for you. Or you can build the index yourself.
If you don't want to deal with key packing (and it's a good idea for starting up), you can take advantage of DB.set_bt_compare which will allow you to use cpickle, json or msgpack for both keys and values while still having an order that makes sens to create indices and doing queries. This is slower method but introduce the pattern of key composition.
To fully take advantage what ordered key is, you can make use of Cursor.set_range(key) to set the position of the db at the beginning of a query.
Another pattern, is called the EAV pattern stores tuples that follow the scheme (entity, attribute, value) and then you build various index by using permutation of that tuple. I learned this pattern studing datomic.
For less ressource hungry database, you will go the "static typed" way and store as much as possible of common information in the "metadata" table and split documents (which are really RDBMS tables) into their own hashmap.
To get you started here is an example database using bsddb (but you could build it using another ordered key/value store like wiredtiger or leveldb) that implements the EAV pattern. In this implementation I swap EAV for IKV which translates to Unique identifier, Key, Value. The overal result is that you have a fully indexed schema less document database. I think it's a good compromise between efficiency and ease-of-use.
import struct
from json import dumps
from json import loads
from bsddb3.db import DB
from bsddb3.db import DBEnv
from bsddb3.db import DB_BTREE
from bsddb3.db import DB_CREATE
from bsddb3.db import DB_INIT_MPOOL
from bsddb3.db import DB_LOG_AUTO_REMOVE
def pack(*values):
def __pack(value):
if type(value) is int:
return '1' + struct.pack('>q', value)
elif type(value) is str:
return '2' + struct.pack('>q', len(value)) + value
else:
data = dumps(value, encoding='utf-8')
return '3' + struct.pack('>q', len(data)) + data
return ''.join(map(__pack, values))
def unpack(packed):
kind = packed[0]
if kind == '1':
value = struct.unpack('>q', packed[1:9])[0]
packed = packed[9:]
elif kind == '2':
size = struct.unpack('>q', packed[1:9])[0]
value = packed[9:9+size]
packed = packed[size+9:]
else:
size = struct.unpack('>q', packed[1:9])[0]
value = loads(packed[9:9+size])
packed = packed[size+9:]
if packed:
values = unpack(packed)
values.insert(0, value)
else:
values = [value]
return values
class TupleSpace(object):
"""Generic database"""
def __init__(self, path):
self.env = DBEnv()
self.env.set_cache_max(10, 0)
self.env.set_cachesize(5, 0)
flags = (
DB_CREATE |
DB_INIT_MPOOL
)
self.env.log_set_config(DB_LOG_AUTO_REMOVE, True)
self.env.set_lg_max(1024 ** 3)
self.env.open(
path,
flags,
0
)
# create vertices and edges k/v stores
def new_store(name):
flags = DB_CREATE
elements = DB(self.env)
elements.open(
name,
None,
DB_BTREE,
flags,
0,
)
return elements
self.tuples = new_store('tuples')
self.index = new_store('index')
self.txn = None
def get(self, uid):
cursor = self.tuples.cursor()
def __get():
record = cursor.set_range(pack(uid, ''))
if not record:
return
key, value = record
while True:
other, key = unpack(key)
if other == uid:
value = unpack(value)[0]
yield key, value
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
tuples = dict(__get())
cursor.close()
return tuples
def add(self, uid, **properties):
for key, value in properties.items():
self.tuples.put(pack(uid, key), pack(value))
self.index.put(pack(key, value, uid), '')
def delete(self, uid):
# delete item from main table and index
cursor = self.tuples.cursor()
index = self.index.cursor()
record = cursor.set_range(pack(uid, ''))
if record:
key, value = record
else:
cursor.close()
raise Exception('not found')
while True:
other, key = unpack(key)
if other == uid:
# remove tuple from main index
cursor.delete()
# remove it from index
value = unpack(value)[0]
index.set(pack(key, value, uid))
index.delete()
# continue
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
cursor.close()
def update(self, uid, **properties):
self.delete(uid)
self.add(uid, **properties)
def close(self):
self.index.close()
self.tuples.close()
self.env.close()
def debug(self):
for key, value in self.tuples.items():
uid, key = unpack(key)
value = unpack(value)[0]
print(uid, key, value)
def query(self, key, value=''):
"""return `(key, value, uid)` tuples that where
`key` and `value` are expressed in the arguments"""
cursor = self.index.cursor()
match = (key, value) if value else (key,)
record = cursor.set_range(pack(key, value))
if not record:
cursor.close()
return
while True:
key, _ = record
other = unpack(key)
ok = reduce(
lambda previous, x: (cmp(*x) == 0) and previous,
zip(match, other),
True
)
if ok:
yield other
record = cursor.next()
if not record:
break
else:
break
cursor.close()
db = TupleSpace('tmp')
# you can use a tuple to store a counter
db.add(0, counter=0)
# And then have a procedure doing the required work
# to alaways have a fresh uid
def make_uid():
counter = db.get(0)
counter['counter'] += 1
return counter['counter']
amirouche = make_uid()
db.add(amirouche, username="amirouche", age=30)
print(db.get(amirouche))

Categories

Resources