Updating a MongoDB document if field doesn't exist - python

Whenever I updated my insert_one with a new field to use, I had to always delete the old posts in the collection. I know there are manual methods of updating such fields using update_many but I know it's inefficient.
For example:
posts.insert_one({
"id": random.randint(1,10000)
"value1": "value1",
"value2": "value2"
})
I use the following code to check if the document exists or not. How would this work for a field?
if posts.find({'id': 12312}).count() > 0:
I know I can easily overwrite the previous data but I know people won't enjoy having their data wiped every other month.
Is there a way to add the field to a document in Python?

How would this work for a field?
You can use $exists to check whether a field exists in a doc.
In your case, you can combine this with find
find({ 'id':1, "fieldToCheck":{$exists:"true"}})
It will return the doc if it exists with id = 1, fieldToCheck is present in doc with id = 1
You can skip id=1, in that case, it will return all docs where fieldToCheck exists
Is there a way to add the field to a document in Python?
You could use update with new field, it will update if it is present else it will insert.
update({"_id":1}, {field:"x"})
If field is present, it will set to x else it will add with field:x
Beware of update options like multi, upsert

Yes you can you use update command in mongoDB shell to do that. check here
This is the command to use...
db.collection.update({},{$set : {"newfield":1}},false,true)
The above will work in the mongoDB shell. It will add newfield in all the documents, if it is not present.
If you want to use Python, use pymongo.
For python, following command should work
db.collection.update({},{"$set" : {"newfield":1}},False, True)

Thanks to john's answer I have made an entire solution that automatically updates documents without the need to run a task meaning you don't update inactive documents.
import datetime
import pymongo
database = pymongo.MongoClient("mongodb://localhost:27017") # Mongodb connection
db = database.maindb # Database
posts = db.items # Collection within a database
# A schema equivalent function that returns the object
def user_details(name, dob):
return {
"username": name, # a username/id
"dob": dob, # some data
"level": 0, # some other data
"latest_update": datetime.datetime.fromtimestamp(1615640176)
# Must be kept to ensure you aren't doing it that often
}
# The first schema changed for example after adding a new feature
def user_details2(name, dob, cake):
return {
"username": name, # a username/id
"dob": dob, # Some data
"level": 0, # Some other data
"cake": cake, # Some new data that isn't in the document
"latest_update": datetime.datetime.utcnow() # Must be kept to ensure you aren't doing it that often
}
def check_if_update(find, main_document,
collection): # parameters: What you find a document with, the schema dictionary, then the mongodb collection
if collection.count_documents(find) > 0: # How many documents match, only proceed if it exists
fields = {} # Init a dictionary
for x in collection.find(find): # You only want one for this to work
fields = x
if "latest_update" in fields: # Just in case it doesn't exist yet
last_time = fields["latest_update"] # Get the time that it was last updated
time_diff = datetime.datetime.utcnow() - last_time # Get the time difference between the utc time now and the time it was last updated
if time_diff.total_seconds() < 3600: # If the total seconds of the difference is smaller than an hour
print("return")
return
db_schema = main_document # Better naming
db_schema["_id"] = 0 # Adds the _id schema_key into the dictionary
if db_schema.keys() != fields:
print("in")
for schema_key, schema_value in db_schema.items():
if schema_key not in fields.keys(): # Main key for example if cake is added and doesn't exist in db fetched fields
collection.update_one(find, {"$set": {schema_key: schema_value}})
else: # Everything exists and you want to check for if a dictionary within that dictionary is changed
try:
sub_dict = dict(schema_value) # Make the value of it a dictionary
# It exists in the schema dictionary but not in the db fetched document
for key2, value2 in sub_dict.items():
if key2 not in fields[schema_key].keys():
new_value = schema_value
new_value[
key2] = value2 # Adding the key and value from the schema dictionary that was added
collection.update_one(find,
{"$set": {schema_key: new_value}})
# It exists in the db fetched document but not in the schema dictionary
for key2, value2 in fields[schema_key].items():
if key2 not in sub_dict.keys():
new_dict = {} # Get all values, filter then so that only the schema existent ones are passed back
for item in sub_dict:
if item != key2:
new_dict[item] = sub_dict.get(item)
collection.update_one(find, {"$set": {schema_key: new_dict}})
except: # Wasn't a dict
pass
# You removed a value from the schema dictionary and want to update it in the db
for key2, value2 in fields.items():
if key2 not in db_schema:
collection.update_one(find, {"$unset": {key2: 1}})
else:
collection.insert_one(main_document) # Insert it because it doesn't exist yet
print("start")
print(posts.find_one({"username": "john"}))
check_if_update({"username": "john"}, user_details("john", "13/03/2021"), posts)
print("inserted")
print(posts.find_one({"username": "john"}))
check_if_update({"username": "john"}, user_details2("john", "13/03/2021", "Lemon drizzle"), posts)
print("Results:")
print(posts.find_one({"username": "john"}))
It is available as a gist

Related

Accessing specific UID in array

I got the following problem. I'm trying to pull the specific field, in the "warnings" array, which has the given UID. I can't seem to figure out why it's not working.
The output (Everything prints out successfully): https://i.imgur.com/ZslJ0rV.png\
My MongoDB structure: https://i.imgur.com/3bRegAD.png
client = pymongo.MongoClient("")
database = client["LateNight"]
ModlogsCollection = database["modlogs"]
theUID = "63TF-lYv0-72m7-9f4I"
theGuild = 1063516188988153896
all_mod_docs = ModlogsCollection.find({"_id": str(theGuild)})
all_uids = []
for doc in all_mod_docs:
doc_keys = [key for key in doc.keys() if key != "_id"]
for key in doc_keys:
sub_doc = doc[key]
if warnings := sub_doc.get("warnings"):
for warning in warnings:
if warning["UID"] == theUID:
print(warning)
print("Warning")
result = ModlogsCollection.update_one(
{"_id": str(theGuild)},
{"$pull": {
"warnings": {"UID": theUID}
}}
)
print(result)
print(result.modified_count)
as you yourself said you try to "extract the specific field, in the warnings table that has the UID given". Before recovering the UID value you must specify the index 0. Afterwards you get a dictionary that will have the keys:
moderator, reason, time and UID

Dictionary iteration with conditions

I have a dictionary where the key is tuple of child and parent table where it's value is a tuple of lists of child fields and list of parent fields.
RELS = {( "ACCESS_PROFILE_RELATIONSHIP","INVOLVED_PARTY_RELATIONSHIP"):(["INVOLVED_PARTY_RELATIONSHIP_ID","INVOLVED_PARTY_RELATIONSHIP_UUID"],["ID","INVOLVED_PARTY_RELATIONSHIP_UUID"]),
("AGREEMENT_IDENTFIER","AGREEMENT"):(["AGREEMENT_ID"],["ID"]),
("AGREEMENT_PARTY_ROLE_RELATIONSHIP","INVOLVED_PARTY"):(["INVOLVEP_PARTY_ID","PARTY_UUID"],["ID","PARTY_UUID"]),
("AGREEMENT_PARTY_ROLE_RELATIONSHIP","AGREEMENT"):(["AGREEMENT_ID","AGREEMENT_UUID"],["ID","AGREEMENT_UUID"])}
I need to transform it into a dictionary of dictionaries where keys are child tables, it's value is dictionary of child fields that has value of a tuple (parent table, parent field)
{
'ACCESS_PROFILE_RELATIONSHIP': {
'INVOLVED_PARTY_RELATIONSHIP_ID': ('INVOLVED_PARTY_RELATIONSHIP','ID'),
'INVOLVED_PARTY_RELATIONSHIP_UUID': ('INVOLVED_PARTY_RELATIONSHIP','INVOLVED_PARTY_RELATIONSHIP_UUID')
},
'AGREEMENT_IDENTFIER': {
'AGREEMENT_ID': ('AGREEMENT','ID')
},
'AGREEMENT_PARTY_ROLE_RELATIONSHIP': {
'INVOLVED_PARTY_ID': ('INVOLVED_PARTY',ID'),
'PARTY_UUID:('INVOLVED_PARTY','PARTY_UUID'),
'AGREEMENT_ID': ('AGREEMENT', 'ID'),
'AGREEMENT_UUID': ('AGREEMENT', 'AGREEMENT_UUID')
}
}
I have performed a loop like this:
refs = {}
for tables,fields in RELS.items():
refs[tables[0]] = {}
for i,_ in enumerate(fields[0]):
fk = {fields[0][i]:(tables[1],fields[1][i])}
if tables[0] in refs.keys():
refs[tables[0]].update(fk)
yet since dictionary can not have 2 same keys - AGREEMENT_PARY_ROLE_RELATIONSHIP relationship with INVOLVED_PARTY is overwritten by relationship of AGREEMENT_PARTY_ROLE_RELATIONSHIP and AGREEMENT.
How can I add a condition in my loop to add a dictionary of {child field:(parent_table,parent_field)} in case if child_table key already exist in my end result?
Thank you in advance!
You are not checking if the key currently exists already. If a table is referenced multiple times the line
refs[tables[0]] = {}
inside your loop will clear the already existing value stored in a previous loop run.
Change it to
RELS = {("ACCESS_PROFILE_RELATIONSHIP","INVOLVED_PARTY_RELATIONSHIP"):(["INVOLVED_PARTY_RELATIONSHIP_ID","INVOLVED_PARTY_RELATIONSHIP_UUID"], ["ID","INVOLVED_PARTY_RELATIONSHIP_UUID"]),
("AGREEMENT_IDENTFIER","AGREEMENT"):(["AGREEMENT_ID"],["ID"]),
("AGREEMENT_PARTY_ROLE_RELATIONSHIP","INVOLVED_PARTY"):(["INVOLVEP_PARTY_ID","PARTY_UUID"],["ID","PARTY_UUID"]),
("AGREEMENT_PARTY_ROLE_RELATIONSHIP","AGREEMENT"):(["AGREEMENT_ID","AGREEMENT_UUID"],["ID","AGREEMENT_UUID"])}
refs = {}
for tables, fields in RELS.items():
inner = refs.setdefault(tables[0],{}) # create if not exist, else get its reference
for i,_ in enumerate(fields[0]):
fk = {fields[0][i]:(tables[1],fields[1][i])}
inner.update(fk)
print(refs)
Output:
{'ACCESS_PROFILE_RELATIONSHIP': {'INVOLVED_PARTY_RELATIONSHIP_ID': ('INVOLVED_PARTY_RELATIONSHIP', 'ID'),
'INVOLVED_PARTY_RELATIONSHIP_UUID': ('INVOLVED_PARTY_RELATIONSHIP', 'INVOLVED_PARTY_RELATIONSHIP_UUID')},
'AGREEMENT_IDENTFIER': {'AGREEMENT_ID': ('AGREEMENT', 'ID')},
'AGREEMENT_PARTY_ROLE_RELATIONSHIP': {'INVOLVEP_PARTY_ID': ('INVOLVED_PARTY', 'ID'),
'PARTY_UUID': ('INVOLVED_PARTY', 'PARTY_UUID'),
'AGREEMENT_ID': ('AGREEMENT', 'ID'),
'AGREEMENT_UUID': ('AGREEMENT', 'AGREEMENT_UUID')}}
This is essentially what #Anentropic ment with his conceise comment.
See setdefault-docu.

Incrementing a counter in DynamoDB when value to be updated is in a map field

I have a lambda function that needs to retrieve an item from DynamoDB and update the counter of that item. But..
The DynamoDB table is structured as:
id: int
options: map
some_option: 0
some_other_option: 0
I need to first retrieve the item of the table that has a certain id and a certain option listed as a key in the options.
Then I want to increment that counter by some value.
Here is what I have so far:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('options')
response = None
try:
response = table.get_item(Key={'id': id})
except ClientError as e:
print(e.response['Error']['Message'])
option = response.get('Item', None)
if option:
option['options'][some_option] = int(option['options'][some_option]) + some_value
# how to update item in DynamoDB now?
My issues is how to update the record now and more importantly will such solution cause data races? Could 2 simultaneous lambda calls that try to update the same item at the same option cause data races? If so what's the way to solve this?
Any pointers/help is appreciated.
Ok, I found the answer:
All I need is:
response = table.update_item(
Key={
'id': my_id,
},
UpdateExpression='SET options.#s = options.#s + :val',
ExpressionAttributeNames={
"#s": my_option
},
ExpressionAttributeValues={
':val': Decimal(some_value)
},
ReturnValues="UPDATED_NEW"
)
This is inspired from Step 3.4: Increment an Atomic Counter which provides an atomic approach to increment values. According to the documentation:
DynamoDB supports atomic counters, which use the update_item method to
increment or decrement the value of an existing attribute without
interfering with other write requests. (All write requests are applied
in the order in which they are received.)

How does allocateIds() work in Cloud Datastore Mode?

In the new Datastore Mode documentation, there is mention of allocateIds() method. However, beyond a single paragraph, there isn't an example code that illustrates how this method is used.
I am trying to allocate an ID each time I create a new entity so that I can save the ID as a property of the entity itself.
I assume that in pseudocode, it works like this:
user_id = allocateIds(number_id_ids=1)
user_key = datastore_client.key(kind='User', user_id)
user = datastore.Entity(key=user_key)
user.update({ 'user_id': user_id }) # Allows a get_user_by_id() query
datastore_client.put(user)
How exactly does allocateIds() work in practice?
When you call the allocateIds() function it invokes a new instance of class Key(object) when the consturctor of "Key" is called it takes all of the arguments you provided allocateIds and recombines them through a _combine_args method. That is what produces your key.
(and if you want to see the code yourself)
source: https://googleapis.dev/python/datastore/latest/_modules/google/cloud/datastore/key.html#Key
Yes, allocateIds() should work for the case where you want to get an ID from Datastore mode and use it as both an ID and property value:
from google.cloud import datastore
client = datastore.Client()
# Allocate a single ID in kind User
# Returns list of keys
keys = client.allocate_ids(client.key('User'), 1)
# Get key from list
key = keys[0]
print(key.id)
# Create a User entity using our key
user = datastore.Entity(key)
# Add ID as a field
user.update({
'user_id': key.id
})
# Commit to database
client.put(user)
# Query based on full key
query = client.query(kind='User')
query.key_filter(user.key, '=')
results = list(query.fetch())
print(results)
For most other cases where you just want a single auto-ID, you can skip allocate_ids:
# Create a User entity
# Use an incomplete key so Datastore assigns an ID
user = datastore.Entity(client.key('User'))
# Add some data
user.update({
'foo': 'bar'
})
# Datastore allocates an ID when you call client.put
client.put(user)
# user.key now contains an ID
user_id = user.key.id
print(user_id)
# Query with the ID and key
query = client.query(kind='User')
query.key_filter(user.key, '=')
results = list(query.fetch())
print(results)

Empty a DynamoDB table with boto

How can I optimally (in terms financial cost) empty a DynamoDB table with boto? (as we can do in SQL with a truncate statement.)
boto.dynamodb2.table.delete() or boto.dynamodb2.layer1.DynamoDBConnection.delete_table() deletes the entire table, while boto.dynamodb2.table.delete_item() boto.dynamodb2.table.BatchTable.delete_item() only deletes the specified items.
While i agree with Johnny Wu that dropping the table and recreating it is much more efficient, there may be cases such as when many GSI's or Tirgger events are associated with a table and you dont want to have to re-associate those. The script below should work to recursively scan the table and use the batch function to delete all items in the table. For massively large tables though, this may not work as it requires all items in the table to be loaded into your computer
import boto3
dynamo = boto3.resource('dynamodb')
def truncateTable(tableName):
table = dynamo.Table(tableName)
#get the table keys
tableKeyNames = [key.get("AttributeName") for key in table.key_schema]
"""
NOTE: there are reserved attributes for key names, please see https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ReservedWords.html
if a hash or range key is in the reserved word list, you will need to use the ExpressionAttributeNames parameter
described at https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Table.scan
"""
#Only retrieve the keys for each item in the table (minimize data transfer)
ProjectionExpression = ", ".join(tableKeyNames)
response = table.scan(ProjectionExpression=ProjectionExpression)
data = response.get('Items')
while 'LastEvaluatedKey' in response:
response = table.scan(
ProjectionExpression=ProjectionExpression,
ExclusiveStartKey=response['LastEvaluatedKey'])
data.extend(response['Items'])
with table.batch_writer() as batch:
for each in data:
batch.delete_item(
Key={key: each[key] for key in tableKeyNames}
)
truncateTable("YOUR_TABLE_NAME")
As Johnny Wu mentioned, deleting a table and re-creating it is more efficient than deleting individual items. You should make sure your code doesn't try to create a new table before it is completely deleted.
def deleteTable(table_name):
print('deleting table')
return client.delete_table(TableName=table_name)
def createTable(table_name):
waiter = client.get_waiter('table_not_exists')
waiter.wait(TableName=table_name)
print('creating table')
table = dynamodb.create_table(
TableName=table_name,
KeySchema=[
{
'AttributeName': 'YOURATTRIBUTENAME',
'KeyType': 'HASH'
}
],
AttributeDefinitions= [
{
'AttributeName': 'YOURATTRIBUTENAME',
'AttributeType': 'S'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 1,
'WriteCapacityUnits': 1
},
StreamSpecification={
'StreamEnabled': False
}
)
def emptyTable(table_name):
deleteTable(table_name)
createTable(table_name)
Deleting a table is much more efficient than deleting items one-by-one. If you are able to control your truncation points, then you can do something similar to rotating tables as suggested in the docs for time series data.
This builds on the answer given by Persistent Plants. If the table already exists, you can extract the table definitions and use that to recreate the table.
import boto3
dynamodb = boto3.resource('dynamodb', region_name='us-east-2')
def delete_table_ddb(table_name):
table = dynamodb.Table(table_name)
return table.delete()
def create_table_ddb(table_name, key_schema, attribute_definitions,
provisioned_throughput, stream_enabled, billing_mode):
settings = dict(
TableName=table_name,
KeySchema=key_schema,
AttributeDefinitions=attribute_definitions,
StreamSpecification={'StreamEnabled': stream_enabled},
BillingMode=billing_mode
)
if billing_mode == 'PROVISIONED':
settings['ProvisionedThroughput'] = provisioned_throughput
return dynamodb.create_table(**settings)
def truncate_table_ddb(table_name):
table = dynamodb.Table(table_name)
key_schema = table.key_schema
attribute_definitions = table.attribute_definitions
if table.billing_mode_summary:
billing_mode = 'PAY_PER_REQUEST'
else:
billing_mode = 'PROVISIONED'
if table.stream_specification:
stream_enabled = True
else:
stream_enabled = False
capacity = ['ReadCapacityUnits', 'WriteCapacityUnits']
provisioned_throughput = {k: v for k, v in table.provisioned_throughput.items() if k in capacity}
delete_table_ddb(table_name)
table.wait_until_not_exists()
return create_table_ddb(
table_name,
key_schema=key_schema,
attribute_definitions=attribute_definitions,
provisioned_throughput=provisioned_throughput,
stream_enabled=stream_enabled,
billing_mode=billing_mode
)
Now call use the function:
table_name = 'test_ddb'
truncate_table_ddb(table_name)

Categories

Resources