How to create MongoDB Time Series Collection using pymongo - python

The documentation shows how to do it with mongosh, but how do you create Time Series Collection using pymongo from within a python script?
import pymongo
import time
from datetime import datetime
client = pymongo.MongoClient()
db = client['time-series-db']
col = db['time-series-col']
# ... do something here to make it 'time-series collection' ...
js = {
"1": "A",
"2": "B",
"3": "C",
"4": "D",
"5": "E",
}
# create BSON type datetime object needed for 'time-series collection'
ts = time.time()
js['timestamp'] = datetime.utcfromtimestamp(ts)
col.insert_one(js)

You can try this:
conn = pymongo.MongoClient('mongodb://localhost')
db = conn.testDB
db.create_collection('testColl', timeseries={ 'timeField': 'timestamp' })
# - OR -
db.command('create', 'testColl', timeseries={ 'timeField': 'timestamp', 'metaField': 'data', 'granularity': 'hours' })
General Reference: Time Series Collections

Related

Inserting data using PyMongo based on a defined data model

I have a dataset consisting of 250 rows that looks like to following:
In MongoDB Compass, I inserted the first row as follows:
db.employees.insertOne([{"employee_id": 412153,
"first_name": "Carrol",
"last_name": "Dhin",
"email": "carrol.dhin#company.com",
"managing": [{"manager_id": 412153, "employee_id": 174543}],
"department": [{"department_name": "Accounting", "department_budget": 500000}],
"laptop": [{"serial_number": "CSS49745",
"manufacturer": "Lenovo",
"model": "X1 Gen 10",
"date_assigned": {$date: 01-15-2022},
"installed_software": ["MS Office", "Adobe Acrobat", "Slack"]}]})
If I wanted to insert all 250 rows into the database using PyMongo in Python, how would I ensure that every row is entered following the format that I used when I inserted it manually in the Mongo shell?
from pymongo import MongoClient
import pandas as pd
client = MongoClient(‘localhost’, 27017)
db = client.MD
collection = db.gammaCorp
df = pd.read_csv(‘ ’) #insert CSV name here
data = {}
for i in df.index:
data['employee_id'] = df['employee_id'][i]
data['first_name'] = df['first_name'][i]
data['last_name'] = df['last_name'][i]
data['email'] = df['email'][i]
data['managing'] = [{'manager_id': df['employee_id'][i]}, {'employee_id': df['managing'][i]}]
data['department'] = [{'department_name': df['department'][i]}, {'department_budget': df['department_budget'][i]}]
data['laptop'] = [{'serial_number': df['serial_number'][i]}, {'manufacturer': df['manufacturer'][i]}, {'model': df['model'][i]}, {'date_assigned': df['date_assigned'][i]}, {'installed_software': df['installed_software'][i]}]
collection.insert_one(data)

Python Object of type Binary is not JSON serializable

I'm trying to pull data from the MongoDB port of some threads with MongoClient. However, I am getting an error when writing to the JSON file. I think there is a Binary data type in the incoming data. How do I fix this error?
My code:
import json
import pymongo as pymongo
def get_ip_data():
db_datas = []
for ip_line in json_data:
try:
replace_ip = ip_line.replace("\n", "")
client = pymongo.MongoClient(replace_ip, 27017)
client.server_info()
db_names = client.list_database_names()
list_collections = client.admin.command({'listCollections': 1.0}, {'authorizedCollections': True})
host_info = client.admin.command({'hostInfo': 1})
server_status = client.admin.command({'serverStatus': 1})
lock_info = client.admin.command({'lockInfo': 1})
build_info = client.admin.command({'buildInfo': 1})
db_data = {f"{ip_line}, db_names": db_names, "collections_list": list_collections,
"server_status": server_status, "host_info": host_info, "lock_info": lock_info,
"build_info": build_info}
db_datas.append(db_data)
except Exception as e:
print(e)
finally:
with open("./data.json", "wt") as write_data:
write_data.write(json.dumps(db_datas, indent=4))
if __name__ == "__main__":
with open("./ip_list.txt", "r") as json_data:
json_data = json_data.readlines()
get_ip_data()
The solution depends on what you want to get in the result
The recommended way is to use pymongo's bson.json_util to correctly dump your dictionary with bson-typed objects to json. In this case, them will be represented as something like that
"your_binary_field": {
"$binary": {
"base64": "Xi6aRRv9SWAJtTK4ScAoKDrw4cU5bek=",
"subType": "00"
}
},
Which can be correctly read by the bson package.
Alternatively, you can recursively walk through your db_datas items, and cast all the bson.Binary objects to the strings, or other json-serializable values, as expample, using bson.Binary.decode.

what is the correct syntax here? getting type TypeError: unhashable type: 'dict

query={"colourCode" : "orange" },{"createdOn":{ "$gt" : my_datetime}},{"assignmentRef":{'$ne':None}}
cursor = collection.find({query},{'createdOn':1,'assignmentRef.name':1,'_id':0,'colourCode':1})
list_cur = list(cursor)
df = DataFrame(list_cur)
print(df)
Result
TypeError: unhashable type: 'dict'
what is the problem here? please rewrite the code with correct syntax, so that I clearly can understand it.
You have two issues; the query needs to be constructed as a dictionary (yours creates a tuple), and the first parameter of the find needs to just be query not {query}.
This should be closer to what you need:
import datetime
from pandas import DataFrame
from pymongo import MongoClient
db = MongoClient()['mydatabase']
collection = db.mycollection
my_datetime = datetime.datetime.now()
query = {"colourCode": "orange", "createdOn": {"$gt": my_datetime}, "assignmentRef": {'$ne': None}}
cursor = collection.find(query, {'createdOn': 1, 'assignmentRef.name': 1, '_id': 0, 'colourCode': 1})
list_cur = list(cursor)
df = DataFrame(list_cur)
print(df)

batch_write_item using dynamodb.client() to write large csv to dynamodb table in python

I am trying to insert a large csv file (5M records) to dynamodb using dynamodb_client.batch_write_item().
When I insert using dynamodb_client.put_item(), it works fine but I need to be able to use it with batch_write_item() too.
Here is my code snippet for few records (more than 1):
import json
import boto3
import csv
import pandas as pd
from datetime import datetime
roleARN = 'arn:aws:iam::123:role/xyz_role'
boto3.setup_default_session(profile_name='test_profile')
client = boto3.client('sts')
response = client.assume_role(RoleArn=roleARN,
RoleSessionName='RoleSessionName',
DurationSeconds=1800)
dynamodb_client = boto3.client('dynamodb', region_name='ap-south-1',
aws_access_key_id=response['Credentials']['AccessKeyId'],
aws_secret_access_key=response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
#Fetching time for population
current_time = datetime.utcnow().isoformat()[:-3] + 'Z'
def convert_csv_to_json_list(file):
items = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data = {}
data['col1'] = row['col1']
data['col2'] = int(row['col2'])
data['col3'] = int(row['col3'])
data['Row_Created'] = current_time
data['col4'] = row['col4']
data['col5'] = int(row['col5'])
data['Row_Updated'] = current_time
items.append(data)
return items
def batch_write(items):
table = "sample_table"
#writing batch
try:
print(type(items))
dynamodb_client.batch_write_item(RequestItems = {
table: [{'PutRequest':
{
'Item' : items
}}]
})
print(f'resource, specify all types : write succeeded.')
except Exception as e:
print(f'resource, specify all types : write failed: {e}')
inp_file = "sample_mapping.csv"
json_data = convert_csv_to_json_list(inp_file)
batch_write(json_data)
I keep getting :
<class 'list'>
resource, specify all types : write failed: Parameter validation failed:
Invalid type for parameter RequestItems.sample_table[0][{'col1': 'abc', 'col2': 59, 'col3': 0
, 'Row_Created': '2021-10-08T04:36:04.787Z', 'col4': 'dfrwfr', 'col5': 1, 'Row_Updated': '2021-10-08T04:36:04.787Z'}, {'col1': 'sffr', 'col2': 45, 'col3': 0
, 'Row_Created': '2021-10-08T04:36:04.787Z', 'col4': 'gty7u', 'col5': 1, 'Row_Updated': '2021-10-08T04:36:04.787Z'}], type: <class 'list'>, valid types: <class
'dict'>
Can someone help me where I am going wrong with batch insertion, tried looking up the documentation too.
Each item should be in a separate PutRequest key.
RequestItems = {
table: [
{'PutRequest': {'Item': {}}},
{'PutRequest': {'Item': {}}}
]
}
There are certain limitations with using batch_write_item, such as there cannot be more than 25 items in the request.

How to pass a CustomDataAsset to a DataContext to run custom expectations on a batch?

I have a CustomPandasDataset with a custom expectation
from great_expectations.data_asset import DataAsset
from great_expectations.dataset import PandasDataset
from datetime import date, datetime, timedelta
class CustomPandasDataset(PandasDataset):
_data_asset_type = "CustomPandasDataset"
#DataAsset.expectation(["column", "datetime_match", "datetime_diff"])
def expect_column_max_value_to_match_datetime(self, column:str, datetime_match: datetime = None, datetime_diff: tuple = None) -> dict:
"""
Check if data is constantly updated by matching the max datetime column to a
datetime value or to a datetime difference.
"""
max_datetime = self[column].max()
if datetime_match is None:
from datetime import date
datetime_match = date.today()
if datetime_diff:
from datetime import timedelta
success = (datetime_match - timedelta(*datetime_diff)) <= max_datetime <= datetime_match
else:
success = (max_datetime == datetime_match)
result = {
"data_max_value": max_datetime,
"expected_max_value": str(datetime_match),
"expected_datetime_diff": datetime_diff
}
return {
"success": success,
"result": result
}
I want to run the expectation expect_column_max_value_to_match_datetime to a given pandas dataframe
expectation_suite_name = "df-raw-expectations"
suite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)
df_ge = ge.from_pandas(df, dataset_class=CustomPandasDataset)
batch_kwargs = {'dataset': df_ge, 'datasource': 'df_raw_datasource'}
# Get batch of data
batch = context.get_batch(batch_kwargs, suite)
which I get from a DataContext, now when I run expectations on this batch
datetime_diff = 4,
batch.expect_column_max_value_to_match_datetime(column='DATE', datetime_diff=datetime_diff)
I got the following error
AttributeError: 'PandasDataset' object has no attribute 'expect_column_max_value_to_match_datetime'
According to the docs I've specified the dataset_class=CustomPandasDataset attribute when constructing the GreatExpectations dataset, indeed running the expectations on df_ge works but not on the batch of data.
According to the docs
To use custom expectations in a datasource or DataContext, you need to define the custom DataAsset in the datasource configuration or batch_kwargs for a specific batch.
so pass CustomPandasDataset through the data_asset_type parameter of get_batch() function
# Get batch of data
batch = context.get_batch(batch_kwargs, suite, data_asset_type=CustomPandasDataset)
or define it in the context Configuration
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.data_context import BaseDataContext
data_context_config = DataContextConfig(
...
datasources={
"sales_raw_datasource": {
"data_asset_type": {
"class_name": "CustomPandasDataset",
"module_name": "custom_dataset",
},
"class_name": "PandasDatasource",
"module_name": "great_expectations.datasource",
}
},
...
)
context = BaseDataContext(project_config=data_context_config)
where CustomPandasDataset is available from the module/script custom_dataset.py

Categories

Resources