I have a dataset consisting of 250 rows that looks like to following:
In MongoDB Compass, I inserted the first row as follows:
db.employees.insertOne([{"employee_id": 412153,
"first_name": "Carrol",
"last_name": "Dhin",
"email": "carrol.dhin#company.com",
"managing": [{"manager_id": 412153, "employee_id": 174543}],
"department": [{"department_name": "Accounting", "department_budget": 500000}],
"laptop": [{"serial_number": "CSS49745",
"manufacturer": "Lenovo",
"model": "X1 Gen 10",
"date_assigned": {$date: 01-15-2022},
"installed_software": ["MS Office", "Adobe Acrobat", "Slack"]}]})
If I wanted to insert all 250 rows into the database using PyMongo in Python, how would I ensure that every row is entered following the format that I used when I inserted it manually in the Mongo shell?
from pymongo import MongoClient
import pandas as pd
client = MongoClient(‘localhost’, 27017)
db = client.MD
collection = db.gammaCorp
df = pd.read_csv(‘ ’) #insert CSV name here
data = {}
for i in df.index:
data['employee_id'] = df['employee_id'][i]
data['first_name'] = df['first_name'][i]
data['last_name'] = df['last_name'][i]
data['email'] = df['email'][i]
data['managing'] = [{'manager_id': df['employee_id'][i]}, {'employee_id': df['managing'][i]}]
data['department'] = [{'department_name': df['department'][i]}, {'department_budget': df['department_budget'][i]}]
data['laptop'] = [{'serial_number': df['serial_number'][i]}, {'manufacturer': df['manufacturer'][i]}, {'model': df['model'][i]}, {'date_assigned': df['date_assigned'][i]}, {'installed_software': df['installed_software'][i]}]
collection.insert_one(data)
Related
I have a following problem. I am creating a a mongodb. Rows looks like this:
{'_id': ObjectId('62136674d12e4f7384caf6c0'), 'visit_id': 595002379, 'referal': '', 'pageviews': [], 'ab_variants': [{'id': 1, 'var': 0}]}.
I am inserting new rows. I want to check if visit_id already exists. If yes, I want to extend ab_variants for example like this:
{'_id': ObjectId('62136674d12e4f7384caf6c0'), 'visit_id': 595002379, 'referal': '', 'pageviews': [], 'ab_variants': [{'id': 1, 'var': 0}, {'id': 2, 'var': 1}]}.
What I tried co far:
from pymongo import MongoClient
try:
conn = MongoClient()
print("Connected successfully!!!")
except:
print("Could not connect to MongoDB")
# database
db = conn.database
# Created or Switched to collection names: my_gfg_collection
collection = db.my_gfg_collection
# drop
db.my_gfg_collection.drop()
print("Old data dropped successfully!")
if collection.find_one({"visit_id": 595002379}) is None:
emp_rec = {
"visit_id": 595002379),
"referal": "",
"pageviews": [],
"ab_variants": [{"id" : 1),
"var" : 0) }]
}
else:
# I WANT TO UPDATE HERE
# Insert Data
rec_id = collection.insert_one(emp_rec)
How can I do this, please?
You can do something like this here
from pymongo import MongoClient
try:
conn = MongoClient()
print("Connected successfully!!!")
except:
print("Could not connect to MongoDB")
# database
db = conn.database
# Created or Switched to collection names: my_gfg_collection
collection = db.my_gfg_collection
# drop
db.my_gfg_collection.drop()
print("Old data dropped successfully!")
# store the document in a variable, if present
record = collection.find_one({"visit_id": 595002379})
if record is None:
emp_rec = {
"visit_id": 595002379),
"referal": "",
"pageviews": [],
"ab_variants": [{"id" : 1),
"var" : 0) }]
}
# Insert Data
rec_id = collection.insert_one(emp_rec)
else:
# I WANT TO UPDATE HERE
if 'ab_variants' not in record:
record['ab_variants'] = []
record['ab_variants'].append(Record that you want to update inside the ab_variants list)
# update the document
collection.update({'_id': record['_id']}, {"$set": record}, upsert=True)
I have a table name employees
I need to push the employees to Elasticsearch index using python
import MySQLdb
import json
from elasticsearch import Elasticsearch
db = MySQLdb.connect("localhost", "admin", "password", "dbname")
cursor = db.cursor()
Here is my quick idea 😎
from sqlalchemy import create_engine
import pymysql
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
#Replaceme
CONSTR = 'mysql+pymysql://root:#127.0.0.1'
sqlEngine = create_engine(CONSTR, pool_recycle=3600)
dbConnection = sqlEngine.connect()
df = pd.read_sql("select * from employees", dbConnection);
rows = df.to_json(orient='records')
es = Elasticsearch()
actions=[]
for item in rows:
action = {
#replace me if need to
"_id": "employee_%s"%item['id'],
"doc_type": "_doc",
"doc": item
}
actions.append(action)
response = helpers.bulk(es, actions, index="employees", doc_type='_doc')
dbConnection.close()
Dump out a CSV file (SELECT .. INTO OUTFILE) from MySQL, Load that into Elasticsearch.
I'm looking for a way to replicate the functionality of SQL Server's JSON_VALUE function using a SQLAlchemy query. I'm using metadata.reflect to define my existing db tables in SQLAlchemy.
SQL:
SELECT Id,
JSON_VALUE(BankDataJSON,'$.AccountName')
FROM BankData
SQLAlchemy Model:
db = SQLAlchemy()
db.Model.metadata.reflect(db.engine)
class BankData(db.Model):
__table__ = db.Model.metadata.tables['BankData']
Endpoint / Query:
#cust_accts_bp.route('/api/CustomerAccts')
def get_cust_accts():
custId = request.args.get('custId')
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
BankAccounts = models.bank_accounts.BankAccounts
qry = db.session.query(BankAccounts.Id, BankAccounts.AccountNumber, BankAccounts.BankName,
BankData.AppId, BankData.CustomerId, BankAccounts.Filename, BankData.BankDataJSON) \
.filter(
and_(BankData.Id == BankAccounts.BankDataId, BankData.CustomerId == custId)
)
engine = app.config['SQLALCHEMY_DATABASE_URI']
df = pd.read_sql(qry.statement, engine)
df['BankDataJSON'] = df['BankDataJSON'].apply(json.loads) # convert string representation of JSON
df['BankDataJSON'] = df['BankDataJSON'].map(lambda x:[x[i] for i in x if i=='AccountName'][0])
df = df.rename(columns={'BankDataJSON':'BusinessName'})
response = json.loads(df.to_json(orient="records"))
return(json.dumps(response))
Using this method, I have to manually serialize the JSON object (BankDataJSON) to a Python dict, and parse it to get the value I want ('AccountName'). If I were to use SQL Server's JSON_VALUE function, this is all done for you.
JSON response:
[
{
"Id": 3003,
"AccountNumber": "111111111",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "some filename.pdf",
"BusinessName": "Some BusinessName"
},
{
"Id": 3004,
"AccountNumber": "22222222",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "Some filename",
"BusinessName": "Some Businessname"
},
]
How can I go about doing this? I walso want to be able to replicated SQL Server's CROSS APPLY OPENJSON functionality for working with array of JSON objects in the future. Do I need to define the BankDataJSON column as a JSON type in my model? When I do this, I get an error regarding pyodbcs inability to deserialize JSON in the MSSQL dialect
may be you can try to implement the server's function in your query, something like this
from sqlalchemy.sql import func
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
qry = db.session.query(BankData.Id,
func.JSON_VALUE(BankData.BankDataJSON,'$.AccountName'))
I have a case study where i need to take data from a REST API do some analysis on the data using aggregate function,joins etc and use the response data in JSON format to plot some retail grahs.
Approaches being followed till now:
Read the data from JSON store these in python variable and use insert to hit the SQL query. Obviously it is a costly operation because for every JSON line read it is inserting into database.For 33k rows it is taking more than 20 mins which is inefficient.
This can be handled in elastic search for faster processing but complex operation like joins are not present in elastic search.
If anybody can suggest what would be the best approach (like preprocessing or post processing in python) to follow for handling such scenerios it would be helpful.
Thanks in advance
Sql Sript
def store_data(AccountNo)
db=MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
insert_query = "INSERT INTO cstore (AccountNo) VALUES (%s)"
cursor.execute(insert_query, (AccountNo))
db.commit()
cursor.close()
db.close()
return
def on_data(file_path):
#This is the meat of the script...it connects to your mongoDB and stores the tweet
try:
# Decode the JSON from Twitter
testFile = open(file_path)
datajson = json.load(testFile)
#print (len(datajson))
#grab the wanted data from the Tweet
for i in range(len(datajson)):
for cosponsor in datajson[i]:
AccountNo=cosponsor['AccountNo']
store_data( AccountNo)
Edit1: Json Added
{
"StartDate": "1/1/18",
"EndDate": "3/30/18",
"Transactions": [
{
"CSPAccountNo": "41469300",
"ZIP": "60098",
"ReportDate": "2018-03-08T00:00:00",
"POSCode": "00980030003",
"POSCodeModifier": "0",
"Description": "TIC TAC GUM WATERMEL",
"ActualSalesPrice": 1.59,
"TotalCount": 1,
"Totalsales": 1.59,
"DiscountAmount": 0,
"DiscountCount": 0,
"PromotionAmount": 0,
"PromotionCount": 0,
"RefundAmount": 0,
"RefundCount": 0
},
{
"CSPAccountNo": "41469378",
"ZIP": "60098",
"ReportDate": "2018-03-08T00:00:00",
"POSCode": "01070080727",
"POSCodeModifier": "0",
"Description": "PAYDAY KS",
"ActualSalesPrice": 2.09,
"TotalCount": 1,
"Totalsales": 2.09,
"DiscountAmount": 0,
"DiscountCount": 0,
"PromotionAmount": 0,
"PromotionCount": 0,
"RefundAmount": 0,
"RefundCount": 0
}
]
}
I do not have your json file so not know if it is runnable, but I would have tried something like below: I read just your account infos to a list and than try to write to the db at once with executemany I expect it to have a better(less) execution time than 20 mins.
def store_data(AccountNo):
db = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
insert_query = "INSERT INTO cstore (AccountNo,ZIP,ReportDate) VALUES (:AccountNo,:ZIP,:ReportDate)"
cursor.executemany(insert_query, AccountNo)
db.commit()
cursor.close()
db.close()
return
def on_data(file_path):
# This is the meat of the script...it connects to your mongoDB and stores the tweet
try:
#declare an empty list for the all accountno's
accountno_list = list()
# Decode the JSON from Twitter
testFile = open(file_path)
datajson = json.load(testFile)
# print (len(datajson))
# grab the wanted data from the Tweet
for row in datajson[0]['Transactions']:
values = dict()
values['AccountNo'] = row['CSPAccountNo']
values['ZIP'] = row['ZIP']
values['ReportDate'] = row['ReportDate']
#from here on you can populate the attributes you need in a similar way..
accountno_list.append(values)
except:
pass
store_data(accountno_list)
Trying to push csv data in to mongodb using python.i'm a beginner to python & mongodb..i used the following code
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
jsonfile = open('C://test//6.json', 'a')
reader = csv.DictReader( csvfile )
header= [ "S.No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
#fieldnames=header
output=[]
for each in reader:
row={}
for field in header:
row[field]=each[field]
output.append(row)
json.dump(output, jsonfile, indent=None, sort_keys=False , encoding="UTF-8")
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
data=pd.read_csv('C://test//6.json', error_bad_lines=0)
df = pd.DataFrame(data)
records = csv.DictReader(df)
db.segment.insert(records)
but the output is given in this format
/* 0 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43134"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyPrice:\"349.75\""
}
/* 1 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43135"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyQuantity:\"3000\""
}
/* 2 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43136"),
"[{\"AverageTradedPrice\":\"0\"" : "ClosePrice:\"350\""
}
/* 3 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43137"),
"[{\"AverageTradedPrice\":\"0\"" : "HighPrice:\"0\""
}
Actually i want the output to like for single id all the other fields should be showed as subtypes
eg:
_id" : ObjectId("54891c4ffb2a0303b0d43137")
AveragetradedPrice :0
HighPrice:0
ClosePrice:350
buyprice:350.75
Please help me Out.Thanks in advance
Thank you for the suggestion.This one is the corrected code:
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
reader = csv.DictReader( csvfile )
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
header= [ "S No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
for each in reader:
row={}
for field in header:
row[field]=each[field]
db.segment.insert(row)
Why do you insert data one by one? Take a look at this one.
import pandas as pd
from pymongo import MongoClient
client = MongoClient(<your_credentials>)
database = client['YOUR_DB_NAME']
collection = database['your_collection']
def csv_to_json(filename, header=None):
data = pd.read_csv(filename, header=header)
return data.to_dict('records')
collection.insert_many(csv_to_json('your_file_path'))
Please be aware of that it might crash your app when the file is too big.
The easiest way is by using pandas
my code is
import json
import pymongo
import pandas as pd
myclient = pymongo.MongoClient()
df = pd.read_csv('yourcsv.csv',encoding = 'ISO-8859-1') # loading csv file
df.to_json('yourjson.json') # saving to json file
jdf = open('yourjson.json').read() # loading the json file
data = json.loads(jdf) # reading json file
now you can insert this json in your mangodb database :-]
There is a better way with less number of imports, assuming you have a header row in your CSV.
from pymongo import MongoClient
import csv
# DB connectivity
client = MongoClient('localhost', 27017)
db = client.db
collection = db.collection
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open(FILEPATH))
result = {}
for row in reader:
key = row.pop('First_value')
result[key] = row
return query
# Final insert statement
db.collection.insert_one(csv_to_dict())
Hope that helps
from pymongo import MongoClient
import csv
import json
# DB connectivity
client = MongoClient('localhost', 27017)
db = client["database name"]
col = db["collection"]
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open('File with path','r'))
result = {}
for row in reader:
key = row.pop('id')
result[key]= row
return result
# Final insert statement
x=col.insert_one(csv_to_dict())
print(x.inserted_id)
# to insert one row
#and to insert many rows following code is to be executed
from pymongo import MongoClient
import csv
# read csv file as a list of lists
client = MongoClient('localhost', 27017)
db = client["data base name"]
col = db["Collection Name"]
with open('File with path', 'r') as read_obj:
# pass the file object to reader() to get the reader object
csv_reader = csv.DictReader(read_obj)
# Pass reader object to list() to get a list of lists
mylist = list(csv_reader)
#print(list_of_rows)
x = col.insert_many(mylist)
#print list of the _id values of the inserted documents:
print(x.inserted_ids)