Export pymongo collection to a JSON file - python

How to export pymongo collection to a JSON file, I have a huge collection which has approximately around 1 GB of data, I need an efficient way to export the data and create a JSON file out of it.
I am using the below code, as written in one of the stack overflow answers.
def getJSONFromDB():
db = GetMongo_client()
collection = db['collection_name']
cursor = collection.find({})
file = open("collection.json", "w")
file.write('[')
for document in cursor:
file.write(json.dumps(document))
file.write(',')
file.write(']')
But it gives me the following error:
TypeError: Object of type ObjectId is not JSON serializable

The pymongo documentation you pointed is obsolete. If you're using version 1.7 I recommend updating. With a more recent version you can do this:
from bson.json_util import dumps
dumps(l)
https://pymongo.readthedocs.io/en/stable/api/bson/json_util.html
Side answer: u'name', u'date', u'_id' etc are the names of the fields of the document on the database.

Hi it's end to end solution
Step 1: it's export all needs collections to json file
Step 2: it's import this json files to other db
import pymongo
import os
import json
from bson.json_util import dumps
import sys
import Consts
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
def replase_coletion(db_name_list,client_prod,client_stg):
for db_name in db_name_list:
db_prod = client_prod[db_name]
os.makedirs(db_name, exist_ok=True)
tables_prod = db_prod.list_collection_names()
print(tables_prod)
for table in tables_prod:
print("exporting data for table", table )
data = list(db_prod[table].find())
json_data = dumps(data, indent=4)
# write data in json file
with open(f"{db_prod.name}/{table}.json", 'w') as file:
file.write(json_data)
replace_word(db_prod.name+"/"+table+".json","$oid","oid")
db_stg = client_stg[db_name]
tables_stg = db_stg.list_collection_names()
print(tables_stg)
for table in tables_stg:
db_stg.drop_collection(table)
for table in tables_prod:
try:
with open(f"{db_prod.name}/{table}.json") as read_file:
data = json.load(read_file)
print(data)
client_stg[db_name][table].insert_many(data)
except:
print("empty list")
db_stg = client_stg[db_name]
tables_stg = db_stg.list_collection_names()
print(tables_stg)
if __name__ == "__main__":
db_name_list = Consts.db_lists
client_prod = pymongo.MongoClient(Consts.from_bd)
client_stg = pymongo.MongoClient(Consts.to_db)
replase_coletion(db_name_list, client_prod, client_stg)

Related

How to dump data into Json file using python

How to dump data into Json file
*as can see in the below python code I am trying the dump data in Json file so but I am struggling to do it in python code *
import time
import json
import os
def long_function(name):
cache_path = 'cache.json'
if not os.path.isfile(cache_path):
with open(cache_path, 't') as json_file:
cache_file_data = [name]
jsondump(cache_file_data, json_file)
else:
with open(cache_path, 'r') as json_file:
cache_file_data = json.load(json_file)
if name in cache_file_data:
print("Name already exist")
return name
else:
cache_file_data.append(name)
for e in range(5):
time.sleep(1)
print(e+1)
with open(cache_path, 'w') as json_file:
jsondump(cache_file_data, json_file)
print("New Name added in cache")
return name
print(long_function('nitu'))
so please resolve my problem......please help me
import json
# JSON data:
x = '{ "organization":"New_holn",
"city":"Noida",
"country":"India"}'
# python object to be appended
y = {"pin":117845}
# parsing JSON string:
z = json.loads(x)
# appending the data
z.update(y)
# the result is a JSON string:
print(json.dumps(z))
This is nothing but follow this pattern and your so your code error is ..you are not defined file mode correctly in if condition
with open (cache_path. "t") as json_file:
Instead of
with open (cache_path. "w") as json_file:
And second thing is you are not doing dump data

Python memory error Save split csv python

I have all the data of a lead object from salesforce by python and I save it by csv.
but since there is a lot of information I get python memory error
.
**This code get python memory error code**
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"CreatedDate"
]
sosl = 'SELECT {0[0]} FROM Lead'.format(
columns)
# Data acquisition with SOSL
data = sf.query_all(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
writer.writerow(data)
if __name__ == '__main__':
main()
That's why when there are more than 1000 lines in the csv I want
csv are recorded as follows.
1 output1.csv (1000 row)
2 output2.csv (1000 row)
3 output3.csv ......
And I get the following error, what do I need to do so that I can get out this way?
I want to split the cvs and I put in with open csv
iterator = True, chunk size = 1000
Code
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"CreatedDate"
]
sosl = 'SELECT {0[0]} FROM Lead'.format(
columns)
# Data acquisition with SOSL
data = sf.query_all(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="",iterator=True,chunksize=1000) as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
writer.writerow(data)
if __name__ == '__main__':
main()
Error message
Traceback (most recent call last):
File "c:/Users/test/Documents/test/test5.py", line 44, in <module>
main()
File "c:/Users/test/Documents//test5.py", line 36, in main
with open(output_csv, 'w', newline="",iterator=True,chunksize=1000) as f:
TypeError: 'iterator' is an invalid keyword argument for open()
With this way I think that I will not get python error if there is another way they can teach me?
If anyone knows, please let me know.
data = sf.query_all(sosl)
This call retrieves all information into memory for the given query, which is SOQL, not SOSL.
Instead, use
data = sf.query_all_iter(sosl)
and iterate over the resulting iterator instead of data.items(), which will be much more memory-efficient as it won't attempt to retrieve all items at once.

Remove very first character in file

I'm trying to remove the very first character (") from a file which contains a JSON String. I'm using Python for this. Below is my code:
jsonOutput = 'JsonString_{}.{}'.format(str(uuid.uuid1()), "json")
jsonOutput_File = os.path.join(arcpy.env.scratchFolder, jsonOutput)
with open(jsonOutput_File, 'w') as json_file:
json.dump(jsonString, json_file)
// I was able to remove the very last character using the code below
with open(jsonOutput_File, 'r+') as read_json_file:
read_json_file.seek(-1, os.SEEK_END)
read_json_file.truncate()
Basically when I dump the JSON String to a file, the String is getting surrounded by double quotes. I'm trying to remove these double quotes from the first & last position of the file.
If you already have a JSON string, simply write it to the file.
Encoding the JSON string to JSON again using json.dump() is a bad idea and will not be fixed as simple as removing a leading and a trailing quote.
Consider the following minimal and complete example:
import json
import os
import uuid
myobject = {"hello": "world"}
jsonString = json.dumps(myobject)
jsonOutput = 'JsonString_{}.{}'.format(str(uuid.uuid1()), "json")
jsonOutput_File = os.path.join("d:\\", jsonOutput)
with open(jsonOutput_File, 'w') as json_file:
json.dump(jsonString, json_file)
The output is a file with the content:
"{\"hello\": \"world\"}"
Removing the quotes will not make it valid JSON.
Instead, avoid the duplicate JSON creation, either by removing json.dumps() which converts the object to JSON one time, or by removing json.dump(), which does it a second time.
Solution 1:
import json
import os
import uuid
myobject = {"hello": "world"}
# <-- deleted line here
jsonOutput = 'JsonString_{}.{}'.format(str(uuid.uuid1()), "json")
jsonOutput_File = os.path.join("d:\\", jsonOutput)
with open(jsonOutput_File, 'w') as json_file:
json.dump(myobject, json_file) # <-- changed to object here
Solution 2:
import json
import os
import uuid
myobject = {"hello": "world"}
jsonString = json.dumps(myobject)
jsonOutput = 'JsonString_{}.{}'.format(str(uuid.uuid1()), "json")
jsonOutput_File = os.path.join("d:\\", jsonOutput)
with open(jsonOutput_File, 'w') as json_file:
json_file.write(jsonString) # <-- Note this line

How to dump a collection to json file using pymongo

I am trying to dump a collection to .json file but after looking in pymongo tutorial I can not find any thing that relates to it.
Tutorial link: https://api.mongodb.com/python/current/tutorial.html
Just get all documents and save them to file e.g.:
from bson.json_util import dumps
from pymongo import MongoClient
if __name__ == '__main__':
client = MongoClient()
db = client.db_name
collection = db.collection_name
cursor = collection.find({})
with open('collection.json', 'w') as file:
file.write('[')
for document in cursor:
file.write(dumps(document))
file.write(',')
file.write(']')
The accepted solution produces an invalid JSON. It results in trailing comma , before the close square bracket ]. The JSON spec does not allow trailing commas. See this answer and this reference.
To build on the accepted solution I used the following:
from bson.json_util import dumps
from pymongo import MongoClient
import json
if __name__ == '__main__':
client = MongoClient()
db = client.db_name
collection = db.collection_name
cursor = collection.find({})
with open('collection.json', 'w') as file:
json.dump(json.loads(dumps(cursor)), file)
Here's another way of not saving a , before the closing square brackets. Also using with open to save some space.
filter = {"type": "something"}
type_documents = db['cluster'].find(filter)
type_documents_count = db['cluster'].count_documents(filter)
with open("type_documents.json", "w") as file:
file.write('[')
# Start from one as type_documents_count also starts from 1.
for i, document in enumerate(type_documents, 1):
file.write(json.dumps(document, default=str))
if i != type_documents_count:
file.write(',')
file.write(']')
It basically doesn't write the comma if number of iterations is equal to the number of documents (which is the last document it saves).
Complementing #kamilitw I use length of cursor to make a JSON file correctly. I use count() and if-else:
def writeToJSONFile(collection):
cursor = collection.find({})
file = open("collection.json", "w")
file.write('[')
qnt_cursor = 0
for document in cursor:
qnt_cursor += 1
num_max = cursor.count()
if (num_max == 1):
file.write(json.dumps(document, indent=4, default=json_util.default))
elif (num_max >= 1 and qnt_cursor <= num_max-1):
file.write(json.dumps(document, indent=4, default=json_util.default))
file.write(',')
elif (qnt_cursor == num_max):
file.write(json.dumps(document, indent=4, default=json_util.default))
file.write(']')
return file
So the JSON file will be correct in the and, because before as writing like this: [{"test": "test"},], now it's writing: [{"test":"test1"},{"test":"test2"}]
"""
#Author: Aseem Jain
#profile: https://www.linkedin.com/in/premaseem/
"""
import os
import pymongo
# configure credentials / db name
db_user = os.environ["MONGO_ATLAS_USER"]
db_pass = os.environ["MONGO_ATLAS_PASSWORD"]
db_name = "sample_mflix"
connection_string = f"mongodb+srv://{db_user}:{db_pass}#sharedcluster.lv3wx.mongodb.net/{db_name}?retryWrites=true&w=majority"
client = pymongo.MongoClient(connection_string)
db = client[db_name]
# create database back directory with db_name
os.makedirs(db_name, exist_ok=True)
# list all tables in database
tables = db.list_collection_names()
# dump all tables in db
for table in tables:
print("exporting data for table", table )
data = list(db[table].find())
# write data in json file
with open(f"{db.name}/{table}.json","w") as writer:
writer.write(str(data))
exit(0)
Using pymongo's json_util:
from bson.json_util import dumps
from pymongo import MongoClient
import json
db_client = MongoClient(mongo_connection_string)
collections = db.collection_name
for collectio in collections.find():
with open("collection.json", 'w') as file:
op_json = dumps(operation)
json.dump(op_json, file)

How to create an index using Whoosh

I am trying to use Whoosh for text searching for the first time. I want to search for documents containing the word "XML". But because I am new to Whoosh, I just wrote a program that search for a word from a document. Where the document is a text file (myRoko.txt)
import os, os.path
from whoosh import index
from whoosh.index import open_dir
from whoosh.fields import Schema, ID, TEXT
from whoosh.qparser import QueryParser
from whoosh.query import *
if not os.path.exists("indexdir3"):
os.mkdir("indexdir3")
schema = Schema(name=ID(stored=True), content=TEXT)
ix = index.create_in("indexdir3", schema)
writer = ix.writer()
path = "myRoko.txt"
with open(path, "r") as f:
content = f.read()
f.close()
writer.add_document(name=path, content= content)
writer.commit()
ix = open_dir("indexdir3")
query_b = QueryParser('content', ix.schema).parse('XML')
with ix.searcher() as srch:
res_b = srch.search(query_b)
print res_b[0]
The above code is supposed to print the document that contain the word "XML". However the code return the following error:
raise ValueError("%r is not unicode or sequence" % value)
ValueError: 'A large number of documents are now represented and stored
as XML document on the web. Thus ................
What could be the cause of this error?
You have a Unicode problem. You should pass unicode strings to the indexer. For that, you need to open the text file as unicode:
import codecs
with codecs.open(path, "r","utf-8") as f:
content = f.read()
and use unicode string for file name:
path = u"myRoko.txt"
After fixes I got this result:
<Hit {'name': u'myRoko.txt'}>
writer.add_document(name=unicode(path), content=unicode(content))
It has to be UNICODE

Categories

Resources