I have all the data of a lead object from salesforce by python and I save it by csv.
but since there is a lot of information I get python memory error
.
**This code get python memory error code**
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"CreatedDate"
]
sosl = 'SELECT {0[0]} FROM Lead'.format(
columns)
# Data acquisition with SOSL
data = sf.query_all(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
writer.writerow(data)
if __name__ == '__main__':
main()
That's why when there are more than 1000 lines in the csv I want
csv are recorded as follows.
1 output1.csv (1000 row)
2 output2.csv (1000 row)
3 output3.csv ......
And I get the following error, what do I need to do so that I can get out this way?
I want to split the cvs and I put in with open csv
iterator = True, chunk size = 1000
Code
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"CreatedDate"
]
sosl = 'SELECT {0[0]} FROM Lead'.format(
columns)
# Data acquisition with SOSL
data = sf.query_all(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="",iterator=True,chunksize=1000) as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
writer.writerow(data)
if __name__ == '__main__':
main()
Error message
Traceback (most recent call last):
File "c:/Users/test/Documents/test/test5.py", line 44, in <module>
main()
File "c:/Users/test/Documents//test5.py", line 36, in main
with open(output_csv, 'w', newline="",iterator=True,chunksize=1000) as f:
TypeError: 'iterator' is an invalid keyword argument for open()
With this way I think that I will not get python error if there is another way they can teach me?
If anyone knows, please let me know.
data = sf.query_all(sosl)
This call retrieves all information into memory for the given query, which is SOQL, not SOSL.
Instead, use
data = sf.query_all_iter(sosl)
and iterate over the resulting iterator instead of data.items(), which will be much more memory-efficient as it won't attempt to retrieve all items at once.
Related
this is my json file
{"_index":"core-bvd-locations","_type":"_doc","_id":"75b82cba4a80784f4fa36d14c86f6d85","_score":1,"_source":{"a_id":"FR518077177","a_id_type":"BVD ID","a_name":"Moisan Patrick Roger","a_name_normal":"MOISAN PATRICK ROGER","a_country_code":"FR","a_country":"France","a_in_compliance_db":false,"a_nationality":"FR","a_street_address":"Les Carmes","a_city":"Decize","a_postcode":"58300","a_region":"Bourgogne-Franche-Comte|Nievre","a_phone":"+33 603740000","a_latitude":46.79402777777778,"a_longitude":3.496277777777778,"a_national_ids":{"European VAT number":["FR58 518077177"],"SIREN number":["518077177"],"TIN":["518077177"],"SIRET number":["518077177-00013"]},"relationship":"Location info","file_name":"/media/hedwig/iforce/data/BvD/s3-transfer/SuperTable_v3_json/locations/part-00021-1f62c713-17a0-410d-9b18-32328d9836d6-c000.json","a_geo_point":{"lat":46.79402777777778,"lon":3.496277777777778}}}
this is my code
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('F:\\1.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w') #
# csvfile = open(path+'.csv', 'wb') # python2
csvfile = open('F:\\1.csv', 'w', encoding='utf-8', newline='') #
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
writer.writerow(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0]) #
print(path)
trans(path)
C:\Users\jeri\PycharmProjects\pythonProject9\venv\Scripts\python.exe C:\Users\jeri\PycharmProjects\pythonProject9\zwc_count_file.py
C:\Users\jeri\PycharmProjects\pythonProject9\zwc_count_file.py
['_index', '_type', '_id', '_score', '_source']
Process finished with exit code 0
output jie
enter image description here
Information in nested json file cannot be parse, how can i modify the code
import json
import pandas as pd
file_data = open("json_filname.json",'r').read()
data= json.loads(file_data)
df = pd.json_normalize(data)
df
json.load(): json.load() accepts file object, parses the JSON data, populates a Python dictionary with the data and returns it back to you.
import json
# Opening JSON file
f = open('data.json')
# returns JSON object as
# a dictionary
data = json.load(f)
writer.writerow write the entire row, rigth sintaxis
writer.writerow(#iterable_object#)
How to export pymongo collection to a JSON file, I have a huge collection which has approximately around 1 GB of data, I need an efficient way to export the data and create a JSON file out of it.
I am using the below code, as written in one of the stack overflow answers.
def getJSONFromDB():
db = GetMongo_client()
collection = db['collection_name']
cursor = collection.find({})
file = open("collection.json", "w")
file.write('[')
for document in cursor:
file.write(json.dumps(document))
file.write(',')
file.write(']')
But it gives me the following error:
TypeError: Object of type ObjectId is not JSON serializable
The pymongo documentation you pointed is obsolete. If you're using version 1.7 I recommend updating. With a more recent version you can do this:
from bson.json_util import dumps
dumps(l)
https://pymongo.readthedocs.io/en/stable/api/bson/json_util.html
Side answer: u'name', u'date', u'_id' etc are the names of the fields of the document on the database.
Hi it's end to end solution
Step 1: it's export all needs collections to json file
Step 2: it's import this json files to other db
import pymongo
import os
import json
from bson.json_util import dumps
import sys
import Consts
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
def replase_coletion(db_name_list,client_prod,client_stg):
for db_name in db_name_list:
db_prod = client_prod[db_name]
os.makedirs(db_name, exist_ok=True)
tables_prod = db_prod.list_collection_names()
print(tables_prod)
for table in tables_prod:
print("exporting data for table", table )
data = list(db_prod[table].find())
json_data = dumps(data, indent=4)
# write data in json file
with open(f"{db_prod.name}/{table}.json", 'w') as file:
file.write(json_data)
replace_word(db_prod.name+"/"+table+".json","$oid","oid")
db_stg = client_stg[db_name]
tables_stg = db_stg.list_collection_names()
print(tables_stg)
for table in tables_stg:
db_stg.drop_collection(table)
for table in tables_prod:
try:
with open(f"{db_prod.name}/{table}.json") as read_file:
data = json.load(read_file)
print(data)
client_stg[db_name][table].insert_many(data)
except:
print("empty list")
db_stg = client_stg[db_name]
tables_stg = db_stg.list_collection_names()
print(tables_stg)
if __name__ == "__main__":
db_name_list = Consts.db_lists
client_prod = pymongo.MongoClient(Consts.from_bd)
client_stg = pymongo.MongoClient(Consts.to_db)
replase_coletion(db_name_list, client_prod, client_stg)
Below, is the json structure I am pulling from my online weather station. I am also including a json_to_csv python script that is supposed to convert json data to csv output, but only returns a "Key" error. I want to pull data from "current_observation": only.
{
"response": {
"features": {
"conditions": 1
}
}
, "current_observation": {
"display_location": {
"latitude":"40.466442",
"longitude":"-85.362709",
"elevation":"280.4"
},
"observation_time_rfc822":"Fri, 26 Jan 2018 09:40:16 -0500",
"local_time_rfc822":"Sun, 28 Jan 2018 11:22:47 -0500",
"local_epoch":"1517156567",
"local_tz_short":"EST",
"weather":"Clear",
"temperature_string":"44.6 F (7.0 C)",
}
}
import csv, json, sys
inputFile = open("pywu.cache.json", 'r') #open json file
outputFile = open("CurrentObs.csv", 'w') #load csv file
data = json.load(inputFile) #load json content
inputFile.close() #close the input file
output = csv.writer(outputFile) #create a csv.write
output.writerow(data[0].keys())
for row in data:
output = csv.writer(outputFile) #create a csv.write
output.writerow(data[0].keys())
for row in data:
output.writerow(row.values()) #values row
What's the best method to retrieve the temperature string and convert to .csv format? Thank you!
import pandas as pd
df = pd.read_json("pywu.cache.json")
df = df.loc[["local_time_rfc822", "weather", "temperature_string"],"current_observation"].T
df.to_csv("pywu.cache.csv")
maybe pandas can be of help for you. the .read_json() function creates a nice dataframe, from which you can easily choose the desired rows and columns. and it can save as csv as well.
to add latitude and longitude to the csv-line, you can do this:
df = pd.read_json("pywu.cache.csv")
df = df.loc[["local_time_rfc822", "weather", "temperature_string", "display_location"],"current_observation"].T
df = df.append(pd.Series([df["display_location"]["latitude"], df["display_location"]["longitude"]], index=["latitude", "longitude"]))
df = df.drop("display_location")
df.to_csv("pywu.cache.csv")
to print the location in numeric values, you can do this:
df = pd.to_numeric(df, errors="ignore")
print(df['latitude'], df['longitude'])
This will find all keys (e.g. "temperature_string") specified inside of the json blob and then write them to a csv file. You can modify this code to get multiple keys.
import csv, json, sys
def find_deep_value(d, key):
# Find a the value of keys hidden within a dict[dict[...]]
# Modified from https://stackoverflow.com/questions/9807634/find-all-occurrences-of-a-key-in-nested-python-dictionaries-and-lists
# #param d dictionary to search through
# #param key to find
if key in d:
yield d[key]
for k in d.keys():
if isinstance(d[k], dict):
for j in find_deep_value(d[k], key):
yield j
inputFile = open("pywu.cache.json", 'r') # open json file
outputFile = open("mypws.csv", 'w') # load csv file
data = json.load(inputFile) # load json content
inputFile.close() # close the input file
output = csv.writer(outputFile) # create a csv.write
# Gives you a list of temperature_strings from within the json
temps = list(find_deep_value(data, "temperature_string"))
output.writerow(temps)
outputFile.close()
I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()
I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.
You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])
Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file
Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"
The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')
You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()
I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')
You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise
Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)
Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)
How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.
As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)
def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.
import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)