how to push a csv data to mongodb using python - python

Trying to push csv data in to mongodb using python.i'm a beginner to python & mongodb..i used the following code
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
jsonfile = open('C://test//6.json', 'a')
reader = csv.DictReader( csvfile )
header= [ "S.No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
#fieldnames=header
output=[]
for each in reader:
row={}
for field in header:
row[field]=each[field]
output.append(row)
json.dump(output, jsonfile, indent=None, sort_keys=False , encoding="UTF-8")
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
data=pd.read_csv('C://test//6.json', error_bad_lines=0)
df = pd.DataFrame(data)
records = csv.DictReader(df)
db.segment.insert(records)
but the output is given in this format
/* 0 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43134"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyPrice:\"349.75\""
}
/* 1 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43135"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyQuantity:\"3000\""
}
/* 2 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43136"),
"[{\"AverageTradedPrice\":\"0\"" : "ClosePrice:\"350\""
}
/* 3 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43137"),
"[{\"AverageTradedPrice\":\"0\"" : "HighPrice:\"0\""
}
Actually i want the output to like for single id all the other fields should be showed as subtypes
eg:
_id" : ObjectId("54891c4ffb2a0303b0d43137")
AveragetradedPrice :0
HighPrice:0
ClosePrice:350
buyprice:350.75
Please help me Out.Thanks in advance

Thank you for the suggestion.This one is the corrected code:
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
reader = csv.DictReader( csvfile )
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
header= [ "S No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
for each in reader:
row={}
for field in header:
row[field]=each[field]
db.segment.insert(row)

Why do you insert data one by one? Take a look at this one.
import pandas as pd
from pymongo import MongoClient
client = MongoClient(<your_credentials>)
database = client['YOUR_DB_NAME']
collection = database['your_collection']
def csv_to_json(filename, header=None):
data = pd.read_csv(filename, header=header)
return data.to_dict('records')
collection.insert_many(csv_to_json('your_file_path'))
Please be aware of that it might crash your app when the file is too big.

The easiest way is by using pandas
my code is
import json
import pymongo
import pandas as pd
myclient = pymongo.MongoClient()
df = pd.read_csv('yourcsv.csv',encoding = 'ISO-8859-1') # loading csv file
df.to_json('yourjson.json') # saving to json file
jdf = open('yourjson.json').read() # loading the json file
data = json.loads(jdf) # reading json file
now you can insert this json in your mangodb database :-]

There is a better way with less number of imports, assuming you have a header row in your CSV.
from pymongo import MongoClient
import csv
# DB connectivity
client = MongoClient('localhost', 27017)
db = client.db
collection = db.collection
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open(FILEPATH))
result = {}
for row in reader:
key = row.pop('First_value')
result[key] = row
return query
# Final insert statement
db.collection.insert_one(csv_to_dict())
Hope that helps

from pymongo import MongoClient
import csv
import json
# DB connectivity
client = MongoClient('localhost', 27017)
db = client["database name"]
col = db["collection"]
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open('File with path','r'))
result = {}
for row in reader:
key = row.pop('id')
result[key]= row
return result
# Final insert statement
x=col.insert_one(csv_to_dict())
print(x.inserted_id)
# to insert one row
#and to insert many rows following code is to be executed
from pymongo import MongoClient
import csv
# read csv file as a list of lists
client = MongoClient('localhost', 27017)
db = client["data base name"]
col = db["Collection Name"]
with open('File with path', 'r') as read_obj:
# pass the file object to reader() to get the reader object
csv_reader = csv.DictReader(read_obj)
# Pass reader object to list() to get a list of lists
mylist = list(csv_reader)
#print(list_of_rows)
x = col.insert_many(mylist)
#print list of the _id values of the inserted documents:
print(x.inserted_ids)

Related

Inserting data using PyMongo based on a defined data model

I have a dataset consisting of 250 rows that looks like to following:
In MongoDB Compass, I inserted the first row as follows:
db.employees.insertOne([{"employee_id": 412153,
"first_name": "Carrol",
"last_name": "Dhin",
"email": "carrol.dhin#company.com",
"managing": [{"manager_id": 412153, "employee_id": 174543}],
"department": [{"department_name": "Accounting", "department_budget": 500000}],
"laptop": [{"serial_number": "CSS49745",
"manufacturer": "Lenovo",
"model": "X1 Gen 10",
"date_assigned": {$date: 01-15-2022},
"installed_software": ["MS Office", "Adobe Acrobat", "Slack"]}]})
If I wanted to insert all 250 rows into the database using PyMongo in Python, how would I ensure that every row is entered following the format that I used when I inserted it manually in the Mongo shell?
from pymongo import MongoClient
import pandas as pd
client = MongoClient(‘localhost’, 27017)
db = client.MD
collection = db.gammaCorp
df = pd.read_csv(‘ ’) #insert CSV name here
data = {}
for i in df.index:
data['employee_id'] = df['employee_id'][i]
data['first_name'] = df['first_name'][i]
data['last_name'] = df['last_name'][i]
data['email'] = df['email'][i]
data['managing'] = [{'manager_id': df['employee_id'][i]}, {'employee_id': df['managing'][i]}]
data['department'] = [{'department_name': df['department'][i]}, {'department_budget': df['department_budget'][i]}]
data['laptop'] = [{'serial_number': df['serial_number'][i]}, {'manufacturer': df['manufacturer'][i]}, {'model': df['model'][i]}, {'date_assigned': df['date_assigned'][i]}, {'installed_software': df['installed_software'][i]}]
collection.insert_one(data)

Databricks to cosmos DB uploading data is very slow

I have below code which runs many days, input file has around 9GB of CSV and has millions of rows, I written below code, which executing from 5 days and still not completed, is there any way we can speed up process uploading data to cosmos DB?
import json
import logging
import sys
import azure.cosmos.cosmos_client as cosmos_client
import azure.cosmos.exceptions as exceptions
from azure.cosmos.partition_key import PartitionKey
from typing import Optional
configs = {
"dev": {
"file_location": "/FileStore/tables/docs/dpidata_pfile_20050523-20221009.csv",
"file_type": "csv",
"infer_schema": False,
"first_row_is_header": True,
"delimiter": ",",
"cdb_url": "https://xyxxxxxxxxxxxxxxx:443/",
"db_name": "abc",
"container_name": "dpi",
"partition_key": "/dpi"
},
"stg": {},
"prd": {}
}
class LoadToCdb():
def dpi_data_load(self) -> Optional[bool]:
try:
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(self.configs["file_type"]) \
.option("inferSchema", self.configs["infer_schema"]) \
.option("header", self.configs["first_row_is_header"]) \
.option("sep", self.configs["delimiter"]) \
.load(self.configs["file_location"])
df = df.select('dpi', 'Entity Type Code')
df = (df.withColumnRenamed("dpi","dpi")
.withColumnRenamed("Entity Type Code","entity_type_code"))
df_json = df.toJSON()
for row in df_json.collect():
print(row)
data = json.loads(row)
data.setdefault('dpi', None)
data["id"] = data["dpi"]
# this method call will update to cosmos db
self.cosmos_db.create_items(data)
except Exception as e:
self.log.error("Could not able to load to cosmos db from csv file")
self.log.error(e)
load_to_cdb = LoadToCdb()
load_to_cdb.dpi_data_load()

Sentiment Analysis data not showing up in csv file

I put data into a csv file (called "Essential Data_posts"). In my main, I extract a particular column from this file (called 'Post Texts') so that I can analyze the post texts for sentiment entity analysis using Google Cloud NLP. I then put this analysis in another csv file (called "SentimentAnalysis"). To do this, I put all of the information pertaining to sentiment entity analysis into an array (one for each piece of information).
The problem I am having is that when I execute my code, nothing shows up in SentimentAnalysis file, other than the headers, ex. "Representative Name". When I requested the lengths of all the arrays, I found out that each array had a length of 0, so they didn't have information being added to them.
I am using Ubuntu 21.04 and Google Cloud Natural Language. I am running this all in Terminal, not the Google Cloud Platform. I am also using Python3 and emacs text editor.
from google.cloud import language_v1
import pandas as pd
import csv
import os
#lists we are appending to
representativeName = []
entity = []
salienceScore = []
entitySentimentScore = []
entitySentimentMagnitude = []
metadataNames = []
metadataValues = []
mentionText = []
mentionType = []
def sentiment_entity(postTexts):
client = language_v1.LanguageServiceClient()
type_ = language_v1.Document.Type.PLAIN_TEXT
language = "en"
document = {"content": post_texts, "type": type_, "language": language}
encodingType = language_v1.EncodingType.UTF8
response = client.analyze_entity_sentiment(request = {'document': document, 'encoding type': encodingType})
#loop through entities returned from the API
for entity in response.entities:
representativeName.append(entity.name)
entity.append(language_v1.Entity.Type(entity.type_).name)
salienceScore.append(entity.salience)
entitySentimentScore.append(sentiment.score)
entitySentimentMagnitude.append(sentiment.magnitude)
#loop over metadata associated with entity
for metadata_name, metadata_value in entity.metadata.items():
metadataNames.append(metadata_name)
metadataValues.append(metadata_value)
#loop over the mentions of this entity in the input document
for mention in entity.mentions:
mentionText.append(mention.text.content)
mentionType.append(mention.type_)
#put the lists into the csv file (using pandas)
data = {
"Representative Name": representativeName,
"Entity": entity,
"Salience Score": salienceScore,
"Entity Sentiment Score": entitySentimentScore,
"Entity Sentiment Magnitude": entitySentimentMagnitude,
"Metadata Name": metadataNames,
"Metadata Value": metadataValues,
"Mention Text": mentionText,
"Mention Type": mentionType
}
df = pd.DataFrame(data)
df
df.to_csv("SentimentAnalysis.csv", encoding='utf-8', index=False)
def main():
import argparse
#read the csv file containing the post text we need to analyze
filename = open('Essential Data_posts.csv', 'r')
#create dictreader object
file = csv.DictReader(filename)
postTexts = []
#iterate over each column and append values to list
for col in file:
postTexts.append(col['Post Text'])
parser = arg.parse.ArgumentParser()
parser.add_argument("--postTexts", type=str, default=postTexts)
args = parser.parse_args()
sentiment_entity(args.postTexts)
I tried running your code and I encountered the following errors:
You did not use the passed parameter postTexts in sentiment_entity() thus this will error at document = {"content": post_texts, "type": type_, "language": language}.
A list cannot be passed to "content": post_texts, it should be string. See Document reference.
In variable request, 'encoding type' should be 'encoding_type'
Local variable entity should not not have the same name with entity = []. Python will try to append values in the local variable entity which is not a list.
Should be entity.sentiment.score and entity.sentiment.magnitude instead of sentiment.score and sentiment.magnitude
Loop for metadata and mention should be under loop for entity in response.entities:
I edited your code and fixed the errors mentioned above. In your main(), I included a step to convert the list postTexts to string so it can be used in your sentiment_entity() function. metadataNames and metadataValues are temporarily commented since I do not have an example that could populate these values.
from google.cloud import language_v1
import pandas as pd
import csv
import os
#lists we are appending to
representativeName = []
entity_arr = []
salienceScore = []
entitySentimentScore = []
entitySentimentMagnitude = []
metadataNames = []
metadataValues = []
mentionText = []
mentionType = []
def listToString(s):
""" Transform list to string"""
str1 = " "
return (str1.join(s))
def sentiment_entity(postTexts):
client = language_v1.LanguageServiceClient()
type_ = language_v1.Document.Type.PLAIN_TEXT
language = "en"
document = {"content": postTexts, "type_": type_, "language": language}
encodingType = language_v1.EncodingType.UTF8
response = client.analyze_entity_sentiment(request = {'document': document, 'encoding_type': encodingType})
#loop through entities returned from the API
for entity in response.entities:
representativeName.append(entity.name)
entity_arr.append(language_v1.Entity.Type(entity.type_).name)
salienceScore.append(entity.salience)
entitySentimentScore.append(entity.sentiment.score)
entitySentimentMagnitude.append(entity.sentiment.magnitude)
#loop over the mentions of this entity in the input document
for mention in entity.mentions:
mentionText.append(mention.text.content)
mentionType.append(mention.type_)
#loop over metadata associated with entity
for metadata_name, metadata_value in entity.metadata.items():
metadataNames.append(metadata_name)
metadataValues.append(metadata_value)
data = {
"Representative Name": representativeName,
"Entity": entity_arr,
"Salience Score": salienceScore,
"Entity Sentiment Score": entitySentimentScore,
"Entity Sentiment Magnitude": entitySentimentMagnitude,
#"Metadata Name": metadataNames,
#"Metadata Value": metadataValues,
"Mention Text": mentionText,
"Mention Type": mentionType
}
df = pd.DataFrame(data)
df.to_csv("SentimentAnalysis.csv", encoding='utf-8', index=False)
def main():
import argparse
#read the csv file containing the post text we need to analyze
filename = open('test.csv', 'r')
#create dictreader object
file = csv.DictReader(filename)
postTexts = []
#iterate over each column and append values to list
for col in file:
postTexts.append(col['Post Text'])
content = listToString(postTexts) #convert list to string
print(content)
sentiment_entity(content)
if __name__ == "__main__":
main()
test.csv:
col_1,Post Text
dummy,Grapes are good.
dummy,Bananas are bad.
When code is ran, I printed the converted list to string and SentimentAnalysis.csv is generated:
SentimentAnalysis.csv:
Representative Name,Entity,Salience Score,Entity Sentiment Score,Entity Sentiment Magnitude,Mention Text,Mention Type
Grapes,OTHER,0.8335162997245789,0.800000011920929,0.800000011920929,Grapes,2
Bananas,OTHER,0.16648370027542114,-0.699999988079071,0.699999988079071,Bananas,2

Not getting expected output in python when converting a csv to json

I have an excel file in which data is saved in csv format in such a way.This data is present in the excel file as shown below,under column A (The CSV File is generated by LabView Software code which i have written to generate data).I have also attached an image of the csv file for reference at the end of my question.
RPM,Load Current,Battery Output,Power Capacity
1200,30,12,37
1600,88,18,55
I want to create a Json file in such format
{
"power_capacity_data" :
{
"rpm" : ["1200","1600"],
"load_curr" : ["30","88"],
"batt_output" : ["12","18"],
"power_cap" : ["37","55"]
}
}
This is my code
import csv
import json
def main():
#created a dictionary so that i can append data to it afterwards
power_data = {"rpm":[],"load_curr":[],"batt_output":[],"power_cap":[]}
with open('power1.lvm') as f:
reader = csv.reader(f)
#trying to append the data of column "RPM" to dictionary
rowcount = 0
for row in reader:
if rowcount == 0:
#trying to skip the first row
rowcount = rowcount + 1
else:
power_data['rpm'].append(row[0])
print(row)
json_report = {}
json_report['pwr_capacity_data'] = power_data
with open('LVMJSON', "w") as f1:
f1.write(json.dumps(json_report, sort_keys=False, indent=4, separators=(',', ': '),encoding="utf-8",ensure_ascii=False))
f1.close()
if __name__ == "__main__":
main()
The output json file that i am getting is this:(please ignore the print(row) statement in my code)
{
"pwr_capacity_data":
{
"load_curr": [],
"rpm": [
"1200,30,12.62,37.88",
"1600,88,18.62,55.88"
],
"batt_output": [],
"power_cap": []
}
}
The whole row is getting saved in the list,but I just want the values under the column RPM to be saved .Can someone help me out with what I may be doing wrong.Thanks in advance.I have attached an image of csv file to just in case it helps
You could use Python's defaultdict to make it a bit easier. Also a dictionary to map all your header values.
from collections import defaultdict
import csv
import json
power_data = defaultdict(list)
header_mappings = {
'RPM' : 'rpm',
'Load Current' : 'load_curr',
'Battery Output' : 'batt_output',
'Power Capacity' : 'power_cap'}
with open('power1.lvm', newline='') as f_input:
csv_input = csv.DictReader(f_input)
for row in csv_input:
for key, value in row.items():
power_data[header_mappings[key]].append(value)
with open('LVMJSON.json', 'w') as f_output:
json.dump({'power_capacity_data' : power_data}, f_output, indent=2)
Giving you an output JSON file looking like:
{
"power_capacity_data": {
"batt_output": [
"12",
"18"
],
"power_cap": [
"37",
"55"
],
"load_curr": [
"30",
"88"
],
"rpm": [
"1200",
"1600"
]
}
}

Python 27 CSV to JSON POST

I'm transferring my movie ratings from IMDB to Trakt. I use a Python script to do so and can't get it to turn my list into serializable JSON.
My script consists of a JSON uploader and an CSV reader, both work fine separately.
I've looked into list vs. tuple, json.dumps options and syntax and into json.encoder. There is a lot on the topic available online but no complete CSV to JSON example.
The following script includes all steps and a few lines of example data. If you want to test this script, you need the username, pass-SHA1 and API key of your Trakt account.
Current Error:
raise TypeError(repr(o) + " is not JSON serializable")
TypeError: `enter code here`set(['["tt1535108", "Elysium", "8", "2013"]']) is not JSON
serializable
#===============================================================================
# Used CSV file (imdb_ratings.csv)
#===============================================================================
# position,const,created,modified,description,Title,Title type,Directors,You rated,IMDb Rating,Runtime (mins),Year,Genres,Num. Votes,Release Date (month/day/year),URL
# 1,tt1683526,Sat Feb 1 00:00:00 2014,,,Detachment,Feature Film,Tony Kaye,8,7.7,97,2011,drama,36556,2011-04-25,http://www.imdb.com/title/tt1683526/
# 2,tt1205537,Wed Jan 29 00:00:00 2014,,,Jack Ryan: Shadow Recruit,Feature Film,Kenneth Branagh,6,6.6,105,2014,"action, mystery, thriller",11500,2014-01-15,http://www.imdb.com/title/tt1205537/
# 3,tt1535108,Tue Jan 28 00:00:00 2014,,,Elysium,Feature Film,Neill Blomkamp,8,6.7,109,2013,"action, drama, sci_fi, thriller",176354,2013-08-07,http://www.imdb.com/title/tt1535108/
#===============================================================================
# Imports etc.
#===============================================================================
import csv
import json
import urllib2
ifile = open('imdb_ratings.csv', "rb")
reader = csv.reader(ifile)
included_cols = [1, 5, 8, 11]
#===============================================================================
# CSV to JSON
#===============================================================================
rownum = 0
for row in reader:
# Save header row.
if rownum == 0:
header = row
else:
content = list(row[i] for i in included_cols)
print(content)
rownum += 1
ifile.close()
#===============================================================================
# POST of JSON
#===============================================================================
data = {
"username": "<username>",
"password": "<SHA1>",
"movies": [
{
# Expected format:
# "imdb_id": "tt0114746",
# "title": "Twelve Monkeys",
# "year": 1995,
# "rating": 9
json.dumps(content)
}
]
}
req = urllib2.Request('http://api.trakt.tv/rate/movies/<api>')
req.add_header('Content-Type', 'application/json')
response = urllib2.urlopen(req, json.dumps(data))
Construct the dict:
{
"imdb_id": "tt0114746",
"title": "Twelve Monkeys",
"year": 1995,
"rating": 9
}
instead of calling json.dumps(content), which creates a string.
You could create the list of dicts using a list comprehension and a dict comprehension:
movies = [{field:row[i] for field, i in zip(fields, included_cols)} for row in reader]
import csv
import json
import urllib2
with open('imdb_ratings.csv', "rb") as ifile:
reader = csv.reader(ifile)
next(reader) # skip header row
included_cols = [1, 5, 8, 11]
fields = ['imdb_id', 'title', 'rating', 'year']
movies = [{field: row[i] for field, i in zip(fields, included_cols)}
for row in reader]
data = {"username": "<username>",
"password": "<SHA1>",
"movies": movies}
req = urllib2.Request('http://api.trakt.tv/rate/movies/<api>')
req.add_header('Content-Type', 'application/json')
response = urllib2.urlopen(req, json.dumps(data))

Categories

Resources