Databricks to cosmos DB uploading data is very slow - python

I have below code which runs many days, input file has around 9GB of CSV and has millions of rows, I written below code, which executing from 5 days and still not completed, is there any way we can speed up process uploading data to cosmos DB?
import json
import logging
import sys
import azure.cosmos.cosmos_client as cosmos_client
import azure.cosmos.exceptions as exceptions
from azure.cosmos.partition_key import PartitionKey
from typing import Optional
configs = {
"dev": {
"file_location": "/FileStore/tables/docs/dpidata_pfile_20050523-20221009.csv",
"file_type": "csv",
"infer_schema": False,
"first_row_is_header": True,
"delimiter": ",",
"cdb_url": "https://xyxxxxxxxxxxxxxxx:443/",
"db_name": "abc",
"container_name": "dpi",
"partition_key": "/dpi"
},
"stg": {},
"prd": {}
}
class LoadToCdb():
def dpi_data_load(self) -> Optional[bool]:
try:
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(self.configs["file_type"]) \
.option("inferSchema", self.configs["infer_schema"]) \
.option("header", self.configs["first_row_is_header"]) \
.option("sep", self.configs["delimiter"]) \
.load(self.configs["file_location"])
df = df.select('dpi', 'Entity Type Code')
df = (df.withColumnRenamed("dpi","dpi")
.withColumnRenamed("Entity Type Code","entity_type_code"))
df_json = df.toJSON()
for row in df_json.collect():
print(row)
data = json.loads(row)
data.setdefault('dpi', None)
data["id"] = data["dpi"]
# this method call will update to cosmos db
self.cosmos_db.create_items(data)
except Exception as e:
self.log.error("Could not able to load to cosmos db from csv file")
self.log.error(e)
load_to_cdb = LoadToCdb()
load_to_cdb.dpi_data_load()

Related

How to migrate a table which contains a JSON column from AWS RedShift to BigQuery in Python?

I want to migrate the data tables from AWS database to BigQuery. I have a specific table named sampletable which includes id, user_id and log. Log is a JSON field that contains a dictionary which consists of keys and its respective values.
'reason': {
'id': 5,
'name': 'Sample name'
'contact': {
number = 123
address = None
}
},
'subreason': {
'id': 80,
'name': 'Sample name',
'is_active': True,
'created_at': '2022-07-18T18:33:28.911Z',
'deleted_at': None,
'complaint_id': 5,
},
This is the function that loads the data from the table to BigQuery:
def load_data(table_id, data):
print("load_data::Writing records to table", table_id)
job_config = bigquery.LoadJobConfig(
write_disposition="WRITE_APPEND",
schema=[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("user_id", "INT64"),
bigquery.SchemaField("log", "JSON"),
],
)
try:
start = time.time()
job = client.load_table_from_dataframe(
data, table_id, job_config=job_config
)
job.result()
end = time.time()
print("load_data::Time taken for writing " + str(data.shape[0]) + " records: ", end - start, "s")
except Exception as e:
print("load_data::exception", e)
print("load_data::Could not establish connection with Google BigQuery. Terminating program")
conn.close()
sys.exit()
However, an exception arises. The exception is that "exception cannot mix list and non-list, non-null values".
I tried changing the schema in this way:
schema=[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("user_id", "INT64"),
bigquery.SchemaField("log", "RECORD"), fields=
[
bigquery.SchemaField("reason", "RECORD", fields=
[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("name", "STRING")
bigquery.SchemaField("contact", "RECORD", fields=
[
bigquery.SchemaField("number", "STRING")
bigquery.SchemaField("address," "STRING"))
]
]),
bigquery.SchemaField("subreason", "RECORD", fields=
[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("name", "STRING")
bigquery.SchemaField("is_active", "BOOLEAN")
bigquery.SchemaField("created_at", "TIMESTAMP")
bigquery.SchemaField("deleted_at", "TIMESTAMP")
bigquery.SchemaField("complaint_id", "INT64")
]),
])
However, I get the exception " with type dict: was expecting tuple of (key, value) pair "
Can anyone guide me in this issue as I am new to data migration of JSON columns in tables? What is the proper way to modify the schema to accept the JSON columns for migration?
You can try and consider below approach.
In this approach, you will be loading the data as JSON data type in BigQuery. However, there will be manual adjustment on the JSON file since BigQuery accepts new-line delimited JSON for data ingestion. See below sample updated file json file.
{"log":{"reason":{"contact":{"address": null,"number": 123},"id": 5,"name": "Sample name"},"subreason": {"complaint_id": 5,"created_at": "2022-07-18T18:33:28.911Z","deleted_at": "None","id": 80,"is_active": true,"name": "Sample name"}}}
Notice that I compressed the JSON into one key which is named "log" and also compressed it into one line to satisfy new-line delimited JSON.
Below is the python code I used to ingest the data:
table_id = "your-project.-your-dataset.your-table"
file_path = "/path/of/your_json_file.json"
def load_table_file(file_path, table_id):
# [START bigquery_load_from_file]
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, autodetect=True,
#write_disposition="WRITE_APPEND",
schema=[
bigquery.SchemaField("log", "JSON"),
],
)
with open(file_path, "rb") as source_file:
job = client.load_table_from_file(source_file, table_id, job_config=job_config)
job.result() # Waits for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
# [END bigquery_load_from_file]
return table
load_table_file(file_path, table_id)
Output:

How to load Mysql to Elasticsearch using python

I have a table name employees
I need to push the employees to Elasticsearch index using python
import MySQLdb
import json
from elasticsearch import Elasticsearch
db = MySQLdb.connect("localhost", "admin", "password", "dbname")
cursor = db.cursor()
Here is my quick idea 😎
from sqlalchemy import create_engine
import pymysql
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
#Replaceme
CONSTR = 'mysql+pymysql://root:#127.0.0.1'
sqlEngine = create_engine(CONSTR, pool_recycle=3600)
dbConnection = sqlEngine.connect()
df = pd.read_sql("select * from employees", dbConnection);
rows = df.to_json(orient='records')
es = Elasticsearch()
actions=[]
for item in rows:
action = {
#replace me if need to
"_id": "employee_%s"%item['id'],
"doc_type": "_doc",
"doc": item
}
actions.append(action)
response = helpers.bulk(es, actions, index="employees", doc_type='_doc')
dbConnection.close()
Dump out a CSV file (SELECT .. INTO OUTFILE) from MySQL, Load that into Elasticsearch.

Query SQL Server JSON columns using SQLAlchemy

I'm looking for a way to replicate the functionality of SQL Server's JSON_VALUE function using a SQLAlchemy query. I'm using metadata.reflect to define my existing db tables in SQLAlchemy.
SQL:
SELECT Id,
JSON_VALUE(BankDataJSON,'$.AccountName')
FROM BankData
SQLAlchemy Model:
db = SQLAlchemy()
db.Model.metadata.reflect(db.engine)
class BankData(db.Model):
__table__ = db.Model.metadata.tables['BankData']
Endpoint / Query:
#cust_accts_bp.route('/api/CustomerAccts')
def get_cust_accts():
custId = request.args.get('custId')
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
BankAccounts = models.bank_accounts.BankAccounts
qry = db.session.query(BankAccounts.Id, BankAccounts.AccountNumber, BankAccounts.BankName,
BankData.AppId, BankData.CustomerId, BankAccounts.Filename, BankData.BankDataJSON) \
.filter(
and_(BankData.Id == BankAccounts.BankDataId, BankData.CustomerId == custId)
)
engine = app.config['SQLALCHEMY_DATABASE_URI']
df = pd.read_sql(qry.statement, engine)
df['BankDataJSON'] = df['BankDataJSON'].apply(json.loads) # convert string representation of JSON
df['BankDataJSON'] = df['BankDataJSON'].map(lambda x:[x[i] for i in x if i=='AccountName'][0])
df = df.rename(columns={'BankDataJSON':'BusinessName'})
response = json.loads(df.to_json(orient="records"))
return(json.dumps(response))
Using this method, I have to manually serialize the JSON object (BankDataJSON) to a Python dict, and parse it to get the value I want ('AccountName'). If I were to use SQL Server's JSON_VALUE function, this is all done for you.
JSON response:
[
{
"Id": 3003,
"AccountNumber": "111111111",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "some filename.pdf",
"BusinessName": "Some BusinessName"
},
{
"Id": 3004,
"AccountNumber": "22222222",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "Some filename",
"BusinessName": "Some Businessname"
},
]
How can I go about doing this? I walso want to be able to replicated SQL Server's CROSS APPLY OPENJSON functionality for working with array of JSON objects in the future. Do I need to define the BankDataJSON column as a JSON type in my model? When I do this, I get an error regarding pyodbcs inability to deserialize JSON in the MSSQL dialect
may be you can try to implement the server's function in your query, something like this
from sqlalchemy.sql import func
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
qry = db.session.query(BankData.Id,
func.JSON_VALUE(BankData.BankDataJSON,'$.AccountName'))

Python code breaks when attemting to download larger zipped csv file, works fine on smaller file

while working with small zipfiles(about 8MB) containg 25MB of CSV files the below code works exactly as it should. As soon as I attempt to download larger files (45MB zip file containing a 180MB csv) the code breaks and I get the following error message:
(venv) ufulu#ufulu awr % python get_awr_ranking_data.py
https://api.awrcloud.com/v2/get.php?action=get_topsites&token=REDACTED&project=REDACTED Client+%5Bw%5D&fileName=2017-01-04-2019-10-09
Traceback (most recent call last):
File "get_awr_ranking_data.py", line 101, in <module>
getRankingData(project['name'])
File "get_awr_ranking_data.py", line 67, in getRankingData
processRankingdata(rankDateData['details'])
File "get_awr_ranking_data.py", line 79, in processRankingdata
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
AttributeError: 'float' object has no attribute 'split'
My goal is to download data for 170 projects and save the data to sqlite DB.
Please bear with me me as I am a novice in the field of programming and python. I would greatly appreciate any help to fixing the code below as well as any other sugestions and improvements to making the code more robust and pythonic.
Thanks in advance
from dotenv import dotenv_values
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from sqlalchemy import create_engine
# SQL Alchemy setup
engine = create_engine('sqlite:///rankingdata.sqlite', echo=False)
# Excerpt from the initial API Call
data = {'projects': [{'name': 'Client1',
'id': '168',
'frequency': 'daily',
'depth': '5',
'kwcount': '80',
'last_updated': '2019-10-01',
'keywordstamp': 1569941983},
{
"depth": "5",
"frequency": "ondemand",
"id": "194",
"kwcount": "10",
"last_updated": "2019-09-30",
"name": "Client2",
"timestamp": 1570610327
},
{
"depth": "5",
"frequency": "ondemand",
"id": "196",
"kwcount": "100",
"last_updated": "2019-09-30",
"name": "Client3",
"timestamp": 1570610331
}
]}
#setup
api_url = 'https://api.awrcloud.com/v2/get.php?action='
urls = [] # processed URLs
urlbacklog = [] # URLs that didn't return a downloadable File
# API Call to recieve URL containing downloadable zip and csv
def getRankingData(project):
action = 'get_dates'
response = requests.get(''.join([api_url, action]),
params=dict(token=dotenv_values()['AWR_API'],
project=project))
response = response.json()
action2 = 'topsites_export'
rankDateData = requests.get(''.join([api_url, action2]),
params=dict(token=dotenv_values()['AWR_API'],
project=project, startDate=response['details']['dates'][0]['date'], stopDate=response['details']['dates'][-1]['date'] ))
rankDateData = rankDateData.json()
print(rankDateData['details'])
urls.append(rankDateData['details'])
processRankingdata(rankDateData['details'])
# API Call to download and unzip csv data and process it in pandas
def processRankingdata(url):
content = requests.get(url)
# {"response_code":25,"message":"Export in progress. Please come back later"}
if "response_code" not in content:
f = ZipFile(BytesIO(content.content))
#print(f.namelist()) to get all filenames in Zip
with f.open(f.namelist()[0], 'r') as g: rankingdatadf = pd.read_csv(g)
rankingdatadf = rankingdatadf[rankingdatadf['Search Engine'].str.contains("Google")]
domain = []
for row in rankingdatadf['URL']:
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
rankingdatadf['Domain'] = domain
rankingdatadf['Domain'] = rankingdatadf['Domain'].str.replace('www.', '')
rankingdatadf = rankingdatadf.drop(columns=['Title', 'Meta description', 'Snippet', 'Page'])
print(rankingdatadf['Search Engine'][0])
writeData(rankingdatadf)
else:
urlbacklog.append(url)
pass
# Finally write the data to database
def writeData(rankingdatadf):
table_name_from_file = project['name']
check = engine.has_table(table_name_from_file)
print(check) # boolean
if check == False:
rankingdatadf.to_sql(table_name_from_file, con=engine)
print(project['name'] + ' ...Done')
else:
print(project['name'] + ' ... already in DB')
for project in data['projects']:
getRankingData(project['name'])
The problem seems to be the split call on a float and not necessarily the download. Try changing line 79
from
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
to
domain.append(str(str(str(row).split("//")[-1]).split("/")[0]).split('?')[0])
It looks like you're trying to parse the network location portion of the URL here, you can also use urllib.parse to make this easier instead of chaining all the splits:
from urllib.parse import urlparse
...
for row in rankingdatadf['URL']:
domain.append(urlparse(row).netloc)
I think a malformed URL is causing you issues, try (to diagnose issue):
try :
for row in rankingdatadf['URL']:
try:
domain.append(urlparse(row).netloc)
catch Exception:
exit(row)
Looks like you figured it out above, you have a database entry with a NULL value for the URL field. Not sure what your fidelity requirements for this data set are but might want to enforce database rules for URL field, or use pandas to drop rows where URL is NaN.
rankingdatadf = rankingdatadf.dropna(subset=['URL'])

how to push a csv data to mongodb using python

Trying to push csv data in to mongodb using python.i'm a beginner to python & mongodb..i used the following code
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
jsonfile = open('C://test//6.json', 'a')
reader = csv.DictReader( csvfile )
header= [ "S.No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
#fieldnames=header
output=[]
for each in reader:
row={}
for field in header:
row[field]=each[field]
output.append(row)
json.dump(output, jsonfile, indent=None, sort_keys=False , encoding="UTF-8")
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
data=pd.read_csv('C://test//6.json', error_bad_lines=0)
df = pd.DataFrame(data)
records = csv.DictReader(df)
db.segment.insert(records)
but the output is given in this format
/* 0 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43134"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyPrice:\"349.75\""
}
/* 1 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43135"),
"[{\"AverageTradedPrice\":\"0\"" : "BuyQuantity:\"3000\""
}
/* 2 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43136"),
"[{\"AverageTradedPrice\":\"0\"" : "ClosePrice:\"350\""
}
/* 3 */
{
"_id" : ObjectId("54891c4ffb2a0303b0d43137"),
"[{\"AverageTradedPrice\":\"0\"" : "HighPrice:\"0\""
}
Actually i want the output to like for single id all the other fields should be showed as subtypes
eg:
_id" : ObjectId("54891c4ffb2a0303b0d43137")
AveragetradedPrice :0
HighPrice:0
ClosePrice:350
buyprice:350.75
Please help me Out.Thanks in advance
Thank you for the suggestion.This one is the corrected code:
import csv
import json
import pandas as pd
import sys, getopt, pprint
from pymongo import MongoClient
#CSV to JSON Conversion
csvfile = open('C://test//final-current.csv', 'r')
reader = csv.DictReader( csvfile )
mongo_client=MongoClient()
db=mongo_client.october_mug_talk
db.segment.drop()
header= [ "S No", "Instrument Name", "Buy Price", "Buy Quantity", "Sell Price", "Sell Quantity", "Last Traded Price", "Total Traded Quantity", "Average Traded Price", "Open Price", "High Price", "Low Price", "Close Price", "V" ,"Time"]
for each in reader:
row={}
for field in header:
row[field]=each[field]
db.segment.insert(row)
Why do you insert data one by one? Take a look at this one.
import pandas as pd
from pymongo import MongoClient
client = MongoClient(<your_credentials>)
database = client['YOUR_DB_NAME']
collection = database['your_collection']
def csv_to_json(filename, header=None):
data = pd.read_csv(filename, header=header)
return data.to_dict('records')
collection.insert_many(csv_to_json('your_file_path'))
Please be aware of that it might crash your app when the file is too big.
The easiest way is by using pandas
my code is
import json
import pymongo
import pandas as pd
myclient = pymongo.MongoClient()
df = pd.read_csv('yourcsv.csv',encoding = 'ISO-8859-1') # loading csv file
df.to_json('yourjson.json') # saving to json file
jdf = open('yourjson.json').read() # loading the json file
data = json.loads(jdf) # reading json file
now you can insert this json in your mangodb database :-]
There is a better way with less number of imports, assuming you have a header row in your CSV.
from pymongo import MongoClient
import csv
# DB connectivity
client = MongoClient('localhost', 27017)
db = client.db
collection = db.collection
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open(FILEPATH))
result = {}
for row in reader:
key = row.pop('First_value')
result[key] = row
return query
# Final insert statement
db.collection.insert_one(csv_to_dict())
Hope that helps
from pymongo import MongoClient
import csv
import json
# DB connectivity
client = MongoClient('localhost', 27017)
db = client["database name"]
col = db["collection"]
# Function to parse csv to dictionary
def csv_to_dict():
reader = csv.DictReader(open('File with path','r'))
result = {}
for row in reader:
key = row.pop('id')
result[key]= row
return result
# Final insert statement
x=col.insert_one(csv_to_dict())
print(x.inserted_id)
# to insert one row
#and to insert many rows following code is to be executed
from pymongo import MongoClient
import csv
# read csv file as a list of lists
client = MongoClient('localhost', 27017)
db = client["data base name"]
col = db["Collection Name"]
with open('File with path', 'r') as read_obj:
# pass the file object to reader() to get the reader object
csv_reader = csv.DictReader(read_obj)
# Pass reader object to list() to get a list of lists
mylist = list(csv_reader)
#print(list_of_rows)
x = col.insert_many(mylist)
#print list of the _id values of the inserted documents:
print(x.inserted_ids)

Categories

Resources