How to load Mysql to Elasticsearch using python

How to load Mysql to Elasticsearch using python - python

I have a table name employees
I need to push the employees to Elasticsearch index using python
import MySQLdb
import json
from elasticsearch import Elasticsearch
db = MySQLdb.connect("localhost", "admin", "password", "dbname")
cursor = db.cursor()

Here is my quick idea 😎
from sqlalchemy import create_engine
import pymysql
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
#Replaceme
CONSTR = 'mysql+pymysql://root:#127.0.0.1'
sqlEngine = create_engine(CONSTR, pool_recycle=3600)
dbConnection = sqlEngine.connect()
df = pd.read_sql("select * from employees", dbConnection);
rows = df.to_json(orient='records')
es = Elasticsearch()
actions=[]
for item in rows:
action = {
#replace me if need to
"_id": "employee_%s"%item['id'],
"doc_type": "_doc",
"doc": item
}
actions.append(action)
response = helpers.bulk(es, actions, index="employees", doc_type='_doc')
dbConnection.close()

Dump out a CSV file (SELECT .. INTO OUTFILE) from MySQL, Load that into Elasticsearch.

Related

Inserting data using PyMongo based on a defined data model

I have a dataset consisting of 250 rows that looks like to following:
In MongoDB Compass, I inserted the first row as follows:
db.employees.insertOne([{"employee_id": 412153,
"first_name": "Carrol",
"last_name": "Dhin",
"email": "carrol.dhin#company.com",
"managing": [{"manager_id": 412153, "employee_id": 174543}],
"department": [{"department_name": "Accounting", "department_budget": 500000}],
"laptop": [{"serial_number": "CSS49745",
"manufacturer": "Lenovo",
"model": "X1 Gen 10",
"date_assigned": {$date: 01-15-2022},
"installed_software": ["MS Office", "Adobe Acrobat", "Slack"]}]})
If I wanted to insert all 250 rows into the database using PyMongo in Python, how would I ensure that every row is entered following the format that I used when I inserted it manually in the Mongo shell?

from pymongo import MongoClient
import pandas as pd
client = MongoClient(‘localhost’, 27017)
db = client.MD
collection = db.gammaCorp
df = pd.read_csv(‘ ’) #insert CSV name here
data = {}
for i in df.index:
data['employee_id'] = df['employee_id'][i]
data['first_name'] = df['first_name'][i]
data['last_name'] = df['last_name'][i]
data['email'] = df['email'][i]
data['managing'] = [{'manager_id': df['employee_id'][i]}, {'employee_id': df['managing'][i]}]
data['department'] = [{'department_name': df['department'][i]}, {'department_budget': df['department_budget'][i]}]
data['laptop'] = [{'serial_number': df['serial_number'][i]}, {'manufacturer': df['manufacturer'][i]}, {'model': df['model'][i]}, {'date_assigned': df['date_assigned'][i]}, {'installed_software': df['installed_software'][i]}]
collection.insert_one(data)

Databricks to cosmos DB uploading data is very slow

I have below code which runs many days, input file has around 9GB of CSV and has millions of rows, I written below code, which executing from 5 days and still not completed, is there any way we can speed up process uploading data to cosmos DB?
import json
import logging
import sys
import azure.cosmos.cosmos_client as cosmos_client
import azure.cosmos.exceptions as exceptions
from azure.cosmos.partition_key import PartitionKey
from typing import Optional
configs = {
"dev": {
"file_location": "/FileStore/tables/docs/dpidata_pfile_20050523-20221009.csv",
"file_type": "csv",
"infer_schema": False,
"first_row_is_header": True,
"delimiter": ",",
"cdb_url": "https://xyxxxxxxxxxxxxxxx:443/",
"db_name": "abc",
"container_name": "dpi",
"partition_key": "/dpi"
},
"stg": {},
"prd": {}
}
class LoadToCdb():
def dpi_data_load(self) -> Optional[bool]:
try:
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(self.configs["file_type"]) \
.option("inferSchema", self.configs["infer_schema"]) \
.option("header", self.configs["first_row_is_header"]) \
.option("sep", self.configs["delimiter"]) \
.load(self.configs["file_location"])
df = df.select('dpi', 'Entity Type Code')
df = (df.withColumnRenamed("dpi","dpi")
.withColumnRenamed("Entity Type Code","entity_type_code"))
df_json = df.toJSON()
for row in df_json.collect():
print(row)
data = json.loads(row)
data.setdefault('dpi', None)
data["id"] = data["dpi"]
# this method call will update to cosmos db
self.cosmos_db.create_items(data)
except Exception as e:
self.log.error("Could not able to load to cosmos db from csv file")
self.log.error(e)
load_to_cdb = LoadToCdb()
load_to_cdb.dpi_data_load()

Query SQL Server JSON columns using SQLAlchemy

I'm looking for a way to replicate the functionality of SQL Server's JSON_VALUE function using a SQLAlchemy query. I'm using metadata.reflect to define my existing db tables in SQLAlchemy.
SQL:
SELECT Id,
JSON_VALUE(BankDataJSON,'$.AccountName')
FROM BankData
SQLAlchemy Model:
db = SQLAlchemy()
db.Model.metadata.reflect(db.engine)
class BankData(db.Model):
__table__ = db.Model.metadata.tables['BankData']
Endpoint / Query:
#cust_accts_bp.route('/api/CustomerAccts')
def get_cust_accts():
custId = request.args.get('custId')
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
BankAccounts = models.bank_accounts.BankAccounts
qry = db.session.query(BankAccounts.Id, BankAccounts.AccountNumber, BankAccounts.BankName,
BankData.AppId, BankData.CustomerId, BankAccounts.Filename, BankData.BankDataJSON) \
.filter(
and_(BankData.Id == BankAccounts.BankDataId, BankData.CustomerId == custId)
)
engine = app.config['SQLALCHEMY_DATABASE_URI']
df = pd.read_sql(qry.statement, engine)
df['BankDataJSON'] = df['BankDataJSON'].apply(json.loads) # convert string representation of JSON
df['BankDataJSON'] = df['BankDataJSON'].map(lambda x:[x[i] for i in x if i=='AccountName'][0])
df = df.rename(columns={'BankDataJSON':'BusinessName'})
response = json.loads(df.to_json(orient="records"))
return(json.dumps(response))
Using this method, I have to manually serialize the JSON object (BankDataJSON) to a Python dict, and parse it to get the value I want ('AccountName'). If I were to use SQL Server's JSON_VALUE function, this is all done for you.
JSON response:
[
{
"Id": 3003,
"AccountNumber": "111111111",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "some filename.pdf",
"BusinessName": "Some BusinessName"
},
{
"Id": 3004,
"AccountNumber": "22222222",
"BankName": "Wells Fargo",
"AppId": 111111,
"CustomerId": "555555",
"Filename": "Some filename",
"BusinessName": "Some Businessname"
},
]
How can I go about doing this? I walso want to be able to replicated SQL Server's CROSS APPLY OPENJSON functionality for working with array of JSON objects in the future. Do I need to define the BankDataJSON column as a JSON type in my model? When I do this, I get an error regarding pyodbcs inability to deserialize JSON in the MSSQL dialect

may be you can try to implement the server's function in your query, something like this
from sqlalchemy.sql import func
db = SQLAlchemy(app)
BankData = models.bank_data.BankData
qry = db.session.query(BankData.Id,
func.JSON_VALUE(BankData.BankDataJSON,'$.AccountName'))

building json data from sql database cursor

Without knowing the structure of the json, how can I return a json object from the database query? All of the the information is there, I just can't figure out how to build the object.
import MySQLdb
import json
db = MySQLdb.connect( host, user, password, db)
cursor = db.cursor()
cursor.execute( query )
rows = cursor.fetchall()
field_names = [i[0] for i in cursor.description]
json_string = json.dumps( dict(rows) )
print field_names[0]
print field_names[1]
print json_string
db.close()
count
severity
{"321": "7.2", "1": "5.0", "5": "4.3", "7": "6.8", "1447": "9.3", "176": "10.0"}
The json object would look like:
{"data":[{"count":"321","severity":"7.2"},{"count":"1","severity":"5.0"},{"count":"5","severity":"4.3"},{"count":"7","severity":"6.8"},{"count":"1447","severity":"9.3"},{"count":"176","severity":"10.0"}]}

The problem you are encountering happens because you only turn the fetched items into dicts, without their description.
dict in python expects either another dict, or an iterable returning two-item tuples, where for each tuple the first item will be the key, and the second the value.
Since you only fetch two columns, you get the first one (count) as key, and the second (severity) as value for each fetched row.
What you want to do is also combine the descriptions, like so:
json_string = json.dumps([
{description: value for description, value in zip(field_names, row)}
for row in rows])

1- You can use pymsql DictCursor:
import pymysql
connection = pymysql.connect(db="test")
cursor = connection.cursor(pymysql.cursors.DictCursor)
cursor.execute("SELECT ...")
row = cursor.fetchone()
print row["key"]
2- MySQLdb also includes DictCursor that you can use. You need to pass cursorclass=MySQLdb.cursors.DictCursor when making the connection.
import MySQLdb
import MySQLdb.cursors
connection = MySQLdb.connect(db="test",cursorclass=MySQLdb.cursors.DictCursor)
cursor = connection.cursor()
cursor.execute("SELECT ...")
row = cursor.fetchone()
print row["key"]

I got this to work using Collections library, although the code is confusing:
import MySQLdb
import json
import collections
db = MySQLdb.connect(host, user, passwd, db)
cursor = db.cursor()
cursor.execute( query )
rows = cursor.fetchall()
field_names = [i[0] for i in cursor.description]
objects_list = []
for row in rows:
d = collections.OrderedDict()
d[ field_names[0] ] = row[0]
d[ field_names[1] ] = row[1]
objects_list.append(d)
json_string = json.dumps( objects_list )
print json_string
db.close()
[{"count": 176, "severity": "10.0"}, {"count": 1447, "severity": "9.3"}, {"count": 321, "severity": "7.2"}, {"count": 7, "severity": "6.8"}, {"count": 1, "severity": "5.8"}, {"count": 1, "severity": "5.0"}, {"count": 5, "severity": "4.3"}]

Iterating rows with Pyodbc

I am using Pyodbc to return a number of rows which are dumped into a JSON and sent to a server. I would like to iterate my SQL table and return all records. I am using cursor.fetchall() now, and the program returns one record. As shown below. When I use fetchone an error is returned AttributeError: 'unicode' object has no attribute 'SRNUMBER' and fetchmany returns one record as well. How do I successfully return all records? I am using Python 2.6.7
Code:
import pyodbc
import json
import collections
import requests
connstr = 'DRIVER={SQL Server};SERVER=server;DATABASE=ServiceRequest; UID=SA;PWD=pwd'
conn = pyodbc.connect(connstr)
cursor = conn.cursor()
cursor.execute("""
SELECT SRNUMBER, FirstName, LastName, ParentNumber
FROM MYLA311 """)
rows = cursor.fetchone()
objects_list = []
for row in rows:
d = collections.OrderedDict()
d['SRNUMBER']= row.SRNUMBER
d['FirstName']= row.FirstName
d['LastName']= row.LastName
d['ParentNumber']= row.ParentNumber
objects_list.append(d)
output = {"MetaData": {},
"SRData": d}
print output
j = json.dumps(output)
print json.dumps(output, sort_keys=True, indent=4)`
Output for fetchall and fetchmany:
{
"MetaData": {},
"SRData": {
"FirstName": "MyLAG",
"LastName": "ThreeEleven",
"ParentNumber": "021720151654176723",
"SRNUMBER": "1-3580171"
}
}

Use code from my answer here to build a list of dictionaries for the value of output['SRData'], then JSON encode the output dict as normal.
import pyodbc
import json
connstr = 'DRIVER={SQL Server};SERVER=server;DATABASE=ServiceRequest; UID=SA;PWD=pwd'
conn = pyodbc.connect(connstr)
cursor = conn.cursor()
cursor.execute("""SELECT SRNUMBER, FirstName, LastName, ParentNumber FROM MYLA311""")
# build list of column names to use as dictionary keys from sql results
columns = [column[0] for column in cursor.description]
results = []
for row in cursor.fetchall():
results.append(dict(zip(columns, row)))
output = {"MetaData": {}, "SRData": results}
print(json.dumps(output, sort_keys=True, indent=4))

For starters, the line
objects_list.append(d)
needs to be inside the for loop, not outside.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to load Mysql to Elasticsearch using python - python

I have a table name employees I need to push the employees to Elasticsearch index using python import MySQLdb import json from elasticsearch import Elasticsearch db = MySQLdb.connect("localhost", "admin", "password", "dbname") cursor = db.cursor()

Dump out a CSV file (SELECT .. INTO OUTFILE) from MySQL, Load that into Elasticsearch.

Related

Inserting data using PyMongo based on a defined data model

Databricks to cosmos DB uploading data is very slow

Query SQL Server JSON columns using SQLAlchemy

building json data from sql database cursor

Iterating rows with Pyodbc

Categories

Resources