Dynamically write mysql table schema into Avro format - python

I want to collect the data from Mysql table and converet it to avro format using python.
Consider this table in mysql
dept_no, dept_name
'd001', 'Marketing'
'd002', 'Finance'
'd003', 'Human Resources'
'd004', 'Production'
'd005', 'Development'
'd006', 'Quality Management'
'd007', 'Sales'
'd008', 'Research'
'd009', 'Customer Service'
mycursor.execute('select * from employees')
results = mycursor.fetchall()
when i fetch the results using above query
i get the results in a class-tuple format.
Where as to convert to Avro format, Schema as to be defined in the following format:
By hard-coding we can achieve the below format
The Following code is to generate avro file.
schema = {
'doc': 'A weather reading.',
'name': 'Weather',
'namespace': 'test',
'type': 'record',
'fields': [
{'name': 'dept_no', 'type': 'string'},
{'name': 'dept_name', 'type': 'string'},
],
}
And the records as
records = [
{u'dept_no': u'd001', u'dept_name': 'Marketing'},
{u'dept_no': u'd002', u'dept_name': 'Finance'},
{u'dept_no': u'd003', u'dept_name': 'Human Resources'},
{u'dept_no': u'd004', u'dept_name': 'Production'},
]
The Question Here is How do i MAP the schema and the data in the above format using Python dynamically.
import mysql.connector
mydb = mysql.connector.connect(
host="********************",
user="***********",
passwd="**********",
database='***********'
)
def byte_to_string(x):
temp_table_list = []
for row in x:
table = row[0].decode()
temp_table_list.append(table)
return temp_table_list
mycursor = mydb.cursor()
#Query to list all the tables
mycursor.execute("show tables")
r = mycursor.fetchall()
r = byte_to_string(r)
print(r)
x = len(r)
#Fetch all the records from table EMPLOYEES using Select *
mycursor.execute('select * from employees')
results = mycursor.fetchall()
print(type(results))
print(results)
#Displays Data of table employee record by record
for i in results:
print(i)
print(type(i))
#Fectching data from 2nd table departments
mycursor.execute('select * from departments')
data=[i[0] for i in mycursor.fetchall()]
mycursor.execute('select * from departments')
data1=[i[1] for i in mycursor.fetchall()]
print(data)
print(data1)
#zipbObj = zip(data,data1)
#dictOfWords = dict(zipbObj)
#print(dictOfWords)
mycursor.execute('SELECT `COLUMN_NAME`\
FROM `INFORMATION_SCHEMA`.`COLUMNS`\
WHERE `TABLE_SCHEMA`="triggerdb1"\
AND `TABLE_NAME`="departments"')
#Fetching the column names of the table as keys
keys=[i[0] for i in mycursor.fetchall()]
print(keys)
'''
zipbObj = zip(column_schema,data1)
dictOfWords = dict(zipbObj)
print(dictOfWords)
'''
#Finally we get the Header as key and records as values in a dict format
abc = {}
abc[keys[0]] = data1
abc[keys[1]] = data
print(abc)
The result is in the form of
#print(data)
>>>'d001', 'd002', 'd003', 'd004', 'd005', 'd006', 'd007', 'd008', 'd009']
#print(data1)
>>>['Marketing', 'Finance', 'Human Resources', 'Production', 'Development', 'Quality Management', 'Sales', 'Research', 'Customer Service']
#print(keys)
>>>['dept_no', 'dept_name']
#print(abc)
>>>{'dept_no': ['Marketing', 'Finance', 'Human Resources', 'Production', 'Development', 'Quality Management', 'Sales', 'Research', 'Customer Service'], 'dept_name': ['d001', 'd002', 'd003', 'd004', 'd005', 'd006', 'd007', 'd008', 'd009']}
The Question Here is How do i dynamically MAP the schema and the data from the resulting class Tuple/Dictionary and covert to avro using Python.
Thanks in Advance!

Related

insert data into mysql table using python

im a beginner in python language , and im trying to insert my json file data into my database table using python but the problem is i have no errors i just get :
tweet number 49634 is uploading to the server
i don't get where the problem is, please i would appreciate any help
import mysql.connector
import json
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_request='insert ignore into tweet_location (latitude, longitude, tweet_id) values (%s,%s,%s)'""
myJsonFile = open('tweet.json', encoding="utf-8")
c = 0
for line in myJsonFile:
c = c + 1
print("tweet number ", c, " is uploading to the server")
data = json.loads(line)
#line = line.replace('','')
tweet = "SELECT * FROM tweet WHERE tweet_id = '" + str(data['tweet_id']) + "'"
mycursor.execute(tweet)
myresult = mycursor.fetchall()
row_count = mycursor.rowcount
if row_count == 0:
if 'location' in data.keys() and data['location'] != None and 'address' in data['location']:
latitude = data ['location']['lat']
longitude=data ['location']['lon']
mycursor.execute(sql_request, (latitude,longitude,data['tweet_id']))
print ('------------')
mydb.commit ()
here's an example of my json file data:
{"tweet_id":"1261276320878788609",
"date":"Fri May 15 12:44:42 +0000 2020",
"raw_text":"برنامج وطني لدعم المبدعين في مواجهة #كورو"
"geo_source":"user_location",
"location":{"address":
{"country":"Tunisia","country_code":"tn","state_district":"غزالة","county":"العرب","state":"Bizerte"},
"response":
"{'place_id': 235309103, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7124228, 'boundingbox': ['37.105957', '37.2033466', '9.4739053', '9.6124953'], 'lat': '37.1551868', 'lon': '9.54834183807249', 'display_name': 'العرب, غزالة, Bizerte, Tunisia', 'class': 'boundary', 'type': 'administrative', 'importance': 0.45, 'icon': '/data/nominatimimages/mapicons/poi_boundary_administrative.p.20.png','address':{'county': 'العرب', 'state_district': 'غزالة', 'state': 'Bizerte', 'country': 'Tunisia', 'country_code': 'tn'}}",
"geohash":"snwg37buskzd","query_term":"arab","lon":9.54834183807249,"lat":37.1551868},
"user_friends_count":61,"user_description":"I love UAE and his great leadership",
"user_created_at":"Wed Oct 09 11:41:41 +0000 2013",
"user_screen_name":"SikandarMirani",
"user_id_str":"706377881",
"user_verified":false,
"user_statuses_count":50804,
"user_followers_count":946,
"user_location":"Dubai United Arab Emirates"}
and this my table's attributes tweet:
tweet_id,
id_user,
text,
tweet_location,
created_at,
name_screen,
categorie_id,
Don't read the JSON file one line at a time. Use json.load() to load the entire file into a dictionary.
Use a parameter in the query that selects the tweet, rather that concatenating data['tweet_id'] into the SQL.
The code that inserts the new row should be inside all the if statements that set latitude and longitude from the data. In fact, you might as well put all the database code inside the if statement that checks whether location is set in the JSON.
import mysql.connector
import json
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_request='insert ignore into tweet_location (latitude, longitude, tweet_id) values (%s,%s,%s)'""
with open('tweet.json', encoding="utf-8") as myJsonFile:
data = json.load(myJsonFile)
if data.get('location') and 'address' in data['location']:
tweet = "SELECT 1 FROM tweet WHERE tweet_id = %s"
mycursor.execute(tweet, (data['tweet_id'],))
myresult = mycursor.fetchall()
row_count = len(myresult)
if row_count == 0:
print(f"Inserting {data['tweet_id']} to the database");
latitude = data['location']['lat']
longitude = data['location']['lon']
mycursor.execute(sql_request, (latitude,longitude,data['tweet_id']))
mydb.commit ()
print ('------------')
else:
print(f"Tweet {data['tweet_id']} is already in the database")

connect a json file data to mysql database (empty table)

im a beginner in python , Trying to connect access database to python with a json file loaded in my program so I can read it and eventually analyze it for certain things. But I can't connect to it and tried different approaches still getting the same error.
import mysql.connector
import json
# create the key
from mysql.connector import cursor
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_tweet = """INSERT INTO tweet ( tweet_id,
id_user,
text,
tweet_location,
created_at,
name_screen,
categorie_id,
)
VALUES (%s,%s,%s,%s,%s,%s,%s)"""
sql_user = """INSERT INTO tweetuser (
id_user,
name_screen,
location_user,
count_followers,
friends_count,
statuse_count)
VALUES (%s,%s,%s,%s,%s,%s)"""
sql_location = """"insert into tweet_location (
location_id,
latitude,
longitude
tweet_id
VALUES(%s,%s,%s,%s)"""
myJsonFile = open('tweets.json', encoding="utf-8")
mycursor.execute("DELETE FROM tweet")
mycursor.execute("DELETE FROM tweetuser")
mycursor.execute("DELETE FROM tweet_location")
c = 0
for line in myJsonFile:
c = c + 1
print("tweet number ", c, " is uploading to the server")
data = json.loads(line)
# insert into tweet
val_tweet = (
data['tweet_id'], data['user_id_str'], data['raw_text'],data['location']['address']['city'],data['date'], data['user_screen_name'])
mycursor.execute(sql_tweet,sql_location, val_tweet)
mydb.commit()
# testing ifthe user already exist
user = "SELECT * FROM tweetuser WHERE id_user = '" + str(data['user_id_str']) + "'"
mycursor.execute(user)
myresult = mycursor.fetchall()
row_count = mycursor.rowcount
if row_count == 0:
val_user = (data['user_id_str'], data['user_screen_name'], data['location']['address']['city'],data['user_followers_count'],
data['user_friends_count'], data['user_statuses_count'])
mycursor.execute(sql_user, val_user)
mydb.commit()
print('done')
here's an example of json file data :
{
"tweet_id":"1261276320878788609",
"date":"Fri May 15 12:44:42 +0000 2020",
"raw_text":"برنامج وطني لدعم المبدعين في مواجهة #كورون",
"geo_source":"user_location",
"location":{
"address":{
"country":"Tunisia",
"country_code":"tn",
"state_district":"غزالة",
"county":"العرب",
"state":"Bizerte"
},
"response":"{'place_id': 235309103, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7124228, 'boundingbox': ['37.105957', '37.2033466', '9.4739053', '9.6124953'], 'lat': '37.1551868', 'lon': '9.54834183807249', 'display_name': 'العرب, غزالة, Bizerte, Tunisia', 'class': 'boundary', 'type': 'administrative', 'importance': 0.45, 'icon': '/data/nominatimimages/mapicons/poi_boundary_administrative.p.20.png','address':{'county': 'العرب', 'state_district': 'غزالة', 'state': 'Bizerte', 'country': 'Tunisia', 'country_code': 'tn'}}",
"geohash":"snwg37buskzd",
"query_term":"arab",
"lon":9.54834183807249,
"lat":37.1551868
},
"user_friends_count":61,
"user_description":"I love UAE and his great leadership",
"user_created_at":"Wed Oct 09 11:41:41 +0000 2013",
"user_screen_name":"SikandarMirani",
"user_id_str":"706377881",
"user_verified":false,
"user_statuses_count":50804,
"user_followers_count":946,
"user_location":"Dubai United Arab Emirates"
}
thanks to you guys , i was able to solve the previous error since i didn't check tha data type of the id user it has to be bigint not int since it's a large data .
i had no problem connecting my jsonfile to my database but it got inserted only in tweetuser table but not in tweet table .
the tweet table is empty.
i would appreciate any kind of help thank you
The error
mysql.connector.errors.DataError: 1264 (22003): Out of range value for column 'id_user' at row 1
suggests that the value you are trying to use as the id_user is numerically too large.
Since you haven't posted the table definitions, my guess is you are using MEDIUMINT or SMALLINT or TINYINT for id_user and the actual user ID that you are trying to write into the database is too large for that data type.
In your example user_id_str is 706377881, however, the maximum value for MEDIUMINT is 8388607 and 16777215 (unsigned), respectively.
Check the data types in the table definitions.
You are connecting to your DB, that is not the problem.
The problem is that the user id that you are trying to insert has a length that surpasses the maximum allowed by MySQL for the datatype of that field. See here and here for more info related to your error.

Dynamic update Dash Datatable from df

I'm creating an auto-update Dash datatable. The data is obtained from a SQL Server Query, passed to a Dataframe and displayed in the datatable:
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import dash_table
import pandas as pd
import numpy as np
import pyodbc
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.MINTY, 'https://use.fontawesome.com/releases/v5.9.0/css/all.css'])
sql_query = ('''SELECT * FROM HOSVITAL.dbo.Censo_Diario
ORDER BY Dias_Estancia DESC''')
server = '#srv'
database = '#db'
username = '#usr'
password = '#pwd'
conn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
df = pd.read_sql(sql_query,conn)
df = df.apply(lambda x: x.str.strip()
if x.dtype == "object" else x) # Trim whitespaces
After that, i'm defining a function for connect to my DB and another function for get and clean the data and return it as a dict:
def connectSQLServer(conn):
connSQLServer = conn
return connSQLServer
def getData():
df2 = pd.DataFrame()
for idx in range(10):
data = pd.read_sql(sql_query,conn)
df2 = df.apply(lambda x: x.str.strip()
if x.dtype == "object" else x)
df2 = df2.append(data, ignore_index=True)
df2 = df2.drop_duplicates().sort_index()
return df2.to_dict('records')
I´m giving custom name to the df columns:
tblcols=[{'name': 'Pabellon', 'id': 'Pabellon'},
{'name': 'Cama', 'id': 'Cama'},
{'name': 'Cedula', 'id': 'Cedula'},
{'name': 'Tipo', 'id': 'Tipo'},
{'name': 'EPS', 'id': 'EPS'},
{'name': 'Nivel', 'id': 'Nivel'},
{'name': 'Ingreso', 'id': 'Ingreso'},
{'name': 'Nombre', 'id': 'Nombre'},
{'name': 'Edad', 'id': 'Edad'},
{'name': 'Cod', 'id': 'Cod'},
{'name': 'Diagnostico', 'id': 'Diagnostico'},
{'name': 'Fecha Ingreso', 'id': 'Fecha_Ingreso'},
{'name': 'Dias Estancia', 'id': 'Dias_Estancia'}
]
Create my layout adding a Dcc.interval for update the data every 5 seconds:
app.layout = html.Div([
html.H4('CN Dashboard'),
dcc.Interval('graph-update', interval = 5000, n_intervals = 0),
dbc.Container([
dash_table.DataTable(
sort_action='native',
id = 'table',
css=[{
'selector': '.dash-cell div.dash-cell-value',
'rule': 'display: inline; white-space: inherit; overflow: inherit; text-overflow: inherit;'
}],
data = getData(),
columns=tblcols,]
)
Finally, I create this callback for update the table:
#app.callback(
dash.dependencies.Output('table','data'),
[dash.dependencies.Input('graph-update', 'n_intervals')])
def updateTable(n):
return getData()
if __name__ == '__main__':
app.run_server(debug=False, host='0.0.0.0', port = 8070)
app.title = 'Tablero CN' # appears in browser title bar
So, this App is displaying the data and it´s updating as expected, but the problem is that all rows are being duplicated in the Output.
Any suggestions?
Thanks in advance.
Error(Invalid prop for this component) - basically it is stating that dash_bootstrap_components.Table does not support data argument where you are trying to push update using callback. dash table has this feature. seems you imported it in code but didn't use. As far as I know it can't be solved with dbc. you can use dash_table if you really want this callback.
Query Re-usability - if both queries are same why not define query as variable in top as you did with conn and pass it as argument.
example:
query = '''select * from table'''

Save dataframe in Postgresql Database with SERIAL Autogenerated ID

Having a dataframe in the following way:
word classification counter
0 house noun 2
1 the article 2
2 white adjective 1
3 yellow adjective 1
I would like to store in Postgresql table with the following definition:
CREATE TABLE public.word_classification (
id SERIAL,
word character varying(100),
classification character varying(10),
counter integer,
start_date date,
end_date date
);
ALTER TABLE public.word_classification OWNER TO postgres;
The current basic configuration I have is as follows:
from sqlalchemy import create_engine
import pandas as pd
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'localhost' ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = 'BVict31C' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD
POSTGRES_DBNAME = 'local-sandbox-dev' ## CHANGE THIS TO YOUR DATABASE NAME
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}#{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME,password=POSTGRES_PASSWORD,ipaddress=POSTGRES_ADDRESS,port=POSTGRES_PORT,dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)
data=[['the','article',0],['house','noun',1],['yellow','adjective',2],
['the','article',4],['house','noun',5],['white','adjective',6]]
df = pd.DataFrame(data, columns=['word','classification','position'])
df_db = pd.DataFrame(columns=['word','classification','counter','start_date','end_date'])
count_series=df.groupby(['word','classification']).size()
new_df = count_series.to_frame(name = 'counter').reset_index()
df_db = new_df.to_sql('word_classification',cnx,if_exists='append',chunksize=1000)
I would like to insert into the table as I am able to do with SQL syntax:
insert into word_classification(word, classification, counter)values('hello','world',1);
Currently, I am getting an error when inserting into the table because I am passing the index:
(psycopg2.errors.UndefinedColumn) column "index" of relation "word_classification" does not exist
LINE 1: INSERT INTO word_classification (index, word, classification...
^
[SQL: INSERT INTO word_classification (index, word, classification, counter) VALUES (%(index)s, %(word)s, %(classification)s, %(counter)s)]
[parameters: ({'index': 0, 'word': 'house', 'classification': 'noun', 'counter': 2}, {'index': 1, 'word': 'the', 'classification': 'article', 'counter': 2}, {'index': 2, 'word': 'white', 'classification': 'adjective', 'counter': 1}, {'index': 3, 'word': 'yellow', 'classification': 'adjective', 'counter': 1})]
I have been searching for ways to get rid of passing the index with no luck.
Thanks for your help
Turn off index when storing in database as follows:
df_db = new_df.to_sql('word_classification',cnx,if_exists='append',chunksize=1000, index=False)

Execute user-defined query on list of dictionaries

I have a set of data that a user needs to query using their own query string. The current solution creates a temporary in-memory sqlite database that the query is run against.
The dataset is a list of "flat" dictionaries, i.e. there is no nested data. The query string does not need to be SQL, but it should be simple to define using an existing query framework.
It needs to support ordering (ascending, descending, custom) and filtering.
The purpose of this question is to get a range of different solutions that might work for this use case.
import sqlite3
items = [
{'id': 1},
{'id': 2, 'description': 'This is a description'},
{'id': 3, 'comment': 'This is a comment'},
{'id': 4, 'height': 1.78}
]
# Assemble temporary sqlite database
conn = sqlite3.connect(':memory:')
cur = conn.cursor()
knownTypes = { "id": "real", "height": "real", "comment": "text" }
allKeys = list(set().union(*(d.keys() for d in items)))
allTypes = list(knownTypes.get(k, "text") for k in allKeys)
createTable_query = "CREATE TABLE data ({});".format(", ".join(["{} {}".format(x[0], x[1]) for x in zip(allKeys, allTypes)]))
cur.execute(createTable_query)
conn.commit()
qs = ["?" for i in range(len(allKeys))]
insertRow_query = "INSERT INTO data VALUES ({});".format(", ".join(qs))
for p in items:
vals = list([p.get(k, None) for k in allKeys])
cur.execute(insertRow_query, vals)
conn.commit()
# modify user query here
theUserQuery = "SELECT * FROM data"
# Get data from query
data = [row for row in cur.execute(theUserQuery)]
YAQL is what I'm looking for.
It doesn't do SQL, but it does execute a query string - which is a simple way to do complex user-defined sorting and filtering.
There's a library called litebox that does what you want. It is backed by SQLite.
from litebox import LiteBox
items = [
{'id': 1},
{'id': 2, 'description': 'This is a description'},
{'id': 3, 'comment': 'This is a comment'},
{'id': 4, 'height': 1.78}
]
types = {"id": int, "height": float, "comment": str}
lb = LiteBox(items, types)
lb.find("height > 1.5")
Result: [{'id': 4, 'height': 1.78}]

Categories

Resources