Moving from HDF5 to PostgreSQL - python

I have a many large 1-D hdf5 datasets with next properties:
init size = (5201,),
maxshape = (6000000,),
dtype ='float32'
chunks = (10000,)
compression = "gzip"
Path example: file["Group"]["1"]["Group1"]["2"]["Dataset"]
I want to move them into the PostgreSQL, I dealed with structure of database and inserting data, but each filling takes ~650 seconds of 72,4mb hdf5 file, can someone give me a tips/advice how I can improve the performance?
What I have now:
def fill_database(self, dog):
if isinstance(dog, h5py.Dataset):
name = dog.name.split('/')
table_name = '{}_{}'.format(name[3], name[5])
data = dog.value.astype(int).tolist()
self.cur.execute('CREATE TABLE IF NOT EXISTS {} (cur_id INT PRIMARY KEY , data INT[]);'.format(table_name))
self.cur.execute('INSERT INTO {} VALUES (%s, %s)'.format(table_name), (name[2], data))
if isinstance(dog, h5py.Group):
for k, v in dict(dog).items():
self.fill_database(v)
What I tried:
import psycopg2
import h5py
from itertools import islice
with h5py.File('full_db.hdf5') as hdf5file:
with psycopg2.connect(database='hdf5', user='postgres', password='pass', port=5432) as conn:
cur = conn.cursor()
cur.execute('drop table if EXISTS mytable;')
cur.execute('create table mytable (data INT[]);')
chunksize = 10000
t = iter(hdf5file["Group"]["1"]["Group1"]["2"]["Dataset"][:].astype(int))
rows = islice(t, chunksize)
while rows:
statement = "INSERT INTO mytable(data) VALUES {}".format(rows) # I stuck here
cur.execute(statement)
rows = islice(t, chunksize)
conn.commit()
Also I tried to do something with LIMIT in PostgreSQL and many other ways but I was not successful.
I think some of the problem may be because of arrays in the database, I use them for later more convenient output.

After almost two weeks I think I can answer my own question.
In search of an answer, I came across on the Internet at this page https://github.com/psycopg/psycopg2/issues/179
Also after reading of documentation, I understood that a copying from a file works even quicker and I tried to use the module of StringIO. And thats what I get:
import h5py
import psycopg2
import time
from io import StringIO
conn = psycopg2.connect(database='hdf5', user='postgres', password=' ')
cur = conn.cursor()
file = h5py.File('db.hdf5', 'r')
data_set = file['path/to/large/data_set'].value.astype(int).tolist()
cur.execute('DROP TABLE IF EXISTS table_test;')
cur.execute('CREATE TABLE table_test (data INTEGER[]);')
# ORIGINAL
start = time.time()
cur.execute('INSERT INTO table_test VALUES (%s);', (data_set,))
print('Original: {} sec'.format(round(time.time() - start, 2)))
# STRING FORMAT
start = time.time()
data_str = ','.join(map(str, data_set)).replace('[', '{').replace(']', '}')
cur.execute('INSERT INTO table_test VALUES (ARRAY[{}]);'.format(data_str))
print('String format: {} sec'.format(round(time.time() - start, 2)))
# STRING IO COPY
start = time.time()
data_str = ','.join(map(str, data_set)).replace('[', '{').replace(']', '}')
data_io = StringIO('{{{}}}'.format(data_str))
cur.copy_from(data_io, 'table_test')
print('String IO: {} sec'.format(round(time.time() - start, 2)))
conn.commit()
Which gives me next result with dataset with shape (1200201,):
Original: 1.27 sec
String format: 0.58 sec
String IO: 0.3 sec

Related

How to speed up Python CSV Read to MySQL Write

I have a 5GB CSV of IP addresses that I need to parse to a MySQL database.
Currently reading rows from the CSV and inserting into the MySQL. It works great however I would love to make it fast.
Could I parallel the reading and writing somehow? Or perhaps chuck the csv down and spawn from processes to read & write each split csv?
import csv
from csv import reader
from csv import writer
import mysql.connector
cnx = mysql.connector.connect(user='root', password='', host='127.0.0.1', database='ips')
cursor = cnx.cursor()
i = 1
with open('iplist.csv', 'r') as read_obj:
csv_reader = reader(read_obj)
for row in csv_reader:
query = """INSERT INTO ips (ip_start,ip_end,continent) VALUES ('%s','%s','%s')""" % (row[0],row[1],row[2])
print (query)
cursor.execute(query)
cursor.execute('COMMIT')
print(i)
i = i + 1
cnx.close()
Any help is appreciated.
Use cursor.executemany to increase speed:
# Tested with:
# docker run --rm -e MYSQL_ALLOW_EMPTY_PASSWORD=y -p 3306:3306 mysql
#
# CREATE DATABASE ips;
# USE ips;
# CREATE TABLE ips (id INT PRIMARY KEY NOT NULL AUTO_INCREMENT, ip_start VARCHAR(15), ip_end VARCHAR(15), continent VARCHAR(20));
import mysql.connector
import csv
import itertools
CHUNKSIZE = 1000 # Number of lines
cnx = mysql.connector.connect(user='root', password='', host='127.0.0.1', database='ips')
cursor = cnx.cursor()
with open('iplist.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
while True:
records = list(itertools.islice(reader, CHUNKSIZE))
if not records:
break
query = """INSERT INTO ips (ip_start, ip_end, continent) VALUES (%s, %s, %s)"""
cursor.executemany(query, records)
cursor.execute('COMMIT')
I created a pseudo-random CSV file where each row is of the style "111.222.333.444,555.666.777.888,A continent". The file contains 33 million rows. The following code was able to insert all rows into a MySQL database table in ~3 minutes:-
import mysql.connector
import time
import concurrent.futures
import csv
import itertools
CSVFILE='/Users/Andy/iplist.csv'
CHUNK=10_000
def doBulkInsert(rows):
with mysql.connector.connect(user='andy', password='monster', host='localhost', database='andy') as connection:
connection.cursor().executemany(f'INSERT INTO ips (ip_start, ip_end, continent) VALUES (%s, %s, %s)', rows)
connection.commit()
def main():
_s = time.perf_counter()
with open(CSVFILE) as csvfile:
csvdata = csv.reader(csvfile)
_s = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor() as executor:
while (data := list(itertools.islice(csvdata, CHUNK))):
executor.submit(doBulkInsert, data)
executor.shutdown(wait=True)
print(f'Duration = {time.perf_counter()-_s}')
if __name__ == '__main__':
main()
My recommendation would be chunk your list. Break it down into 5,000 (or similar) chunks, then iterate through those. This will reduce the amount of queries you are making. Query volume seems to be your biggest bottleneck.
https://medium.com/code-85/two-simple-algorithms-for-chunking-a-list-in-python-dc46bc9cc1a2

Python pandas not importing values as is on Oracle

I imported a txt file on my python script and then converted it to dataframe. Then I created a function that uses cx_oracle to insert my data to Oracle database faster. It works pretty well and it only took 15min to import 1mil+ data - but it doesn't copy the values as is. This is a chunk of that code:
sqlquery = 'INSERT INTO {} VALUES({})'.format(tablename, inserttext)
df_list = df.values.tolist()
cur = con.cursor()
cur.execute(sql_query1)
logger.info("Completed: %s", sql_query1)
for b in df_list :
for index, value in enumerate(b):
if isinstance(value, float) and math.isnan(value):
b[index] = None
elif isinstance(value, type(pd.NaT)):
b[index] = None
Here is a sample data of what I expected:
DATE
STORE
COST
PARTIAL
16-JUN-21 08.00.00.000000000 PM
00006
+00000.0082
false
But instead this is being imported
DATE
STORE
COST
PARTIAL
16-JUN-21
6
0.0082
F
I need it to be eaxcatly same with zeros, symbols etc. I've already tried converting the dataframe as string by doing df = df.astype(str) but it doesn't work.
Hopefully you can help!
Without going into whether the schema design and architecture is really what you should be using, then with this schema:
create table t (d varchar2(31), s varchar2(6), c varchar(12), p varchar(5));
and this data in t.csv:
16-JUN-21 08.00.00.000000000 PM,00006,+00000.0082,false
and this code:
import cx_Oracle
import os
import sys
import csv
if sys.platform.startswith("darwin"):
cx_Oracle.init_oracle_client(lib_dir=os.environ.get("HOME")+"/Downloads/instantclient_19_8")
username = os.environ.get("PYTHON_USERNAME")
password = os.environ.get("PYTHON_PASSWORD")
connect_string = os.environ.get("PYTHON_CONNECTSTRING")
connection = cx_Oracle.connect(username, password, connect_string)
with connection.cursor() as cursor:
# Predefine the memory areas to match the table definition
cursor.setinputsizes(31,6,12,5)
# Adjust the batch size to meet your memory and performance requirements
batch_size = 10000
with open('t.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
sql = "insert into t (d, s, c, p) values (:1, :2, :3, :4)"
data = []
for line in csv_reader:
data.append(line)
if len(data) % batch_size == 0:
cursor.executemany(sql, data)
data = []
if data:
cursor.executemany(sql, data)
connection.commit()
with connection.cursor() as cursor:
sql = """select * from t"""
for r in cursor.execute(sql):
print(r)
the output is:
('16-JUN-21 08.00.00.000000000 PM', '00006', '+00000.0082', 'false')
For general reference see the cx_Oracle documentation Batch Statement Execution and Bulk Loading.

More efficient way to query this SQL table from python?

I need to query rows where a column matches my list of ~60K IDs out of a table that contains millions of IDs. I think normally you would insert a temporary table into the database and merge on that but I can't edit this database. I am doing it like this using a loop w/ a python wrapper, but is there a better way? I mean it works, but still:
import pyodbc
import pandas as pd
# connect to the database using windows authentication
conn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=my_fav_server;DATABASE=my_fav_db;Trusted_Connection=yes;')
cursor = conn.cursor()
# read in all the ids
ids_list = [...60K ids in here..]
# query in 10K chunks to prevent memory error
def chunks(l,n):
# split list into n lists of evenish size
n = max(1,n)
return [l[i:i+n] for i in range(0,len(l), n)]
chunked_ids_lists = chunks(ids_list, 10000)
# looping through to retrieve all cols
for chunk_num, chunked_ids_list in enumerate(chunked_ids_lists):
temp_ids_string = "('" + "','".join(chunked_ids_list) + "')"
temp_sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN {temp_ids_string};"
temp_data = pd.read_sql_query(temp_sql, conn)
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data.to_csv(temp_path, sep='\t', index=None)
# read the query chunks
all_data_list = []
for chunk_num in range(len(chunked_ids_lists)):
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data = pd.read_csv(temp_path, sep='\t')
all_data_list.append(temp_data)
all_data = pd.concat(all_data_list)
Another way use Psycopg's cursor.
import psycopg2
# Connect to an existing database
conn = psycopg2.connect("dbname=test user=postgres")
# Open a cursor to perform database operations
cur = conn.cursor()
# get data from query
# no need construct 'SQL-correct syntax' filter
cur.execute("SELECT * FROM dbo.my_fav_table WHERE ID IN %(filter)s;", {"filter": chunked_ids_lists})
# loop over getted rows
for record in cur:
# we got one record
print(record) # or make other data treatment
Use parameters rather than concatenating strings.
I don't see the need for the CSV files, if you're just going to read them all into Python in the next loop. Just put everything into all_data_list during the query loop.
all_data_list = []
for chunk in chunked_ids_lists:
params = ','.join(['?'] * len(chunk))
sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN ({params});"
cursor.execute(sql, chunk)
rows = cursor.fetchall()
all_data_list.extend(rows)
all_data = pd.dataFrame(all_data_list)

Speeding up performance when writing from pandas to sqlite

Hoping for a few pointers on how I can optimise this code up... Ideally I'd like to keep with using pandas but assume there's some nifty sqlite tricks I can use to get some good speed-up. For additional "points", would love to know if Cython could help at all here?
Incase it's not obvious from the code.. for context, I'm having to write out millions of very small sqlite files (files in "uncompressedDir") and outputting them into a much larger "master" sqlite DB ("6th jan.db").
Thanks in advance everyone!
%%cython -a
import os
import pandas as pd
import sqlite3
import time
import sys
def main():
rootDir = "/Users/harryrobinson/Desktop/dataForMartin/"
unCompressedDir = "/Users/harryrobinson/Desktop/dataForMartin/unCompressedSqlFiles/"
with sqlite3.connect(rootDir+'6thJan.db') as conn:
destCursor = conn.cursor()
createTable = "CREATE TABLE IF NOT EXISTS userData(TimeStamp, Category, Action, Parameter1Name, Parameter1Value, Parameter2Name, Parameter2Value, formatVersion, appVersion, userID, operatingSystem)"
destCursor.execute(createTable)
for i in os.listdir(unCompressedDir):
try:
with sqlite3.connect(unCompressedDir+i) as connection:
cursor = connection.cursor()
cursor.execute('SELECT * FROM Events')
df_events = pd.DataFrame(cursor.fetchall())
cursor.execute('SELECT * FROM Global')
df_global = pd.DataFrame(cursor.fetchall())
cols = ['TimeStamp', 'Category', 'Action', 'Parameter1Name', 'Parameter1Value', 'Parameter2Name', 'Parameter2Value']
df_events = df_events.drop(0,axis=1)
df_events.columns = cols
df_events['formatVersion'] = df_global.iloc[0,0]
df_events['appVersion'] = df_global.iloc[0,1]
df_events['userID'] = df_global.iloc[0,2]
df_events['operatingSystem'] = df_global.iloc[0,3]
except Exception as e:
print(e, sys.exc_info()[-1].tb_lineno)
try:
df_events.to_sql("userData", conn, if_exists="append", index=False)
except Exception as e:
print("Sqlite error, {0} - line {1}".format(e, sys.exc_info()[-1].tb_lineno))
UPDATE: halved the time by adding a transaction instead of to_sql
Reconsider using Pandas as a staging tool (leave the library for data analysis). Simply write pure SQL queries which can be accommodated by using SQLite's ATTACH to query external databases.
with sqlite3.connect(os.path.join(rootDir,'6thJan.db')) as conn:
destCursor = conn.cursor()
createTable = """CREATE TABLE IF NOT EXISTS userData(
TimeStamp TEXT, Category TEXT, Action TEXT, Parameter1Name TEXT,
Parameter1Value TEXT, Parameter2Name TEXT, Parameter2Value TEXT,
formatVersion TEXT, appVersion TEXT, userID TEXT, operatingSystem TEXT
);"""
destCursor.execute(createTable)
conn.commit()
for i in os.listdir(unCompressedDir):
destCursor.execute("ATTACH ? AS curr_db;", i)
sql = """INSERT INTO userData
SELECT e.*, g.formatVersion, g.appVersion, g.userID, g.operatingSystem
FROM curr_db.[events] e
CROSS JOIN (SELECT * FROM curr_db.[global] LIMIT 1) g;"""
destCursor.execute(sql)
conn.commit()
destCursor.execute("DETACH curr_db;")

Loop not working for sql update statement (mysqldb)

I have a folder called 'testfolder' that includes two files -- 'Sigurdlogfile' and '2004ADlogfile'. Each file has a list of strings called entries. I need to run my code on both of them and am using glob to do this. My code creates a dictionary for each file and stores data extracted using regex where the dictionary keys are stored in commonterms below. Then it inserts each dictionary into a mysql table. It does all of this successfully, but my second sql statement is not inserting how it should (per file).
import glob
import re
files = glob.glob('/home/user/testfolder/*logfile*')
commonterms = (["freq", "\s?(\d+e?\d*)\s?"],
["tx", "#txpattern"],
["rx", "#rxpattern"], ...)
terms = [commonterms[i][0] for i in range(len(commonterms))]
patterns = [commonterms[i][1] for i in range(len(commonterms))]
def getTerms(entry):
for i in range(len(terms)):
term = re.search(patterns[i], entry)
if term:
term = term.groups()[0] if term.groups()[0] is not None else term.groups()[1]
else:
term = 'NULL'
d[terms[i]] += [term]
return d
for filename in files:
#code to create 'entries'
objkey = re.match(r'/home/user/testfolder/(.+?)logfile', filename).group(1)
d = {t: [] for t in terms}
for entry in entries:
d = getTerms(entry)
import MySQLdb
db = MySQLdb.connect(host='', user='', passwd='', db='')
cursor = db.cursor()
cols = d.keys()
vals = d.values()
for i in range(len(entries)):
lst = [item[i] for item in vals]
csv = "'{}'".format("','".join(lst))
sql1 = "INSERT INTO table (%s) VALUES (%s);" % (','.join(cols), csv.replace("'NULL'", "NULL"))
cursor.execute(sql1)
#now in my 2nd sql statement I need to update the table with data from an old table, which is where I have the problem...
sql2 = "UPDATE table, oldtable SET table.key1 = oldtable.key1,
table.key2 = oldtable.key2 WHERE oldtable.obj = %s;" % repr(objkey)
cursor.execute(sql2)
db.commit()
db.close()
The problem is that in the second sql statement, it ends up inserting that data into all columns of the table from only one of the objkeys, but I need it to insert different data depending on which file the code is currently running on. I can't figure out why this is, since I've defined objkey inside my for filename in files loop. How can I fix this?
Instead of doing separate INSERT and UPDATE, do them together to incorporate the fields from the old table.
for i in range(len(entries)):
lst = [item[i] for item in vals]
csv = "'{}'".format("','".join(lst))
sql1 = """INSERT INTO table (key1, key2, %s)
SELECT o.key1, o.key2, a.*
FROM (SELECT %s) AS a
LEFT JOIN oldtable AS o ON o.obj = %s""" % (','.join(cols), csv.replace("'NULL'", "NULL"), repr(objkey))
cursor.execute(sql1)

Categories

Resources