I have a script that read some files one by one, clean it and insert it ino postgres database.
i tried to use python multi-processing using Pools but actually i found that CPU usage still reach sometimes to 30% and return most of time to 6%. so it's really so slow.
Any suggestion for speeding it up ?
thank you
import os
import multiprocessing
path = 'data/'
arr = os.listdir(path)
connection = psycopg2.connect(
user="postgres", password="blabla", host="127.0.0.1", port="5432", database="test"
)
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO mobile (data1, data2) VALUES (%s,%s)
ON CONFLICT (data1)
DO
UPDATE SET data2 = EXCLUDED.data1 ;"""
def insert_data(key,record_to_insert, item):
print(key)
try:
cursor.executemany(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print(count, "Record inserted successfully into mobile table", item)
except (Exception, psycopg2.Error) as error:
print("Failed to insert record into mobile table", error)
i = 1
def process_data(item):
print(item)
global i
records = []
i+=1
with open(path+item,'r') as file:
for line in file:
line = dataCleansing(line)
records.append((line+'-'+str(i),'data2-'+str(i)+line))
if len(records)==50000:
insert_data(i,records,item)
records=[]
insert_data(i,records,item)
records=[]
if __name__ == '__main__':
a_pool = multiprocessing.Pool(6)
result = a_pool.map(process_data, arr)
Related
I am new to Python and started off with sqlite.
I have two csv transaction.csv and users.csv from where I am reading the data and writing to the sqlite database.Below is the snippet
import csv
import sqlite3 as db
def readCSV_users():
with open('users.csv',mode='r') as data:
dr = csv.DictReader(data, delimiter=',')
users_data = [(i['user_id'], i['is_active']) for i in dr if i['is_active']=='True']
#---------------------
return users_data
def readCSV_transactions():
with open('transactions.csv',mode='r') as d:
dr = csv.DictReader(d, delimiter=',')
trans_data = [(i['user_id'], i['is_blocked'],i['transaction_amount'],i['transaction_category_id']) for i in dr if i['is_blocked']=='False']
#---------------------
return trans_data
def SQLite_connection(database):
try:
# connect to the database
conn = db.connect(database)
print("Database connection is established successfully!")
conn = db.connect(':memory:')
print("Established database connection to a database\
that resides in the memory!")
cur = conn.cursor()
return cur,conn
except exception as Err:
print(Err)
def dbQuery(users_data,trans_data,cur,conn):
try:
cur.executescript(""" CREATE TABLE if not exists users(user_id text,is_active text);
CREATE TABLE if not exists transactions(user_id text,is_blocked text,transaction_amount text,transaction_category_id text);
INSERT INTO users VALUES (?,?),users_data;
INSERT INTO transactions VALUES (?,?,?,?),trans_data""")
conn.commit()
a=[]
rows = curr.execute("SELECT * FROM users").fetchall()
for r in rows:
a.append(r)
return a
except Err:
print(Err)
finally:
conn.close()
if __name__ == "__main__":
database='uit'
users_data=readCSV_users()
trans_data=readCSV_transactions()
curr,conn=SQLite_connection(database)
print(dbQuery(users_data,trans_data,curr,conn))
But I am facing below error.I believe the ? is throwing the error in executescript
cur.executescript(""" CREATE TABLE if not exists users(user_id text,is_active text);
sqlite3.OperationalError: near "users_data": syntax error
Any pointers to resolve this?
Putting users_data directly in query is wrong. It treats it as normal string.
But it seems executescript can't use arguments.
You would have to put values directly in place of ?.
Or you have to use execute()
cur.execute("INSERT INTO users VALUES (?,?);", users_data)
cur.execute("INSERT INTO transactions VALUES (?,?,?,?)", trans_data)
I know there are some other posts out there, but I was not able to find the specific question I had in mind.
I'm using US_baby_names csv file. and want to import this csv file line by line into sqlite3 as a table.
I'm able to create the table called storage.
I'm then trying to read lines in the csv file and put it into that table, but I must be doing something wrong.
import sqlite3 as sql
from sqlite3 import Error
import csv
def CreateConnection ( dbFileName ):
try:
conn = sql.connect(dbFileName)
return conn
except Error as e:
print(e)
return None
def CreateNew( dbConnection, new):
sql = """INSERT INTO storage (dat, Id, Name, Year, group, subgroup, Count)
VALUES (?,?,?,?,?,?,?)"""
try:
cursor = dbConnection.cursor()
cursor.execute(sql, new)
return cursor.lastrowid
except Error as e:
print(e)
def Main():
database = "storage.db"
dbConnection = CreateConnection(database)
with open('storage.csv', 'rb') as fin:
dr = csv.DictReader(fin)
to_db = [(i['dat'], i['Id'], i['Name'], i['Year'], i['group'], i['subgroup'], i['Count']) \
for i in dr]
cursor.executemany(CreateNew(sql, to_db))
dbConnection.close()
if __name__ == "__main__":
Main()
I believe my cursor.executemany is wrong, but I'm not able to figure out what else to do..
Thanks
You are almost right with much of your code, but:
in cursor.execute(sql, new) you are passing an iterable, new, to sqlite3.execute() (which requires a simple SQL statement), instead of sqlite3.executemany().
Moreover, the result of CreateNew() is an integer, lastrowid, and you pass that result to executemany().
You must use Connection.commit() to save the changes to the database, and Connection.rollback() to discard them.
You must open the file for the csv.DictReader class as a text file, in r or rt mode.
Finally, remember that sqlite3.Connection is a context manager, so you can use it in a with statement.
This should be your desired outcome:
import sqlite3 as sql
from sqlite3 import Error
import csv
def create_table(conn):
sql = "CREATE TABLE IF NOT EXISTS baby_names("\
"dat TEXT,"\
"Id INTEGER PRIMARY KEY,"\
"Name TEXT NOT NULL,"\
"Year INTEGER NOT NULL,"\
"Gender TEXT NOT NULL,"\
"State TEXT NOT NULL,"\
"Count INTEGER)"
conn.execute(sql)
conn.execute("DELETE FROM baby_names")
def select_all(conn):
for r in conn.execute("SELECT * FROM baby_names").fetchall():
print(r)
def execute_sql_statement(conn, data):
sql = "INSERT INTO baby_names "\
"(dat, Id, Name, Year, Gender, State, Count) "\
"VALUES (?,?,?,?,?,?,?)"
try:
cursor = conn.executemany(sql, data)
except Error as e:
print(e)
conn.rollback()
return None
else:
conn.commit()
return cursor.lastrowid
def main():
with sql.connect('baby_names.db') as conn, open('US_Baby_Names_right.csv', 'r') as fin:
create_table(conn)
dr = csv.DictReader(fin)
data = [(i['dat'], i['Id'], i['Name'], i['Year'], i['Gender'], i['State'], i['Count']) for i in dr ]
lastrowid = execute_sql_statement(conn, data)
select_all(conn)
main()
I added a create_table() function just to test my code. I also made up a sample test file as follows:
dat,Id,Name,Year,Gender,State,Count
1,1,John,1998,M,Washington,2
2,2,Luke,2000,M,Arkansas,10
3,3,Carrie,1999,F,Texas,3
The output of the select_all() function is:
('1',1,'John',1998,'M','Washington',2)
('2',2,'Luke',2000,'M','Arkansas',10)
('3',3,'Carrie',1999,'F','Texas',3)
I have an SQLite DB file and I am parsing the data from each column in a table of the db to a .txt file. At the moment it is writing the column contents to the file but it won't pull the column names and write those. How can I go about it as I have tried to use this guide Is there a way to get a list of column names in sqlite? but i cannot seem to get it to work. Here is my code with an attempt at pulling the column names from the table.
import sqlite3
from sqlite3 import Error
# create a database connection to the SQLite database specified by the db_file
def create_connection(db_file,detect_types=sqlite3.PARSE_DECLTYPES):
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
return None
# Query specific rows in the sms table
def select_data(conn):
cur = conn.cursor()
cur.execute("SELECT _id, address, strftime('%d-%m-%Y', date / 1000, 'unixepoch'),read, type, body, seen FROM sms")
print("Writing the contents of the sms table to an evidence file")
print("\t")
# Trying to pull out column names from db table
def get_col_names():
conn = sqlite3.connect("mmssms.db")
c = conn.cursor()
c.execute("SELECT _id, address, strftime('%d-%m-%Y', date / 1000, 'unixepoch'),read, type, body, seen FROM sms")
return [member[0] for member in c.description]
# Write the data to a smsEvidence.txt file
with open('EvidenceExtractionFiles/smsInfo.txt', 'a+') as f:
rows = cur.fetchall()
for row in rows:
#print(row)
f.write("%s\n" % str(row))
print("SMS Data is written to the evidence File")
# path to where the db files are stored
def main():
database = "H:\College Fourth Year\Development Project\Final Year Project 2018\mmssms.db"
# create a database connection
conn = create_connection(database)
with conn:
# print("Query specific columns")
select_data(conn)
# close db connection
if(conn):
conn.close()
print("Database closed")
if __name__ == '__main__':
main()
You may use cursor.description which holds info about the column names:
[ ... ]
cur = cursor.execute('SELECT * FROM test_table LIMIT 100')
col_names = [ name[0] for name in cur.description ]
print (col_names)
[ ... ]
My usecase is to write create a temp table in the postgres database and fetch records from it and insert into a different table.
The code i used is:
import psycopg2
import sys
import pprint
from __future__ import print_function
from os.path import join,dirname,abspath
import xlrd
import os.path
newlist = []
itemidlist = []
def main():
conn_string = "host='prod-dump.cvv9i14mrv4k.us-east-1.rds.amazonaws.com' dbname='ebdb' user='ebroot' password='*********'"
# print the connection string we will use to connect
# print "Connecting to database" % (conn_string)
# get a connection, if a connect cannot be made an exception will be raised here
conn = psycopg2.connect(conn_string)
# conn.cursor will return a cursor object, you can use this cursor to perform queries
cursor = conn.cursor()
dealer_id = input("Please enter dealer_id: ")
group_id = input("Please enter group_id: ")
scriptpath = os.path.dirname('__file__')
filename = os.path.join(scriptpath, 'Winco - Gusti.xlsx')
xl_workbook = xlrd.open_workbook(filename, "rb")
xl_sheet = xl_workbook.sheet_by_index(0)
print('Sheet Name: %s' % xl_sheet.name)
row=xl_sheet.row(0)
from xlrd.sheet import ctype_text
print('(Column #) type:value')
for idx, cell_obj in enumerate(row):
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
#print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value))
num_cols = xl_sheet.ncols
for row_idx in range(0, xl_sheet.nrows): # Iterate through rows
num_cols = xl_sheet.ncols
id_obj = xl_sheet.cell(row_idx, 1) # Get cell object by row, col
itemid = id_obj.value
#if itemid not in itemidlist:
itemidlist.append(itemid)
# execute our Query
'''
cursor.execute("""
if not exists(SELECT 1 FROM model_enable AS c WHERE c.name = %s);
BEGIN;
INSERT INTO model_enable (name) VALUES (%s)
END;
""" %(itemid,itemid))
'''
cursor.execute("drop table temp_mbp1")
try:
cursor.execute("SELECT p.model_no, pc.id as PCid, g.id AS GROUPid into public.temp_mbp1 FROM products p, \
model_enable me, products_clients pc, groups g WHERE p.model_no = me.name \
and p.id = pc.product_id and pc.client_id = %s and pc.client_id = g.client_id and g.id = %s"\
% (dealer_id,group_id)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
cursor.execute("select count(*) from public.temp_mbp1")
# retrieve the records from the database
records = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(records)
if __name__ == "__main__":
main()
The try except block in between the program is not throwing any error but the table is not getting created in the postgres database as i see in the data admin.
The output shown is:
Please enter dealer_id: 90
Please enter group_id: 13
Sheet Name: Winco Full 8_15_17
(Column #) type:value
[(3263,)]
Thanks,
Santosh
You didn't commit the changes, so they aren't saved in the database. Add to the bottom, just below the pprint statement:
conn.commit()
I'm migrating a script from another language to Python. I watered this down on the specifics of the database calls etc... but this is what the file looks like. I intentionally made some queries fail as I was testing the transaction and it did not rollback() the queries executed prior to the forced error. I am a little confused as how to the transactions work with Python, the example I followed was this one, it was a loop with several queries nested within transactions so I adapted the code according to what I understood from it.
#!/usr/bin/python
import MySQLdb
import thread
import os
# Open database connection
# added local_infile=1 to allow the import to work, otherwise you get an error
db = MySQLdb.connect(CONNECTION ARGS...)
# define our function that will be called from our thread
def import_queued_file(conn,distid):
# prepare a cursor object using cursor() method
cursor = conn.cursor()
# total lines imported for all files for a distributor
total_lines_imported = 0
# current lines imported for each file on each iteration
current_lines_imported = 0
# default this to 0, this will have the total lines for our imports on each iteration
previous_lines_imported = 0
# initialize the file exists flag to 0
file_exists = 0
# sql statement to retrieve the file(s) for a specific distributor
sql = """
SELECT
...
FROM ...
WHERE ...
"""
# execute the sql statement
cursor.execute(sql)
# if we have records, execute the code below
if (cursor.rowcount > 0):
# set the records to the files variable
files = cursor.fetchall()
# set a variable to count iterations
# we'll use this to determine if we need to drop the table
cnt = 0
# keep track of the total number of lines imported per distributor (may be multiple files)
lines_imported = 0
# loop the recordset
for col in files:
# increment the cnt variable
cnt += 1
# set file_exists to 0 at the beginning of the iteration
file_exists = 0
# set some variables to be used in our sql load data statement
var1 = col[1]
var2 = col[2]
....
# this is the path of our file that we will be using for MySQL LOAD DATA also
# TODO: REFACTOR SO THAT THE /home/webex/backup/ IS NOT HARD CODED
inventoryfile = "/path/to/file/%s" % (filepath)
# check to see if we have a file
if (os.path.exists(inventoryfile)):
try:
# set file exists to true
file_exists = 1
# if cnt > 1, it means we have more than 1 file for this distributor
# only drop the table if this is the first iteration
if (cnt == 1):
# drop table sql statement
sql = "DROP TABLE IF EXISTS %s" % (temptable)
# execute the sql command
cur = conn.cursor()
cur.execute(sql)
cur.close()
# assign the create table statement to the sql variable
sql = """
CREATE TABLE IF NOT EXISTS
.......
.......
) ENGINE=MyISAM DEFAULT CHARSET=utf8
""" % (temptable)
# execute the sql statement
cur = conn.cursor()
cur.execute(sql)
cur.close()
# query the temptable to see if we have any records
sql = "SELECT COUNT(0) AS total FROM %s" % (temptable)
cur = conn.cursor()
cur.execute(sql)
cur.close()
# get the count of how many records exist in the database
number_of_line_items = cur.fetchall()
previous_lines_imported = number_of_line_items[0][0]
# load data local infile sql statement
sql = """
LOAD DATA LOCAL INFILE ...
"""
# execute the load data infile sql statement
cur = conn.cursor()
cur.execute(sql)
cur.close()
# clean up the table by removing...
# rows that don't have a part_number,
# rows that have part_number's less than 3 characters
sql = """
DELETE FROM ...
""" % (temptable)
# execute the delete query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# query the temptable to see if we have any records after the import
sql = "SELECT COUNT(0) AS total FROM %s" % (temptable)
# execute the count query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# get the count of how many records exist in the database after the import
number_of_line_items = cur.fetchall()
# get the current lines imported
current_lines_imported = number_of_line_items[0][0] - previous_lines_imported
# add the current lines imported to the total lines imported
total_lines_imported += current_lines_imported
# update distributor_file_settings table last_updated_on field
sql = """
UPDATE ...
""" % (file_id,distributor__id)
print sql
# execute the update query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# close cursor
conn.commit()
except:
conn.rollback()
# no records exists for this distributor
else:
print "dist doesn't exist"
cursor.close()
import_queued_file(db,42)
# prepare a cursor object using cursor() method
cursor = db.cursor()
# select distinct file settings
sql = """
SELECT ...
"""
# disconnect from server
db.close()
After reviewing the code again and again, the issue happened to be the table type. After changing it to INNODB it worked as expected.