Python MySQL Bulk Insertion Error with Character Encode - python

I Start new Project in Python with MySQL.
I just try to insert millions of record from CSV to MySQL through MySQLdb package.
My Code:
import pandas as pd
import MySQLdb
#Connect with MySQL
db = MySQLdb.connect('localhost','root','****','MY_DB')
cur = db.cursor()
#Reading CSV
df = pd.read_csv('/home/shankar/LAB/Python/Rough/******.csv')
for i in df.COMPANY_NAME:
i = i.replace("'","")
i = i.replace("\\","")
#i = i.encode('latin-1', 'ignore')
cur.execute("INSERT INTO polls_company (name) VALUES ('" + i + "')")
db.commit()
This code working fine in some sort of CSV files, but having issues in few CSV files.
Errors :
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-7-aac849862588> in <module>()
13 i = i.replace("\\","")
14 #i = i.encode('latin-1', 'ignore')
---> 15 cur.execute("INSERT INTO polls_company (name) VALUES ('" + i + "')")
16 db.commit()
/home/shankar/.local/lib/python3.5/site-packages/MySQLdb/cursors.py in execute(self, query, args)
211
212 if isinstance(query, unicode):
--> 213 query = query.encode(db.unicode_literal.charset, 'surrogateescape')
214
215 res = None
UnicodeEncodeError: 'latin-1' codec can't encode character '\ufffd' in position 49: ordinal not in range(256)
Here, this "Character Encoding" issue is occurred in some CSV files only, but i want automatic Insertion with common encoding techniques.
Because CSV Files encoded works with "utf-8", "latin-1" and more...
If i use utf-8 : then i got error in latin-1
and vise versa
So, Is there any ways to operate all kind of CSV file with common encoding
or
any other ways to solve this ?
[Thanks in Advance...]

I would let the pandas take care of encoding and you don't need to loop through your DF. Let's do it pandas way:
import pandas as pd
import MySQLdb
#Connect with MySQL
db = MySQLdb.connect('localhost','root','****','MY_DB')
cur = db.cursor()
#Reading CSV
df = pd.read_csv('/home/shankar/LAB/Python/Rough/******.csv')
df.COMPANY_NAME.str.replace(r"['\]*", "").rename(columns={'COMPANY_NAME':'name'}).to_sql('polls_company', db, if_exists='append', index=False)

Related

Psycopg2 copy_from for csv to postgress

I have a csv file that i read into pandas, and im supposed to insert into postgres. The file contains strings in some fields with the backslash "character". This causes a problem because the copy_from function reads it as an escape character. how do i let it ignore "" and leave it as a string. i have tried many different encoding formats but i stil get a "cannot decode character" error. issue is i cannot replace that character, it is important in the string.
def load_into_db(cur, con, file,table_name):
f = open(file, mode="r", encoding='utf-8')
try:
# print("wrote to csv")
sqlstr = "COPY {} FROM STDIN DELIMITER '|' CSV".format(table_name)
cur.copy_from(f, table_name, null="nan", sep="|")
con.commit()
f.close()
except Exception as e:
print(e)
print("something went wrong")
example of the rows causing the issue
name
age
attribute
name1
23
example/1/test
name2
26
example/2/test
error: invalid byte sequence for encoding "UTF8": 0xa2
import io
import csv
def df2db(df_a, table_name, engine):
output = io.StringIO()
# ignore the index
# df_a.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE)
df_a.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE, escapechar='\\')
output.getvalue()
# jump to start of stream
output.seek(0)
#engine <--- from sqlalchemy import create_engine
connection = engine.raw_connection()
cursor = connection.cursor()
# null value become ''
cursor.copy_from(output,table_name,null='')
connection.commit()
cursor.close()
use the function df2db to insert a DataFrame to an exists table, as the cols of the table and the df's columns should be the same.
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://user:psw#localhost:5432/dbname')
df = pd.read_csv(file)
df2db(df, table_name, engine)

Python: loading data from file csv insert whole data in .db and operate on tables

I'm learning currently a python language. Here is my question i converted .txt file to .csv then want to insert to table to database file. I have a problem with iteriation on the bottom im pasting results. How can i iterate with it? Im struggle with that few days so don't really know how to solve the problem.
txt file(few rows):
id,id2,album,artysta
TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
TRMMMXN128F42936A5,SOZVAPQ12A8C13B63C,David Montgomery,"Symphony No. 1 G minor ""Sinfonie Serieuse""/Allegro con energia"
TRMMMLR128F1494097,SOQVRHI12A6D4FB2D7,Sasha / Turbulence,We Have Got Love
TRMMMBB12903CB7D21,SOEYRFT12AB018936C,Kris Kross,2 Da Beat Ch'yall
Python:
from io import StringIO
import pandas as pd
import numpy as np
import os
import sqlite3, csv
save_path = r"C:\Users\Maticz\Desktop\python"
#konwerter txt -> csv
in_file = os.path.join(save_path, "tracks.txt")
out_file = os.path.join(save_path, "Output.csv")
#df = pd.read_csv(in_file, sep="<SEP>", engine='python')
#df.to_csv(out_file, index=False)
#print(df)
df = pd.read_csv(r'C:\Users\Maticz\PycharmProjects\zadanie\tracks.txt', delimiter='<SEP>',
engine='python', names=["id", "id2", "album", "artysta"])
print(df.head(5))
sv = df.to_csv(r'C:\Users\Maticz\PycharmProjects\zadanie\tracks.csv', index = None, header=True)
con = sqlite3.connect("artists.db")
cur = con.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS tabela (id TEXT, id2 TEXT, album TEXT, artysta TEXT);")
with open(r'C:\Users\Maticz\PycharmProjects\zadanie\tracks.csv', 'a+') as fin:
dr = pd.read_csv(fin, delimiter=',', names=["id", "id2", "album", "artysta"]) # comma is default delimiter
to_db = [(i['id'], i['id2'], i['album'], i['artysta']) for i in dr]
cur.executemany("INSERT INTO tabela (id, id2, album, artysta) VALUES (?, ?, ?, ?);", to_db)
con.commit()
cur.execute("SELECT * FROM artists")
print(cur.fetchall())
con.close()
Output:
id id2 album artysta
0 TRMMMYQ128F932D901 SOQMMHC12AB0180CB8 Faster Pussy cat Silent Night
1 TRMMMKD128F425225D SOVFVAK12A8C1350D9 Karkkiautomaatti Tanssi vaan
2 TRMMMRX128F93187D9 SOGTUKN12AB017F4F1 Hudson Mohawke No One Could Ever
3 TRMMMCH128F425532C SOBNYVR12A8C13558C Yerba Brava Si Vos Querés
4 TRMMMWA128F426B589 SOHSBXH12A8C13B0DF Der Mystic Tangle Of Aspens
Traceback (most recent call last):
File "C:/Users/Maticz/PycharmProjects/zadanie/main.py", line 26, in <module>
to_db = [(i['id'], i['id2'], i['album'], i['artysta']) for i in dr]
File "C:/Users/Maticz/PycharmProjects/zadanie/main.py", line 26, in <listcomp>
to_db = [(i['id'], i['id2'], i['album'], i['artysta']) for i in dr]
TypeError: string indices must be integers
Process finished with exit code 1
Apreciate for any help thank you :)
You could simplify the operation using sqlalchemy
from sqlalchemy import create_engine
# sqlite://<nohostname>/<path>
# where <path> is relative:
engine = create_engine('sqlite:///artists.db')
df.to_sql('tabela', con = engine, if_exists = 'append', chunksize=1000)
this will remove the need to write to another CSV as you have the data in a pandas dataframe already. After this is done you can create your cursor to validate that the data has been written into the sqlite db file.
the statements:
if_exsists = 'append'
will force the new data to append to the table or even creates the table if it doesn't exist.
chunksize = 1000
this will write the records 1000 at a time (or all at once if less than 1000) then commit the records, saving the data to the table.

CSV to sqlite with uft-8

I would like to load a csv into a sqlite. My CSV contains utf-8 characters like (é, à, ü, ♀...).
These characters are displayed as ’ or é in my sqlite. I have used con.text_factory = str but it doesn't change anything.
I have also tried .decode('utf8') as this question suggests but I get the error: 'str' object has no attribute 'decode'
import sqlite3
import csv
import os
db= "mydb.sqlite"
con = sqlite3.connect(db)
con.text_factory = str # allows utf-8 data to be stored
cursor = con.cursor()
csvfile= 'mycsv.csv'
tablename = os.path.splitext(os.path.basename(csvfile))[0]
with open (csvfile, 'r') as f:
reader = csv.reader(f)
header_line_from_csv = next(reader)
columns = [h.strip() for h in header_line_from_csv] #Strips white space in header
headers= ', '.join([f'{column} text' for column in columns]) ##
sql = f'CREATE TABLE {tablename} ({headers})'
print (sql)
cursor.execute(sql)
query = 'insert into {0}({1}) values ({2})'
query = query.format(tablename, ','.join(columns), ','.join('?' * len(columns)))
print(query)
cursor = con.cursor()
for row in reader:
cursor.execute(query, row)
con.commit()
print(cursor.rowcount)
cursor.execute(f"SELECT * FROM {tablename}")
print("fetchall:\n",cursor.fetchall())
print(cursor.description)
con.close()
You can add this line at beginning:
# -*- coding:utf-8 -*-
to guarantee that your script will use this encoding. Try also to open csv file with utf-8 encoding:
with open (csvfile, 'r', encoding='utf-8') as f:

How to query unicode database with ascii characters

I am currently running a query on my postgresql database that ignores German characters - umlauts. I however, do not want to loose these characters and would rather have the German characters or at least their equivalent (e.g ä = ae) in the output of the query. Running Python 2.7.12
When I change the encode object to replace or xmlcharrefreplace I get the following error:
psycopg2.ProgrammingError: syntax error at or near "?"
LINE 1: ?SELECT
Code Snippet:
# -*- coding: utf-8 -*-
connection_str = r'postgresql://' + user + ':' + password + '#' + host + '/' + database
def query_db(conn, sql):
with conn.cursor() as curs:
curs.execute(sql)
rows = curs.fetchall()
print("fetched %s rows from db" % len(rows))
return rows
with psycopg2.connect(connection_str) as conn:
for filename in files:
# Read SQL
sql = u""
f = codecs.open(os.path.join(SQL_LOC, filename), "r", "utf-8")
for line in f:
sql += line.encode('ascii', 'replace').replace('\r\n', ' ')
rows = query_db(conn, f)
How can I pass a query as a unicode object with German characters ?
I also tried decoded the query as utf-8 but then I get the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128)
Here is a solution to obtain their encoded equivalent. You will be able to re-encode it later and the query will not create an error:
SELECT convert_from(BYTEA 'foo ᚠ bar'::bytea, 'latin-1');
+----------------+
| convert_from |
|----------------|
| foo á<U+009A>  bar |
+----------------+
SELECT 1
Time: 0.011s
You just need to conn.set_client_encoding("utf-8") and then you can just execute unicode strings - sql and results will be encoded and decoded on the fly:
$ cat psycopg2-unicode.py
import sys
import os
import psycopg2
import csv
with psycopg2.connect("") as conn:
conn.set_client_encoding("utf-8")
for filename in sys.argv[1:]:
file = open(filename, "r", encoding="utf-8")
sql = file.read()
with conn.cursor() as cursor:
cursor.execute(sql)
try:
rows = cursor.fetchall()
except psycopg2.ProgrammingError as err:
# No results
continue
with open(filename+".out", "w", encoding="utf-8", newline="") as outfile:
csv.writer(outfile, dialect="excel-tab").writerows(rows)
$ cat sql0.sql
create temporary table t(v) as
select 'The quick brown fox jumps over the lazy dog.'
union all
select 'Zwölf große Boxkämpfer jagen Viktor quer über den Sylter Deich.'
union all
select 'Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч.'
union all
select 'Mężny bądź, chroń pułk twój i sześć flag.'
;
$ cat sql1.sql
select * from t;
$ python3 psycopg2-unicode.py sql0.sql sql1.sql
$ cat sql1.sql.out
The quick brown fox jumps over the lazy dog.
Zwölf große Boxkämpfer jagen Viktor quer über den Sylter Deich.
Любя, съешь щипцы, — вздохнёт мэр, — кайф жгуч.
Mężny bądź, chroń pułk twój i sześć flag.
A Python2 version of this program is a little bit more complicated, as we need to tell the driver that we'd like return values as unicode objects. Also csv module I used for output does not support unicode, so it needs a workaround. Here it is:
$ cat psycopg2-unicode2.py
from __future__ import print_function
import sys
import os
import csv
import codecs
import psycopg2
import psycopg2.extensions
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
with psycopg2.connect("") as conn:
conn.set_client_encoding("utf-8")
for filename in sys.argv[1:]:
file = codecs.open(filename, "r", encoding="utf-8")
sql = file.read()
with conn.cursor() as cursor:
cursor.execute(sql)
try:
rows = cursor.fetchall()
except psycopg2.ProgrammingError as err:
# No results from SQL
continue
with open(filename+".out", "wb") as outfile:
for row in rows:
row_utf8 = [v.encode('utf-8') for v in row]
csv.writer(outfile, dialect="excel-tab").writerow(row_utf8)

Probleme encoding characters with Python 2.7

It works fine with regular characters but it doesn't work with
accented characters like é,à etc...
Here is the program:
def search():
connection = sqlite3.connect('vocab.sqlite')
cursor = connection.cursor()
sql = "SELECT French, English value FROM Ami "
cursor.execute(sql)
data = cursor.fetchall()
data=sorted(data)
file_open=open('vraiamis.html','w')
for i in data:
a='<a href="'+'http://www.google.fr/#hl=fr&gs_nf=1&cp=4&gs_id=o&xhr=t&q='
a=a+str(i[0]).encode('latin-1')+'">'+str(i[0]).encode('latin-1')+'</a>'+'<br>'
file_open.write(a)
file_open.close()
webbrowser.open('vraiamis.html')
when the value in the database contains special characters like é,à,ç ( it doesn't work I get the following error message:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 1: ordinal not in range(128)
Thanks in advance for your help
Try
a=a+i[0].encode('latin-1')+'">' + i[0].encode('latin-1')+'</a>'+'<br>'
etc - your str() calls are trying to convert the unicode to a bytestring before you've decoded it.
You may write your vraiamis.html in utf-8 encoding, so that your special characters may be encoded.
def search():
import codecs
connection = sqlite3.connect('vocab.sqlite')
cursor = connection.cursor()
sql = "SELECT French, English value FROM Ami "
cursor.execute(sql)
data = cursor.fetchall()
data=sorted(data)
file_open= codecs.open('vraiamis.html', 'w', encoding='utf-8')
for i in data:
a=u'<a href="' + u'http://www.google.fr/#hl=fr&gs_nf=1&cp=4&gs_id=o&xhr=t&q='
a=a + i[0] + u'">' + i[0] + u'</a>' + u'<br>'
file_open.write(a)
file_open.close()
webbrowser.open('vraiamis.html')

Categories

Resources