Python UNICODE csv reader GZIPPED file - python

I have read every thread related to unicode reading, but I can't seem to get it to work.
Im trying to read a csv which happens to have a utf-8 BOM signature on it and is also utf-8.
So, after opening the file, reading it with unicodecsv library, I've tried different things.
def _extract_gz(self): # fd
logging.info("Gz detected")
self.fp = gzip.open(self.path)
return unicodecsv.reader(self.path.read().decode('utf-8-sig').splitlines(), encoding='utf-8')
Still fails at row 226. UnicodeEncodeError: 'ascii' codec can't encode character u'\xf1' in position 226: ordinal not in range(128)
Also tried this approach but failed as well.
def _extract_gz(self): # fd
logging.info("Gz detected")
self.fp = gzip.open(self.path)
self.f = self.unicode_csv_reader()
return self.f
def unicode_csv_reader(self):
csv_reader = csv.reader(self.fp.read().decode('utf-8-sig').splitlines())
for row in csv_reader:
yield [cell.encode('utf-8', 'replace') for cell in row]
What am I doing wrong?
Thanks everyone.
Version is Python 2.7.12

The built-in csv module does not support Unicode (assuming Python 2.x), but there is a drop-in replacement unicodecsv module which does (and which you've apparently tried, unsuccessfully) and it should be fairly straightforward:
import gzip
import unicodecsv as csv
def read_csv(filename, has_bom=True, **kwargs):
with gzip.open(filename, "r") as f:
if has_bom:
f.seek(3) # skip the BOM
reader = csv.reader(f, **kwargs)
for row in reader:
yield row
for row in read_csv("path/to/your.csv.gz", delimiter=";"): # encoding needed for BOM
print(row) # or do whatever you want with it
Should do the trick.
UPDATE - The above code works with your uploaded file and doesn't throw any errors (since your files are delimited by a semi-column I've added that as well), however there is a bug in the unicodecsv module - it doesn't remove quotes around the first column name when parsing a file with BOM so I've updated the code to reflect that.
When running it on your uploaded file you get the following output (YMMV, depends how your console prints unicode):
[u'Name', u'Ref', u'POS', u'POS', u'Status', u'City', u'']
[u'Hotel Flamero', u'3365', u'ES', u'0.27', u'No Change', u'Matalascañas', u'']
(the last empty entry is due to your CSV having the last entry as empty)
UPDATE#2 - Don't have a MySQL instance at hand, but you can check that it parses just fine using an in-memory SQLite DB:
import sqlite3
db = sqlite3.connect(":memory:") # create an in-memory DB
c = db.cursor()
c.execute("CREATE TABLE test (Name TEXT, Ref TEXT, POS TEXT, Status TEXT, City TEXT)")
header = None
for row in read_csv("path/to/your.csv.gz", delimiter=";"):
del row[-1] # remove the last element as it's always empty
if header is None: # get the header first
header = row
continue
query = u"INSERT INTO test ({}) VALUES ({})".format(
u", ".join(header),
u", ".join(u"'{}'".format(column) for column in row) # quote each column entry
)
c.execute(query)
# now lets read our data from the DB
c.execute("SELECT * FROM test")
for row in c.fetchall():
print(row)
which happily prints:
(u'Hotel Flamero', u'3365', u'ES', u'No Change', u'Matalascañas')

Related

Get MySQL Data Using Python. But Why Does Python Corrupt The Data?

I am working on a new project. I want to connect the database, download the file from here and upload it again after making changes. But there is a problem. When I pull the data with Python, the result should be exactly the same. However, when I open the file, I see that the spaces are removed, adds parentheses to the beginning and end, the UTF-8 structure is broken, and the lines are completely removed. Why is this happening and how can it be resolved?
My Code:
# -*- coding: utf-8 -*-
f = open('sonuc.txt','w', encoding='utf-8')
import MySQLdb
db=MySQLdb.connect(host='host',user='usr',password='ps',db='db',)
mycursor = db.cursor()
mycursor.execute('SELECT message FROM mybb_posts WHERE pid=1;')
sonuc = mycursor.fetchall()
f.write(str(sonuc))
f.close()
The original data is as follows:
Lets Try This!
Line 2
Line 3
Try other charecter:
like "ş", "i", "ü", "ğ", "İ"
Line 6
Python result (sonuc.txt):
(('Lets Try This!\r\nLine 2\r\nLine 3\r\nTry other charecter:\r\nlike "?", "i", "ü", "?", "?"\r\nLine 6\r\n',),)
Edit:
for UTF-8 problem, add charset='utf8mb4', to MySQLdb.connect()
There's nothing corrupt about that. That's just the Python representation of an 1-tuple containing an 1-tuple containing a string, since .fetchall() returns a tuple of tuples with the columns you requested.
If you want to write the first column of each row returned by your query,
for row in mycursor:
message = row[0]
f.write(message)
f.close()
While you're at it, you should practice proper open hygiene:
import MySQLdb
with MySQLdb.connect(
host="host",
user="usr",
password="ps",
db="db",
) as db:
mycursor = db.cursor()
mycursor.execute("SELECT message FROM mybb_posts WHERE pid=1;")
with open("sonuc.txt", "w", encoding="utf-8") as f:
for row in mycursor:
message = row[0]
f.write(message)

Properly encoding sc.textFile data (python 2.7)

My CSV was originally created by Excel. Anticipating encoding anomalies, I opened and re-saved the file with UTF-8 BOM encoding using Sublime Text.
Imported into the notebook:
filepath = "file:///Volumes/PASSPORT/Inserts/IMAGETRAC/csv/universe_wcsv.csv"
uverse = sc.textFile(filepath)
header = uverse.first()
data = uverse.filter(lambda x:x<>header)
Formatted my fields:
fields = header.replace(" ", "_").replace("/", "_").split(",")
Structured the data:
import csv
from StringIO import StringIO
from collections import namedtuple
Products = namedtuple("Products", fields, verbose=True)
def parse(row):
reader = csv.reader(StringIO(row))
row = reader.next()
return Products(*row)
products = data.map(parse)
If I then do products.first(), I get the first record as expected. However, if I want to, say, see the count by brand and so run:
products.map(lambda x: x.brand).countByValue()
I still get an UnicodeEncodeError related Py4JJavaError:
File "<ipython-input-18-4cc0cb8c6fe7>", line 3, in parse
UnicodeEncodeError: 'ascii' codec can't encode character u'\xab' in
position 125: ordinal not in range(128)
How can I fix this code?
csv module in legacy Python versions doesn't support Unicode input. Personally I would recommend using Spark csv data source:
df = spark.read.option("header", "true").csv(filepath)
fields = [c.strip().replace(" ", "_").replace("/", "_") for c in df.columns]
df.toDF(*fields).rdd
For most applications Row objects should work as well as namedtuple (it extends tuple and provides similar attribute getters) but you can easily follow convert one into another.
You could also try reading data as without decoding:
uverse = sc.textFile(filepath, use_unicode=False)
and decoding fields manually after initial parsing:
(data
.map(parse)
.map(lambda prod: Products(*[x.decode("utf-8") for x in prod])))
Related question Reading a UTF8 CSV file with Python

CSV new-line character seen in unquoted field error

the following code worked until today when I imported from a Windows machine and got this error:
new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
import csv
class CSV:
def __init__(self, file=None):
self.file = file
def read_file(self):
data = []
file_read = csv.reader(self.file)
for row in file_read:
data.append(row)
return data
def get_row_count(self):
return len(self.read_file())
def get_column_count(self):
new_data = self.read_file()
return len(new_data[0])
def get_data(self, rows=1):
data = self.read_file()
return data[:rows]
How can I fix this issue?
def upload_configurator(request, id=None):
"""
A view that allows the user to configurator the uploaded CSV.
"""
upload = Upload.objects.get(id=id)
csvobject = CSV(upload.filepath)
upload.num_records = csvobject.get_row_count()
upload.num_columns = csvobject.get_column_count()
upload.save()
form = ConfiguratorForm()
row_count = csvobject.get_row_count()
colum_count = csvobject.get_column_count()
first_row = csvobject.get_data(rows=1)
first_two_rows = csvobject.get_data(rows=5)
It'll be good to see the csv file itself, but this might work for you, give it a try, replace:
file_read = csv.reader(self.file)
with:
file_read = csv.reader(self.file, dialect=csv.excel_tab)
Or, open a file with universal newline mode and pass it to csv.reader, like:
reader = csv.reader(open(self.file, 'rU'), dialect=csv.excel_tab)
Or, use splitlines(), like this:
def read_file(self):
with open(self.file, 'r') as f:
data = [row for row in csv.reader(f.read().splitlines())]
return data
I realize this is an old post, but I ran into the same problem and don't see the correct answer so I will give it a try
Python Error:
_csv.Error: new-line character seen in unquoted field
Caused by trying to read Macintosh (pre OS X formatted) CSV files. These are text files that use CR for end of line. If using MS Office make sure you select either plain CSV format or CSV (MS-DOS). Do not use CSV (Macintosh) as save-as type.
My preferred EOL version would be LF (Unix/Linux/Apple), but I don't think MS Office provides the option to save in this format.
For Mac OS X, save your CSV file in "Windows Comma Separated (.csv)" format.
If this happens to you on mac (as it did to me):
Save the file as CSV (MS-DOS Comma-Separated)
Run the following script
with open(csv_filename, 'rU') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print ', '.join(row)
Try to run dos2unix on your windows imported files first
This is an error that I faced. I had saved .csv file in MAC OSX.
While saving, save it as "Windows Comma Separated Values (.csv)" which resolved the issue.
This worked for me on OSX.
# allow variable to opened as files
from io import StringIO
# library to map other strange (accented) characters back into UTF-8
from unidecode import unidecode
# cleanse input file with Windows formating to plain UTF-8 string
with open(filename, 'rb') as fID:
uncleansedBytes = fID.read()
# decode the file using the correct encoding scheme
# (probably this old windows one)
uncleansedText = uncleansedBytes.decode('Windows-1252')
# replace carriage-returns with new-lines
cleansedText = uncleansedText.replace('\r', '\n')
# map any other non UTF-8 characters into UTF-8
asciiText = unidecode(cleansedText)
# read each line of the csv file and store as an array of dicts,
# use first line as field names for each dict.
reader = csv.DictReader(StringIO(cleansedText))
for line_entry in reader:
# do something with your read data
I know this has been answered for quite some time but not solve my problem. I am using DictReader and StringIO for my csv reading due to some other complications. I was able to solve problem more simply by replacing delimiters explicitly:
with urllib.request.urlopen(q) as response:
raw_data = response.read()
encoding = response.info().get_content_charset('utf8')
data = raw_data.decode(encoding)
if '\r\n' not in data:
# proably a windows delimited thing...try to update it
data = data.replace('\r', '\r\n')
Might not be reasonable for enormous CSV files, but worked well for my use case.
Alternative and fast solution : I faced the same error. I reopened the "wierd" csv file in GNUMERIC on my lubuntu machine and exported the file as csv file. This corrected the issue.

python unicode csv export using pyramid

I'm trying to export mongodb that has non ascii characters into csv format.
Right now, I'm dabbling with pyramid and using pyramid.response.
from pyramid.response import Response
from mycart.Member import Member
#view_config(context="mycart:resources.Member", name='', request_method="POST", permission = 'admin')
def member_export( context, request):
filename = 'member-'+time.strftime("%Y%m%d%H%M%S")+".csv"
download_path = os.getcwd() + '/MyCart/mycart/static/downloads/'+filename
member = Members(request)
my_list = [['First Name,Last Name']]
record = member.get_all_member( )
for r in record:
mystr = [ r['fname'], r['lname']]
my_list.append(mystr)
with open(download_path, 'wb') as f:
fileWriter = csv.writer(f, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
for l in my_list:
print(l)
fileWriter.writerow(l)
size = os.path.getsize(download_path)
response = Response(content_type='application/force-download', content_disposition='attachment; filename=' + filename)
response.app_iter = open(download_path , 'rb')
response.content_length = size
return response
In mongoDB, first name is showing 王, when I'm using print, it too is showing 王. However, when I used excel to open it up, it shows random stuff - ç¾…
However, when I tried to view it in shell
$ more member-20130227141550.csv
It managed to display the non ascii character correctly.
How should I rectify this problem?
I'm not a Windows guy, so I am not sure whether the problem may be with your code or with excel just not handling non-ascii characters nicely. But I have noticed that you are writing your file with python csv module, which is notorious for headaches with unicode.
Other users have reported success with using unicodecsv as a replacement for the csv module. Perhaps you could try dropping in this module as a csv writer and see if your problem magically goes away.

Python CSV DictReader with UTF-8 data

AFAIK, the Python (v2.6) csv module can't handle unicode data by default, correct? In the Python docs there's an example on how to read from a UTF-8 encoded file. But this example only returns the CSV rows as a list.
I'd like to access the row columns by name as it is done by csv.DictReader but with UTF-8 encoded CSV input file.
Can anyone tell me how to do this in an efficient way? I will have to process CSV files in 100's of MByte in size.
I came up with an answer myself:
def UnicodeDictReader(utf8_data, **kwargs):
csv_reader = csv.DictReader(utf8_data, **kwargs)
for row in csv_reader:
yield {unicode(key, 'utf-8'):unicode(value, 'utf-8') for key, value in row.iteritems()}
Note: This has been updated so keys are decoded per the suggestion in the comments
For me, the key was not in manipulating the csv DictReader args, but the file opener itself. This did the trick:
with open(filepath, mode="r", encoding="utf-8-sig") as csv_file:
csv_reader = csv.DictReader(csv_file)
No special class required. Now I can open files either with or without BOM without crashing.
First of all, use the 2.6 version of the documentation. It can change for each release. It says clearly that it doesn't support Unicode but it does support UTF-8. Technically, these are not the same thing. As the docs say:
The csv module doesn’t directly support reading and writing Unicode, but it is 8-bit-clean save for some problems with ASCII NUL characters. So you can write functions or classes that handle the encoding and decoding for you as long as you avoid encodings like UTF-16 that use NULs. UTF-8 is recommended.
The example below (from the docs) shows how to create two functions that correctly read text as UTF-8 as CSV. You should know that csv.reader() always returns a DictReader object.
import csv
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.DictReader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
A classed based approach to #LMatter answer, with this approach you still get all the benefits of DictReader such as getting the fieldnames and getting the line number plus it handles UTF-8
import csv
class UnicodeDictReader(csv.DictReader, object):
def next(self):
row = super(UnicodeDictReader, self).next()
return {unicode(key, 'utf-8'): unicode(value, 'utf-8') for key, value in row.iteritems()}
That's easy with the unicodecsv package.
# pip install unicodecsv
import unicodecsv as csv
with open('your_file.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print(row)
The csvw package has other functionality as well (for metadata-enriched CSV for the Web), but it defines a UnicodeDictReader class wrapping around its UnicodeReader class, which at its core does exactly that:
class UnicodeReader(Iterator):
"""Read Unicode data from a csv file."""
[…]
def _next_row(self):
self.lineno += 1
return [
s if isinstance(s, text_type) else s.decode(self._reader_encoding)
for s in next(self.reader)]
It did catch me off a few times, but csvw.UnicodeDictReader really, really needs to be used in a with block and breaks otherwise. Other than that, the module is nicely generic and compatible with both py2 and py3.
The answer doesn't have the DictWriter methods, so here is the updated class:
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
self.fieldnames = fieldnames # list of keys for the dict
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow({k: v.encode("utf-8") for k, v in row.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)

Categories

Resources