export data from csv file containing unicode characters - python

I would like to export data from a csv file which contains unicode strings.
Previously I tried a Python script which works fine for ASCII data only. But it won't support unicode stuff either:
#! /usr/bin/env python
import csv
csv.register_dialect('custom',delimiter=','
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = csv.reader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
Traceback (most recent call last): for record in data: _csv.Error:
line contains NULL byte

use this unicode-csv library instead
https://github.com/jdunck/python-unicodecsv
import unicodecsv as csv
with open('input.csv') as ifile:
rows = [row for row in csv.reader(ifile, encoding='utf-8')]
print rows

You can wrap the csv.reader in a class to handle it for you. The following is taken from the csv documentation examples and works for me:
#! /usr/bin/env python
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
csv.register_dialect('custom', delimiter=',',
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = UnicodeReader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
There is also a UnicodeWriter class there if you need that functionality.

It seems you are using Python 3. Follow the very first code example in the docs:
#!/usr/bin/env python3
import csv
with open('input.csv', newline='', encoding=encoding) as csvfile:
reader = csv.reader(csvfile, dialect="custom")
for row in reader:
print(", ".join(row))
where "custom" dialect is defined in the code in your question and encoding is the character encoding of your file such as "utf-16". If you omit encoding argument; the encoding returned by locale.getpreferredencoding(False) is used.

Related

Python encoding in Windows command line: Chcp 932 doesn't work?

I've looked at other answers, and have done what they recommended:
1. Changed system locale to Japanese
2. Chcp 932 (Japanese)
3. Python file saved as UTF-8
4. All inputs are subject to the unicode(input, 'utf-8') function as seen below.
NOTE: I've also tried using chcp 65001, but this doesn't work either.
I'm trying to read a csv file in Japanese, but the following error keeps coming up.
Traceback (most recent call last):
...
...
UnicodeEncodeError: 'cp932' codec can't encode character u'\ufeff' in position 0: illegal multibyte sequence
My code and sample file contents:
def setFood(self):
reader = self.unicode_csv_reader(open("food.csv"))
aDict = {}
for field1, field2 in reader:
if field2 not in aDict.keys():
aDict[field2] = [field1]
else:
aDict[field2] += [field1]
return aDict
def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs):
reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in reader:
yield [unicode(cell, 'utf-8') for cell in row]
def recFood(self, inp):
print inp
for key in self.foodDict.keys():
for value in self.foodDict[key]:
print(key)
print(value)
Sample csv
ヤクルト,飲み物
カキフライ,洋食
エビフライ,洋食
豚カツ,洋食
The example at the bottom of the Python 2.7 csv module documentation is what you want, but use utf-8-sig for the encoding. The \ufeff is a byte order mark (BOM) character and that encoding will handle it correctly if present.
You will need to have the Japanese system locale to print in the Windows console. Better, switch to Python 3.6 which will print in the console using Unicode APIs...all you need is a font that supports Japanese. The csv module in Python 3 is Unicode-aware as well and works much better.
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
with open('food.csv','rb') as f:
r = UnicodeReader(f)
for key,value in r:
print key,value
ヤクルト 飲み物
カキフライ 洋食
エビフライ 洋食
豚カツ 洋食

decoding Ascii 7 bits to a readable UTF8 .CSV file

I'd like someone to help me with part of my code, there is a problem on the output file that should come out in .csv format using unicode, easy to read on excel. The problem is that the output file comes out without format and the text in it comes in ASCII (7bit).
I really apreaciate your help i've been on this for 4 hours now and can't find the problem yet :/
The last part of the script:
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8").replace("\n"," ").replace("\r"," ").replace("\t",'') for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
Python Version is 2.7 on windows 10
is in Ascii
Writing .csv format using unicode, for instance:
import io, csv
outfile = 'test/out.csv'
fieldnames = ['field1', 'field2']
content_dict = {'field1':'John', 'field2':'Doo'}
with io.open(outfile, 'w', newline='', encoding='utf-8') as csv_out:
writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
writer.writeheader()
for row_dict in content_dict:
writer.writerow(row_dict)

I want an **unicode type** of the following code ::

I got the below code from SO expert but it's working for ANSI Strings and my input is UNICODE STRING. How to make this code work for both of the versions? TIA
import csv
from collections import defaultdict
summary = defaultdict(list)
csvin = csv.reader(open('qwetry.txt'), delimiter='|')
for row in csvin:
summary[(row[1].split()[0], row[2])].append(int(row[5]))
csvout = csv.writer(open('datacopy.out','wb'), delimiter='|')
for who, what in summary.iteritems():
csvout.writerow( [' '.join(who), len(what), sum(what)] )
courtsey: Jon Clements
The csv module doesn’t directly support reading and writing Unicode. you can find the details here. The generator for the same is as below::
import csv
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')

Trouble with UTF-8 CSV input in Python

This seems like it should be an easy fix, but so far a solution has eluded me. I have a single column csv file with non-ascii chars saved in utf-8 that I want to read in and store in a list. I'm attempting to follow the principle of the "Unicode Sandwich" and decode upon reading the file in:
import codecs
import csv
with codecs.open('utf8file.csv', 'rU', encoding='utf-8') as file:
input_file = csv.reader(file, delimiter=",", quotechar='|')
list = []
for row in input_file:
list.extend(row)
This produces the dread 'codec can't encode characters in position, ordinal not in range(128)' error.
I've also tried adapting a solution from this answer, which returns a similar error
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]
filename = 'inputs\encode.csv'
reader = unicode_csv_reader(open(filename))
target_list = []
for field1 in reader:
target_list.extend(field1)
A very similar solution adapted from the docs returns the same error.
def unicode_csv_reader(utf8_data, dialect=csv.excel):
csv_reader = csv.reader(utf_8_encoder(utf8_data), dialect)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
filename = 'inputs\encode.csv'
reader = unicode_csv_reader(open(filename))
target_list = []
for field1 in reader:
target_list.extend(field1)
Clearly I'm missing something. Most of the questions that I've seen regarding this problem seem to predate Python 2.7, so an update here might be useful.
Your first snippet won't work. You are feeding unicode data to the csv reader, which (as documented) can't handle it.
Your 2nd and 3rd snippets are confused. Something like the following is all that you need:
f = open('your_utf8_encoded_file.csv', 'rb')
reader = csv.reader(f)
for utf8_row in reader:
unicode_row = [x.decode('utf8') for x in utf8_row]
print unicode_row
At it fails from the first char to read, you may have a BOM. Use codecs.open('utf8file.csv', 'rU', encoding='utf-8-sig') if your file is UTF8 and has a BOM at the beginning.
I'd suggest trying just:
input_file = csv.reader(open('utf8file.csv', 'r'), delimiter=",", quotechar='|')
or
input_file = csv.reader(open('utf8file.csv', 'rb'), delimiter=",", quotechar='|')
csv should be unicode aware, and it should just work.

Python DictWriter writing UTF-8 encoded CSV files

I have a list of dictionaries containing unicode strings.
csv.DictWriter can write a list of dictionaries into a CSV file.
I want the CSV file to be encoded in UTF8.
The csv module cannot handle converting unicode strings into UTF8.
The csv module documentation has an example for converting everything to UTF8:
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
It also has a UnicodeWriter class.
But... how do I make DictWriter work with these? Wouldn't they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don't get it.
UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There's also a Python 3 solution that doesn't required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3's built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)
There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it's something one should do anyway.
You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow(). For example:
import csv
rows = [
{'name': u'Anton\xedn Dvo\u0159\xe1k','country': u'\u010cesko'},
{'name': u'Bj\xf6rk Gu\xf0mundsd\xf3ttir', 'country': u'\xcdsland'},
{'name': u'S\xf8ren Kierkeg\xe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark
You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))
When you call csv.writer with your content, the idea is to pass the content through utf_8_encoder as it would give you the (utf-8) encoded content.
My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)

Categories

Resources