Python encoding in Windows command line: Chcp 932 doesn't work? - python

I've looked at other answers, and have done what they recommended:
1. Changed system locale to Japanese
2. Chcp 932 (Japanese)
3. Python file saved as UTF-8
4. All inputs are subject to the unicode(input, 'utf-8') function as seen below.
NOTE: I've also tried using chcp 65001, but this doesn't work either.
I'm trying to read a csv file in Japanese, but the following error keeps coming up.
Traceback (most recent call last):
...
...
UnicodeEncodeError: 'cp932' codec can't encode character u'\ufeff' in position 0: illegal multibyte sequence
My code and sample file contents:
def setFood(self):
reader = self.unicode_csv_reader(open("food.csv"))
aDict = {}
for field1, field2 in reader:
if field2 not in aDict.keys():
aDict[field2] = [field1]
else:
aDict[field2] += [field1]
return aDict
def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs):
reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in reader:
yield [unicode(cell, 'utf-8') for cell in row]
def recFood(self, inp):
print inp
for key in self.foodDict.keys():
for value in self.foodDict[key]:
print(key)
print(value)
Sample csv
ヤクルト,飲み物
カキフライ,洋食
エビフライ,洋食
豚カツ,洋食

The example at the bottom of the Python 2.7 csv module documentation is what you want, but use utf-8-sig for the encoding. The \ufeff is a byte order mark (BOM) character and that encoding will handle it correctly if present.
You will need to have the Japanese system locale to print in the Windows console. Better, switch to Python 3.6 which will print in the console using Unicode APIs...all you need is a font that supports Japanese. The csv module in Python 3 is Unicode-aware as well and works much better.
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
with open('food.csv','rb') as f:
r = UnicodeReader(f)
for key,value in r:
print key,value
ヤクルト 飲み物
カキフライ 洋食
エビフライ 洋食
豚カツ 洋食

Related

How to write Unicode data as Cyrillic symbols to file? [duplicate]

This question already has answers here:
Read and Write CSV files including unicode with Python 2.7
(7 answers)
Closed 5 years ago.
I have some variables in Unicode.
title
u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
type(title)
unicode
If I print this vaiable, I get:
print (title)
Администратор интернет-магазин
When I try to write this data (Cyrillic symbols) to CSV file:
with open('avito.csv','a') as f:
writer=csv.writer(f)
writer.writerow((title))
This error occurs:
UnicodeEncodeError: 'ascii' codec can't encode character u'\u0410' in position 0: ordinal not in range(128)
How can I write this variable as Cyrillic symbols to a CSV?
You have to write to the file with the correct encoding, and from your comment I guess, it is cp1251:
import io
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
with io.open('avito.csv', 'a', encoding='cp1251') as output:
output.write(title + '\n')
Three ways on Python 2.7. Note that to open the files in Excel that program likes a UTF-8 BOM encoded at the start of the file. I write it manually in the brute force method, but the utf-8-sig codec will handle it for you otherwise. Skip the BOM signature if you aren't dealing with lame editors (Windows Notepad) or Excel.
import csv
import codecs
import cStringIO
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
print(title)
# Brute force
with open('avito.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8')) # writes "byte order mark" UTF-8 signature
writer=csv.writer(f)
writer.writerow([title.encode('utf8')])
# Example from the documentation for csv module
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('avito2.csv','wb') as f:
w = UnicodeWriter(f)
w.writerow([title])
# 3rd party module, install from pip
import unicodecsv
with open('avito3.csv','wb') as f:
w = unicodecsv.writer(f,encoding='utf-8-sig')
w.writerow([title])

Saving UTF-8 CSV with Python

I've been struggling with this, and have read numerous threads, but I can't seem to get this working. I need to save a UTF-8 CSV file.
Firstly, here's my super-simple approach:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
f = codecs.open("output.csv", "w", "utf-8-sig")
writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
writer.writerow(cells)
That results in an error:
Traceback (most recent call last):
File "./makesimplecsv.py", line 10, in <module>
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc7 in position 1: ordinal not in range(128)
I've also tried using the UnicodeWriter class that's listed in the Python docs (https://docs.python.org/2/library/csv.html#examples ):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
import cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
f = codecs.open("output.csv", "w", "utf-8-sig")
writer = UnicodeWriter(f)
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
writer.writerow(cells)
That results in the same error:
Traceback (most recent call last):
File "./makesimplecsvwithunicodewriter.sh", line 40, in <module>
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc7 in position 1: ordinal not in range(128)
I thought I'd gone through the checklist of things I've found in other similar questions:
My file has an encoding statement.
I'm opening the file for writing with UTF-8.
I'm encoding the individual strings in UTF-8 before I pass them to the CSV writer.
I've tried with and without adding a UTF-8 BOM, but that doesn't seem to make any difference, or indeed be critical, from what I've read.
Any ideas on what I'm doing wrong?
You are writing encoded byte strings to your CSV file. There is little point in doing this when you are expecting Unicode objects.
Don't encode, decode:
cells = ["hello".decode("utf-8"), "nǐ hǎo".decode("utf-8"), "你好".decode("utf-8")]
or use u'...' unicode string literals:
cells = [u"hello", u"nǐ hǎo", u"你好"]
You cannot use a codecs.open() file object with the Python 2 csv module. Either use the UnicodeWriter approach (with a regular file object) and pass in Unicode objects, or encode your cells to byte strings and use the csv.writer() object directly (again with a regular file object), as that's what the UnicodeWriter does; pass encoded byte strings to the csv.writer() object.
UPDATE - SOLUTION
Thanks to the accepted answer I was able to get this working. Here is the full working example for future reference:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
import cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
f = open("output.csv", "w")
writer = UnicodeWriter(f)
cells = ["hello".decode("utf-8"), "nǐ hǎo".decode("utf-8"), "你好".decode("utf-8")]
writer.writerow(cells)

export data from csv file containing unicode characters

I would like to export data from a csv file which contains unicode strings.
Previously I tried a Python script which works fine for ASCII data only. But it won't support unicode stuff either:
#! /usr/bin/env python
import csv
csv.register_dialect('custom',delimiter=','
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = csv.reader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
Traceback (most recent call last): for record in data: _csv.Error:
line contains NULL byte
use this unicode-csv library instead
https://github.com/jdunck/python-unicodecsv
import unicodecsv as csv
with open('input.csv') as ifile:
rows = [row for row in csv.reader(ifile, encoding='utf-8')]
print rows
You can wrap the csv.reader in a class to handle it for you. The following is taken from the csv documentation examples and works for me:
#! /usr/bin/env python
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
csv.register_dialect('custom', delimiter=',',
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = UnicodeReader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
There is also a UnicodeWriter class there if you need that functionality.
It seems you are using Python 3. Follow the very first code example in the docs:
#!/usr/bin/env python3
import csv
with open('input.csv', newline='', encoding=encoding) as csvfile:
reader = csv.reader(csvfile, dialect="custom")
for row in reader:
print(", ".join(row))
where "custom" dialect is defined in the code in your question and encoding is the character encoding of your file such as "utf-16". If you omit encoding argument; the encoding returned by locale.getpreferredencoding(False) is used.

I want an **unicode type** of the following code ::

I got the below code from SO expert but it's working for ANSI Strings and my input is UNICODE STRING. How to make this code work for both of the versions? TIA
import csv
from collections import defaultdict
summary = defaultdict(list)
csvin = csv.reader(open('qwetry.txt'), delimiter='|')
for row in csvin:
summary[(row[1].split()[0], row[2])].append(int(row[5]))
csvout = csv.writer(open('datacopy.out','wb'), delimiter='|')
for who, what in summary.iteritems():
csvout.writerow( [' '.join(who), len(what), sum(what)] )
courtsey: Jon Clements
The csv module doesn’t directly support reading and writing Unicode. you can find the details here. The generator for the same is as below::
import csv
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')

Python DictWriter writing UTF-8 encoded CSV files

I have a list of dictionaries containing unicode strings.
csv.DictWriter can write a list of dictionaries into a CSV file.
I want the CSV file to be encoded in UTF8.
The csv module cannot handle converting unicode strings into UTF8.
The csv module documentation has an example for converting everything to UTF8:
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
It also has a UnicodeWriter class.
But... how do I make DictWriter work with these? Wouldn't they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don't get it.
UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There's also a Python 3 solution that doesn't required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3's built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)
There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it's something one should do anyway.
You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow(). For example:
import csv
rows = [
{'name': u'Anton\xedn Dvo\u0159\xe1k','country': u'\u010cesko'},
{'name': u'Bj\xf6rk Gu\xf0mundsd\xf3ttir', 'country': u'\xcdsland'},
{'name': u'S\xf8ren Kierkeg\xe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark
You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))
When you call csv.writer with your content, the idea is to pass the content through utf_8_encoder as it would give you the (utf-8) encoded content.
My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)

Categories

Resources