How to write Unicode data as Cyrillic symbols to file? [duplicate] - python

This question already has answers here:
Read and Write CSV files including unicode with Python 2.7
(7 answers)
Closed 5 years ago.
I have some variables in Unicode.
title
u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
type(title)
unicode
If I print this vaiable, I get:
print (title)
Администратор интернет-магазин
When I try to write this data (Cyrillic symbols) to CSV file:
with open('avito.csv','a') as f:
writer=csv.writer(f)
writer.writerow((title))
This error occurs:
UnicodeEncodeError: 'ascii' codec can't encode character u'\u0410' in position 0: ordinal not in range(128)
How can I write this variable as Cyrillic symbols to a CSV?

You have to write to the file with the correct encoding, and from your comment I guess, it is cp1251:
import io
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
with io.open('avito.csv', 'a', encoding='cp1251') as output:
output.write(title + '\n')

Three ways on Python 2.7. Note that to open the files in Excel that program likes a UTF-8 BOM encoded at the start of the file. I write it manually in the brute force method, but the utf-8-sig codec will handle it for you otherwise. Skip the BOM signature if you aren't dealing with lame editors (Windows Notepad) or Excel.
import csv
import codecs
import cStringIO
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
print(title)
# Brute force
with open('avito.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8')) # writes "byte order mark" UTF-8 signature
writer=csv.writer(f)
writer.writerow([title.encode('utf8')])
# Example from the documentation for csv module
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('avito2.csv','wb') as f:
w = UnicodeWriter(f)
w.writerow([title])
# 3rd party module, install from pip
import unicodecsv
with open('avito3.csv','wb') as f:
w = unicodecsv.writer(f,encoding='utf-8-sig')
w.writerow([title])

Related

Formatting a tab-delimited text file with Python

I’m updating a Python script from 2 to 3. It reads in a manifest (i.e., [batchdate]xmlList.xml), iterates through each XML file identified in the manifest, collects stats, then outputs a stats file in tab-delimited text format. The formatting and encoding of the tab file is off, and I can’t figure out how to fix it.
for encoding in utf-8:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
def writerow(self, row):
self.writer.writerow([str(s).encode("utf-8") for s in row])
data = self.queue.getvalue()
self.stream.write(data)
self.queue.truncate(0)
read in xmllist.xml manifest:
xmlListPath = input('Enter the filepath of the xmlList.xml file: ').replace('"', '')
xmlListFile = codecs.open(xmlListPath)
xmlList = etree.parse(xmlListFile)
listRoot = xmlList.getroot()
xmlListFile.close()
create stats file and write header:
batchID = path.split(xmlListPath)[1]
statsFile = 'S:/Metadata/ETD/Documentation/Statistics/' + batchID.replace('xmlList.xml', '.stats.txt')
stats = open(statsFile, 'w')
wtrStats = UnicodeWriter(stats, delimiter='\t')
statsHeader = ['Author', 'Degree', 'Department', 'Embargo Start Date', 'Date Web Available',
'Embargo Code', 'Identifier', 'PURL', 'Title', 'Comments']
wtrStats.writerow(statsHeader)
Here is how the tab file is coming out:
b'Author' b'Degree' b'Department' b'Embargo Start Date' b'Date Web Available' b'Embargo Code' b'Identifier' b'PURL' b'Title' b'Comments'
b'Confer, Matthew Phelan' b'Ph.D.' b'Chemical & Biological Engineering' b'01/01/2021' b'01/01/2026' b'4' b'u0015_0000001_0003682' b'http://purl.lib.ua.edu/177826' b'EXPERIMENTAL AND COMPUTATIONAL STUDIES OF MATERIALS DECOMPOSITION' b''
Thanks for any help.
The thing is that in Python3, the CSV module readers and writers expect to find strings (unicode text) - when you feed them bytes, by pre-encoding your strings, it uses the representation of those bytes objects, which is a b'...' prefixed string.
TL;DR: simply open your output file in the desired encoding, and point your csv.writer object to it - there is absolutely no need for this UnicodeWriter intermediate class you are listing.
import csv
...
stats = open(statsFile, 'w', encoding="utf-8")
wtrStats = csv.writer(stats, delimiter="\t")
...

Python encoding in Windows command line: Chcp 932 doesn't work?

I've looked at other answers, and have done what they recommended:
1. Changed system locale to Japanese
2. Chcp 932 (Japanese)
3. Python file saved as UTF-8
4. All inputs are subject to the unicode(input, 'utf-8') function as seen below.
NOTE: I've also tried using chcp 65001, but this doesn't work either.
I'm trying to read a csv file in Japanese, but the following error keeps coming up.
Traceback (most recent call last):
...
...
UnicodeEncodeError: 'cp932' codec can't encode character u'\ufeff' in position 0: illegal multibyte sequence
My code and sample file contents:
def setFood(self):
reader = self.unicode_csv_reader(open("food.csv"))
aDict = {}
for field1, field2 in reader:
if field2 not in aDict.keys():
aDict[field2] = [field1]
else:
aDict[field2] += [field1]
return aDict
def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs):
reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in reader:
yield [unicode(cell, 'utf-8') for cell in row]
def recFood(self, inp):
print inp
for key in self.foodDict.keys():
for value in self.foodDict[key]:
print(key)
print(value)
Sample csv
ヤクルト,飲み物
カキフライ,洋食
エビフライ,洋食
豚カツ,洋食
The example at the bottom of the Python 2.7 csv module documentation is what you want, but use utf-8-sig for the encoding. The \ufeff is a byte order mark (BOM) character and that encoding will handle it correctly if present.
You will need to have the Japanese system locale to print in the Windows console. Better, switch to Python 3.6 which will print in the console using Unicode APIs...all you need is a font that supports Japanese. The csv module in Python 3 is Unicode-aware as well and works much better.
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
with open('food.csv','rb') as f:
r = UnicodeReader(f)
for key,value in r:
print key,value
ヤクルト 飲み物
カキフライ 洋食
エビフライ 洋食
豚カツ 洋食

decoding Ascii 7 bits to a readable UTF8 .CSV file

I'd like someone to help me with part of my code, there is a problem on the output file that should come out in .csv format using unicode, easy to read on excel. The problem is that the output file comes out without format and the text in it comes in ASCII (7bit).
I really apreaciate your help i've been on this for 4 hours now and can't find the problem yet :/
The last part of the script:
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8").replace("\n"," ").replace("\r"," ").replace("\t",'') for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
Python Version is 2.7 on windows 10
is in Ascii
Writing .csv format using unicode, for instance:
import io, csv
outfile = 'test/out.csv'
fieldnames = ['field1', 'field2']
content_dict = {'field1':'John', 'field2':'Doo'}
with io.open(outfile, 'w', newline='', encoding='utf-8') as csv_out:
writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
writer.writeheader()
for row_dict in content_dict:
writer.writerow(row_dict)

Saving UTF-8 CSV with Python

I've been struggling with this, and have read numerous threads, but I can't seem to get this working. I need to save a UTF-8 CSV file.
Firstly, here's my super-simple approach:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
f = codecs.open("output.csv", "w", "utf-8-sig")
writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
writer.writerow(cells)
That results in an error:
Traceback (most recent call last):
File "./makesimplecsv.py", line 10, in <module>
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc7 in position 1: ordinal not in range(128)
I've also tried using the UnicodeWriter class that's listed in the Python docs (https://docs.python.org/2/library/csv.html#examples ):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
import cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
f = codecs.open("output.csv", "w", "utf-8-sig")
writer = UnicodeWriter(f)
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
writer.writerow(cells)
That results in the same error:
Traceback (most recent call last):
File "./makesimplecsvwithunicodewriter.sh", line 40, in <module>
cells = ["hello".encode("utf-8"), "nǐ hǎo".encode("utf-8"), "你好".encode("utf-8")]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc7 in position 1: ordinal not in range(128)
I thought I'd gone through the checklist of things I've found in other similar questions:
My file has an encoding statement.
I'm opening the file for writing with UTF-8.
I'm encoding the individual strings in UTF-8 before I pass them to the CSV writer.
I've tried with and without adding a UTF-8 BOM, but that doesn't seem to make any difference, or indeed be critical, from what I've read.
Any ideas on what I'm doing wrong?
You are writing encoded byte strings to your CSV file. There is little point in doing this when you are expecting Unicode objects.
Don't encode, decode:
cells = ["hello".decode("utf-8"), "nǐ hǎo".decode("utf-8"), "你好".decode("utf-8")]
or use u'...' unicode string literals:
cells = [u"hello", u"nǐ hǎo", u"你好"]
You cannot use a codecs.open() file object with the Python 2 csv module. Either use the UnicodeWriter approach (with a regular file object) and pass in Unicode objects, or encode your cells to byte strings and use the csv.writer() object directly (again with a regular file object), as that's what the UnicodeWriter does; pass encoded byte strings to the csv.writer() object.
UPDATE - SOLUTION
Thanks to the accepted answer I was able to get this working. Here is the full working example for future reference:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
import codecs
import cStringIO
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
f = open("output.csv", "w")
writer = UnicodeWriter(f)
cells = ["hello".decode("utf-8"), "nǐ hǎo".decode("utf-8"), "你好".decode("utf-8")]
writer.writerow(cells)

Python DictWriter writing UTF-8 encoded CSV files

I have a list of dictionaries containing unicode strings.
csv.DictWriter can write a list of dictionaries into a CSV file.
I want the CSV file to be encoded in UTF8.
The csv module cannot handle converting unicode strings into UTF8.
The csv module documentation has an example for converting everything to UTF8:
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
It also has a UnicodeWriter class.
But... how do I make DictWriter work with these? Wouldn't they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don't get it.
UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There's also a Python 3 solution that doesn't required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3's built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)
There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it's something one should do anyway.
You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow(). For example:
import csv
rows = [
{'name': u'Anton\xedn Dvo\u0159\xe1k','country': u'\u010cesko'},
{'name': u'Bj\xf6rk Gu\xf0mundsd\xf3ttir', 'country': u'\xcdsland'},
{'name': u'S\xf8ren Kierkeg\xe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark
You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))
When you call csv.writer with your content, the idea is to pass the content through utf_8_encoder as it would give you the (utf-8) encoded content.
My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)

Categories

Resources