decoding Ascii 7 bits to a readable UTF8 .CSV file

decoding Ascii 7 bits to a readable UTF8 .CSV file - python

I'd like someone to help me with part of my code, there is a problem on the output file that should come out in .csv format using unicode, easy to read on excel. The problem is that the output file comes out without format and the text in it comes in ASCII (7bit).
I really apreaciate your help i've been on this for 4 hours now and can't find the problem yet :/
The last part of the script:
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8").replace("\n"," ").replace("\r"," ").replace("\t",'') for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
Python Version is 2.7 on windows 10
is in Ascii

Writing .csv format using unicode, for instance:
import io, csv
outfile = 'test/out.csv'
fieldnames = ['field1', 'field2']
content_dict = {'field1':'John', 'field2':'Doo'}
with io.open(outfile, 'w', newline='', encoding='utf-8') as csv_out:
writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
writer.writeheader()
for row_dict in content_dict:
writer.writerow(row_dict)

Related

Formatting a tab-delimited text file with Python

I’m updating a Python script from 2 to 3. It reads in a manifest (i.e., [batchdate]xmlList.xml), iterates through each XML file identified in the manifest, collects stats, then outputs a stats file in tab-delimited text format. The formatting and encoding of the tab file is off, and I can’t figure out how to fix it.
for encoding in utf-8:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
def writerow(self, row):
self.writer.writerow([str(s).encode("utf-8") for s in row])
data = self.queue.getvalue()
self.stream.write(data)
self.queue.truncate(0)
read in xmllist.xml manifest:
xmlListPath = input('Enter the filepath of the xmlList.xml file: ').replace('"', '')
xmlListFile = codecs.open(xmlListPath)
xmlList = etree.parse(xmlListFile)
listRoot = xmlList.getroot()
xmlListFile.close()
create stats file and write header:
batchID = path.split(xmlListPath)[1]
statsFile = 'S:/Metadata/ETD/Documentation/Statistics/' + batchID.replace('xmlList.xml', '.stats.txt')
stats = open(statsFile, 'w')
wtrStats = UnicodeWriter(stats, delimiter='\t')
statsHeader = ['Author', 'Degree', 'Department', 'Embargo Start Date', 'Date Web Available',
'Embargo Code', 'Identifier', 'PURL', 'Title', 'Comments']
wtrStats.writerow(statsHeader)
Here is how the tab file is coming out:
b'Author' b'Degree' b'Department' b'Embargo Start Date' b'Date Web Available' b'Embargo Code' b'Identifier' b'PURL' b'Title' b'Comments'
b'Confer, Matthew Phelan' b'Ph.D.' b'Chemical & Biological Engineering' b'01/01/2021' b'01/01/2026' b'4' b'u0015_0000001_0003682' b'http://purl.lib.ua.edu/177826' b'EXPERIMENTAL AND COMPUTATIONAL STUDIES OF MATERIALS DECOMPOSITION' b''
Thanks for any help.

The thing is that in Python3, the CSV module readers and writers expect to find strings (unicode text) - when you feed them bytes, by pre-encoding your strings, it uses the representation of those bytes objects, which is a b'...' prefixed string.
TL;DR: simply open your output file in the desired encoding, and point your csv.writer object to it - there is absolutely no need for this UnicodeWriter intermediate class you are listing.
import csv
...
stats = open(statsFile, 'w', encoding="utf-8")
wtrStats = csv.writer(stats, delimiter="\t")
...

How to write Unicode data as Cyrillic symbols to file? [duplicate]

This question already has answers here:
Read and Write CSV files including unicode with Python 2.7
(7 answers)
Closed 5 years ago.
I have some variables in Unicode.
title
u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
type(title)
unicode
If I print this vaiable, I get:
print (title)
Администратор интернет-магазин
When I try to write this data (Cyrillic symbols) to CSV file:
with open('avito.csv','a') as f:
writer=csv.writer(f)
writer.writerow((title))
This error occurs:
UnicodeEncodeError: 'ascii' codec can't encode character u'\u0410' in position 0: ordinal not in range(128)
How can I write this variable as Cyrillic symbols to a CSV?

You have to write to the file with the correct encoding, and from your comment I guess, it is cp1251:
import io
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
with io.open('avito.csv', 'a', encoding='cp1251') as output:
output.write(title + '\n')

Three ways on Python 2.7. Note that to open the files in Excel that program likes a UTF-8 BOM encoded at the start of the file. I write it manually in the brute force method, but the utf-8-sig codec will handle it for you otherwise. Skip the BOM signature if you aren't dealing with lame editors (Windows Notepad) or Excel.
import csv
import codecs
import cStringIO
title = u'\u0410\u0434\u043c\u0438\u043d\u0438\u0441\u0442\u0440\u0430\u0442\u043e\u0440 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d\u0430'
print(title)
# Brute force
with open('avito.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8')) # writes "byte order mark" UTF-8 signature
writer=csv.writer(f)
writer.writerow([title.encode('utf8')])
# Example from the documentation for csv module
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('avito2.csv','wb') as f:
w = UnicodeWriter(f)
w.writerow([title])
# 3rd party module, install from pip
import unicodecsv
with open('avito3.csv','wb') as f:
w = unicodecsv.writer(f,encoding='utf-8-sig')
w.writerow([title])

export an utf-8 csv file in python

I encountered an error while export an utf-8 csv file in python. The error says
AttributeError: 'int' object has no attribute 'encode'
First, I use pyodbc to connect microsoft access database and get data there.
MDB = "E:/Research/2000-01.mdb"; DRV = '{Microsoft Access Driver (*.mdb)}'; PWD = 'pw'
con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
cur = con.cursor()
SQL = 'SELECT * FROM 200001;'
rows = cur.execute(SQL).fetchall()
cur.close()
con.close()
then use the class,
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
I started to write the utf-8 csv file
with open("E:/Research/200001.txt", 'wb') as f:
writer = UnicodeWriter(f)
writer.writerows(rows)
An exemplary line in rows is
(577540, u'1', datetime.datetime(2000, 1, 1, 0, 0), u'85411000', u'53', u'4403944851', u'44039', u'10', u'116', u'110', u'4', u'01', 89956, 0.15575717389583588, u'\u5916\u5546\u72ec\u8d44\u4f01\u4e1a', u'\u5c71\u7279\u7535\u5b50 (\u6df1\u5733) \u6709\u9650\u516c\u53f8', u'\u6df1\u5733\u5b9d\u5b8972\u533a\u5b9d\u77f3\u8def\u53f7', u'755 27757943', u'', u'518101', u'', u'\u90d1\u66fc\u5a1c', u'\u4e8c\u6781\u7ba1\uff0c\u4f46\u5149\u654f\u4e8c\u6781\u7ba1\u6216\u53d1\u5149\u4e8c\u6781\u7ba1\u9664\u5916', u'\u5e7f\u4e1c\u7701\u6df1\u5733', u'\u65e5\u672c', u'\u6df1\u5733\u6d77\u5173', u'\u4e00\u822c\u8d38\u6613', u'\u6c7d\u8f66\u8fd0\u8f93', u'\u4e2a/\u5957', u'\u9999\u6e2f', u'\u8fdb\u53e3')
It looks like each line contains some integers and datetime stuff. Any idea of solving this problem? Thanks a lot!

You probably need to do something like this first, just before writer.writerows:
rows = [[unicode(x) for x in row] for row in rows]
Or, my guess is it's crapping out on trying to write the database row ID. So you could also probably try slicing that off:
rows = [row[1:] for row in rows]

export data from csv file containing unicode characters

I would like to export data from a csv file which contains unicode strings.
Previously I tried a Python script which works fine for ASCII data only. But it won't support unicode stuff either:
#! /usr/bin/env python
import csv
csv.register_dialect('custom',delimiter=','
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = csv.reader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
Traceback (most recent call last): for record in data: _csv.Error:
line contains NULL byte

use this unicode-csv library instead
https://github.com/jdunck/python-unicodecsv
import unicodecsv as csv
with open('input.csv') as ifile:
rows = [row for row in csv.reader(ifile, encoding='utf-8')]
print rows

You can wrap the csv.reader in a class to handle it for you. The following is taken from the csv documentation examples and works for me:
#! /usr/bin/env python
import csv, codecs
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
csv.register_dialect('custom', delimiter=',',
doublequote=True,
escapechar=None,
quotechar='"',
quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
with open('input.csv') as ifile:
data = UnicodeReader(ifile, dialect='custom')
for record in data:
for i, field in enumerate(record):
print (" <field%s>" % i + field + "</field%s>" % i)
There is also a UnicodeWriter class there if you need that functionality.

It seems you are using Python 3. Follow the very first code example in the docs:
#!/usr/bin/env python3
import csv
with open('input.csv', newline='', encoding=encoding) as csvfile:
reader = csv.reader(csvfile, dialect="custom")
for row in reader:
print(", ".join(row))
where "custom" dialect is defined in the code in your question and encoding is the character encoding of your file such as "utf-16". If you omit encoding argument; the encoding returned by locale.getpreferredencoding(False) is used.

Python DictWriter writing UTF-8 encoded CSV files

I have a list of dictionaries containing unicode strings.
csv.DictWriter can write a list of dictionaries into a CSV file.
I want the CSV file to be encoded in UTF8.
The csv module cannot handle converting unicode strings into UTF8.
The csv module documentation has an example for converting everything to UTF8:
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
It also has a UnicodeWriter class.
But... how do I make DictWriter work with these? Wouldn't they have to inject themselves in the middle of it, to catch the disassembled dictionaries and encode them before it writes them to the file? I don't get it.

UPDATE: The 3rd party unicodecsv module implements this 7-year old answer for you. Example below this code. There's also a Python 3 solution that doesn't required a 3rd party module.
Original Python 2 Answer
If using Python 2.7 or later, use a dict comprehension to remap the dictionary to utf-8 before passing to DictWriter:
# coding: utf-8
import csv
D = {'name':u'马克','pinyin':u'mǎkè'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = csv.DictWriter(f,sorted(D.keys()))
w.writeheader()
w.writerow({k:v.encode('utf8') for k,v in D.items()})
f.close()
You can use this idea to update UnicodeWriter to DictUnicodeWriter:
# coding: utf-8
import csv
import cStringIO
import codecs
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
D1 = {'name':u'马克','pinyin':u'Mǎkè'}
D2 = {'name':u'美国','pinyin':u'Měiguó'}
f = open('out.csv','wb')
f.write(u'\ufeff'.encode('utf8')) # BOM (optional...Excel needs it to open UTF-8 file properly)
w = DictUnicodeWriter(f,sorted(D.keys()))
w.writeheader()
w.writerows([D1,D2])
f.close()
Python 2 unicodecsv Example:
# coding: utf-8
import unicodecsv as csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
with open('out.csv','wb') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()),encoding='utf-8-sig')
w.writeheader()
w.writerow(D)
Python 3:
Additionally, Python 3's built-in csv module supports Unicode natively:
# coding: utf-8
import csv
D = {u'name':u'马克',u'pinyin':u'mǎkè'}
# Use newline='' instead of 'wb' in Python 3.
with open('out.csv','w',encoding='utf-8-sig',newline='') as f:
w = csv.DictWriter(f,fieldnames=sorted(D.keys()))
w.writeheader()
w.writerow(D)

There is a simple workaround using the wonderful UnicodeCSV module. After having it, just change the line
import csv
to
import unicodecsv as csv
And it automagically begins playing nice with UTF-8.
Note: Switching to Python 3 will also rid you of this problem (thanks jamescampbell for the tip). And it's something one should do anyway.

You can convert the values to UTF-8 on the fly as you pass the dict to DictWriter.writerow(). For example:
import csv
rows = [
{'name': u'Anton\xedn Dvo\u0159\xe1k','country': u'\u010cesko'},
{'name': u'Bj\xf6rk Gu\xf0mundsd\xf3ttir', 'country': u'\xcdsland'},
{'name': u'S\xf8ren Kierkeg\xe5rd', 'country': u'Danmark'}
]
# implement this wrapper on 2.6 or lower if you need to output a header
class DictWriterEx(csv.DictWriter):
def writeheader(self):
header = dict(zip(self.fieldnames, self.fieldnames))
self.writerow(header)
out = open('foo.csv', 'wb')
writer = DictWriterEx(out, fieldnames=['name','country'])
# DictWriter.writeheader() was added in 2.7 (use class above for <= 2.6)
writer.writeheader()
for row in rows:
writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
out.close()
Output foo.csv:
name,country
Antonín Dvořák,Česko
Björk Guðmundsdóttir,Ísland
Søren Kierkegård,Danmark

You can use some proxy class to encode dict values as needed, like this:
# -*- coding: utf-8 -*-
import csv
d = {'a':123,'b':456, 'c':u'Non-ASCII: проверка'}
class DictUnicodeProxy(object):
def __init__(self, d):
self.d = d
def __iter__(self):
return self.d.__iter__()
def get(self, item, default=None):
i = self.d.get(item, default)
if isinstance(i, unicode):
return i.encode('utf-8')
return i
with open('some.csv', 'wb') as f:
writer = csv.DictWriter(f, ['a', 'b', 'c'])
writer.writerow(DictUnicodeProxy(d))

When you call csv.writer with your content, the idea is to pass the content through utf_8_encoder as it would give you the (utf-8) encoded content.

My solution is a bit different. While all solutions above are focusing on having unicode compatible dict, my solutions makes DictWriter compatible with unicode. This approach is even suggested in python docs (1).
Classes UTF8Recoder, UnicodeReader, UnicodeWriter are taken from python docs. UnicodeWriter->writerow was changed a little bit too.
Use it as regular DictWriter/DictReader.
Here is the code:
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeDictWriter(csv.DictWriter, object):
def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds):
super(UnicodeDictWriter, self).__init__(f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds)
self.writer = UnicodeWriter(f, dialect, **kwds)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

decoding Ascii 7 bits to a readable UTF8 .CSV file - python

Related

Formatting a tab-delimited text file with Python

How to write Unicode data as Cyrillic symbols to file? [duplicate]

export an utf-8 csv file in python

export data from csv file containing unicode characters

Python DictWriter writing UTF-8 encoded CSV files

Categories

Resources