charmap codec cant encode characters in position xx - xx - python

I am trying to use unicodecsv python library in python 2.7.x
import codecs
import unicodecsv
def read(self, path):
with codecs.open(path, "rb", encoding = "utf-8") as f:
r = unicodecsv.reader(f, encoding = 'utf-8')
row = r.next()
print row
read("unicode.csv")
Error:
charmap codec cant encode characters in position xx - xx
I have manually converted my csv file to utf-8 using txt editors so i am sure the input file is fine

I see few problems with your code:
def read(self, path):
You using self no within class
after opening file with codecs.openyou can use standard python csv reader.
With some modifications:
f = "/home/dzagorulkin/workspace/zont/file.txt"
import codecs
#import unicodecsv
def read(path):
with codecs.open(path, "rb", encoding = "utf-8") as f:
for line in f:
print line
read(f)
i used none ASCII file and output:
Меня Дима зовут! Меня Дима зовут!

Related

Can't decode from windows1252 to UTF8

I know questions about encoding and decoding in utf-8 have been asked so many times, but I could not find an answer to my question.
I have a CSV file in windows1252 and I want to make it in UTF-8, here is the script:
import os
import sys
import inspect
import codecs
import chardet
from bs4 import UnicodeDammit
#Declare the variables
defaultencoding = 'utf-8'
filename = '19-01-2017+06-00-00.csv'
#open the file and get the content
file_obj = open(filename,"r")
content = file_obj.read()
file_obj.close()
#Check the initial encoding using both unicodeDammit and chardet
dammit = UnicodeDammit(content)
#print it
print(dammit.original_encoding)
print(chardet.detect(content)['encoding'])
#Decode in UTF8
content_decoded = content.decode('windows-1252')
content_encoded = content_decoded.encode(defaultencoding)
#Write the result in a temporary file
file_obj = open('tmp.txt',"w")
try:
file_obj.write(content_encoded)
finally:
file_obj.close()
#Read the result decoded file
file_obj = open('tmp.txt', "r")
content = file_obj.read()
file_obj.close()
#Check if it is really in UTF8 using both unicodeDammit and chardet
dammit = UnicodeDammit(content)
print(dammit.original_encoding)
print(chardet.detect(content)['encoding'])
Output:
windows-1252
windows-1252
windows-1252
windows-1252
Expected output:
windows-1252
windows-1252
utf-8
utf-8
I used chardet and uncodeDammit because I found out that chardet is not giving the correct encoding guess all the time.
Why can't encode the file in utf-8 ?

Writing CSV file with umlauts causing "UnicodeEncodeError: 'ascii' codec can't encode character"

I am trying to write characters with double dots (umlauts) such as ä, ö and Ö. I am able to write it to the file with data.encode("utf-8") but the result b'\xc3\xa4\xc3\xa4\xc3\x96' is not nice (UTF-8 as literal characters). I want to get "ääÖ" as written stored to a file.
How can I write data with umlaut characters to a CSV file in Python 3?
import csv
data="ääÖ"
with open("test.csv", "w") as fp:
a = csv.writer(fp, delimiter=";")
data=resultFile
a.writerows(data)
Traceback:
File "<ipython-input-280-73b1f615929e>", line 5, in <module>
a.writerows(data)
UnicodeEncodeError: 'ascii' codec can't encode character '\xe4' in position 15: ordinal not in range(128)
Add a parameter encoding to the open() and set it to 'utf8'.
import csv
data = "ääÖ"
with open("test.csv", 'w', encoding='utf8') as fp:
a = csv.writer(fp, delimiter=";")
a.writerows(data)
Edit: Removed the use of io library as open is same as io.open in Python 3.
This solution should work on both python2 and 3 (not needed in python3):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
data="ääÖ"
with open("test.csv", "w") as fp:
a = csv.writer(fp, delimiter=";")
a.writerows(data)
Credits to:
Working with utf-8 encoding in Python source

pandas reading csv file encoding error

i have a iso8859-9 encoded csv file and trying to read it into a dataframe.
here is the code and error I got.
iller = pd.read_csv('/Users/me/Documents/Works/map/dist.csv' ,sep=';',encoding='iso-8859-9')
iller.head()
and error is
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc4 in position 250: ordinal not in range(128)
and code below works without error.
import codecs
myfile = codecs.open('/Users/me/Documents/Works/map/dist.csv', "r",encoding='iso-8859-9')
for a in myfile:
print a
My question is why pandas not reading my correctly encoded file ? and is there any way to make it read?
Not possible to see what could be off with you data of course, but if you can read in the data without issues with codecs, then maybe an idea would be to write out the file to UTF encoding(?)
import codecs
filename = '/Users/me/Documents/Works/map/dist.csv'
target_filename = '/Users/me/Documents/Works/map/dist-utf-8.csv'
myfile = codecs.open(filename, "r",encoding='iso-8859-9')
f_contents = myfile.read()
or
import codecs
with codecs.open(filename, 'r', encoding='iso-8859-9') as fh:
f_contents = fh.read()
# write out in UTF-8
with codecs.open(target_filename, 'w', encoding = 'utf-8') as fh:
fh.write(f_contents)
I hope this helps!

I'm trying to encode csv file to utf8 using python

I'm using python to read and encode many files to utf8 using python,I try it with the code below:
import os
from os import listdir
def find_csv_filenames(path_to_dir, suffix=".csv" ):
path_to_dir = os.path.normpath(path_to_dir)
filenames = listdir(path_to_dir)
#Check *csv directory
fp = lambda f: not os.path.isdir(path_to_dir+"/"+f) and f.endswith(suffix)
return [path_to_dir+"/"+fname for fname in filenames if fp(fname)]
def convert_files(files, ascii, to="utf-8"):
count = 0
lineno = 0
for name in files:
lineno = lineno+1
with open(name) as f:
file_target = open(name, mode='r', encoding='latin-1')
file_content = file_target.read()
file_target.close
print(lineno)
file_source = open("./csv/data{}.csv".format(lineno), mode='w', encoding='utf-8')
file_source.write(file_content)
csv_files = find_csv_filenames('./csv', ".csv")
convert_files(csv_files, "cp866")
The problem is that after I read and write data to other files and set encode it to utf8 but it still not work.
Before you open a file which encoding is not clear, you could use chardet to detect the file's encoding rather than use a encoding guessed to open a file. Usage is like this:
>>> import chardet
>>> encoding = chardet.detect('PATH/TO/FILE')['encoding']
And then open the file with the encoding detected and write the contents into a file opened with 'utf-8' encoding.
If you're not sure whether the file is converted using 'utf-8' encoding, you could use enca to see if the encoding of the file is 'ASCII' or 'utf-8' like this in Linux shell:
$ enca FILENAME

This is my current way of writing to a file. However, I can't do UTF-8?

f = open("go.txt", "w")
f.write(title)
f.close()
What if "title" is in japanese/utf-8? How do I modify this code to be able to write "title" without having the ascii error?
Edit: Then, how do I read this file in UTF-8?
How to use UTF-8:
import codecs
# ...
# title is a unicode string
# ...
f = codecs.open("go.txt", "w", "utf-8")
f.write(title)
# ...
fileObj = codecs.open("go.txt", "r", "utf-8")
u = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file
It depends on whether you want to insert a Unicode UTF-8 byte order mark, of which the only way I know of is to open a normal file and write:
import codecs
f = open('go.txt', 'wb')
f.write(codecs.BOM_UTF8)
f.write(title.encode('utf-8')
f.close()
Generally though, I don't want to add a UTF-8 BOM and the following will suffice though:
import codecs
f = codecs.open('go.txt', 'w', 'utf-8')
f.write(title)
f.close()

Categories

Resources