Python Encoding Issue with JSON and CSV - python

I am having an encoding issue when I run my script below:
Here is the error code:
-UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 9: ordinal not in range(128)
Here is my script:
import logging
import urllib
import csv
import json
import io
import codecs
with open('/home/local/apple.csv',
'rb') as csvinput:
reader = csv.reader(csvinput, delimiter=',')
firstline = True
for row in reader:
if firstline:
firstline = False
continue
address1 = row[0]
print row[0]
locality = row[1]
admin_area = row[2]
query = ' '.join(str(x) for x in (address1, locality, admin_area))
normalized = query.replace(" ", "+")
BaseURL = 'http://localhost:8080/verify?country=JP&freeform='
URL = BaseURL + normalized
print URL
data = urllib.urlopen(URL)
response = data.getcode()
print response
if response == 200:
file= json.load(data)
print file
output_f=open('output.csv','wb')
csvwriter=csv.writer(output_f)
count = 0
for f in file:
if count == 0:
header= f.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(f.values())
output_f.close()
else:
print 'error'
can anyone help me fix this its getting really annoying. I need to encode to utf8

Looks like you are using Python 2.x, instead of python's standard open, use codecs.open where you can optionally pass an encoding to use and what to do when there are errors. Gets a little less confusing in Python 3 where the standard Python open can do this.
So in your two lines where you are opening, do:
with codecs.open('/home/local/apple.csv',
'rb', 'utf-8') as csvinput:
output_f = codecs.open('output.csv','wb', 'utf-8')
The optional error parm defaults to "strict" which raises an exception if the bytes can't be mapped to the given encoding. In some contexts you may want to use 'ignore' or 'replace'.
See the python doc for a bit more info.

Related

Python 2.7 CSV file read/write \xef\xbb\xbf code

I have a question about Python 2.7 read/write csv file with 'utf-8-sig' code, my csv . header is
['\xef\xbb\xbfID;timestamp;CustomerID;Email']
there have some code("\xef\xbb\xbfID") I read from file A.csv and I want write the same code and header to file B.csv
My print log is shows:
['\xef\xbb\xbfID;timestamp;CustomerID;Email']
But the actual output file header it looks like
ÔªøID;timestamp
Here is the code:
def remove_gdpr_info_from_csv(file_path, file_name, temp_folder, original_header):
new_temp_folder = tempfile.mkdtemp()
new_temp_file = new_temp_folder + "/" + file_name
# Blanked new file
with open(new_temp_file, 'wb') as outfile:
writer = csv.writer(outfile, delimiter=";")
print original_header
writer.writerow(original_header)
# File from SFTP
with open(file_path, 'r') as infile:
reader = csv.reader(infile, delimiter=";")
first_row = next(reader)
email = first_row.index('Email')
contract_detractor1 = first_row.index('Contact Detractor (Q21)')
contract_detractor2 = first_row.index('Contact Detractor (Q20)')
contract_detractor3 = first_row.index('Contact Detractor (Q43)')
contract_detractor4 = first_row.index('Contact Detractor(Q26)')
contract_detractor5 = first_row.index('Contact Detractor(Q27)')
contract_detractor6 = first_row.index('Contact Detractor(Q44)')
indexes = []
for column_name in header_list:
ind = first_row.index(column_name)
indexes.append(ind)
for row in reader:
output_row = []
for ind in indexes:
data = row[ind]
if ind == email:
data = ''
elif ind == contract_detractor1:
data = ''
elif ind == contract_detractor2:
data = ''
elif ind == contract_detractor3:
data = ''
elif ind == contract_detractor4:
data = ''
elif ind == contract_detractor5:
data = ''
elif ind == contract_detractor6:
data = ''
output_row.append(data)
writer.writerow(output_row)
s3core.upload_files(SPARKY_S3, DESTINATION_PATH, new_temp_file)
shutil.rmtree(temp_folder)
shutil.rmtree(new_temp_folder)
'\xef\xbb\xbf' is the UTF8 encoded version of the unicode ZERO WIDTH NO-BREAK SPACE U+FEFF. It is often used as a Byte Order Mark at the beginning of unicode text files:
when you have 3 bytes: '\xef\xbb\xbf', then the file is utf8 encoded
when you have 2 bytes: '\xff\xfe', then the file is in utf16 little endian
when you have 2 bytes: '\xfe\xff', then the file is in utf16 big endian
The 'utf-8-sig' encoding explicitely asks for writing this BOM at the beginning of the file
To process it automatically at read time of a csv file in Python 2, you can use the codecs module:
with open(file_path, 'r') as infile:
reader = csv.reader(codecs.EncodedFile(infile, 'utf-8', 'utf-8-sig'), delimiter=";")
EncodedFile will wrap the original file object by decoding it in utf8-sig, actually skipping the BOM and re-encoding it in utf8 with no BOM.
You want to use the EncodedFile method from the codecs library as in Serge Ballesta's answer.
However using Python 2.7 the encoding utf-8-sig is not a supported alias for the UTF8-sig encoding, you need to use utf_8_sig. Additionally the order of the method properties needs to define the output data encoding first, and the file encoding second: codecs.EncodedFile(file,datacodec,filecodec=None,errors=’strict')
Here's the full result:
import codecs
with open(file_path, 'r') as infile:
reader = csv.reader(codecs.EncodedFile(infile, 'utf8', 'utf_8_sig'), delimiter=";")

Python 3.6 utf-8 UnicodeEncodeError

#!/usr/bin/env python3
import glob
import xml.etree.ElementTree as ET
filenames = glob.glob("C:\\Users\\####\\Desktop\\BNC2\\[A00-ZZZ]*.xml")
out_lines = []
for filename in filenames:
with open(filename, 'r', encoding="utf-8") as content:
tree = ET.parse(content)
root = tree.getroot()
for w in root.iter('w'):
lemma = w.get('hw')
pos = w.get('pos')
tag = w.get('c5')
out_lines.append(w.text + "," + lemma + "," + pos + "," + tag)
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w") as out_file:
for line in out_lines:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
out_file.write("{}\n".format(line))
Gives the error:
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 0: character maps to undefined
I thought this line would have solved that:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
You need to specify the encoding when opening the output file, same as you did with the input file:
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w", encoding="utf-8") as out_file:
for line in out_lines:
out_file.write("{}\n".format(line))
If your script have multiple reads and writes and you want to have a particular encoding ( let's say utf-8) for all of them, we can change the default encoding too
import sys
reload(sys)
sys.setdefaultencoding('UTF8')
We should use it only when we have multiple reads/writes though and should be done at the beginning of the script
Changing default encoding of Python?

Getting unicode decode error in python?

I am using facebook graph API but getting error when I try to run graph.py
How should I resolve this problem of charmap. I am facing unicode decode error.
enter image description here
In graph.py :
table = json2html.convert(json = variable)
htmlfile=table.encode('utf-8')
f = open('Table.html','wb')
f.write(htmlfile)
f.close()
# replacing '&gt' with '>' and '&lt' with '<'
f = open('Table.html','r')
s=f.read()
s=s.replace(">",">")
s=s.replace("<","<")
f.close()
# writting content to html file
f = open('Table.html','w')
f.write(s)
f.close()
# output
webbrowser.open("Table.html")
else:
print("We couldn't find anything for",PageName)
I could not understand why I am facing this issue. Also getting some error with 's=f.read()'
In error message I see it tries to guess encoding used in file when you read it and finally it uses encoding cp1250 to read it (probably because Windows use cp1250 as default in system) but it is incorrect encoding becuse you saved it as 'utf-8'.
So you have to use open( ..., encoding='utf-8') and it will not have to guess encoding.
# replacing '&gt' with '>' and '&lt' with '<'
f = open('Table.html','r', encoding='utf-8')
s = f.read()
f.close()
s = s.replace(">",">")
s = s.replace("<","<")
# writting content to html file
f = open('Table.html','w', encoding='utf-8')
f.write(s)
f.close()
But you could change it before you save it. And then you don't have to open it again.
table = json2html.convert(json=variable)
table = table.replace(">",">").replace("<","<")
f = open('Table.html', 'w', encoding='utf-8')
f.write(table)
f.close()
# output
webbrowser.open("Table.html")
BTW: python has function html.unescape(text) to replace all "chars" like > (so called entity)
import html
table = json2html.convert(json=variable)
table = html.unescape(table)
f = open('Table.html', 'w', encoding='utf-8')
f.write(table)
f.close()
# output
webbrowser.open("Table.html")

decoding a .txt - 'utf-8' codec can't decode byte 0xf3

I am taking data, domains, from an excel file to a text file and then check the availability of the domains. The problem pops up when I try to use that text file after taking the data from the excel file.
This is the data in the excel file
arete.cl
cbsanbernardo.cl
ludala.cl
puntotactico.cl
sunriseskateboard.cl
ellegrand.cl
turismosantodomingo.cl
delotroladof.cl
produccionesmandala.cl
So, basically if I type manually the domains in the text file the script works fine. But if I take the domains from an excel file to a text file and then run the script this errors pops up:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 194: invalid continuation byte
The same happens if I try to check the domains directly from the excel file.
So should I decode the .txt or the .xlsx? How can I do it?
#!/usr/bin/python
import pythonwhois
import openpyxl
from openpyxl import load_workbook
import os
pathx = 'path'
filex = 'file.xlsx'
print('**Availability of domains**')
os.chdir(pathx)
workbook = openpyxl.load_workbook(filex, data_only = True)
sheet = workbook.get_sheet_by_name('Dic')
domainsz = io.open(pathx + '\\domains.txt', 'a')
for i in range(1, 10):
domainx = sheet["A" + str(i * 2)].value
if domainx is not None:
domainsz.write(domainx + '\n')
print(domainx)
domainsz.close()
with gzip.open('domains.txt' + ".gz", "wb") as outfile:
outfile.write(bytes(plaintext, 'UTF-8'))
domains = []
available = []
unavailable = []
def getDomains():
with io.open('domains.txt', 'r', encoding='latin-1') as f:
for domainName in f.read().splitlines():
domains.append(domainName)
def run():
for dom in domains:
if dom is not None and dom != '':
details = pythonwhois.get_whois(dom)
if details['contacts']['registrant'] is not None:
unavailable.append(dom)
else:
available.append(dom)
def printAvailability():
print ("-----------------------------")
print ("Unavailable Domains: ")
print ("-----------------------------")
for un in unavailable:
print (un)
print ("\n")
print ("-----------------------------")
print ("Available Domains: ")
print ("-----------------------------")
for av in available:
print (av)
if __name__ == "__main__":
getDomains()
run()
printAvailability()

Encoding UTF-8 when writing to CSV

I have some simple code to ingest some JSON Twitter data, and output some specific fields into separate columns of a CSV file. My problem is that I cannot for the life of me figure out the proper way to encode the output as UTF-8. Below is the closest I've been able to get, with the help of a member here, but I still it still isn't running correctly and fails because of the unique characters in the tweet text field.
import json
import sys
import csv
import codecs
def main():
writer = csv.writer(codecs.getwriter("utf-8")(sys.stdout), delimiter="\t")
for line in sys.stdin:
line = line.strip()
data = []
try:
data.append(json.loads(line))
except ValueError as detail:
continue
for tweet in data:
## deletes any rate limited data
if tweet.has_key('limit'):
pass
else:
writer.writerow([
tweet['id_str'],
tweet['user']['screen_name'],
tweet['text']
])
if __name__ == '__main__':
main()
From Docs:
https://docs.python.org/2/howto/unicode.html
a = "string"
encodedstring = a.encode('utf-8')
If that does not work:
Python DictWriter writing UTF-8 encoded CSV files
I have had the same problem. I have a large amount of data from twitter firehose so every possible complication case (and has arisen)!
I've solved it as follows using try / except:
if the dict value is a string: if isinstance(value,basestring) I try to encode it straight away. If not a string, I make it a string and then encode it.
If this fails, it's because some joker is tweeting odd symbols to mess up my script. If that is the case, firstly I decode then re-encode value.decode('utf-8').encode('utf-8') for strings and decode, make into a string and re-encode for non-strings value.decode('utf-8').encode('utf-8')
Have a go with this:
import csv
def export_to_csv(list_of_tweet_dicts,export_name="flat_twitter_output.csv"):
utf8_flat_tweets=[]
keys = []
for tweet in list_of_tweet_dicts:
tmp_tweet = tweet
for key,value in tweet.iteritems():
if key not in keys: keys.append(key)
# convert fields to utf-8 if text
try:
if isinstance(value,basestring):
tmp_tweet[key] = value.encode('utf-8')
else:
tmp_tweet[key] = str(value).encode('utf-8')
except:
if isinstance(value,basestring):
tmp_tweet[key] = value.decode('utf-8').encode('utf-8')
else:
tmp_tweet[key] = str(value.decode('utf-8')).encode('utf-8')
utf8_flat_tweets.append(tmp_tweet)
del tmp_tweet
list_of_tweet_dicts = utf8_flat_tweets
del utf8_flat_tweets
with open(export_name, 'w') as f:
dict_writer = csv.DictWriter(f, fieldnames=keys,quoting=csv.QUOTE_ALL)
dict_writer.writeheader()
dict_writer.writerows(list_of_tweet_dicts)
print "exported tweets to '"+export_name+"'"
return list_of_tweet_dicts
hope that helps you.

Categories

Resources