decoding a .txt - 'utf-8' codec can't decode byte 0xf3

decoding a .txt - 'utf-8' codec can't decode byte 0xf3 - python

I am taking data, domains, from an excel file to a text file and then check the availability of the domains. The problem pops up when I try to use that text file after taking the data from the excel file.
This is the data in the excel file
arete.cl
cbsanbernardo.cl
ludala.cl
puntotactico.cl
sunriseskateboard.cl
ellegrand.cl
turismosantodomingo.cl
delotroladof.cl
produccionesmandala.cl
So, basically if I type manually the domains in the text file the script works fine. But if I take the domains from an excel file to a text file and then run the script this errors pops up:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 194: invalid continuation byte
The same happens if I try to check the domains directly from the excel file.
So should I decode the .txt or the .xlsx? How can I do it?
#!/usr/bin/python
import pythonwhois
import openpyxl
from openpyxl import load_workbook
import os
pathx = 'path'
filex = 'file.xlsx'
print('**Availability of domains**')
os.chdir(pathx)
workbook = openpyxl.load_workbook(filex, data_only = True)
sheet = workbook.get_sheet_by_name('Dic')
domainsz = io.open(pathx + '\\domains.txt', 'a')
for i in range(1, 10):
domainx = sheet["A" + str(i * 2)].value
if domainx is not None:
domainsz.write(domainx + '\n')
print(domainx)
domainsz.close()
with gzip.open('domains.txt' + ".gz", "wb") as outfile:
outfile.write(bytes(plaintext, 'UTF-8'))
domains = []
available = []
unavailable = []
def getDomains():
with io.open('domains.txt', 'r', encoding='latin-1') as f:
for domainName in f.read().splitlines():
domains.append(domainName)
def run():
for dom in domains:
if dom is not None and dom != '':
details = pythonwhois.get_whois(dom)
if details['contacts']['registrant'] is not None:
unavailable.append(dom)
else:
available.append(dom)
def printAvailability():
print ("-----------------------------")
print ("Unavailable Domains: ")
print ("-----------------------------")
for un in unavailable:
print (un)
print ("\n")
print ("-----------------------------")
print ("Available Domains: ")
print ("-----------------------------")
for av in available:
print (av)
if __name__ == "__main__":
getDomains()
run()
printAvailability()

Related

Unicode Error when I try import a txt file tab separated

(I work on Mac)
When I insert my python code to obtain data from txt file (tab separated) I have the error: "'utf-8' codec can't decode byte 0xa3 in position 4186: invalid start byte".
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
def load_transactions (file_name, sep="\t"):
lines = open(file_name, 'rt').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
groceries=load_transactions("Online_Retail.txt",sep="\t")
len(groceries)
Thank you.

I resolved this form:
First discover the unicode of the file
from chardet.universaldetector import UniversalDetector
usock = open('/Users/leonorbrites/Desktop/Online_Retail.txt', 'rb')
detector = UniversalDetector()
for line in usock.readlines():
detector.feed(line)
if detector.done: break
detector.close()
usock.close()
print (detector.result)
Then change the unicode from my file
def transactions (file_name, sep="\t"):
lines = open(file_name,'rt', encoding='iso-8859-1').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
retail=transactions('/Users/leonorbrites/Desktop/Online_Retail.txt', sep="\t")
len(retail)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position 7458: character maps to <undefined>

I'm trying to open open a file using a CSV module but i recived this error
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position
7458: character maps to
I checked the file, and file encoding is UTF-8...
Below is my code. The error is in line 63
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as PT
import traceback
#Global variables
#Variable to hold file name
FILE_NAME = "CustomLabels.labels"
#Variable to hold delimiter
DELIMETE = ','
#Variable to hold root category in xml hierarchy
CUSTOM_LABELS = "CustomLabels"
#Variable to hold sub element in xml
LABELS = "labels"
#Variable to hold argument for CustomLabels category
XMLNS = 'xmlns'
#Variable to hold value for argument for CustomLabels category
URL = "http://soap.sforce.com/2006/04/metadata"
#variable to save file
SAVE_PATH = ""
#variable to hold file to read name
FILE_TO_READ = "CustomLabels.csv"
#Function to open the file with ugly XML
def openFile():
print('D:M|***| openFile')
try:
customLabelsFile = open(FILE_NAME, 'r+',encoding="utf-8")
except Exception:
traceback.print_exc()
return customLabelsFile
#Function to make pretty XML on output
def prettyXMLfile():
print('D:M|***| prettyXMLfile')
try:
dom = PT.parse(FILE_NAME)
pretty_xml_as_string = dom.toprettyxml()
except Exception:
traceback.print_exc()
return pretty_xml_as_string
#Function to save preetyXML
#para
#xml_file - it is a file from openFile Function
#context - it is a formatted xml
def saveAsPrertyXML(xml_file,context):
try:
n = xml_file.write(context)
xml_file.close()
except Exception:
traceback.print_exc()
with open(FILE_TO_READ,encoding="utf-8",errors='ignore',"rb") as csv_file:
csv_reader = csv.reader(csv_file, encoding='utf-8',delimiter=DELIMETE)
line_count = 0
listOfColumnNames = list()
customLabels = ET.Element(CUSTOM_LABELS)
customLabels.set(XMLNS,URL)
try:
for row in csv_reader:
if line_count == 0:
listOfColumnNames.append(row)
finalListOfColumns = listOfColumnNames[line_count]
line_count += 1
else:
index = 0
while index < len(finalListOfColumns):
if index == 0:
labels = ET.SubElement(customLabels, LABELS)
ET.SubElement(labels, finalListOfColumns[index]).text = row[index]
index += 1
line_count += 1
except Exception:
print(f'The line with error is {line_count}')
traceback.print_exc()
tree = ET.ElementTree(customLabels)
tree.write(FILE_NAME, xml_declaration=True,encoding='utf-8',method="xml")
uglyXML = openFile()
prettyXMLasString = prettyXMLfile()
saveAsPrertyXML(uglyXML,prettyXMLasString)
print(f'Generator pars {line_count} lines')
print('XML file saved succesfull')

Ok i figure out what was wrong
it should be:
with open(FILE_TO_READ,"rt",encoding="utf-8") as csv_file:
instead of
with open(FILE_TO_READ,"rb+",encoding="utf-8") as csv_file:

How to translate encoding by ansi into unicode

When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence

Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.

cp949 codec can't encode character error in python

I am using the code below to parse the XML format wikipedia training data into a pure text file:
from __future__ import print_function
import logging
import os.path
import six
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) != 3:
print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text")
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
# ###another method###
# output.write(
# space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
else:
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
when I run this code, it gives me a following error message:
File "wiki_parser.py", line 42, in <module>
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
UnicodeEncodeError: 'cp949' codec can't encode character '\u1f00' in position 1537: illegal multibyte sequence
When I searched this error online, most answers told me to add 'utf-8' as the encoding which is already there. What could be the possible issue with the code?

Minimal example
The problem is that your file is opened with an implicit encoding (inferred from your system). I can recreate your issue as follows:
a = '\u1f00'
with open('f.txt', 'w', encoding='cp949') as f:
f.write(a)
Error message: UnicodeEncodeError: 'cp949' codec can't encode character '\u1f00' in position 0: illegal multibyte sequence
You have two options. Either open the file using an encoding which can encode the character you are using:
with open('f.txt', 'w', encoding='utf-8') as f:
f.write(a)
Or open the file as binary and write encoded bytes:
with open('f.txt', 'wb') as f:
f.write(a.encode('utf-8'))
Applied to your code:
I would replace this part:
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
# ###another method###
# output.write(
# space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
else:
output.write(space.join(text) + "\n")
with this:
from io import open
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
with open(outp, 'w', encoding='utf=8') as output:
for text in wiki.get_texts():
output.write(u' '.join(text) + u'\n')
which should work in both Python 2 and Python 3.

Python Encoding Issue with JSON and CSV

I am having an encoding issue when I run my script below:
Here is the error code:
-UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 9: ordinal not in range(128)
Here is my script:
import logging
import urllib
import csv
import json
import io
import codecs
with open('/home/local/apple.csv',
'rb') as csvinput:
reader = csv.reader(csvinput, delimiter=',')
firstline = True
for row in reader:
if firstline:
firstline = False
continue
address1 = row[0]
print row[0]
locality = row[1]
admin_area = row[2]
query = ' '.join(str(x) for x in (address1, locality, admin_area))
normalized = query.replace(" ", "+")
BaseURL = 'http://localhost:8080/verify?country=JP&freeform='
URL = BaseURL + normalized
print URL
data = urllib.urlopen(URL)
response = data.getcode()
print response
if response == 200:
file= json.load(data)
print file
output_f=open('output.csv','wb')
csvwriter=csv.writer(output_f)
count = 0
for f in file:
if count == 0:
header= f.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(f.values())
output_f.close()
else:
print 'error'
can anyone help me fix this its getting really annoying. I need to encode to utf8

Looks like you are using Python 2.x, instead of python's standard open, use codecs.open where you can optionally pass an encoding to use and what to do when there are errors. Gets a little less confusing in Python 3 where the standard Python open can do this.
So in your two lines where you are opening, do:
with codecs.open('/home/local/apple.csv',
'rb', 'utf-8') as csvinput:
output_f = codecs.open('output.csv','wb', 'utf-8')
The optional error parm defaults to "strict" which raises an exception if the bytes can't be mapped to the given encoding. In some contexts you may want to use 'ignore' or 'replace'.
See the python doc for a bit more info.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

decoding a .txt - 'utf-8' codec can't decode byte 0xf3 - python

Related

Unicode Error when I try import a txt file tab separated

UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position 7458: character maps to <undefined>

How to translate encoding by ansi into unicode

cp949 codec can't encode character error in python

Python Encoding Issue with JSON and CSV

Categories

Resources