Incorrect character in saving process for excel - python

I'm creating a new coloumn and this new file and want to save. But in there excel file a coloumn have a character. How can I skip this line the save process or change line to a correct character?
import pandas as pd
path = '/My Documents/Python/'
fileName = "test.xlsx"
# open the excel file
ef = pd.ExcelFile(path+fileName)
# read the contents
df = pd.read_excel(path+fileName, sheet_name=ef.sheet_names[0])
print(df['Content'])
print(df['Engine'])
i = 1
for test in df['Content']:
try:
print(i)
print(test)
except:
print("An exception occurred")
break
i += 1
df['Test'] = 'value'
df.to_excel('My Documents/Python/Test_NEW.xlsx')
Error message
data, consumed = self.encode(object, self.errors)
UnicodeEncodeError: 'utf-8' codec can't encode character '\ude7c' in position 470: surrogates not allowed

df['Content'] = df['Content'].astype(str)

Related

UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position 7458: character maps to <undefined>

I'm trying to open open a file using a CSV module but i recived this error
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position
7458: character maps to
I checked the file, and file encoding is UTF-8...
Below is my code. The error is in line 63
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as PT
import traceback
#Global variables
#Variable to hold file name
FILE_NAME = "CustomLabels.labels"
#Variable to hold delimiter
DELIMETE = ','
#Variable to hold root category in xml hierarchy
CUSTOM_LABELS = "CustomLabels"
#Variable to hold sub element in xml
LABELS = "labels"
#Variable to hold argument for CustomLabels category
XMLNS = 'xmlns'
#Variable to hold value for argument for CustomLabels category
URL = "http://soap.sforce.com/2006/04/metadata"
#variable to save file
SAVE_PATH = ""
#variable to hold file to read name
FILE_TO_READ = "CustomLabels.csv"
#Function to open the file with ugly XML
def openFile():
print('D:M|***| openFile')
try:
customLabelsFile = open(FILE_NAME, 'r+',encoding="utf-8")
except Exception:
traceback.print_exc()
return customLabelsFile
#Function to make pretty XML on output
def prettyXMLfile():
print('D:M|***| prettyXMLfile')
try:
dom = PT.parse(FILE_NAME)
pretty_xml_as_string = dom.toprettyxml()
except Exception:
traceback.print_exc()
return pretty_xml_as_string
#Function to save preetyXML
#para
#xml_file - it is a file from openFile Function
#context - it is a formatted xml
def saveAsPrertyXML(xml_file,context):
try:
n = xml_file.write(context)
xml_file.close()
except Exception:
traceback.print_exc()
with open(FILE_TO_READ,encoding="utf-8",errors='ignore',"rb") as csv_file:
csv_reader = csv.reader(csv_file, encoding='utf-8',delimiter=DELIMETE)
line_count = 0
listOfColumnNames = list()
customLabels = ET.Element(CUSTOM_LABELS)
customLabels.set(XMLNS,URL)
try:
for row in csv_reader:
if line_count == 0:
listOfColumnNames.append(row)
finalListOfColumns = listOfColumnNames[line_count]
line_count += 1
else:
index = 0
while index < len(finalListOfColumns):
if index == 0:
labels = ET.SubElement(customLabels, LABELS)
ET.SubElement(labels, finalListOfColumns[index]).text = row[index]
index += 1
line_count += 1
except Exception:
print(f'The line with error is {line_count}')
traceback.print_exc()
tree = ET.ElementTree(customLabels)
tree.write(FILE_NAME, xml_declaration=True,encoding='utf-8',method="xml")
uglyXML = openFile()
prettyXMLasString = prettyXMLfile()
saveAsPrertyXML(uglyXML,prettyXMLasString)
print(f'Generator pars {line_count} lines')
print('XML file saved succesfull')
Ok i figure out what was wrong
it should be:
with open(FILE_TO_READ,"rt",encoding="utf-8") as csv_file:
instead of
with open(FILE_TO_READ,"rb+",encoding="utf-8") as csv_file:

Failing to Convert Files from CSV to Excel

Attempting to convert a folder list of csv files to excel. Unfortunately most of them do not work and I also get following errors. When I do the same via excel front end, it works fine to save them from csv. Any ideas what I might be doing wrong?
import os
import glob
import csv
import openpyxl # from https://pythonhosted.org/openpyxl/ or PyPI (e.g. via pip)
for csvfile in glob.glob(os.path.join('.', '*.csv')):
wb = openpyxl.Workbook()
ws = wb.active
with open(csvfile, 'rb') as f:
reader = csv.reader(f)
for r, row in enumerate(reader, start=1):
for c, val in enumerate(row, start=1):
ws.cell(row=r, column=c).value = val
wb.save(csvfile + '.xlsx')
Get the following errors:
(most recent call last):
File "C:\Users\test\Documents\ConvertCSVtoXLSX\2007+.py", line 14, in
ws.cell(row=r, column=c).value = val
File "C:\Python27\ArcGIS10.7\lib\site-packages\openpyxl\cell\cell.py", line 272, in value
self._bind_value(value)
File "C:\Python27\ArcGIS10.7\lib\site-packages\openpyxl\cell\cell.py", line 229, in _bind_value
value = self.check_string(value)
File "C:\Python27\ArcGIS10.7\lib\site-packages\openpyxl\cell\cell.py", line 180, in check_string
value = unicode(value, self.encoding)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa0 in position 30: invalid start byte
It looks like pyopenxl expects that it will receive data encoded as UTF-8, but the data you are passing it has some other encoding - probably one of the Windows cp* codepages. You can determine the system's default encoding by calling locale.getpreferredencoding. Let's assume it's cp1252.
In the traceback, we can see that this is the failing line:
unicode(value, self.encoding)
resulting in this error:
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa0 in position 30: invalid start byte
pyopenxl is trying to decode the value it receives from UTF-8, and failing; we can work round this by re-encoding the value before passing it to pyopenxl.
for c, val in enumerate(row, start=1):
fixed_val = unicode(val, 'cp1252').encode('utf-8')
ws.cell(row=r, column=c).value = fixed_val
If it's possible that some of your files are encoded as UTF-8 and some are encoded in your system's default encoding, you may need to wrap the original assignment in a try/except block
for c, val in enumerate(row, start=1):
try:
ws.cell(row=r, column=c).value = val
except UnicodeDecodeError:
fixed_val = unicode(val, 'cp1252').encode('utf-8')
ws.cell(row=r, column=c).value = fixed_val

How to translate encoding by ansi into unicode

When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence
Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.

Python 2.7 ascii' codec can't encode character u'\xe4

I have experienced a code problem in Python 2.7, I already used UTF-8, but it still got the exception
"UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 81: ordinal not in range(128)"
My files and contains so many this kind of shit, but for some reason, I'm not allowed to delete it.
desktop,[Search] Store | Automated Titles,google / cpc,Titles > Kesäkaverit,275285048,13
I have tried the below method to avoid, but still, haven't fix it. Can anyone help me ?
1.With "#!/usr/bin/python" in my file header
2.Set setdefaultencoding
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
3.content = unicode(s3core.download_file_to_memory(S3_PROFILE, S3_RAW + file), "utf-8", "ignore")
My code below
content = unicode(s3core.download_file_to_memory(S3_PROFILE, S3_RAW + file), "utf8", "ignore")
rows = content.split('\n')[1:]
for row in rows:
if not row:
continue
try:
# fetch variables
cols = row.rstrip('\n').split(',')
transaction = cols[0]
device_category = cols[1]
campaign = cols[2]
source = cols[3].split('/')[0].strip()
medium = cols[3].split('/')[1].strip()
ad_group = cols[4]
transactions = cols[5]
data_list.append('\t'.join(
['-'.join([dt[:4], dt[4:6], dt[6:]]), country, transaction, device_category, campaign, source,
medium, ad_group, transactions]))
except:
print 'ignoring row: ' + row

decoding a .txt - 'utf-8' codec can't decode byte 0xf3

I am taking data, domains, from an excel file to a text file and then check the availability of the domains. The problem pops up when I try to use that text file after taking the data from the excel file.
This is the data in the excel file
arete.cl
cbsanbernardo.cl
ludala.cl
puntotactico.cl
sunriseskateboard.cl
ellegrand.cl
turismosantodomingo.cl
delotroladof.cl
produccionesmandala.cl
So, basically if I type manually the domains in the text file the script works fine. But if I take the domains from an excel file to a text file and then run the script this errors pops up:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 194: invalid continuation byte
The same happens if I try to check the domains directly from the excel file.
So should I decode the .txt or the .xlsx? How can I do it?
#!/usr/bin/python
import pythonwhois
import openpyxl
from openpyxl import load_workbook
import os
pathx = 'path'
filex = 'file.xlsx'
print('**Availability of domains**')
os.chdir(pathx)
workbook = openpyxl.load_workbook(filex, data_only = True)
sheet = workbook.get_sheet_by_name('Dic')
domainsz = io.open(pathx + '\\domains.txt', 'a')
for i in range(1, 10):
domainx = sheet["A" + str(i * 2)].value
if domainx is not None:
domainsz.write(domainx + '\n')
print(domainx)
domainsz.close()
with gzip.open('domains.txt' + ".gz", "wb") as outfile:
outfile.write(bytes(plaintext, 'UTF-8'))
domains = []
available = []
unavailable = []
def getDomains():
with io.open('domains.txt', 'r', encoding='latin-1') as f:
for domainName in f.read().splitlines():
domains.append(domainName)
def run():
for dom in domains:
if dom is not None and dom != '':
details = pythonwhois.get_whois(dom)
if details['contacts']['registrant'] is not None:
unavailable.append(dom)
else:
available.append(dom)
def printAvailability():
print ("-----------------------------")
print ("Unavailable Domains: ")
print ("-----------------------------")
for un in unavailable:
print (un)
print ("\n")
print ("-----------------------------")
print ("Available Domains: ")
print ("-----------------------------")
for av in available:
print (av)
if __name__ == "__main__":
getDomains()
run()
printAvailability()

Categories

Resources