'ascii' codec can't encode... while uploading csv file - python

I've got class for uploading my csv files with holidays to my fullcalendar. It looks like this:
class UploadVacationsView(APIView):
def put(self, request, *args, **kwargs):
try:
# check file type
mime = MimeTypes()
url = urllib.pathname2url(request.FILES['file']._name)
mime_type = mime.guess_type(url)
if 'text/csv' not in mime_type:
raise APIException(code=400, detail='File type must be CSV')
vacations_list =[]
csv_file = StringIO(request.data.get('file', None).read().decode('utf-8'))
user_tz = pytz.timezone(request.user.common_settings.time_zone)
schedule_file = ScheduleFile.objects.create(user=request.user)
instance_hebcal = HebcalService()
events = instance_hebcal.process_csv(csv_file, user_tz)
...
And in the other class, I've got a method that works with csv files:
class HebcalService(...):
def process_csv(self, csv_file, user_tz):
events = []
csv_input = csv.reader(csv_file.readlines(), dialect=csv.excel)
curr_row = 1
start_date = None
end_date = None
start_name = None
holiday_name = ''
last_event = {'subject': '',
'date': '',
}
for row in list(csv_input)[1:]:
subject, date, time, _, _, _, _ = row[:7]
curr_row += 1
row = [unicode(cell.strip(), 'utf-8') for cell in row]
if 'lighting' in subject and not start_date:
start_date = user_tz.localize(format_datetime(date, time))
if date == last_event['date']:
start_name = last_event['subject']
Everything is ok when working with english holiday's names but when I encounter hebrew names it shots an error:
Traceback (most recent call last):
File "/home/stas/work/vacation/vmode/apps/marketplaces/base/api/views.py", line 47, in put
events = instance_hebcal.process_csv(csv_file, user_tz)
File "/home/stas/work/vacation/vmode/apps/marketplaces/base/services/hebcal.py", line 106, in process_csv
for row in list(csv_input)[1:]:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-23: ordinal not in range(128)
I've read about making all strings to unicode but don't understand where it gets that default ASCII encoding, how can I handle it and save string with holiday_name from csv file?

Related

Python Trouble Parsing a .max translated to OLE File => output unreadable in text format

The following script outputs files unreadable in .txt format. Please advise.
I inspired myself with: https://area.autodesk.com/m/drew.avis/tutorials/writing-and-reading-3ds-max-scene-sidecar-data-in-python
This is to replicate a macho shark into a mechanical robot.
import olefile
# set this to your file
f = r'C:\MRP\Shortfin_Mako_Shark_Rigged_scanline.max'
def cleanString(data,isArray=False):
# remove first 6 bytes + last byte
data = data[6:]
if isArray:
data = data[:-1]
return data
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
# myString = fin.read().decode("utf-16")
# myString = cleanString(myString, isArray=True)
fout = open(entry[0], "wb")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(s)
Please advise.
https://www.turbosquid.com/fr/3d-models/max-shortfin-mako-shark-rigged/991102#
I also tried this:
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
#myString = fin.read().decode("utf-16")
#myString = cleanString(myString, isArray=True)
fout = open(entry[0], "w")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(cleanString(s, isArray = True).decode("utf-8"))
# stream = ole.openstream('CustomFileStreamDataStorage/MyString')
# myString = stream.read().decode('utf-16')
# myString = cleanString(myString)
# stream = ole.openstream('CustomFileStreamDataStorage/MyGeometry')
# myGeometry = stream.read().decode('utf-16')
# myGeometry = cleanString(myGeometry, isArray=True)
# myGeometry = myGeometry.split('\x00')
# stream = ole.openstream('CustomFileStreamDataStorage/MyLayers')
# myLayers = stream.read().decode('utf-16')
# myLayers = cleanString(myLayers, isArray=True)
# myLayers = myLayers.split('\x00')
# print ("My String: {}\nMy Geometry: {}\nMy Layers: {}".format (myString, myGeometry, myLayers))
What is the right encoding to decode from?
Exception has occurred: UnicodeDecodeError
'utf-8' codec can't decode bytes in position 4-5: invalid continuation byte
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-8"))
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode characters in position 2-5: character maps to
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-16"))
KR,
Ludo

Python script for converting CSV to JSON for use with Auth0 has problems after system upgrade

This script is designed to convert a CSV to JSON for user with Auth0, and was previously working until I moved to a new machine - after multiple python upgrades, module installs, and attempted fixes, I've reached the end of my bug-whacking prowess.
import csv, json, bcrypt, sys
csvPath = sys.argv[1]
jsonPath = sys.argv[2]
data = []
f = open( csvPath, 'r' )
reader = csv.DictReader( f, fieldnames = ( "name","email","password" ))
next(reader)
for row in reader:
entry = {}
sub1 = {}
sub2 = {}
pwd = row['password']
password = pwd.encode('utf-8')
salt = bcrypt.gensalt(rounds=10)
sub2['value'] = bcrypt.hashpw(password, salt)
entry['name'] = row['name']
entry['email'] = row['email']
entry['email_verified'] = True
sub1['algorithm'] = 'bcrypt'
sub1['hash'] = sub2
entry['custom_password_hash'] = sub1
data.append(entry)
out = json.dumps( data )
print ("JSON parsed!")
f = open( jsonPath , 'w')
f.write(out)
print ("JSON saved")
I was initially getting a TypeError: Unicode-objects must be encoded before hashing error, which prompted me to add the .encode on line 16.
That changed the error to this:
Traceback (most recent call last):
File "python-auth0.py", line 28, in <module>
out = json.dumps( data )
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/__init__.py", line 231, in dumps
return _default_encoder.encode(obj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type bytes is not JSON serializable
I attempted to add a decode attribute to 'data' in line 28, which led to AttributeError: 'list' object has no attribute 'decode'
So I'm clearly just making wild stabs in the dark here - any help to get this working again would be appreciated!
I think what you can do is that you decode the hashed back into a python string
import json, bcrypt
pwd = "mypassword"
password = pwd.encode('utf-8')
salt = bcrypt.gensalt(rounds=10)
hashed_pwd = bcrypt.hashpw(password, salt)
decoded_hash = hashed_pwd.decode('utf-8')
# this will raise exception
print(json.dumps(hashed_pwd))
# this will not
print(json.dumps(decoded_hash))
in your case, try:
import csv, json, bcrypt, sys
csvPath = sys.argv[1]
jsonPath = sys.argv[2]
data = []
f = open( csvPath, 'r' )
reader = csv.DictReader( f, fieldnames = ( "name","email","password" ))
next(reader)
for row in reader:
entry = {}
sub1 = {}
sub2 = {}
pwd = row['password']
password = pwd.encode('utf-8')
salt = bcrypt.gensalt(rounds=10)
sub2['value'] = bcrypt.hashpw(password, salt).decode('utf-8') # <-- here
entry['name'] = row['name']
entry['email'] = row['email']
entry['email_verified'] = True
sub1['algorithm'] = 'bcrypt'
sub1['hash'] = sub2
entry['custom_password_hash'] = sub1
data.append(entry)
out = json.dumps( data )
print ("JSON parsed!")
f = open( jsonPath , 'w')
f.write(out)
print ("JSON saved")

UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position 7458: character maps to <undefined>

I'm trying to open open a file using a CSV module but i recived this error
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position
7458: character maps to
I checked the file, and file encoding is UTF-8...
Below is my code. The error is in line 63
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as PT
import traceback
#Global variables
#Variable to hold file name
FILE_NAME = "CustomLabels.labels"
#Variable to hold delimiter
DELIMETE = ','
#Variable to hold root category in xml hierarchy
CUSTOM_LABELS = "CustomLabels"
#Variable to hold sub element in xml
LABELS = "labels"
#Variable to hold argument for CustomLabels category
XMLNS = 'xmlns'
#Variable to hold value for argument for CustomLabels category
URL = "http://soap.sforce.com/2006/04/metadata"
#variable to save file
SAVE_PATH = ""
#variable to hold file to read name
FILE_TO_READ = "CustomLabels.csv"
#Function to open the file with ugly XML
def openFile():
print('D:M|***| openFile')
try:
customLabelsFile = open(FILE_NAME, 'r+',encoding="utf-8")
except Exception:
traceback.print_exc()
return customLabelsFile
#Function to make pretty XML on output
def prettyXMLfile():
print('D:M|***| prettyXMLfile')
try:
dom = PT.parse(FILE_NAME)
pretty_xml_as_string = dom.toprettyxml()
except Exception:
traceback.print_exc()
return pretty_xml_as_string
#Function to save preetyXML
#para
#xml_file - it is a file from openFile Function
#context - it is a formatted xml
def saveAsPrertyXML(xml_file,context):
try:
n = xml_file.write(context)
xml_file.close()
except Exception:
traceback.print_exc()
with open(FILE_TO_READ,encoding="utf-8",errors='ignore',"rb") as csv_file:
csv_reader = csv.reader(csv_file, encoding='utf-8',delimiter=DELIMETE)
line_count = 0
listOfColumnNames = list()
customLabels = ET.Element(CUSTOM_LABELS)
customLabels.set(XMLNS,URL)
try:
for row in csv_reader:
if line_count == 0:
listOfColumnNames.append(row)
finalListOfColumns = listOfColumnNames[line_count]
line_count += 1
else:
index = 0
while index < len(finalListOfColumns):
if index == 0:
labels = ET.SubElement(customLabels, LABELS)
ET.SubElement(labels, finalListOfColumns[index]).text = row[index]
index += 1
line_count += 1
except Exception:
print(f'The line with error is {line_count}')
traceback.print_exc()
tree = ET.ElementTree(customLabels)
tree.write(FILE_NAME, xml_declaration=True,encoding='utf-8',method="xml")
uglyXML = openFile()
prettyXMLasString = prettyXMLfile()
saveAsPrertyXML(uglyXML,prettyXMLasString)
print(f'Generator pars {line_count} lines')
print('XML file saved succesfull')
Ok i figure out what was wrong
it should be:
with open(FILE_TO_READ,"rt",encoding="utf-8") as csv_file:
instead of
with open(FILE_TO_READ,"rb+",encoding="utf-8") as csv_file:

Python/JSON: Errors one after another

I am following an online tutorial for making a chatbot...here is the code for the beginning portion
import sqlite3
import json
from datetime import datetime
timeframe = '2015-01'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n', ' newlinechar ').replace('\r', ' newlinechar ').replace('"', "'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('C:/Users/oriba/Desktop/Month of Reddit/RC_2015-01'.format(timeframe.split('-')[0], timeframe), encoding='ISO-8859-1', buffering=1000) as f:
for row in f:
print(row)
row_counter += 1
row = json.load(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
When that runs, I get this error:
Traceback (most recent call last):
File "C:/Users/oriba/Desktop/Month of Reddit/chatbot.py", line 37, in <module>
for row in f:
File "C:\Users\oriba\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 97: character maps to <undefined>
After searching online, I found that adding "encoding='ISO-8859-1'" to 'with open()' should fix it...then I get this error:
Traceback (most recent call last):
File "C:/Users/oriba/Desktop/Month of Reddit/chatbot.py", line 40, in <module>
row = json.load(row)
File "C:\Users\oriba\AppData\Local\Programs\Python\Python36\lib\json\__init__.py", line 296, in load
BZh91AY&SYÔAÙÕÿî»ÿÿÿúÿÿÿÿÿÿÿÿc*è` 1Ï. ñÕ ¢U±Ã$'¤;\=# ÝX9kl´ÜιKW; É# Ò PQáGF PÝ Û P :è
return loads(fp.read(),
AttributeError: 'str' object has no attribute 'read'
And now I'm at a loss. I know this is a lot, this is complex for me. I appreciate any help :)
json.loads() does the job.
loads() read from a str object while load() read from a File object
You code is
for row in f:
...
row here is a str

'ascii' codec can't encode character

I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.
I am also using beautifulsoup4 which extracts the text in decoded form.
This is my code that I have used.
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
I am getting the following error every time i run the script.
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
Any suggestions how i can remove this error.
if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.

Categories

Resources