I am getting error while I opening a file - python

import os
import sys
file = os.path.expanduser('~/Desktop/python_programing/python_projects/12dicts-6.0.2/agid.txt')
try:
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt)
except IOError as e:
print("{}\nError opening {}. Terminating program." .format(e, file), file=sys.stderr)
sys,exit(1)
file_to_open = os.path.expanduser('~/Desktop/movie_quotes.txt')
while running the above code I am getting following error
Traceback (most recent call last):
File "/Users/pavandadi/Desktop/python_programing/python_projects/exception.py", line 6, in <module>
loaded_txt = in_file.read()
File "/Users/pavandadi/opt/anaconda3/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2921: invalid continuation byte
what to do?

I think you need to decode it after read it like that
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt.decode('UTF-8')) #or any encoding type you want

Related

UnicodeDecodeError when trying to encode/decode CSV file

I am trying to figure out how to make the following function work. Basically, what I'm trying to achieve with this function is to create a csv file from a DataFrame, encode it, and then decode it for download.:
def filedownload(df):
csv = df.to_csv(index=False, encoding='utf-8')
# strings <-> bytes conversion
encoded = base64.b64encode(csv)
decoded = base64.b64decode(encoded)
href = f'Download Predictions'
return href
However, when running the entire program, I get the following error:
File "/app/.heroku/python/lib/python3.9/site-packages/streamlit/script_runner.py", line 354, in _run_script
exec(code, module.__dict__)
File "/app/bioactivity_app.py", line 97, in <module>
build_model(desc_subset)
File "/app/bioactivity_app.py", line 35, in build_model
load_model = pickle.load(open('sars_cov_proteinase_model.pkl'))
File "/app/.heroku/python/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
This is an example of what the input CSV file looks like: https://ufile.io/sbl163ty
This is part of the code I'm using to generate this file:
prediction_output = pd.Series(Y_pred, name = 'pIC50')
molecule_name = pd.Series(load_data['molecular_id'], name = 'Molecule Name')
df2 = pd.concat([molecule_name, prediction_output], axis=1)
csv = df2.to_csv('example_generated.csv')
I believe it has something to do with how the file is getting encoded but am not sure. Any help would be appreciated!

Test/Remove un-decodable bytes when preprocessing

I am trying to clean the dataset that I am working with. It is comprised of .json files that I parse with the following code:
def parse_single_file(filename):
with open("/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Big_Tech_Regulation_data/" + filename) as f:
content = ijson.items(f, "value")
row_list = []
for o in content:
print(filename)
for i in range(0, len(o)):
print(type(o[i]["Document"]))
if o[i]["Document"] is not None: #Drops rows where there is no content
row_list.append(Row_object(o[i]["Jurisdiction"], o[i]["Location"], o[i]["ContentType"], o[i]["Byline"], o[i]["WordLength"],o[i]["Date"],o[i]["Title"],LDA_clean(o[i]["Document"]["Content"]),o[i]["Source"]["Name"]))
return row_list
After running the first 100 or so file correctly, I got the following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte.
Here is the Traceback:
Traceback (most recent call last):
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 240, in <module>
list_of_row_objects = parse_single_file(filename)
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 208, in parse_single_file
for o in content:
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/venv/lib/python3.8/site-packages/ijson/compat.py", line 32, in read
return self.str_reader.read(n).encode('utf-8')
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Thanks in advance, I think it has to do with the character decoding, but I thought start bytes were usually not the issue.

ignore encoding error when parsing pdf with pdfminer

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fn='test.pdf'
with open(fn, mode='rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
item = {}
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
item[name]=value
Hello, I need help with this code as it is giving me Unicode error on some characters
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 80, in resolve1
x = x.resolve(default=default)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 67, in resolve
return self.doc.getobj(self.objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 673, in getobj
stream = stream_value(self.getobj(strmid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 676, in getobj
obj = self._getobj_parse(index, objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 648, in _getobj_parse
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py", line 85, in __repr__
return self.name.decode('ascii')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
is there anything I can add so it "ingores" the charchters that its not able to decode or at least return the name with the value as blank in name, value = field.get('T'), field.get('V').
any help is appreciated
Here is one way you can fix it
nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py"
then in line 85
def __repr__(self):
return self.name.decode('ascii', 'ignore') # this fixes it
I don't believe it's recommended to edit source scripts, you should also post an issue on Github

Python3 Tarfile: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf0 in position

I am trying to port a piece of python2 code to python3. The code works perfectly in python2, but fails in python3. In the original python2 code, data is being compressed into a tarfile as follows:
_tar = tarfile.open(name, mode="w|")
data = StringIO()
data.write(compress(dumps(probe, HIGHEST_PROTOCOL)))
data.seek(0)
info = tarfile.TarInfo()
info.name = 'Probe_%s.lzo' % dest
info.uid = 0
info.gid = 0
info.size = len(data.buf)
info.mode = S_IMODE(0o0444)
info.mtime = mktime(probe.circs[0].created.timetuple())
_tar.addfile(tarinfo=info, fileobj=data)
Now, in another script, this code is being read in the following way:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
while True:
cprobe = tar.next()
if not cprobe:
raise StopIteration()
tarx = tar.extractfile(cprobe)
if not tarx:
continue
yield tarx.read()
The second script is intended to be called in the following way:
cat outputOfFirst | python ./second.py 1> outputOfSecond
This works fine in python2. If I use the output of the first script generated through python2, and pass it to the second script with python3, i get the following error:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
File "/usr/lib/python3.6/tarfile.py", line 1601, in open
t = cls(name, filemode, stream, **kwargs)
File "/usr/lib/python3.6/tarfile.py", line 1482, in __init__
self.firstmember = self.next()
File "/usr/lib/python3.6/tarfile.py", line 2297, in next
tarinfo = self.tarinfo.fromtarfile(self)
File "/usr/lib/python3.6/tarfile.py", line 1092, in fromtarfile
buf = tarfile.fileobj.read(BLOCKSIZE)
File "/usr/lib/python3.6/tarfile.py", line 539, in read
buf = self._read(size)
File "/usr/lib/python3.6/tarfile.py", line 547, in _read
return self.__read(size)
File "/usr/lib/python3.6/tarfile.py", line 572, in __read
buf = self.fileobj.read(self.bufsize)
File "/usr/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf0 in position 512: invalid continuation byte
What would be the python3 equivalent to this? My understanding is that i have to somehow encode the stdin part to something like "latin-1". But i am not sure how that would be done.

Python 'charmap' codec can't decode byte 0x9d

I am trying to run the following python script:
#! python
import textmining
import glob
tdm = textmining.TermDocumentMatrix()
files = glob.glob("C:/Users/farre/Desktop/matrix/blurbs/*")
print(files)
for f in files:
content = open(f).read()
content = content.replace('\n', ' ')
tdm.add_doc(content)
tdm.write_csv('matrix.csv', cutoff=1)
But I am getting an error
File "matrix.py", line 13, in <module>
content = open(f).read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 688: character maps to <undefined>
I have tried a few things I seen on here but nothing has worked, I tried using io.open(filename,encoding="utf8") but I got:
File "matrix.py", line 11, in <module>
content = io.open(f, encoding="utf8").read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 310: invalid start byte
Anyone know how I can fix this?

Categories

Resources