UnicodeDecodeError when trying to encode/decode CSV file - python

I am trying to figure out how to make the following function work. Basically, what I'm trying to achieve with this function is to create a csv file from a DataFrame, encode it, and then decode it for download.:
def filedownload(df):
csv = df.to_csv(index=False, encoding='utf-8')
# strings <-> bytes conversion
encoded = base64.b64encode(csv)
decoded = base64.b64decode(encoded)
href = f'Download Predictions'
return href
However, when running the entire program, I get the following error:
File "/app/.heroku/python/lib/python3.9/site-packages/streamlit/script_runner.py", line 354, in _run_script
exec(code, module.__dict__)
File "/app/bioactivity_app.py", line 97, in <module>
build_model(desc_subset)
File "/app/bioactivity_app.py", line 35, in build_model
load_model = pickle.load(open('sars_cov_proteinase_model.pkl'))
File "/app/.heroku/python/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
This is an example of what the input CSV file looks like: https://ufile.io/sbl163ty
This is part of the code I'm using to generate this file:
prediction_output = pd.Series(Y_pred, name = 'pIC50')
molecule_name = pd.Series(load_data['molecular_id'], name = 'Molecule Name')
df2 = pd.concat([molecule_name, prediction_output], axis=1)
csv = df2.to_csv('example_generated.csv')
I believe it has something to do with how the file is getting encoded but am not sure. Any help would be appreciated!

Related

Test/Remove un-decodable bytes when preprocessing

I am trying to clean the dataset that I am working with. It is comprised of .json files that I parse with the following code:
def parse_single_file(filename):
with open("/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Big_Tech_Regulation_data/" + filename) as f:
content = ijson.items(f, "value")
row_list = []
for o in content:
print(filename)
for i in range(0, len(o)):
print(type(o[i]["Document"]))
if o[i]["Document"] is not None: #Drops rows where there is no content
row_list.append(Row_object(o[i]["Jurisdiction"], o[i]["Location"], o[i]["ContentType"], o[i]["Byline"], o[i]["WordLength"],o[i]["Date"],o[i]["Title"],LDA_clean(o[i]["Document"]["Content"]),o[i]["Source"]["Name"]))
return row_list
After running the first 100 or so file correctly, I got the following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte.
Here is the Traceback:
Traceback (most recent call last):
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 240, in <module>
list_of_row_objects = parse_single_file(filename)
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 208, in parse_single_file
for o in content:
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/venv/lib/python3.8/site-packages/ijson/compat.py", line 32, in read
return self.str_reader.read(n).encode('utf-8')
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Thanks in advance, I think it has to do with the character decoding, but I thought start bytes were usually not the issue.

ignore encoding error when parsing pdf with pdfminer

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fn='test.pdf'
with open(fn, mode='rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
item = {}
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
item[name]=value
Hello, I need help with this code as it is giving me Unicode error on some characters
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 80, in resolve1
x = x.resolve(default=default)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 67, in resolve
return self.doc.getobj(self.objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 673, in getobj
stream = stream_value(self.getobj(strmid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 676, in getobj
obj = self._getobj_parse(index, objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 648, in _getobj_parse
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py", line 85, in __repr__
return self.name.decode('ascii')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
is there anything I can add so it "ingores" the charchters that its not able to decode or at least return the name with the value as blank in name, value = field.get('T'), field.get('V').
any help is appreciated
Here is one way you can fix it
nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py"
then in line 85
def __repr__(self):
return self.name.decode('ascii', 'ignore') # this fixes it
I don't believe it's recommended to edit source scripts, you should also post an issue on Github

Python3 Tarfile: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf0 in position

I am trying to port a piece of python2 code to python3. The code works perfectly in python2, but fails in python3. In the original python2 code, data is being compressed into a tarfile as follows:
_tar = tarfile.open(name, mode="w|")
data = StringIO()
data.write(compress(dumps(probe, HIGHEST_PROTOCOL)))
data.seek(0)
info = tarfile.TarInfo()
info.name = 'Probe_%s.lzo' % dest
info.uid = 0
info.gid = 0
info.size = len(data.buf)
info.mode = S_IMODE(0o0444)
info.mtime = mktime(probe.circs[0].created.timetuple())
_tar.addfile(tarinfo=info, fileobj=data)
Now, in another script, this code is being read in the following way:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
while True:
cprobe = tar.next()
if not cprobe:
raise StopIteration()
tarx = tar.extractfile(cprobe)
if not tarx:
continue
yield tarx.read()
The second script is intended to be called in the following way:
cat outputOfFirst | python ./second.py 1> outputOfSecond
This works fine in python2. If I use the output of the first script generated through python2, and pass it to the second script with python3, i get the following error:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
File "/usr/lib/python3.6/tarfile.py", line 1601, in open
t = cls(name, filemode, stream, **kwargs)
File "/usr/lib/python3.6/tarfile.py", line 1482, in __init__
self.firstmember = self.next()
File "/usr/lib/python3.6/tarfile.py", line 2297, in next
tarinfo = self.tarinfo.fromtarfile(self)
File "/usr/lib/python3.6/tarfile.py", line 1092, in fromtarfile
buf = tarfile.fileobj.read(BLOCKSIZE)
File "/usr/lib/python3.6/tarfile.py", line 539, in read
buf = self._read(size)
File "/usr/lib/python3.6/tarfile.py", line 547, in _read
return self.__read(size)
File "/usr/lib/python3.6/tarfile.py", line 572, in __read
buf = self.fileobj.read(self.bufsize)
File "/usr/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf0 in position 512: invalid continuation byte
What would be the python3 equivalent to this? My understanding is that i have to somehow encode the stdin part to something like "latin-1". But i am not sure how that would be done.

UnicodeDecodeError in converting shapefile into GeoJSON file

I've tried to convert 'shapefile' into 'geojson' format using python and this is my code
import shapefile
import json
path = "shapefiles/WTL_VALV_PS"
sf = shapefile.Reader(path)
fields = sf.fields[1:]
field_names = [field[0] for field in fields]
buffer = []
for sr in sf.shapeRecords():
atr = dict(zip(field_names, sr.record))
geom = sr.shape.__geo_interface__
buffer.append(dict(type="Feature", geometry=geom, properties=atr))
geojson = open("test4.geojson", "w", encoding='utf-8')
geojson.write(json.dumps({"type": "FeatureCollection", "features": buffer}, indent=2, ensure_ascii=False))
geojson.close()
but I got this error
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/ConvertGeoJSON/geoJSON.py", line 16, in <module>
for sr in sf.shapeRecords():
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1039, in shapeRecords
for rec in zip(self.shapes(), self.records())])
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1012, in records
r = self.__record(oid=i)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 987, in __record
value = u(value, self.encoding, self.encodingErrors)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 104, in u
return v.decode(encoding, encodingErrors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc1 in position 1: invalid start byte
I thought it is becuase shapefile contains 'Korean character' but I could convert file with Arabic character like 'أفغانستان ' and of course I could convert file with English.
I lost my way and I don't know where I'm supposed to start from

UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 1266: invalid start byte

I am trying to train some text data using scikit. The same code is being used on other PC without any error but on my system it gives error:
File "/root/Desktop/karim/svn/questo-anso/v5/trials/classify/domain_detection_final/test_classifier_temp.py", line 130, in trainClassifier
X_train = self.vectorizer.fit_transform(self.data_train.data)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 1270, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 808, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 741, in _count_vocab
for feature in analyze(doc):
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 233, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 111, in decode
doc = doc.decode(self.encoding, self.decode_error)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 1266: invalid start byte
I already checked similar threads but no helps.
UPDATE:
self.data_train = self.fetch_data(cache, subset='train')
if not os.path.exists(self.root_dir+"/autocreated/vectorizer.txt"):
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
start_time = time()
print("Transforming the dataset")
X_train = self.vectorizer.fit_transform(self.data_train.data) // Error is here
joblib.dump(self.vectorizer, self.root_dir+"/autocreated/vectorizer.txt")
Your file is actually encoded in ISO-8869-1, not UTF-8. You need to properly decode it before you can encode it again.
0xBA is the numero sign (º) in ISO-8869-1.
There was issue in dealing with the training data. One thing that solved my issue is ignoring error using decode_error='ignore', there might be some other solutions.
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english',decode_error='ignore')

Categories

Resources