I am trying to figure out how to make the following function work. Basically, what I'm trying to achieve with this function is to create a csv file from a DataFrame, encode it, and then decode it for download.:
def filedownload(df):
csv = df.to_csv(index=False, encoding='utf-8')
# strings <-> bytes conversion
encoded = base64.b64encode(csv)
decoded = base64.b64decode(encoded)
href = f'Download Predictions'
return href
However, when running the entire program, I get the following error:
File "/app/.heroku/python/lib/python3.9/site-packages/streamlit/script_runner.py", line 354, in _run_script
exec(code, module.__dict__)
File "/app/bioactivity_app.py", line 97, in <module>
build_model(desc_subset)
File "/app/bioactivity_app.py", line 35, in build_model
load_model = pickle.load(open('sars_cov_proteinase_model.pkl'))
File "/app/.heroku/python/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
This is an example of what the input CSV file looks like: https://ufile.io/sbl163ty
This is part of the code I'm using to generate this file:
prediction_output = pd.Series(Y_pred, name = 'pIC50')
molecule_name = pd.Series(load_data['molecular_id'], name = 'Molecule Name')
df2 = pd.concat([molecule_name, prediction_output], axis=1)
csv = df2.to_csv('example_generated.csv')
I believe it has something to do with how the file is getting encoded but am not sure. Any help would be appreciated!
Related
I am trying to clean the dataset that I am working with. It is comprised of .json files that I parse with the following code:
def parse_single_file(filename):
with open("/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Big_Tech_Regulation_data/" + filename) as f:
content = ijson.items(f, "value")
row_list = []
for o in content:
print(filename)
for i in range(0, len(o)):
print(type(o[i]["Document"]))
if o[i]["Document"] is not None: #Drops rows where there is no content
row_list.append(Row_object(o[i]["Jurisdiction"], o[i]["Location"], o[i]["ContentType"], o[i]["Byline"], o[i]["WordLength"],o[i]["Date"],o[i]["Title"],LDA_clean(o[i]["Document"]["Content"]),o[i]["Source"]["Name"]))
return row_list
After running the first 100 or so file correctly, I got the following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte.
Here is the Traceback:
Traceback (most recent call last):
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 240, in <module>
list_of_row_objects = parse_single_file(filename)
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 208, in parse_single_file
for o in content:
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/venv/lib/python3.8/site-packages/ijson/compat.py", line 32, in read
return self.str_reader.read(n).encode('utf-8')
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Thanks in advance, I think it has to do with the character decoding, but I thought start bytes were usually not the issue.
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fn='test.pdf'
with open(fn, mode='rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
item = {}
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
item[name]=value
Hello, I need help with this code as it is giving me Unicode error on some characters
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 80, in resolve1
x = x.resolve(default=default)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 67, in resolve
return self.doc.getobj(self.objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 673, in getobj
stream = stream_value(self.getobj(strmid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 676, in getobj
obj = self._getobj_parse(index, objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 648, in _getobj_parse
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py", line 85, in __repr__
return self.name.decode('ascii')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
is there anything I can add so it "ingores" the charchters that its not able to decode or at least return the name with the value as blank in name, value = field.get('T'), field.get('V').
any help is appreciated
Here is one way you can fix it
nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py"
then in line 85
def __repr__(self):
return self.name.decode('ascii', 'ignore') # this fixes it
I don't believe it's recommended to edit source scripts, you should also post an issue on Github
I am trying to port a piece of python2 code to python3. The code works perfectly in python2, but fails in python3. In the original python2 code, data is being compressed into a tarfile as follows:
_tar = tarfile.open(name, mode="w|")
data = StringIO()
data.write(compress(dumps(probe, HIGHEST_PROTOCOL)))
data.seek(0)
info = tarfile.TarInfo()
info.name = 'Probe_%s.lzo' % dest
info.uid = 0
info.gid = 0
info.size = len(data.buf)
info.mode = S_IMODE(0o0444)
info.mtime = mktime(probe.circs[0].created.timetuple())
_tar.addfile(tarinfo=info, fileobj=data)
Now, in another script, this code is being read in the following way:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
while True:
cprobe = tar.next()
if not cprobe:
raise StopIteration()
tarx = tar.extractfile(cprobe)
if not tarx:
continue
yield tarx.read()
The second script is intended to be called in the following way:
cat outputOfFirst | python ./second.py 1> outputOfSecond
This works fine in python2. If I use the output of the first script generated through python2, and pass it to the second script with python3, i get the following error:
with tarfile.open(fileobj=stdin, mode="r|") as tar:
File "/usr/lib/python3.6/tarfile.py", line 1601, in open
t = cls(name, filemode, stream, **kwargs)
File "/usr/lib/python3.6/tarfile.py", line 1482, in __init__
self.firstmember = self.next()
File "/usr/lib/python3.6/tarfile.py", line 2297, in next
tarinfo = self.tarinfo.fromtarfile(self)
File "/usr/lib/python3.6/tarfile.py", line 1092, in fromtarfile
buf = tarfile.fileobj.read(BLOCKSIZE)
File "/usr/lib/python3.6/tarfile.py", line 539, in read
buf = self._read(size)
File "/usr/lib/python3.6/tarfile.py", line 547, in _read
return self.__read(size)
File "/usr/lib/python3.6/tarfile.py", line 572, in __read
buf = self.fileobj.read(self.bufsize)
File "/usr/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf0 in position 512: invalid continuation byte
What would be the python3 equivalent to this? My understanding is that i have to somehow encode the stdin part to something like "latin-1". But i am not sure how that would be done.
I've tried to convert 'shapefile' into 'geojson' format using python and this is my code
import shapefile
import json
path = "shapefiles/WTL_VALV_PS"
sf = shapefile.Reader(path)
fields = sf.fields[1:]
field_names = [field[0] for field in fields]
buffer = []
for sr in sf.shapeRecords():
atr = dict(zip(field_names, sr.record))
geom = sr.shape.__geo_interface__
buffer.append(dict(type="Feature", geometry=geom, properties=atr))
geojson = open("test4.geojson", "w", encoding='utf-8')
geojson.write(json.dumps({"type": "FeatureCollection", "features": buffer}, indent=2, ensure_ascii=False))
geojson.close()
but I got this error
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/ConvertGeoJSON/geoJSON.py", line 16, in <module>
for sr in sf.shapeRecords():
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1039, in shapeRecords
for rec in zip(self.shapes(), self.records())])
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1012, in records
r = self.__record(oid=i)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 987, in __record
value = u(value, self.encoding, self.encodingErrors)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 104, in u
return v.decode(encoding, encodingErrors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc1 in position 1: invalid start byte
I thought it is becuase shapefile contains 'Korean character' but I could convert file with Arabic character like 'أفغانستان ' and of course I could convert file with English.
I lost my way and I don't know where I'm supposed to start from
I am trying to train some text data using scikit. The same code is being used on other PC without any error but on my system it gives error:
File "/root/Desktop/karim/svn/questo-anso/v5/trials/classify/domain_detection_final/test_classifier_temp.py", line 130, in trainClassifier
X_train = self.vectorizer.fit_transform(self.data_train.data)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 1270, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 808, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 741, in _count_vocab
for feature in analyze(doc):
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 233, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 111, in decode
doc = doc.decode(self.encoding, self.decode_error)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 1266: invalid start byte
I already checked similar threads but no helps.
UPDATE:
self.data_train = self.fetch_data(cache, subset='train')
if not os.path.exists(self.root_dir+"/autocreated/vectorizer.txt"):
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
start_time = time()
print("Transforming the dataset")
X_train = self.vectorizer.fit_transform(self.data_train.data) // Error is here
joblib.dump(self.vectorizer, self.root_dir+"/autocreated/vectorizer.txt")
Your file is actually encoded in ISO-8869-1, not UTF-8. You need to properly decode it before you can encode it again.
0xBA is the numero sign (º) in ISO-8869-1.
There was issue in dealing with the training data. One thing that solved my issue is ignoring error using decode_error='ignore', there might be some other solutions.
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english',decode_error='ignore')