Test/Remove un-decodable bytes when preprocessing - python

I am trying to clean the dataset that I am working with. It is comprised of .json files that I parse with the following code:
def parse_single_file(filename):
with open("/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Big_Tech_Regulation_data/" + filename) as f:
content = ijson.items(f, "value")
row_list = []
for o in content:
print(filename)
for i in range(0, len(o)):
print(type(o[i]["Document"]))
if o[i]["Document"] is not None: #Drops rows where there is no content
row_list.append(Row_object(o[i]["Jurisdiction"], o[i]["Location"], o[i]["ContentType"], o[i]["Byline"], o[i]["WordLength"],o[i]["Date"],o[i]["Title"],LDA_clean(o[i]["Document"]["Content"]),o[i]["Source"]["Name"]))
return row_list
After running the first 100 or so file correctly, I got the following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte.
Here is the Traceback:
Traceback (most recent call last):
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 240, in <module>
list_of_row_objects = parse_single_file(filename)
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 208, in parse_single_file
for o in content:
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/venv/lib/python3.8/site-packages/ijson/compat.py", line 32, in read
return self.str_reader.read(n).encode('utf-8')
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Thanks in advance, I think it has to do with the character decoding, but I thought start bytes were usually not the issue.

Related

UnicodeDecodeError when trying to encode/decode CSV file

I am trying to figure out how to make the following function work. Basically, what I'm trying to achieve with this function is to create a csv file from a DataFrame, encode it, and then decode it for download.:
def filedownload(df):
csv = df.to_csv(index=False, encoding='utf-8')
# strings <-> bytes conversion
encoded = base64.b64encode(csv)
decoded = base64.b64decode(encoded)
href = f'Download Predictions'
return href
However, when running the entire program, I get the following error:
File "/app/.heroku/python/lib/python3.9/site-packages/streamlit/script_runner.py", line 354, in _run_script
exec(code, module.__dict__)
File "/app/bioactivity_app.py", line 97, in <module>
build_model(desc_subset)
File "/app/bioactivity_app.py", line 35, in build_model
load_model = pickle.load(open('sars_cov_proteinase_model.pkl'))
File "/app/.heroku/python/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
This is an example of what the input CSV file looks like: https://ufile.io/sbl163ty
This is part of the code I'm using to generate this file:
prediction_output = pd.Series(Y_pred, name = 'pIC50')
molecule_name = pd.Series(load_data['molecular_id'], name = 'Molecule Name')
df2 = pd.concat([molecule_name, prediction_output], axis=1)
csv = df2.to_csv('example_generated.csv')
I believe it has something to do with how the file is getting encoded but am not sure. Any help would be appreciated!

I am getting error while I opening a file

import os
import sys
file = os.path.expanduser('~/Desktop/python_programing/python_projects/12dicts-6.0.2/agid.txt')
try:
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt)
except IOError as e:
print("{}\nError opening {}. Terminating program." .format(e, file), file=sys.stderr)
sys,exit(1)
file_to_open = os.path.expanduser('~/Desktop/movie_quotes.txt')
while running the above code I am getting following error
Traceback (most recent call last):
File "/Users/pavandadi/Desktop/python_programing/python_projects/exception.py", line 6, in <module>
loaded_txt = in_file.read()
File "/Users/pavandadi/opt/anaconda3/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2921: invalid continuation byte
what to do?
I think you need to decode it after read it like that
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt.decode('UTF-8')) #or any encoding type you want

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 49: for textacy

I am using the textacy method to get synonyms.
import textacy.resources
rs = textacy.resources.ConceptNet()
syn=rs.get_synonyms('happy')
I get the below error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 353, in get_synonyms
return self._get_relation_values(self.synonyms, term, lang=lang, sense=sense)
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 338, in synonyms
self._synonyms = self._get_relation_data("/r/Synonym", is_symmetric=True)
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 162, in _get_relation_data
for row in rows:
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\io\csv.py", line 96, in read_csv
for row in csv_reader:
File "C:\Python37\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 49: character maps to <undefined>
I have tried to enforce encoding='utf8' in both concept_net.py", line 162, and io\csv.py", line 96, in read_csv, but that gives another error
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached
What can be done ?

'utf-8' codec can't decode byte 0xec in position 14: invalid continuation byte

here my error
Traceback (most recent call last):
File "C:/Users/dani ibrahim/PycharmProjects/TestPy/Test.py", line 16, in <module>
load_content=True, encoding='utf-8', shuffle=True, random_state=42)
File "C:\Users\dani ibrahim\PycharmProjects\TestPy\venv\lib\site-packages\sklearn\datasets\base.py", line 197, in load_files
data = [d.decode(encoding, decode_error) for d in data]
File "C:\Users\dani ibrahim\PycharmProjects\TestPy\venv\lib\site-packages\sklearn\datasets\base.py", line 197, in <listcomp>
data = [d.decode(encoding, decode_error) for d in data]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xec in position 14: invalid continuation byte
here my code, how can i fix this?
docs_to_train = sklearn.datasets.load_files("/Users/dani ibrahim/PycharmProjects/TestPy/data/",
description=None, categories=categories,
load_content=True, encoding='utf-8', shuffle=True, random_state=42)

Python 'charmap' codec can't decode byte 0x9d

I am trying to run the following python script:
#! python
import textmining
import glob
tdm = textmining.TermDocumentMatrix()
files = glob.glob("C:/Users/farre/Desktop/matrix/blurbs/*")
print(files)
for f in files:
content = open(f).read()
content = content.replace('\n', ' ')
tdm.add_doc(content)
tdm.write_csv('matrix.csv', cutoff=1)
But I am getting an error
File "matrix.py", line 13, in <module>
content = open(f).read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 688: character maps to <undefined>
I have tried a few things I seen on here but nothing has worked, I tried using io.open(filename,encoding="utf8") but I got:
File "matrix.py", line 11, in <module>
content = io.open(f, encoding="utf8").read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 310: invalid start byte
Anyone know how I can fix this?

Categories

Resources