Python 'charmap' codec can't decode byte 0x9d - python

I am trying to run the following python script:
#! python
import textmining
import glob
tdm = textmining.TermDocumentMatrix()
files = glob.glob("C:/Users/farre/Desktop/matrix/blurbs/*")
print(files)
for f in files:
content = open(f).read()
content = content.replace('\n', ' ')
tdm.add_doc(content)
tdm.write_csv('matrix.csv', cutoff=1)
But I am getting an error
File "matrix.py", line 13, in <module>
content = open(f).read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 688: character maps to <undefined>
I have tried a few things I seen on here but nothing has worked, I tried using io.open(filename,encoding="utf8") but I got:
File "matrix.py", line 11, in <module>
content = io.open(f, encoding="utf8").read()
File "C:\Users\farre\AppData\Local\Programs\Python\Python36\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 310: invalid start byte
Anyone know how I can fix this?

Related

UnicodeDecodeError when trying to encode/decode CSV file

I am trying to figure out how to make the following function work. Basically, what I'm trying to achieve with this function is to create a csv file from a DataFrame, encode it, and then decode it for download.:
def filedownload(df):
csv = df.to_csv(index=False, encoding='utf-8')
# strings <-> bytes conversion
encoded = base64.b64encode(csv)
decoded = base64.b64decode(encoded)
href = f'Download Predictions'
return href
However, when running the entire program, I get the following error:
File "/app/.heroku/python/lib/python3.9/site-packages/streamlit/script_runner.py", line 354, in _run_script
exec(code, module.__dict__)
File "/app/bioactivity_app.py", line 97, in <module>
build_model(desc_subset)
File "/app/bioactivity_app.py", line 35, in build_model
load_model = pickle.load(open('sars_cov_proteinase_model.pkl'))
File "/app/.heroku/python/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
This is an example of what the input CSV file looks like: https://ufile.io/sbl163ty
This is part of the code I'm using to generate this file:
prediction_output = pd.Series(Y_pred, name = 'pIC50')
molecule_name = pd.Series(load_data['molecular_id'], name = 'Molecule Name')
df2 = pd.concat([molecule_name, prediction_output], axis=1)
csv = df2.to_csv('example_generated.csv')
I believe it has something to do with how the file is getting encoded but am not sure. Any help would be appreciated!

I am getting error while I opening a file

import os
import sys
file = os.path.expanduser('~/Desktop/python_programing/python_projects/12dicts-6.0.2/agid.txt')
try:
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt)
except IOError as e:
print("{}\nError opening {}. Terminating program." .format(e, file), file=sys.stderr)
sys,exit(1)
file_to_open = os.path.expanduser('~/Desktop/movie_quotes.txt')
while running the above code I am getting following error
Traceback (most recent call last):
File "/Users/pavandadi/Desktop/python_programing/python_projects/exception.py", line 6, in <module>
loaded_txt = in_file.read()
File "/Users/pavandadi/opt/anaconda3/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2921: invalid continuation byte
what to do?
I think you need to decode it after read it like that
with open(file) as in_file:
loaded_txt = in_file.read()
loaded_txt = [x.lower() for x in loaded_txt]
print (loaded_txt.decode('UTF-8')) #or any encoding type you want

Test/Remove un-decodable bytes when preprocessing

I am trying to clean the dataset that I am working with. It is comprised of .json files that I parse with the following code:
def parse_single_file(filename):
with open("/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Big_Tech_Regulation_data/" + filename) as f:
content = ijson.items(f, "value")
row_list = []
for o in content:
print(filename)
for i in range(0, len(o)):
print(type(o[i]["Document"]))
if o[i]["Document"] is not None: #Drops rows where there is no content
row_list.append(Row_object(o[i]["Jurisdiction"], o[i]["Location"], o[i]["ContentType"], o[i]["Byline"], o[i]["WordLength"],o[i]["Date"],o[i]["Title"],LDA_clean(o[i]["Document"]["Content"]),o[i]["Source"]["Name"]))
return row_list
After running the first 100 or so file correctly, I got the following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte.
Here is the Traceback:
Traceback (most recent call last):
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 240, in <module>
list_of_row_objects = parse_single_file(filename)
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/Import_and_Clean.py", line 208, in parse_single_file
for o in content:
File "/Users/njjones14/PycharmProjects/Big_Tech_Regulation/venv/lib/python3.8/site-packages/ijson/compat.py", line 32, in read
return self.str_reader.read(n).encode('utf-8')
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Thanks in advance, I think it has to do with the character decoding, but I thought start bytes were usually not the issue.

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 49: for textacy

I am using the textacy method to get synonyms.
import textacy.resources
rs = textacy.resources.ConceptNet()
syn=rs.get_synonyms('happy')
I get the below error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 353, in get_synonyms
return self._get_relation_values(self.synonyms, term, lang=lang, sense=sense)
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 338, in synonyms
self._synonyms = self._get_relation_data("/r/Synonym", is_symmetric=True)
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\resources\concept_net.py", line 162, in _get_relation_data
for row in rows:
File "C:\Users\Dhiraj\Desktop\Work\QGen\lib\site-packages\textacy\io\csv.py", line 96, in read_csv
for row in csv_reader:
File "C:\Python37\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 49: character maps to <undefined>
I have tried to enforce encoding='utf8' in both concept_net.py", line 162, and io\csv.py", line 96, in read_csv, but that gives another error
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached
What can be done ?

UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 1266: invalid start byte

I am trying to train some text data using scikit. The same code is being used on other PC without any error but on my system it gives error:
File "/root/Desktop/karim/svn/questo-anso/v5/trials/classify/domain_detection_final/test_classifier_temp.py", line 130, in trainClassifier
X_train = self.vectorizer.fit_transform(self.data_train.data)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 1270, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 808, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 741, in _count_vocab
for feature in analyze(doc):
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 233, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "/root/Desktop/karim/software/scikit-learn-0.15.1/sklearn/feature_extraction/text.py", line 111, in decode
doc = doc.decode(self.encoding, self.decode_error)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 1266: invalid start byte
I already checked similar threads but no helps.
UPDATE:
self.data_train = self.fetch_data(cache, subset='train')
if not os.path.exists(self.root_dir+"/autocreated/vectorizer.txt"):
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
start_time = time()
print("Transforming the dataset")
X_train = self.vectorizer.fit_transform(self.data_train.data) // Error is here
joblib.dump(self.vectorizer, self.root_dir+"/autocreated/vectorizer.txt")
Your file is actually encoded in ISO-8869-1, not UTF-8. You need to properly decode it before you can encode it again.
0xBA is the numero sign (ยบ) in ISO-8869-1.
There was issue in dealing with the training data. One thing that solved my issue is ignoring error using decode_error='ignore', there might be some other solutions.
self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english',decode_error='ignore')

Categories

Resources