Problems with utf-8 in python

Problems with utf-8 in python - python

My code is below.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import codecs
df1 = pd.read_csv(r'E:\내논문자료\wordcloud\test1\1311_1312.csv',encoding='utf-8')
df2 = df1.groupby(['address']).size().reset_index()
df2.rename(columns = {0: 'frequency'}, inplace = True)
print(df2[:100])
But When I execute this code I got this message
Traceback (most recent call last):
File "E:/빅데이터 캠퍼스/untitled1/groupby freq.py", line 7, in <module>
df1 = pd.read_csv(r'E:\내논문자료\wordcloud\test1\1311_1312.csv',encoding='utf-8')
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 645, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 400, in _read
data = parser.read()
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 938, in read
ret = self._engine.read(nrows)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 1507, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 846, in pandas.parser.TextReader.read (pandas\parser.c:10364)
File "pandas\parser.pyx", line 868, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10640)
File "pandas\parser.pyx", line 945, in pandas.parser.TextReader._read_rows (pandas\parser.c:11677)
File "pandas\parser.pyx", line 1047, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13111)
File "pandas\parser.pyx", line 1106, in pandas.parser.TextReader._convert_tokens (pandas\parser.c:14065)
File "pandas\parser.pyx", line 1204, in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:16121)
File "pandas\parser.pyx", line 1220, in pandas.parser.TextReader._string_convert (pandas\parser.c:16349)
File "pandas\parser.pyx", line 1452, in pandas.parser._string_box_utf8 (pandas\parser.c:22014)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 0: invalid start byte
How can I solve it??
Should I alter parsers code in pandas??

It looks like your source data hasn't been encoded with UTF-8 - it's likely to be one of the other codecs. Per this answer, you might want to try with encoding='GBK' to start with, or encoding='gb2312'.

Related

Can't decode csv file using pandas

This is the code and encoding='utf-8' is also not working:
import pandas as pd
df = pd.read_csv('sunil.csv',encoding ="utf-8")
print(df)
And here is the error:
Traceback (most recent call last):
File "C:/Users/ASUS/Desktop/utube/sunil.py", line 2, in <module>
df = pd.read_csv('sunil.csv',encoding ="utf-8")
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 764, in pandas._libs.parsers.TextReader._get_header
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 0: invalid start byte

Can you try ISO-8859-1 as it worked for me when I had same issue
dataset = pd.read_csv(r'filename.csv',encoding = "ISO-8859-1")

Getting the below error while trying to read a CSV file in python

File "<ipython-input-10-9cc4e896b568>", line 1, in <module>
pd.read_csv('temp.csv')
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 401, in _read
data = parser.read()
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 939, in read
ret = self._engine.read(nrows)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1508, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 848, in pandas.parser.TextReader.read (pandas\parser.c:10415)
File "pandas\parser.pyx", line 870, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10691)
File "pandas\parser.pyx", line 947, in pandas.parser.TextReader._read_rows (pandas\parser.c:11728)
File "pandas\parser.pyx", line 1049, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13162)
File "pandas\parser.pyx", line 1108, in pandas.parser.TextReader._convert_tokens (pandas\parser.c:14116)
File "pandas\parser.pyx", line 1206, in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:16172)
File "pandas\parser.pyx", line 1222, in pandas.parser.TextReader._string_convert (pandas\parser.c:16400)
File "pandas\parser.pyx", line 1458, in pandas.parser._string_box_utf8 (pandas\parser.c:22072)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 0: invalid start byte
pd.read_csv('temp.csv')
Traceback (most recent call last):
File "<ipython-input-11-9cc4e896b568>", line 1, in <module>
pd.read_csv('temp.csv')
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 401, in _read
data = parser.read()
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 939, in read
ret = self._engine.read(nrows)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1508, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 848, in pandas.parser.TextReader.read (pandas\parser.c:10415)
File "pandas\parser.pyx", line 870, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10691)
File "pandas\parser.pyx", line 947, in pandas.parser.TextReader._read_rows (pandas\parser.c:11728)
File "pandas\parser.pyx", line 1049, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13162)
File "pandas\parser.pyx", line 1108, in pandas.parser.TextReader._convert_tokens (pandas\parser.c:14116)
File "pandas\parser.pyx", line 1206, in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:16172)
File "pandas\parser.pyx", line 1222, in pandas.parser.TextReader._string_convert (pandas\parser.c:16400)
File "pandas\parser.pyx", line 1458, in pandas.parser._string_box_utf8 (pandas\parser.c:22072)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 0: invalid start byte
import sys
sys.setdefaultencoding("ISO-8859-1")
Traceback (most recent call last):
File "<ipython-input-12-b416bfca896f>", line 2, in <module>
sys.setdefaultencoding("ISO-8859-1")
AttributeError: module 'sys' has no attribute 'setdefaultencoding'
pd.read_csv('temp.csv')
Traceback (most recent call last):
File "<ipython-input-13-9cc4e896b568>", line 1, in <module>
pd.read_csv('temp.csv')
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 401, in _read
data = parser.read()
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 939, in read
ret = self._engine.read(nrows)
File "C:\Users\nivetha.n\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1508, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 848, in pandas.parser.TextReader.read (pandas\parser.c:10415)
File "pandas\parser.pyx", line 870, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10691)
File "pandas\parser.pyx", line 947, in pandas.parser.TextReader._read_rows (pandas\parser.c:11728)
File "pandas\parser.pyx", line 1049, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13162)
File "pandas\parser.pyx", line 1108, in pandas.parser.TextReader._convert_tokens (pandas\parser.c:14116)
File "pandas\parser.pyx", line 1206, in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:16172)
File "pandas\parser.pyx", line 1222, in pandas.parser.TextReader._string_convert (pandas\parser.c:16400)
File "pandas\parser.pyx", line 1458, in pandas.parser._string_box_utf8 (pandas\parser.c:22072)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 0: invalid start byte

use pd.read_csv('temp.csv', encoding='latin-1')

Pandas Reading csv - Error

I have a problem with reading a CSV file with pandas (I know there are other topics but I could not solve the problem). My code is:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
The error I get is:
Traceback (most recent call last):
File "/username/username/Documents/first.py", line 362, in <module>
fuck = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 562, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 325, in _read
return parser.read()
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 815, in read
ret = self._engine.read(nrows)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 1314, in read
data = self._reader.read(nrows)
File "pandas/parser.pyx", line 805, in pandas.parser.TextReader.read (pandas/parser.c:8748)
File "pandas/parser.pyx", line 827, in pandas.parser.TextReader._read_low_memory (pandas/parser.c:9003)
File "pandas/parser.pyx", line 881, in pandas.parser.TextReader._read_rows (pandas/parser.c:9731)
File "pandas/parser.pyx", line 868, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:9602)
File "pandas/parser.pyx", line 1865, in pandas.parser.raise_parser_error (pandas/parser.c:23325)
pandas.io.common.CParserError: Error tokenizing data. C error: Expected 4 fields in line 4587, saw 5
What am I doing wrong?

Try adding the argument error_bad_lines=False to read_csv

The following worked for me by adding:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,error_bad_lines=False)

Python Pandas print error in Eclipse's PyDev: unknown encoding: MS874

I am trying to use Pandas library to read csv files, using Eclipse's PyDev.
foo.csv file:
"head1", "head2",
"A", "123"
test.py:
import pandas as pd
data = pd.read_csv('foo.csv');
print data
I ran this and got an error:
Traceback (most recent call last):
File "C:\Users\qqq\studyspace\macd\test3.py", line 4, in <module>
print data
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 666, in __str__
return self.__bytes__()
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 676, in __bytes__
return self.__unicode__().encode(encoding, 'replace')
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 691, in __unicode__
fits_horizontal = self._repr_fits_horizontal_()
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 651, in _repr_fits_horizontal_
d.to_string(buf=buf)
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 1488, in to_string
formatter.to_string()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 314, in to_string
strcols = self._to_str_columns()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 258, in _to_str_columns
str_index = self._get_formatted_index()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 472, in _get_formatted_index
fmt_index = [index.format(name=show_index_names, formatter=fmt)]
File "C:\Python27\lib\site-packages\pandas\core\index.py", line 450, in format
return self._format_with_header(header, **kwargs)
File "C:\Python27\lib\site-packages\pandas\core\index.py", line 472, in _format_with_header
result = _trim_front(format_array(values, None, justify='left'))
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1321, in format_array
return fmt_obj.get_result()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1448, in get_result
return _make_fixed_width(fmt_values, self.justify)
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1495, in _make_fixed_width
max_len = np.max([_strlen(x) for x in strings])
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 184, in _strlen
return len(x.decode(encoding))
LookupError: unknown encoding: MS874
I have tried to run this in IPython, and it does not give the error, so I think the problem is with my Eclipse setting. I use Eclipse Juno and I installed Pandas via Python(x,y).
I have tried to solve it blindly like this
import pandas as pd
data = pd.read_csv('foo.csv');
b = True;
while(b):
try:
print data
b = False
except:
print 'foooo'
And it just printed 'foooo' forever.

I have found the solution.
Right click on the project => Properties => Resource => Text file encoding. Choose other => UTF-8.

Python-Reportlab error: ValueError: format not resolved

when I was using python-reportlab to create a pdf document, sometimes it throws out an exception: ValueError: format not resolved talk.google.com, I wonder why this came out, and how to solve it, the full error stack is like below:
File "/usr/lib64/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib64/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/site-packages/tweets2pdf/tweets2pdf.py", line 42,
in generate_thread
tpdoc.dump()
File "/usr/lib/python2.7/site-packages/tweets2pdf/pdfgen.py", line 609, in
dump
self.pdfdoc.build(self.elements, onFirstPage = self.on_first_page,
onLaterPages = self.on_later_pages)
File "/usr/lib64/python2.7/site-
packages/reportlab/platypus/doctemplate.py", line 1117, in build
BaseDocTemplate.build(self,flowables, canvasmaker=canvasmaker)
File "/usr/lib64/python2.7/site-
packages/reportlab/platypus/doctemplate.py", line 906, in build
self._endBuild()
File "/usr/lib64/python2.7/site-
packages/reportlab/platypus/doctemplate.py", line 848, in _endBuild
if getattr(self,'_doSave',1): self.canv.save()
File "/usr/lib64/python2.7/site-packages/reportlab/pdfgen/canvas.py", line
1123, in save
self._doc.SaveToFile(self._filename, self)
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 235, in SaveToFile
f.write(self.GetPDFData(canvas))
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 257, in GetPDFData
return self.format()
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 417, in format
IOf = IO.format(self)
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 869, in format
fcontent = format(self.content, document, toplevel=1) # yes this is at
top level
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 102, in format
f = element.format(document)
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 1635, in format
return D.format(document)
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 667, in format
L = [(format(PDFName(k),document)+" "+format(dict[k],document)) for k in
keys]
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 102, in format
f = element.format(document)
File "/usr/lib64/python2.7/site-packages/reportlab/pdfbase/pdfdoc.py",
line 1764, in format
if f is None: raise ValueError, "format not resolved %s" % self.name
ValueError: format not resolved talk.google.com

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Problems with utf-8 in python - python

It looks like your source data hasn't been encoded with UTF-8 - it's likely to be one of the other codecs. Per this answer, you might want to try with encoding='GBK' to start with, or encoding='gb2312'.

Related

Can't decode csv file using pandas

Getting the below error while trying to read a CSV file in python

Pandas Reading csv - Error

Python Pandas print error in Eclipse's PyDev: unknown encoding: MS874

Python-Reportlab error: ValueError: format not resolved

Categories

Resources