Can't decode csv file using pandas - python

This is the code and encoding='utf-8' is also not working:
import pandas as pd
df = pd.read_csv('sunil.csv',encoding ="utf-8")
print(df)
And here is the error:
Traceback (most recent call last):
File "C:/Users/ASUS/Desktop/utube/sunil.py", line 2, in <module>
df = pd.read_csv('sunil.csv',encoding ="utf-8")
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 764, in pandas._libs.parsers.TextReader._get_header
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 0: invalid start byte

Can you try ISO-8859-1 as it worked for me when I had same issue
dataset = pd.read_csv(r'filename.csv',encoding = "ISO-8859-1")

Related

I'm getting an error while reading a csv with pandas

I'm getting this error while I'm trying to read a csv in python with pandas
df02 = pd.read_csv('PMDM Full\filename.csv', sep = '|')
Traceback (most recent call last): File "<stdin>", line 1, in <module> File
"C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\util\_decorators.py", line 311, in
wrapper File "C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1250,
in read index, columns, col_dict = self._engine.read(nrows) File
"C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py",
line 225, in read chunks = self._reader.read_low_memory(nrows)
File "pandas\_libs\parsers.pyx", line 805, in
pandas._libs.parsers.TextReader.read_low_memory File
"pandas\_libs\parsers.pyx", line 861, in
pandas._libs.parsers.TextReader._read_rows File
"pandas\_libs\parsers.pyx", line 847, in
pandas._libs.parsers.TextReader._tokenize_rows File
"pandas\_libs\parsers.pyx", line 1960, in
pandas._libs.parsers.raise_parser_error pandas.errors.ParserError:
Error tokenizing data. C error: Expected 109 fields in line 1021, saw
113
Code used:
df02 = pd.read_csv('filepath', sep = '|')
sample file
This error is occurring because line 5 of the csv contains a different number of columns than the other lines.
To read the file excluding this line you can use the following code:
df = pd.read_csv('sample.csv', sep='|', error_bad_lines=False)

Pandas Reading csv - Error

I have a problem with reading a CSV file with pandas (I know there are other topics but I could not solve the problem). My code is:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
The error I get is:
Traceback (most recent call last):
File "/username/username/Documents/first.py", line 362, in <module>
fuck = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 562, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 325, in _read
return parser.read()
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 815, in read
ret = self._engine.read(nrows)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 1314, in read
data = self._reader.read(nrows)
File "pandas/parser.pyx", line 805, in pandas.parser.TextReader.read (pandas/parser.c:8748)
File "pandas/parser.pyx", line 827, in pandas.parser.TextReader._read_low_memory (pandas/parser.c:9003)
File "pandas/parser.pyx", line 881, in pandas.parser.TextReader._read_rows (pandas/parser.c:9731)
File "pandas/parser.pyx", line 868, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:9602)
File "pandas/parser.pyx", line 1865, in pandas.parser.raise_parser_error (pandas/parser.c:23325)
pandas.io.common.CParserError: Error tokenizing data. C error: Expected 4 fields in line 4587, saw 5
What am I doing wrong?
Try adding the argument error_bad_lines=False to read_csv
The following worked for me by adding:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,error_bad_lines=False)

Problems with utf-8 in python

My code is below.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import codecs
df1 = pd.read_csv(r'E:\내논문자료\wordcloud\test1\1311_1312.csv',encoding='utf-8')
df2 = df1.groupby(['address']).size().reset_index()
df2.rename(columns = {0: 'frequency'}, inplace = True)
print(df2[:100])
But When I execute this code I got this message
Traceback (most recent call last):
File "E:/빅데이터 캠퍼스/untitled1/groupby freq.py", line 7, in <module>
df1 = pd.read_csv(r'E:\내논문자료\wordcloud\test1\1311_1312.csv',encoding='utf-8')
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 645, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 400, in _read
data = parser.read()
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 938, in read
ret = self._engine.read(nrows)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 1507, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 846, in pandas.parser.TextReader.read (pandas\parser.c:10364)
File "pandas\parser.pyx", line 868, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10640)
File "pandas\parser.pyx", line 945, in pandas.parser.TextReader._read_rows (pandas\parser.c:11677)
File "pandas\parser.pyx", line 1047, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13111)
File "pandas\parser.pyx", line 1106, in pandas.parser.TextReader._convert_tokens (pandas\parser.c:14065)
File "pandas\parser.pyx", line 1204, in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:16121)
File "pandas\parser.pyx", line 1220, in pandas.parser.TextReader._string_convert (pandas\parser.c:16349)
File "pandas\parser.pyx", line 1452, in pandas.parser._string_box_utf8 (pandas\parser.c:22014)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 0: invalid start byte
How can I solve it??
Should I alter parsers code in pandas??
It looks like your source data hasn't been encoded with UTF-8 - it's likely to be one of the other codecs. Per this answer, you might want to try with encoding='GBK' to start with, or encoding='gb2312'.

Issues reading json from txt file

I have a json string in a txt file and I'm trying to read it to do some other procedures afterwards. It looks like this:
with open('code test.txt', 'r', encoding=('UTF-8')) as f:
x = json.load(f)
I know the json is valid, but I'm getting:
Traceback (most recent call last):
File "C:\Python33\lib\json\decoder.py", line 368, in raw_decode
obj, end = self.scan_once(s, idx)
StopIteration
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 334, in <module>
user_input()
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 328, in user_input
child_remover()
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 280, in child_remover
x = json.load(f)
File "C:\Python33\lib\json\__init__.py", line 274, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "C:\Python33\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Python33\lib\json\decoder.py", line 352, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python33\lib\json\decoder.py", line 370, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
I used this website to check if the string is valid. If I use .loads(), I get a different error:
Traceback (most recent call last):
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 334, in <module>
user_input()
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 328, in user_input
child_remover()
File "C:\Users\rodrigof\Desktop\xml test\xml extraction.py", line 280, in child_remover
x = json.loads(f)
File "C:\Python33\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Python33\lib\json\decoder.py", line 352, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
TypeError: expected string or buffer
Originally the json was embeded in my script like this:
json_text="""json stuff here"""
And didn't get any errors. Any ideas on how to fix this???
Running python 3.3.3 just in case.
Thanks!!
EDIT:
Just some random (valid) json on the txt and I get the same issue. This os one of the ones i tried:
{"data":
{"mobileHelp":
{"value":
{
"ID1":{"children": [1,2,3,4,5]},
"ID2":{"children": []},
"ID3":{"children": [6,7,8,9,10]}
}
}
}
}
Which is valid as well as per jsonlint.com.
Your file contains a UTF-8 BOM character at the start. UTF-8 doesn't need a BOM but especially Microsoft tools insist on adding one anyway.
Open the file with the utf-8-sig encoding instead:
>>> open('/tmp/json.test', 'wb').write(b'\xef\xbb\xbf{"data":\r\n {"mobileHelp":\r\n {"value":\r\n {\r\n "ID1":{"children": [1,2,3,4,5]},\r\n "ID2":{"children": []},\r\n "ID3":{"children": [6,7,8,9,10]}\r\n }\r\n }\r\n }\r\n}')
230
>>> import json
>>> with open('/tmp/json.test', encoding='utf8') as f:
... data = json.load(f)
...
Traceback (most recent call last):
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.3/json/decoder.py", line 367, in raw_decode
obj, end = self.scan_once(s, idx)
StopIteration
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.3/json/__init__.py", line 271, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.3/json/__init__.py", line 316, in loads
return _default_decoder.decode(s)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.3/json/decoder.py", line 351, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.3/json/decoder.py", line 369, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
>>> with open('/tmp/json.test', encoding='utf-8-sig') as f:
... data = json.load(f)
...
>>> data
{'data': {'mobileHelp': {'value': {'ID2': {'children': []}, 'ID3': {'children': [6, 7, 8, 9, 10]}, 'ID1': {'children': [1, 2, 3, 4, 5]}}}}}
Note that from Python 3.4 onwards you get a more helpful error message here:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.4/json/__init__.py", line 268, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python3.4/json/__init__.py", line 314, in loads
raise ValueError("Unexpected UTF-8 BOM (decode using utf-8-sig)")
ValueError: Unexpected UTF-8 BOM (decode using utf-8-sig)
Not sure what your code looks like for the second error, but it looks like you are passing json.loads a file object and not a string. Try:
with open('code test.txt', 'r', encoding=('UTF-8')) as f:
x = json.loads(f.read())
or without newlines with:
with open('code test.txt', 'r', encoding=('UTF-8')) as f:
x = json.loads(f.read().replace('\n', ''))
As another choice, This will be much easier to fix this issue.
json.loads(open('test.txt').read().decode('utf-8-sig'))

Python Pandas print error in Eclipse's PyDev: unknown encoding: MS874

I am trying to use Pandas library to read csv files, using Eclipse's PyDev.
foo.csv file:
"head1", "head2",
"A", "123"
test.py:
import pandas as pd
data = pd.read_csv('foo.csv');
print data
I ran this and got an error:
Traceback (most recent call last):
File "C:\Users\qqq\studyspace\macd\test3.py", line 4, in <module>
print data
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 666, in __str__
return self.__bytes__()
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 676, in __bytes__
return self.__unicode__().encode(encoding, 'replace')
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 691, in __unicode__
fits_horizontal = self._repr_fits_horizontal_()
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 651, in _repr_fits_horizontal_
d.to_string(buf=buf)
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 1488, in to_string
formatter.to_string()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 314, in to_string
strcols = self._to_str_columns()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 258, in _to_str_columns
str_index = self._get_formatted_index()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 472, in _get_formatted_index
fmt_index = [index.format(name=show_index_names, formatter=fmt)]
File "C:\Python27\lib\site-packages\pandas\core\index.py", line 450, in format
return self._format_with_header(header, **kwargs)
File "C:\Python27\lib\site-packages\pandas\core\index.py", line 472, in _format_with_header
result = _trim_front(format_array(values, None, justify='left'))
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1321, in format_array
return fmt_obj.get_result()
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1448, in get_result
return _make_fixed_width(fmt_values, self.justify)
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 1495, in _make_fixed_width
max_len = np.max([_strlen(x) for x in strings])
File "C:\Python27\lib\site-packages\pandas\core\format.py", line 184, in _strlen
return len(x.decode(encoding))
LookupError: unknown encoding: MS874
I have tried to run this in IPython, and it does not give the error, so I think the problem is with my Eclipse setting. I use Eclipse Juno and I installed Pandas via Python(x,y).
I have tried to solve it blindly like this
import pandas as pd
data = pd.read_csv('foo.csv');
b = True;
while(b):
try:
print data
b = False
except:
print 'foooo'
And it just printed 'foooo' forever.
I have found the solution.
Right click on the project => Properties => Resource => Text file encoding. Choose other => UTF-8.

Categories

Resources