Importing TSV file into pandas - python

I am trying to import a tsv file of amazon customer reviews (size 1.2 gb) into pandas.
This is what I have tried:
import pandas as pd
tsv_books = pd.read_csv('/Users/hs/Downloads/amazon_reviews_us_Books_v1_02.tsv', sep='\t')
print(tsv_books)
I am getting these error messages:
Traceback (most recent call last):
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/python_parser.py", line 760, in _next_iter_line
line = next(self.data)
_csv.Error: ' ' expected after '"'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/hs/PycharmProjects/Thesis Analysis/main.py", line 4, in <module>
tsv_books = pd.read_csv('/Users/hs/Downloads/amazon_reviews_us_Books_v1_02.tsv', delimiter='\t', engine="python")
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/util/_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv
return _read(filepath_or_buffer, kwds)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 581, in _read
return parser.read(nrows)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1250, in read
index, columns, col_dict = self._engine.read(nrows)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/python_parser.py", line 238, in read
content = self._get_lines(rows)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/python_parser.py", line 1091, in _get_lines
new_row = self._next_iter_line(row_num=self.pos + rows + 1)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/python_parser.py", line 789, in _next_iter_line
self._alert_malformed(msg, row_num)
File "/Users/hs/opt/anaconda3/envs/python39/lib/python3.9/site-packages/pandas/io/parsers/python_parser.py", line 739, in _alert_malformed
raise ParserError(msg)
pandas.errors.ParserError: ' ' expected after '"'
This seems to be a rookie mistake, sorry guys. Nonetheless I would appreciate any help!

Related

File contains data in an unknown format. (m4a load from librosa)

So I am currently working on a DNN that takes in m4a files. I have ffmpeg, it creates a few batches and then dies on this error:
Traceback (most recent call last):
File "/users/work/s163838/./main.py", line 126, in <module>
File "/users/work/s163838/./main.py", line 96, in main
print("e")
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1203, in _next_data
return self._process_data(data)
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1229, in _process_data
data.reraise()
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/_utils.py", line 425, in reraise
raise self.exc_type(msg)
EOFError: Caught EOFError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/librosa/core/audio.py", line 164, in load
y, sr_native = __soundfile_load(path, offset, duration, dtype)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/librosa/core/audio.py", line 195, in __soundfile_load
context = sf.SoundFile(path)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/soundfile.py", line 629, in __init__
self._file = self._open(file, mode_int, closefd)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/soundfile.py", line 1183, in _open
_error_check(_snd.sf_error(file_ptr),
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/soundfile.py", line 1357, in _error_check
raise RuntimeError(prefix + _ffi.string(err_str).decode('utf-8', 'replace'))
RuntimeError: Error opening 'vox2/dev/aac/id08194/QnBYPze-x9A/00079.m4a': File contains data in an unknown format.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/apl/tryton/python/3.9.5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/users/work/s163838/vox_celeb_loader.py", line 53, in __getitem__
load(speaker2utt1, self.num_samples)
File "/users/work/s163838/vox_celeb_loader.py", line 13, in load
wav, sr = librosa.load(path, sr=16000)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/librosa/util/decorators.py", line 88, in inner_f
return f(*args, **kwargs)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/librosa/core/audio.py", line 170, in load
y, sr_native = __audioread_load(path, offset, duration, dtype)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/librosa/core/audio.py", line 226, in __audioread_load
reader = audioread.audio_open(path)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/audioread/__init__.py", line 111, in audio_open
return BackendClass(path)
File "/users/kdm/s163838/.local/lib/python3.9/site-packages/audioread/rawread.py", line 65, in __init__
self._file = aifc.open(self._fh)
File "/apl/tryton/python/3.9.5/lib/python3.9/aifc.py", line 917, in open
return Aifc_read(f)
File "/apl/tryton/python/3.9.5/lib/python3.9/aifc.py", line 358, in __init__
self.initfp(f)
File "/apl/tryton/python/3.9.5/lib/python3.9/aifc.py", line 314, in initfp
chunk = Chunk(file)
File "/apl/tryton/python/3.9.5/lib/python3.9/chunk.py", line 63, in __init__
raise EOFError
EOFError
I am using this command
wav, sr = librosa.load(path, sr=16000)
is it just a broken file? How do I skip such then? Or is it something about loading a m4a file even with ffmpeg and the desired output when tested on a single m4a file?

Error shows up when using df.to_parquet("filename")

I want to save the data set as a parquet file, called power.parquet, and I use df.to_parquet(<filename>). But it gives me this errer "ValueError: Error converting column "Global_reactive_power" to bytes using encoding UTF8. Original error: bad argument type for built-in operation" And I installed the fastparquet package.
from fastparquet import write, ParquetFile
dat.to_parquet("power.parquet")
df_parquet = ParquetFile("power.parquet").to_pandas()
df_parquet.head() # Test your final value
`*Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 259, in convert
out = array_encode_utf8(data)
File "fastparquet/speedups.pyx", line 50, in fastparquet.speedups.array_encode_utf8
TypeError: bad argument type for built-in operation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/folders/4f/bm2th1p56tz4rq_zffc8g3940000gn/T/ipykernel_85477/3080656655.py", line 1, in <module>
dat.to_parquet("power.parquet", compression="GZIP")
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/core.py", line 4560, in to_parquet
return to_parquet(self, path, *args, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 732, in to_parquet
return compute_as_if_collection(
File "/opt/anaconda3/lib/python3.9/site-packages/dask/base.py", line 315, in compute_as_if_collection
return schedule(dsk2, keys, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/threaded.py", line 79, in get
results = get_async(
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 507, in get_async
raise_exception(exc, tb)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 315, in reraise
raise exc
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 220, in execute_task
result = _execute_task(task, data)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/opt/anaconda3/lib/python3.9/site-packages/dask/utils.py", line 35, in apply
return func(*args, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 1167, in write_partition
rg = make_part_file(
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 716, in make_part_file
rg = make_row_group(f, data, schema, compression=compression,
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 701, in make_row_group
chunk = write_column(f, coldata, column,
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 554, in write_column
repetition_data, definition_data, encode[encoding](data, selement), 8 * b'\x00'
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 354, in encode_plain
out = convert(data, se)
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 284, in convert
raise ValueError('Error converting column "%s" to bytes using '
ValueError: Error converting column "Global_reactive_power" to bytes using encoding UTF8. Original error: bad argument type for built-in operation
*
I tried by adding object_coding = "bytes".I want to solve this problem.

I'm getting an error while reading a csv with pandas

I'm getting this error while I'm trying to read a csv in python with pandas
df02 = pd.read_csv('PMDM Full\filename.csv', sep = '|')
Traceback (most recent call last): File "<stdin>", line 1, in <module> File
"C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\util\_decorators.py", line 311, in
wrapper File "C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1250,
in read index, columns, col_dict = self._engine.read(nrows) File
"C:\Users\dm\Google Drive\CS\GV\Tickets\Status
Check\venv\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py",
line 225, in read chunks = self._reader.read_low_memory(nrows)
File "pandas\_libs\parsers.pyx", line 805, in
pandas._libs.parsers.TextReader.read_low_memory File
"pandas\_libs\parsers.pyx", line 861, in
pandas._libs.parsers.TextReader._read_rows File
"pandas\_libs\parsers.pyx", line 847, in
pandas._libs.parsers.TextReader._tokenize_rows File
"pandas\_libs\parsers.pyx", line 1960, in
pandas._libs.parsers.raise_parser_error pandas.errors.ParserError:
Error tokenizing data. C error: Expected 109 fields in line 1021, saw
113
Code used:
df02 = pd.read_csv('filepath', sep = '|')
sample file
This error is occurring because line 5 of the csv contains a different number of columns than the other lines.
To read the file excluding this line you can use the following code:
df = pd.read_csv('sample.csv', sep='|', error_bad_lines=False)

Problem with adding Excel files at Pandas | wrapper return func

Hi everybody I have a problem uploading a excel file with Pandas
I have taken the file in archive, if I uploaded it directly it gaves me an error. If I cope and paste the excel file there is no problem.
The code is very easy:
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\pandas.xlsx")
and this is the error:
Traceback (most recent call last):
File "C:/Users/obett/PycharmProjects/pythonProject6/main.py", line 24, in <module>
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\Aida_Export_67.xlsx")
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 344, in read_excel
data = io.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 1170, in parse
return self._reader.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 492, in parse
data = self.get_sheet_data(sheet, convert_float)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in get_sheet_data
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in <listcomp>
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 514, in _convert_cell
elif cell.is_date:
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 101, in is_date
return Cell.is_date.__get__(self)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\cell.py", line 256, in is_date
self.data_type == 'n' and is_date_format(self.number_format)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 66, in number_format
_id = self.style_array.numFmtId
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 56, in style_array
return self.parent.parent._cell_styles[self._style_id]
IndexError: list index out of range
Thank you very much

Pandas Reading csv - Error

I have a problem with reading a CSV file with pandas (I know there are other topics but I could not solve the problem). My code is:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
The error I get is:
Traceback (most recent call last):
File "/username/username/Documents/first.py", line 362, in <module>
fuck = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 562, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 325, in _read
return parser.read()
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 815, in read
ret = self._engine.read(nrows)
File "/Users/username/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py", line 1314, in read
data = self._reader.read(nrows)
File "pandas/parser.pyx", line 805, in pandas.parser.TextReader.read (pandas/parser.c:8748)
File "pandas/parser.pyx", line 827, in pandas.parser.TextReader._read_low_memory (pandas/parser.c:9003)
File "pandas/parser.pyx", line 881, in pandas.parser.TextReader._read_rows (pandas/parser.c:9731)
File "pandas/parser.pyx", line 868, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:9602)
File "pandas/parser.pyx", line 1865, in pandas.parser.raise_parser_error (pandas/parser.c:23325)
pandas.io.common.CParserError: Error tokenizing data. C error: Expected 4 fields in line 4587, saw 5
What am I doing wrong?
Try adding the argument error_bad_lines=False to read_csv
The following worked for me by adding:
import pandas as pd
f = pd.read_csv('1803Ltem.csv',sep='\t', dtype=object,error_bad_lines=False)

Categories

Resources