Related
I'm getting the following error in my code, and I'm not sure how to fix it. I'm trying to write a function that reads a list of xlsx files into individual pandas DataFrames then concatenates them into a single DataFrame.
It was working originally, but now I'm getting this XLRD Error:
---------------------------------------------------------------------------
XLRDError Traceback (most recent call last)
<ipython-input-123-91bc654495ef> in <module>
11
12 # Returns a single combined dataframe
---> 13 stacked_df = excel_to_dataframe(validation_list, BASEDIR)
<ipython-input-123-91bc654495ef> in excel_to_dataframe(list, file_path)
5 data = []
6 for i in list:
----> 7 df = pd.read_excel(os.path.join(file_path, i), index_col=None)
8 data.append(df)
9 # Concatanates the list of validation dataframes
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
186 else:
187 kwargs[new_arg_name] = new_arg_value
--> 188 return func(*args, **kwargs)
189 return wrapper
190 return _deprecate_kwarg
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
186 else:
187 kwargs[new_arg_name] = new_arg_value
--> 188 return func(*args, **kwargs)
189 return wrapper
190 return _deprecate_kwarg
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\excel.py in read_excel(io, sheet_name, header, names, index_col, parse_cols, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skip_footer, skipfooter, convert_float, mangle_dupe_cols, **kwds)
348
349 if not isinstance(io, ExcelFile):
--> 350 io = ExcelFile(io, engine=engine)
351
352 return io.parse(
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\excel.py in __init__(self, io, engine)
651 self._io = _stringify_path(io)
652
--> 653 self._reader = self._engines[engine](self._io)
654
655 def __fspath__(self):
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\excel.py in __init__(self, filepath_or_buffer)
422 self.book = xlrd.open_workbook(file_contents=data)
423 elif isinstance(filepath_or_buffer, compat.string_types):
--> 424 self.book = xlrd.open_workbook(filepath_or_buffer)
425 else:
426 raise ValueError('Must explicitly set engine if not passing in'
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
155 formatting_info=formatting_info,
156 on_demand=on_demand,
--> 157 ragged_rows=ragged_rows,
158 )
159 return bk
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\xlrd\book.py in open_workbook_xls(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
90 t1 = perf_counter()
91 bk.load_time_stage_1 = t1 - t0
---> 92 biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
93 if not biff_version:
94 raise XLRDError("Can't determine file's BIFF version")
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\xlrd\book.py in getbof(self, rqd_stream)
1276 bof_error('Expected BOF record; met end of file')
1277 if opcode not in bofcodes:
-> 1278 bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
1279 length = self.get2bytes()
1280 if length == MY_EOF:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\xlrd\book.py in bof_error(msg)
1270
1271 def bof_error(msg):
-> 1272 raise XLRDError('Unsupported format, or corrupt file: ' + msg)
1273 savpos = self._position
1274 opcode = self.get2bytes()
XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'\rJulian '
Here are my base and log directories:
DATE_NOW = datetime.today().strftime('%Y-%m-%d')
BASEDIR = Path(r'\\company.Net\NDrive\Project\50813\NJ1\Task 3 Db-Template-Tools Dev\UPL\upl_workbooks\2020_workbooks\Uploaded to S3\log_csv files\NEW_REPROCESSED_NF_OP')
LOGDIR = BASEDIR / 'validation_summary_output'
LOGNAME = BASEDIR / LOGDIR / f"final_validation_output_{DATE_NOW}.xlsx"
And here is the code that reads the xlsx files into DataFrames:
def excel_to_dataframe(list, file_path):
"""
The function reads the list of xlsx files into individual dataframes then combines them into a single dataframe.
"""
data = []
for i in list:
df = pd.read_excel(os.path.join(file_path, i), index_col=None)
data.append(df)
# Concatanates the list of validation dataframes
return pd.concat(data, sort=True)
# Returns a single combined dataframe
stacked_df = excel_to_dataframe(validation_list, pathlib.Path(BASEDIR))
How do I fix this issue? Thanks.
I am trying to import a large excel file (400k x 40) to a Pandas DataFrame. While it works well on my local machine it breaks when ported to a linux server with Python=3.7, Pandas=1.2.4 and Openpyxl=3.0.7. Locally I have slightly older packages. I have tried all constelations of parameters regarding types and dates:
df = pd.read_excel(fpath)
df = pd.read_excel(fpath, dtype=str, parse_dates=['the_only_actual_date_column']) # all dates are within 2017
df = pd.read_excel(fpath, dtype={k:str for k in column_names})
df = pd.read_excel(fpath, converters={k:str for k in column_names})
but nothing worked, I am always getting the same error:
OverflowError: date value out of range
I suspect that somewhere a random cell is treated as a date but how is it possible that the date value is a concern if I declare to treat everything as a string? The same happens even if I specify usecols to only one column which is definitely not a date.
Edit: full error message
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-12-d38de141bf91> in <module>
----> 1 fu = pd.read_excel(fpath, nrows=10)
/opt/tljh/user/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
297 )
298 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 299 return func(*args, **kwargs)
300
301 return wrapper
/opt/tljh/user/lib/python3.7/site-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
365 skipfooter=skipfooter,
366 convert_float=convert_float,
--> 367 mangle_dupe_cols=mangle_dupe_cols,
368 )
369 finally:
/opt/tljh/user/lib/python3.7/site-packages/pandas/io/excel/_base.py in parse(self, sheet_name, header, names, index_col, usecols, squeeze, converters, true_values, false_values, skiprows, nrows, na_values, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
1188 convert_float=convert_float,
1189 mangle_dupe_cols=mangle_dupe_cols,
-> 1190 **kwds,
1191 )
1192
/opt/tljh/user/lib/python3.7/site-packages/pandas/io/excel/_base.py in parse(self, sheet_name, header, names, index_col, usecols, squeeze, dtype, true_values, false_values, skiprows, nrows, na_values, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
490 sheet = self.get_sheet_by_index(asheetname)
491
--> 492 data = self.get_sheet_data(sheet, convert_float)
493 usecols = maybe_convert_usecols(usecols)
494
/opt/tljh/user/lib/python3.7/site-packages/pandas/io/excel/_openpyxl.py in get_sheet_data(self, sheet, convert_float)
546 data: List[List[Scalar]] = []
547 last_row_with_data = -1
--> 548 for row_number, row in enumerate(sheet.rows):
549 converted_row = [self._convert_cell(cell, convert_float) for cell in row]
550 if not all(cell == "" for cell in converted_row):
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/worksheet/_read_only.py in _cells_by_row(self, min_col, min_row, max_col, max_row, values_only)
77 data_only=self.parent.data_only, epoch=self.parent.epoch,
78 date_formats=self.parent._date_formats)
---> 79 for idx, row in parser.parse():
80 if max_row is not None and idx > max_row:
81 break
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/worksheet/_reader.py in parse(self)
153 element.clear()
154 elif tag_name == ROW_TAG:
--> 155 row = self.parse_row(element)
156 element.clear()
157 yield row
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/worksheet/_reader.py in parse_row(self, row)
284 self.row_dimensions[str(self.row_counter)] = attrs
285
--> 286 cells = [self.parse_cell(el) for el in row]
287 return self.row_counter, cells
288
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/worksheet/_reader.py in <listcomp>(.0)
284 self.row_dimensions[str(self.row_counter)] = attrs
285
--> 286 cells = [self.parse_cell(el) for el in row]
287 return self.row_counter, cells
288
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/worksheet/_reader.py in parse_cell(self, element)
205 try:
206 value = from_excel(
--> 207 value, self.epoch, timedelta=style_id in self.timedelta_formats
208 )
209 except ValueError:
/opt/tljh/user/lib/python3.7/site-packages/openpyxl/utils/datetime.py in from_excel(value, epoch, timedelta)
120 if 0 < value < 60 and epoch == WINDOWS_EPOCH:
121 day += 1
--> 122 return epoch + datetime.timedelta(days=day) + diff
123
124
OverflowError: date value out of range
if you face this issue while reading excel/csv file using pandas, then check your excel/csv file, any one or more column will have the values like #############, that means value is negative ex: (-11111) or date is too long to fit in the cell
import pandas as pd
df = pd.read_excel(file path,dtype='string') #this will convert all the column
type to string
or if you want to convert specific column then
df = pd.read_excel(file path,converters={'column name':str})
change the dtype according your need
df = pd.read_excel(fpath)
df = pd.read_excel(fpath, dtype='string')
you dont need to loop through each column to change the dtype, you can do it once
The goal is to combine the important information from sheet1 & sheet2 of 952 excel files. Then save them as csvs in path as a name according to a cell value. Thanks to the stackoverflow community this is mostly working. Now, it gives the error: "List index out of range". This happens around the half way point specifically 475 to 516 files saved correctly.
Would anyone be able to make this work for the entire list?
# https://stackoverflow.com/questions/44776793/copy-all-csv-files-in-a-directory-of-folders-to-one-folder-in-python
# https://stackoverflow.com/questions/59292999/modifying-multiple-csv-files-from-same-directory-in-python
# https://stackoverflow.com/questions/38101009/changing-multiple-column-names-but-not-all-of-them-pandas-python
# https://stackoverflow.com/questions/28465779/how-do-i-limit-the-amount-of-letters-in-a-string
# https://stackoverflow.com/questions/37952797/pandas-dataframe-column-name-remove-special-character
import glob
import pandas as pd
excel_files = glob.glob('data1/*.xlsx')
path = Path('data2')
for excel in excel_files:
df1 = pd.read_excel(excel, sheet_name=0, dtype=str, index_col=None)
df2 = pd.read_excel(excel, sheet_name=1, dtype=str, index_col=None)
i = df1.iat[0,1]
j = df1.iat[0,15]
df2.rename(columns={'Date':'Date','Sales':i+j}, inplace=True)
df2.columns=df2.columns.str.replace('(','')
df2.columns=df2.columns.str.replace('/','')
df2.columns=df2.columns.str.replace(')','')
df2.columns=df2.columns.str.replace(' ','-')
df2.columns=df2.columns.str.replace('<','-')
df2.columns=df2.columns.str.replace('>','-')
k = df2.columns[1]
l = (k)[:19]
m = l + '.csv'
df2.to_csv(path/m, encoding='utf-8', index=False)
->Edit below:
Stack-trace as requested. Thanks for taking a look at least.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-10-3f727a40755a> in <module>
23 for excel in excel_files:
24 df1 = pd.read_excel(excel, sheet_name=0, dtype=str, index_col=None)
---> 25 df2 = pd.read_excel(excel, sheet_name=1, dtype=str, index_col=None)
26 i = df1.iat[0,1]
27 j = df1.iat[0,15]
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
332 convert_float=convert_float,
333 mangle_dupe_cols=mangle_dupe_cols,
--> 334 **kwds,
335 )
336
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel/_base.py in parse(self, sheet_name, header, names, index_col, usecols, squeeze, converters, true_values, false_values, skiprows, nrows, na_values, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
886 convert_float=convert_float,
887 mangle_dupe_cols=mangle_dupe_cols,
--> 888 **kwds,
889 )
890
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel/_base.py in parse(self, sheet_name, header, names, index_col, usecols, squeeze, dtype, true_values, false_values, skiprows, nrows, na_values, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
439 sheet = self.get_sheet_by_name(asheetname)
440 else: # assume an integer if not a string
--> 441 sheet = self.get_sheet_by_index(asheetname)
442
443 data = self.get_sheet_data(sheet, convert_float)
~/anaconda3/lib/python3.7/site-packages/pandas/io/excel/_xlrd.py in get_sheet_by_index(self, index)
44
45 def get_sheet_by_index(self, index):
---> 46 return self.book.sheet_by_index(index)
47
48 def get_sheet_data(self, sheet, convert_float):
~/anaconda3/lib/python3.7/site-packages/xlrd/book.py in sheet_by_index(self, sheetx)
464 :returns: A :class:`~xlrd.sheet.Sheet`.
465 """
--> 466 return self._sheet_list[sheetx] or self.get_sheet(sheetx)
467
468 def sheet_by_name(self, sheet_name):
IndexError: list index out of range
Chris's comment solved the problem:
It means your particular excel workbook only has one sheet unlike the others and read_excel fails if parameter sheet_name is 1. If you want to handle that case, you need to wrap your pandas.read_excel call inside a try / except clause.
So, i'm trying to select a few lines and columns of an Excel file and turn them into a Pandas dataframe. The problem is: I must select only columns C:G and lines 19 to 245. I tried using df = pd.read_excel("Energy Indicators.xls", skiprows=18, usecols="C:G") without success.
Unfortunately I cannot change the XLS file, so I have to leave it as it is.
I always get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-ff45621b0e89> in <module>()
4 file_loc = "Energy Indicators.xls"
5
----> 6 df = pd.read_excel(file_loc, skiprows=18, usecols="A:D")
/opt/conda/lib/python3.6/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, true_values, false_values, engine, squeeze, **kwds)
198 skip_footer=skip_footer, converters=converters,
199 true_values=true_values, false_values=false_values, squeeze=squeeze,
--> 200 **kwds)
201
202
/opt/conda/lib/python3.6/site-packages/pandas/io/excel.py in _parse_excel(self, sheetname, header, skiprows, names, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, true_values, false_values, verbose, squeeze, **kwds)
502 skipfooter=skip_footer,
503 squeeze=squeeze,
--> 504 **kwds)
505
506 output[asheetname] = parser.read()
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in TextParser(*args, **kwds)
1669 """
1670 kwds['engine'] = 'python'
-> 1671 return TextFileReader(*args, **kwds)
1672
1673
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
728 self.options['has_index_names'] = kwds['has_index_names']
729
--> 730 self._make_engine(self.engine)
731
732 def close(self):
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
927 elif engine == 'python-fwf':
928 klass = FixedWidthFieldParser
--> 929 self._engine = klass(self.f, **self.options)
930
931 def _failover_to_python(self):
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, **kwds)
1818 # infer column indices from self.usecols if is is specified.
1819 self._col_indices = None
-> 1820 self.columns, self.num_original_columns = self._infer_columns()
1821
1822 # Now self.columns has the set of columns that we will process.
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in _infer_columns(self)
2181 columns = [names]
2182 else:
-> 2183 columns = self._handle_usecols(columns, columns[0])
2184 else:
2185 try:
/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py in _handle_usecols(self, columns, usecols_key)
2234 for u in self.usecols:
2235 if isinstance(u, string_types):
-> 2236 col_indices.append(usecols_key.index(u))
2237 else:
2238 col_indices.append(u)
ValueError: 'A' is not in list
Could someone please help me fix this? Thank you very much
#Would this work?
df = pd.read_excel("Energy Indicators.xls")
df=df[list('CDEFG')]
df=df[19:245]
It may make a difference if your loading a very big file. Else this should be just fine
By default read_excel use the first line of the Excel sheet to set the column names. Maybe by adding header=None ?
df = pd.read_excel("Energy Indicators.xls", header=None, skiprows=18, usecols="C:G")
My problem is that I have multiple text files with size of 200mb+ with this format (very little example):
john,smith,3;sasha,dilma,4;sofia,vergara,5;etc.
I need to read all those files and analyze the information, graph, sum, etc.
I've been thinking in different methods to save the data and use it in Python. However, the line terminator ';' is causing problems every time I try to load data into a DataBase or directly in Python (also tried with lineterminator parameter), for example:
import pandas as pd
userHeader = ['name', 'last_name', 'number']
users = pd.read_table('C:/prueba.txt', engine='python', sep=',', header=None, names=userHeader)
# print 3 first users
print '# 3 first users: \n%s' % users[:2]
Result:
# 3 first users:
name last_name number
0 john,smith,3 sasha,dilma,4 sofia,vergara,5
Edit. When I implement lineterminator just like this:
users = pd.read_table('C:/prueba.txt', engine='python', sep=',', lineterminator=';', header=None, names=userHeader)
I get the following:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-23a80631d090> in <module>()
1 import pandas as pd
2 userHeader = ['user_id', 'gender', 'age']
----> 3 users = pd.read_table('C:/prueba.txt', engine='python', sep=';', lineterminator=';', header=None, names=userHeader)
4
5 # print 5 first users
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
472 skip_blank_lines=skip_blank_lines)
473
--> 474 return _read(filepath_or_buffer, kwds)
475
476 parser_f.__name__ = name
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
248
249 # Create the parser.
--> 250 parser = TextFileReader(filepath_or_buffer, **kwds)
251
252 if (nrows is not None) and (chunksize is not None):
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in __init__(self, f, engine, **kwds)
564 self.options['has_index_names'] = kwds['has_index_names']
565
--> 566 self._make_engine(self.engine)
567
568 def _get_options_with_defaults(self, engine):
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in _make_engine(self, engine)
709 elif engine == 'python-fwf':
710 klass = FixedWidthFieldParser
--> 711 self._engine = klass(self.f, **self.options)
712
713 def _failover_to_python(self):
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in __init__(self, f, **kwds)
1420 # Set self.data to something that can read lines.
1421 if hasattr(f, 'readline'):
-> 1422 self._make_reader(f)
1423 else:
1424 self.data = f
C:\Users\molmos\Anaconda\lib\site-packages\pandas\io\parsers.pyc in _make_reader(self, f)
1495 if sep is None or len(sep) == 1:
1496 if self.lineterminator:
-> 1497 raise ValueError('Custom line terminators not supported in '
1498 'python parser (yet)')
1499
ValueError: Custom line terminators not supported in python parser (yet)
Do you have any idea of how to read and manipulate all this information stored in text files?
I appreciate your help.
Add parameter lineterminator=";".
import pandas as pd
import io
temp=u"""john,smith,3;sasha,dilma,4;sofia,vergara,5"""
userHeader = ['name', 'last_name', 'number']
users = pd.read_table(io.StringIO(temp), sep=',', lineterminator=";",header=None, names=userHeader)
print users
# name last_name number
#0 john smith 3
#1 sasha dilma 4
#2 sofia vergara 5
You have to omit engine='python', because error:
ValueError: Custom line terminators not supported in python parser (yet)
Docs:
lineterminator : string (length 1), default None,
Character to break file into lines. Only valid with C parser
sep is the separator for fields. The line terminator is given in lineterminator.
users = pd.read_table('C:/prueba.txt', engine='c', sep=',', lineterminator=';', header=None, names=userHeader)
Use lineterminator:
df = pd.read_table('C:/prueba.txt', sep=',', lineterminator=';', header=None, names=userHeader)
In [62]: df
Out[62]:
john smith 3
0 sasha dilma 4
1 sofia vergara 5