Copying CSV files to XLSX with formats

Copying CSV files to XLSX with formats - python

Here is the snippet of my code.
import pandas as pd
import os
dpath = '//xxx//HEM'
for filename in os.listdir('//xxx//HEM'):
df = pd.read_csv(dpath + '/' + filename)
df = df['ab':'af'] #select required columns based on your requirement.
df["ab"] = pd.to_numeric(df["ab"]) # convert datatype of the column based on your need
df["af"] = pd.to_numeric(df["af"]) # convert datatype of the column based on your need
df1.append(df)
del df
df1.to_excel('test.xlsx')
On each CSV sheet in the folder i am reading from, Column AB & AF should be numeric values. I get the following error.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-c4c944eb874e> in <module>
4 for filename in os.listdir('//xxx//HEM'):
5 df = pd.read_csv(dpath + '/' + filename)
----> 6 df = df['ab':'af'] #select required columns based on your requirement.
7 df["ab"] = pd.to_numeric(df["ab"]) # convert datatype of the column based on your need
8 df1.append(df)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2906
2907 # Do we have a slicer (on rows)?
-> 2908 indexer = convert_to_index_sliceable(self, key)
2909 if indexer is not None:
2910 return self._slice(indexer, axis=0)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in convert_to_index_sliceable(obj, key)
2454 idx = obj.index
2455 if isinstance(key, slice):
-> 2456 return idx._convert_slice_indexer(key, kind='getitem')
2457
2458 elif isinstance(key, compat.string_types):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _convert_slice_indexer(self, key, kind)
2926 """
2927 if self.is_integer() or is_index_slice:
-> 2928 return slice(self._validate_indexer('slice', key.start, kind),
2929 self._validate_indexer('slice', key.stop, kind),
2930 self._validate_indexer('slice', key.step, kind))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_indexer(self, form, key, kind)
4708 pass
4709 elif kind in ['iloc', 'getitem']:
-> 4710 self._invalid_indexer(form, key)
4711 return key
4712
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
3065 "indexers [{key}] of {kind}".format(
3066 form=form, klass=type(self), key=key,
-> 3067 kind=type(key)))
3068
3069 # --------------------------------------------------------------------
TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [ab] of <class 'str'>
Is there something i am doing wrong ? I am guessing its the format of the Data ?

Related

How do I create a function that pulls two values from my dataframe?

I want to create a function that basically pulls two specific values or "cells" from my data frame. The data frame is a little large so I attached a picture of it. My current code for the function
def return_riskscore(CountryA, CountryB):
i = CountryA
j = CountryB
senti_value = main7.at[i,j]
return senti_value
Then I used the following to apply the function to the dataframe.
main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine ', CountryB = 'Russia'))
The error I receive is as follows:
KeyError Traceback (most recent call last)
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/4285294519.py in <module>
----> 1 main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine', CountryB = 'Russia'))
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/1369879570.py in return_riskscore(CountryA, CountryB)
2 i = CountryA
3 j = CountryB
----> 4 senti_value = main7.at[i,j]
5 return senti_value
6 #for i in range(len(main7)):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2273 return self.obj.loc[key]
2274
-> 2275 return super().__getitem__(key)
2276
2277 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2220
2221 key = self._convert_key(key)
-> 2222 return self.obj._get_value(*key, takeable=self._takeable)
2223
2224 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in _get_value(self, index, col, takeable)
3577
3578 try:
-> 3579 loc = engine.get_loc(index)
3580 return series._values[loc]
3581 except AttributeError:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Ukraine'
More specifically, I want to be able to pull country A's and country B's value.
Is there another way to achieve this? an example output is the value (number) at the specified Row , Country title. For example if Country A = Ukraine and Country B = Russia, then the output I expect is 1.0 Data Frame

How to solve this problem in Python Importing Stocks data?

I'm sorry that I'm a begginer, so this might be something simple but I don't know what might be wrong.
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
This is the error I get:
ValueError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3524 try:
-> 3525 value = Series(value)
3526 except (ValueError, NotImplementedError, TypeError):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
312
--> 313 data = SingleBlockManager(data, index, fastpath=True)
314
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, block, axis, do_integrity_check, fastpath)
1515 if not isinstance(block, Block):
-> 1516 block = make_block(block, placement=slice(0, len(axis)), ndim=1)
1517
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype, fastpath)
3266
-> 3267 return klass(values, ndim=ndim, placement=placement)
3268
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
2774
-> 2775 super().__init__(values, ndim=ndim, placement=placement)
2776
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
127 "Wrong number of items passed {val}, placement implies "
--> 128 "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs))
129 )
ValueError: Wrong number of items passed 6, placement implies 1318
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-11-6f48653a5e0e> in <module>
2 new_data = pd.DataFrame()
3 for t in tickers:
----> 4 new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3470 else:
3471 # set column
-> 3472 self._set_item(key, value)
3473
3474 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3546 """
3547
-> 3548 self._ensure_valid_index(value)
3549 value = self._sanitize_column(key, value)
3550 NDFrame._set_item(self, key, value)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3526 except (ValueError, NotImplementedError, TypeError):
3527 raise ValueError(
-> 3528 "Cannot set a frame with no defined index "
3529 "and a value that cannot be converted to a "
3530 "Series"
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
I imagine this has got something to do with missing information? Reading int as text or something? Can someone help?
This is actually from the answer key of an exercise I am doing from an online course, and it's returning this error.

You can only assign a column with in a dataframe with data of the same size: a column or Series (in Pandas terms). Let's look at what DataReader returns:
wb.DataReader('F', data_source='yahoo', start='2015-1-1').head()
Date High Low Open Close Volume AdjClose
2015-01-02 15.65 15.18 15.59 15.36 24777900.0 11.452041
2015-01-05 15.13 14.69 15.12 14.76 44079700.0 11.004693
2015-01-06 14.90 14.38 14.88 14.62 32981600.0 10.900313
2015-01-07 15.09 14.77 14.78 15.04 26065300.0 11.213456
2015-01-08 15.48 15.23 15.40 15.42 33943400.0 11.496773
It returns a dataframe. If you'd like to pick one column from here - say, Open - you can assign that to your new table.
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')['Open']
new_data.head()
Date F MSFT BP
2015-01-02 15.59 46.660000 38.209999
2015-01-05 15.12 46.369999 36.590000
2015-01-06 14.88 46.380001 36.009998
2015-01-07 14.78 45.980000 36.000000
2015-01-08 15.40 46.750000 36.430000

You could assign to a dictionary;
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = {}
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
print(new_data['F'])
you are trying to assign a dataframe to each column of a dataframe

Trying to filter my pandas column with another list but recieving a memory error

My dataframe ag_data contains a column called state that contains state abbreviations, however not all the data in state is a correct and acknowledged US state abbreviation. SA is a list of state abbreviations. I wanted to check if the state abbreviation in my state column was in the SA list as well and filter my dataset to that, but I keep getting an error. Is there another way to do this???
ag_data[ag_data.state.isin(SA)]
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-26-8e3eefc5127e> in <module>()
1 #ag_data["state"] = ag_data[ag_data[ag_data.columns[0]].isin(SA)]
----> 2 ag_data[ag_data.state.isin(SA)]
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1956 if isinstance(key, (Series, np.ndarray, Index, list)):
1957 # either boolean or fancy integer index
-> 1958 return self._getitem_array(key)
1959 elif isinstance(key, DataFrame):
1960 return self._getitem_frame(key)
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
1998 key = check_bool_indexer(self.index, key)
1999 indexer = key.nonzero()[0]
-> 2000 return self.take(indexer, axis=0, convert=False)
2001 else:
2002 indexer = self.loc._convert_to_indexer(key, axis=1)
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\generic.py in take(self, indices, axis, convert, is_copy, **kwargs)
1926 new_data = self._data.take(indices,
1927 axis=self._get_block_manager_axis(axis),
-> 1928 convert=True, verify=True)
1929 result = self._constructor(new_data).__finalize__(self)
1930
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\internals.py in take(self, indexer, axis, verify, convert)
4009 new_labels = self.axes[axis].take(indexer)
4010 return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
-> 4011 axis=axis, allow_dups=True)
4012
4013 def merge(self, other, lsuffix='', rsuffix=''):
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3895 new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
3896 fill_value if fill_value is not None else blk.fill_value,))
-> 3897 for blk in self.blocks]
3898
3899 new_axes = list(self.axes)
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\internals.py in <listcomp>(.0)
3895 new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
3896 fill_value if fill_value is not None else blk.fill_value,))
-> 3897 for blk in self.blocks]
3898
3899 new_axes = list(self.axes)
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\internals.py in take_nd(self, indexer, axis, new_mgr_locs, fill_tuple)
1044 fill_value = fill_tuple[0]
1045 new_values = algos.take_nd(values, indexer, axis=axis,
-> 1046 allow_fill=True, fill_value=fill_value)
1047
1048 if new_mgr_locs is None:
~\AppData\Local\Continuum\anaconda3\envs\acnPJ\lib\site-packages\pandas\core\algorithms.py in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
1465 out = np.empty(out_shape, dtype=dtype, order='F')
1466 else:
-> 1467 out = np.empty(out_shape, dtype=dtype)
1468
1469 func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
MemoryError:

Pandas: When can we get a TypeError doing agg

This is a rather generic question: I would like to know in what cases df.groupby(level = 'id').agg(['count']) can result in the following type error:
TypeError Traceback (most recent call last)
<ipython-input-116-b422fc0967b1> in <module>()
----> 4 df.groupby(level = 'id', sort = False).agg(['count'])
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2798 if hasattr(func_or_funcs, '__iter__'):
2799 ret = self._aggregate_multiple_funcs(func_or_funcs,
-> 2800 (_level or 0) + 1)
2801 else:
2802 cyfunc = self._is_cython_func(func_or_funcs)
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_multiple_funcs(self, arg, _level)
2867 # reset the cache so that we
2868 # only include the named selection
-> 2869 if name in self._selected_obj:
2870 obj = copy.copy(obj)
2871 obj._reset_cache()
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __contains__(self, key)
905 def __contains__(self, key):
906 """True if the key is in the info axis"""
--> 907 return key in self._info_axis
908
909 #property
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __contains__(self, key)
1326 hash(key)
1327 try:
-> 1328 self.get_loc(key)
1329 return True
1330 except LookupError:
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method)
1964
1965 if not isinstance(key, tuple):
-> 1966 loc = self._get_level_indexer(key, level=0)
1967 return _maybe_to_slice(loc)
1968
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer)
2227 else:
2228
-> 2229 loc = level_index.get_loc(key)
2230 if isinstance(loc, slice):
2231 return loc
/home/user/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2391 key = _values_from_object(key)
2392 try:
-> 2393 return self._engine.get_loc(key)
2394 except KeyError:
2395 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5239)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:4881)()
pandas/_libs/index.pyx in pandas._libs.index._bin_search (pandas/_libs/index.c:8637)()
TypeError: '<' not supported between instances of 'numpy.ndarray' and 'str'
I'm asking this because I have a dataframe where this error occurs in some cases but not always, for example if I use df.head(n).groupby(level = 'id').agg(['count']) then depending on n I get this error. For example it runs ok for head up to 2523475 and fails from 2523476 but I can't find anything wrong with the rows at those positions, they look the same as all the other rows in the dataframe and there are no null values.
I figure there must be something wrong with the data but in order to find what I need to know when this error can happen.
The data is a series in this format:
id date
3531364 2017-04-13 1
1550725 2017-02-21 1
1819411 2017-04-19 1
1636629 2016-12-28 1
3123152 2017-05-03 1
dtype: int64
In case it matters the dates are periods. Converting to_dict() gives me this:
{(1550725, Period('2017-02-21', 'D')): 1,
(1636629, Period('2016-12-28', 'D')): 1,
(1819411, Period('2017-04-19', 'D')): 1,
(3123152, Period('2017-05-03', 'D')): 1,
(3531364, Period('2017-04-13', 'D')): 1}
Some extra bits that might help somebody diagnose the problem:
Once I get the error if I do a aux = df.head(5) and then aux.groupby(level = 'id').agg(['count']) it fails BUT aux.groupby(level = 'id').count() works fine. This is also puzzling because with the original dataframe df.head(5).groupby(level = 'id').agg(['count']) works perfectly. Maybe something is cached? How can this happen?
If you try it with the Series I pasted here it will work for you and it will work for me but if I get the error and then get the head(5) of the df then it fails... (!). I guess eventually something is making agg(['count']) fail and that something is in some way cached by pandas but I really don't know what is going on under the hood.
I would appreciate any help with this. I know it is not easy.

Get keyerror accessing row by index in 16.0 Pandas dataframe in Python 3.4

Why do I keep getting a key error?
[edit] Here is the data:
GEO,LAT,LON
AALBORG DENMARK,57.0482206,9.9193939
AARHUS DENMARK,56.1496278,10.2134046
ABBOTSFORD BC CANADA,49.0519047,-122.3290473
ABEOKUTA NIGERIA,7.161,3.348
ABERDEEN SCOTLAND,57.1452452,-2.0913745
[end edit]
Can't find row by index, but its clearly there:
geocache = pd.read_csv('geolog.csv',index_col=['GEO']) # index_col=['GEO']
geocache.head()
Shows
LAT LON
GEO
AALBORG DENMARK 57.048221 9.919394
AARHUS DENMARK 56.149628 10.213405
ABBOTSFORD BC CANADA 49.051905 -122.329047
ABEOKUTA NIGERIA 7.161000 3.348000
ABERDEEN SCOTLAND 57.145245 -2.091374
So then I test it:
x = 'AARHUS DENMARK'
print(x)
geocache[x]
And this is what I get:
AARHUS DENMARK
KeyError Traceback (most recent call last)
in ()
2 x = u'AARHUS DENMARK'
3 print(x)
----> 4 geocache[x]
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1785 return self._getitem_multilevel(key)
1786 else:
-> 1787 return self._getitem_column(key)
1788
1789 def _getitem_column(self, key):
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1792 # get column
1793 if self.columns.is_unique:
-> 1794 return self._get_item_cache(key)
1795
1796 # duplicate columns & possible reduce dimensionaility
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1077 res = cache.get(item)
1078 if res is None:
-> 1079 values = self._data.get(item)
1080 res = self._box_item_values(item, values)
1081 cache[item] = res
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
2841
2842 if not isnull(item):
-> 2843 loc = self.items.get_loc(item)
2844 else:
2845 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\index.py in get_loc(self, key, method)
1435 """
1436 if method is None:
-> 1437 return self._engine.get_loc(_values_from_object(key))
1438
1439 indexer = self.get_indexer([key], method=method)
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3824)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3704)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12349)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12300)()
KeyError: 'AARHUS DENMARK'
No extra spaces or non-visible chars, Tried putting r and u before the string assignment with no change in behavior.
Ok, what am I missing?

As you didn't pass a sep (separator) arg to read_csv the default is comma separated. As your csv contained spaces/tabs after the commas then these get treated as part of the data hence your index data contains embedded spaces.
So you need to pass additional params to read_csv:
pd.read_csv('geolog.csv',index_col=['GEO'], sep=',\s+', engine='python')
The sep arg means that it will look for commas with optional 1 or more spaces in front of the commas, we pass engine='python' as the c engine does not accept a regex for separators.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Copying CSV files to XLSX with formats - python

Related

How do I create a function that pulls two values from my dataframe?

How to solve this problem in Python Importing Stocks data?

Trying to filter my pandas column with another list but recieving a memory error

Pandas: When can we get a TypeError doing agg

Get keyerror accessing row by index in 16.0 Pandas dataframe in Python 3.4

Categories

Resources