This script :
for x in df.index:
if df.loc[x,'medicament1'] in dicoprix:
df.loc[x,'coutmed1'] = dicoprix[df.loc[x,'medicament1']]
gives this error :
File "<ipython-input-35-097fdb2220b8>", line 3, in <module>
df.loc[x,'coutmed1'] = dicoprix[df.loc[x,'medicament1']]
File "//anaconda/lib/python2.7/site-packages/pandas/core/indexing.py", line 115, in __setitem__
self._setitem_with_indexer(indexer, value)
File "//anaconda/lib/python2.7/site-packages/pandas/core/indexing.py", line 346, in _setitem_with_indexer
value = self._align_series(indexer, value)
File "//anaconda/lib/python2.7/site-packages/pandas/core/indexing.py", line 613, in _align_series
raise ValueError('Incompatible indexer with Series')
ValueError: Incompatible indexer with Series
But the script is working, meaning df.loc[x,'coutmed1'] takes the value that I want.
I don't understand what am I doing wrong ?
I think that the problem comes from this
dicoprix[df.loc[x,'medicament1']]
This problem occurs when a key in the dict refers to more than one value !
Solution: Remove the duplicate indexes from the series (i.e. dicoprix) and keep them unique
You got it, the problem is in dicoprix[df.loc[x,'medicament1']]
There are duplicates in the indexes of the series dicoprix, which cannot be put as one value in the dataframe.
Below is the demonstration:
In [1]:
import pandas as pd
dum_ser = pd.Series(index=['a','b','b','c'], data=['apple', 'balloon', 'ball', 'cat' ])
[Out 1]
a apple
b balloon
b ball
c cat
dtype: object
In [2]:
df = pd.DataFrame({'letter':['a','b','c','d'], 'full_form':['aley', 'byue', 'case', 'cible']}, index=[0,1,2,3])
df
Out [2]:
letter full_form
0 a aley
1 b byue
2 c case
3 d cible
Following command will run fine as 'a' is not the duplicate index in dum_ser series
In [3]:
df.loc[0,'full_form'] = dum_ser['a']
df
Out [3]:
letter full_form
0 a apple
1 b byue
2 c case
3 d apple
Error will occur when the command tries to insert two records from the series(as there are two records for the index b in dum_ser, to check run the command dum_ser['b']) into one value-space of the DataFrame. Refer below
In [4]:
df.loc[1,'full_form'] = dum_ser['b']
Out [4]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-af11b9b3a776> in <module>()
----> 1 df.loc['b','full_form'] = dum_ser['b']
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
635 # setting for extensionarrays that store dicts. Need to decide
636 # if it's worth supporting that.
--> 637 value = self._align_series(indexer, Series(value))
638
639 elif isinstance(value, ABCDataFrame):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
775 return ser.reindex(ax)._values
776
--> 777 raise ValueError('Incompatible indexer with Series')
778
779 def _align_frame(self, indexer, df):
ValueError: Incompatible indexer with Series
The above-written line of the code is the one of the iteration from the for loop i.e. for x=1
Solution: Remove the duplicate indexes from the series (i.e. dum_ser here) and keep them unique
Use indexing like this:
dicoprix[df.loc[x,'medicament1']][0]
It did work for me.
Related
I'm trying to merge the columns kw1, kw2, kw3 shown here:
and have it in one separate column called keywords. This is what I tried:
df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
df
but I'm getting this error:
ValueError Traceback (most recent call last)
Input In [13], in <cell line: 1>()
----> 1 df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
2 df
File /lib/python3.10/site-packages/pandas/core/frame.py:3655, in DataFrame.__setitem__(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
-> 3655 self._set_item(key, value)
File /lib/python3.10/site-packages/pandas/core/frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) -> None:
3823 """
3824 Add series to DataFrame in specified column.
3825
(...)
3830 ensure homogeneity.
3831 """
-> 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File /lib/python3.10/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File /lib/python3.10/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (3) does not match length of index (141)
Is there a way to make it so that it turns it into a list like this [{value of kw1}, {value of kw2}, {value of kw3}]
You can do it like this
df['keywords'] = np.stack([df['kw1'], df['kw2'], df['kw3']], axis=1).tolist()
Pandas treats each element in the outermost list as a single value, so it complains that you only has three values (which are your three series) while you need 141 values for a new column since your original frame has 141 rows.
Stacking the underlying numpy arrays of the three series on the last dimension gives you a shape (141,3) and converting them to list gives you a list of length 141, with each element being another list of length 3.
A more concise way is to extract three columns as another df and let pandas do the stacking for you
df['keywords'] = df[['kw1', 'kw2', 'kw3']].values.tolist()
I am new to python and I can't figure out why I get this error: ValueError: Incompatible indexer with Series.
I am trying to add a date to my data frame.
The date I am trying to add:
date = (chec[(chec['Día_Sem']=='Thursday') & (chec['ID']==2011957)]['Entrada'])
date
Date output:
56 1900-01-01 07:34:00
Name: Entrada, dtype: datetime64[ns]
Then I try to add 'date' to my data frame using loc:
rep.loc[2039838,'Thursday'] = date
rep
And I get this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-347-3e0678b0fdbf> in <module>
----> 1 rep.loc[2039838,'Thursday'] = date
2 rep
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
188 key = com.apply_if_callable(key, self.obj)
189 indexer = self._get_setitem_indexer(key)
--> 190 self._setitem_with_indexer(indexer, value)
191
192 def _validate_key(self, key, axis):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
640 # setting for extensionarrays that store dicts. Need to decide
641 # if it's worth supporting that.
--> 642 value = self._align_series(indexer, Series(value))
643
644 elif isinstance(value, ABCDataFrame):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
781 return ser.reindex(ax)._values
782
--> 783 raise ValueError('Incompatible indexer with Series')
784
785 def _align_frame(self, indexer, df):
ValueError: Incompatible indexer with Series
I was also facing similar issue but in a different scenario. I came across threads of duplicate indices, but of-course that was not the case with me. What worked for me was to use .at in place of .loc. So you can try and see if it works
rep['Thursday'].at[2039838] = date.values[0]
Try date.iloc[0] instead of date:
rep.loc[2039838,'Thursday'] = date.iloc[0]
Because date is actually a Series (so basically like a list/array) of the values, and .iloc[0] actually selects the value.
You use loc to get a specific value, and your date type is a series or dataframe, the type between the two can not match, you can change the code to give the value of date to rep.loc[2039838,'Thursday'], for example, if your date type is a series and is not null, you can do this:
rep.loc[2039838,'Thursday'] = date.values[0]
I'm trying to plot a pandas series variable, which has a numeric id in one column and frequency of that id in the next column. I wish to plot these two as a bar graph with freq on the y-axis and id no. on the x-axis. However, there are too many rows, i.e. id numbers. Is there a way I can only plot the top 10 most frequently occurring ids?
executing this code - area_count.plot.bar
gives this error-
<bound method SeriesPlotMethods.bar of
<pandas.plotting._core.SeriesPlotMethods object at 0x0000019C68029908>>
I tried storing the top 20 values from this series into another variable using the following code:
for i in range(1,20):
f[i,:] = area_count[i,:]
But it showed this error:
ValueError Traceback (most recent call last)
<ipython-input-88-1020cb7bdfc3> in <module>
1 for i in range(1,20):
----> 2 f[i,:] = area_count[i,:]
~\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
909 key = check_bool_indexer(self.index, key)
910
--> 911 return self._get_with(key)
912
913 def _get_with(self, key):
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_with(self, key)
921 elif isinstance(key, tuple):
922 try:
--> 923 return self._get_values_tuple(key)
924 except Exception:
925 if len(key) == 1:
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_values_tuple(self, key)
966
967 if not isinstance(self.index, MultiIndex):
--> 968 raise ValueError('Can only tuple-index with a MultiIndex')
969
970 # If key is contained, would have returned by now
ValueError: Can only tuple-index with a MultiIndex
If I understand you correctly, you now need the top 10 frequently occurring ids, do it by turning your series into a dataframe like:
x = df['id'].value_counts().sort_values(ascending = False).head(10).to_frame().reset_index()
x.rename(columns = {'index':'id', 'id': 'freq'}, inplace = True)
Now plot the graph:
x.plot.bar(x = 'id', y = 'freq')
Sample Output:
I am trying to find solution to below given problem but seems like I am going wrong with the approach
I have a set of Excel with some columns like ISBN, Title etc. The columns names in Excel are not properly formatted. ISBN is named as ISBN in some of the Excel files while it is named as ISBN-13, Alias, ISBN13 etc. in others. Similarly for Title and other columns.
I have read all these Excels as data frame in python using read Excel and used str.contains to find the columns based on substring. Please find code below:
searchfor = ['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN
13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13
DIGITS)','EAN','ALIAS','ITEMCODE']
searchfor1 = ['TITLE','BOOK NAME','NAME','TITLE
NAME','TITLES','BOOKNAME','BKDESC','PRODUCT NAME','ITEM DESCRIPTION','TITLE
18','COMPLETETITLE']
for f, i in zip(files_txt1, num1):
df = pd.read_excel(f,encoding='sys.getfilesystemencoding()')
df.columns = df.columns.str.upper()
df1['Isbn'] = df[df.columns[df.columns.str.contains('|'.join(searchfor))]]
df1['Title']=
df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor1))]]
The code works fine if I have excel with text present in list. However throws error in case excel does not have any columns with name similar to list. Also code does not work for ISBN.
Please see detailed error below:
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\common.py in _asarray_tuplesafe(values, dtype)
376 result = np.empty(len(values), dtype=object)
--> 377 result[:] = values
378 except ValueError:
ValueError: could not broadcast input array from shape (31807,0) into shape (31807)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value) 2375 try:
-> 2376 value = Series(value) 2377 except:
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
247 data = _sanitize_array(data, index, dtype, copy,
--> 248 raise_cast_failure=True)
249
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure) 3028 else:
-> 3029 subarr = _asarray_tuplesafe(data, dtype=dtype) 3030
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\common.py in _asarray_tuplesafe(values, dtype)
379 # we have a list-of-list
--> 380 result[:] = [tuple(x) for x in values]
381
ValueError: cannot copy sequence with size 0 to array axis with dimension 31807
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) <ipython-input-23-9e043c13fef2> in <module>()
11 df.columns = df.columns.str.upper()
12 #print(list(df.columns))
---> 13 df1['Isbn'] = df[df.columns[df.columns.str.contains('|'.join(searchfor))]]
14 df1['Title'] = df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor1))]]
15 df1['Curr'] = df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor2))]]
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value) 2329 else: 2330
# set column
-> 2331 self._set_item(key, value) 2332 2333 def _setitem_slice(self, key, value):
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value) 2394 """ 2395
-> 2396 self._ensure_valid_index(value) 2397 value = self._sanitize_column(key, value) 2398 NDFrame._set_item(self, key, value)
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value) 2376 value = Series(value) 2377 except:
-> 2378 raise ValueError('Cannot set a frame with no defined index ' 2379 'and a value that cannot be converted to a ' 2380 'Series')
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
You don't need this all, if you know your columns beforehand, just try at teh time of creating dataFrame and exporting the File into Pandas itself that way you will reduce the memory usage significantly as well.
df = pd.read_csv(file_name, usecols=['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN 13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13 DIGITS)','EAN','ALIAS','ITEMCODE']).fillna('')
This would work as long as you have no match or exactly 1 match
searchfor = ['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN 13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13 DIGITS)','EAN','ALIAS','ITEMCODE']
searchfor1 = ['TITLE','BOOK NAME','NAME','TITLE NAME','TITLES','BOOKNAME','BKDESC','PRODUCT NAME','ITEM DESCRIPTION','TITLE 18','COMPLETETITLE']
for f, i in zip(files_txt1, num1):
df = pd.read_excel(f,encoding='sys.getfilesystemencoding()')
df.columns = df.columns.str.upper()
cols = df.columns
is_isbn = cols.isin(searchfor)
df1['Isbn'] = df[cols[is_isbn]] if is_isbn.any() else None
is_title = cols.isin(searchfor1)
df1['Title'] = df[cols[is_title]] if is_title.any() else None
I am trying to calculate the Median of Groups over columns. I found a very clear example at
Pandas: Calculate Median of Group over Columns
This question and answer is the exactly the answer I needed. I created the exact example posted to work through the details on my own
import pandas
import numpy
data_3 = [2,3,4,5,4,2]
data_4 = [0,1,2,3,4,2]
df = pandas.DataFrame({'COL1': ['A','A','A','A','B','B'],
'COL2': ['AA','AA','BB','BB','BB','BB'],
'COL3': data_3,
'COL4': data_4})
m = df.groupby(['COL1', 'COL2'])[['COL3','COL4']].apply(numpy.median)
When I tried to calculate the median of Group over columns I encounter the error
TypeError: Series.name must be a hashable type
If I do the exact same code with the only difference replacing median with a different statistic (mean, min, max, std) and everything works just fine.
I don't understand the cause of this error and why it only occurs for median, which is what I really need to calculate.
Thanks in advance for your help,
Bob
Here is the full error message. I am using python 3.5.2
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-af0ef7da3347> in <module>()
----> 1 m = df.groupby(['COL1', 'COL2'])[['COL3','COL4']].apply(numpy.median)
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
649 # ignore SettingWithCopy here in case the user mutates
650 with option_context('mode.chained_assignment', None):
--> 651 return self._python_apply_general(f)
652
653 def _python_apply_general(self, f):
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
658 keys,
659 values,
--> 660 not_indexed_same=mutated or self.mutated)
661
662 def _iterate_slices(self):
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
3373 coerce = True if any([isinstance(x, Timestamp)
3374 for x in values]) else False
-> 3375 return (Series(values, index=key_index, name=self.name)
3376 ._convert(datetime=True,
3377 coerce=coerce))
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
231 generic.NDFrame.__init__(self, data, fastpath=True)
232
--> 233 self.name = name
234 self._set_axis(0, index, fastpath=True)
235
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
2692 object.__setattr__(self, name, value)
2693 elif name in self._metadata:
-> 2694 object.__setattr__(self, name, value)
2695 else:
2696 try:
/Applications/anaconda3/lib/python3.5/site-packages/pandas/core/series.py in name(self, value)
307 def name(self, value):
308 if value is not None and not com.is_hashable(value):
--> 309 raise TypeError('Series.name must be a hashable type')
310 object.__setattr__(self, '_name', value)
311
TypeError: Series.name must be a hashable type
Somehow the series name at this stage is being interpreted as un-hashable, despite supposedly being a tuple. I think it may be the same bug as the one fixed and closed:
Apply on selected columns of a groupby object - stopped working with 0.18.1 #13568
Basically, single scalar values in groups (as you have in your example) were causing the name of the Series to not be passed through. It is fixed in 0.19.2.
In any case, it shouldn't be a practical concern since you can (and should) call mean, median, etc. on GroupBy objects directly.
>>> df.groupby(['COL1', 'COL2'])[['COL3', 'COL4']].median()
COL3 COL4
COL1 COL2
A AA 2.5 0.5
BB 4.5 2.5
B BB 3.0 3.0