Reading and Using a CSV file in python 3 panda - python

I have CSV file
Firstname Lastname City Province
'Guy', 'Ouell', 'Brossard','QC'
'Michelle', 'Balonne','Stittsville','ON'
'Ben', 'Sluzing','Toronto','ON'
'Theodora', 'Panapoulos','Saint-Constant','QC'
'Kathleen', 'Mercier','St Johns','NL'
...
and I open and check it which is everything is fine:
df = pd.read_csv('a.csv')
df.head(n=5)
When I want to use columns I have two different problems:
Problem1: Only I have access to the first column and when I want to use other columns I get an error:
for mis_column, mis_row in missing_df.iterrows():
print(mis_row['Firstname'])
I get all of the first names but when I want to get all of the cities, for example, I see:
TypeError Traceback (most recent call last)
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2482 try:
-> 2483 return libts.get_value_box(s, key)
2484 except IndexError:
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box
(pandas\_libs\tslib.c:18843)()
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box
(pandas\_libs\tslib.c:18477)()
TypeError: 'str' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-36-55ba81245685> in <module>()
1
2 for mis_column, mis_row in missing_df.iterrows():
----> 3 print(mis_row['City'])
4
5
E:\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_value(self, series, key)
2489 raise InvalidIndexError(key)
2490 else:
-> 2491 raise e1
2492 except Exception: # pragma: no cover
2493 raise e1
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'City'
Problem 2:
for mis_column, mis_row in df.iterrows():
if mis_row['Firstname'] == 'Guy':
print('A')
does not print A
Thanks in advance

With your CSV's header comma separated. Like this,
Firstname, Lastname, City, Province
'Guy', 'Ouell', 'Brossard','QC'
'Michelle', 'Balonne','Stittsville','ON'
'Ben', 'Sluzing','Toronto','ON'
'Theodora', 'Panapoulos','Saint-Constant','QC'
'Kathleen', 'Mercier','St John's','NL'
As your CSV has white spaces around, you can read to dataframe by skipping,
df = pd.read_csv('<your_input>.csv', skipinitialspace=True)
If you want to remove the single quotes as well, then,
df = pd.read_csv('<your_input>.csv', skipinitialspace=True, quotechar="'")
>>> df
Firstname Lastname City Province
0 Guy Ouell Brossard QC
1 Michelle Balonne Stittsville ON
2 Ben Sluzing Toronto ON
3 Theodora Panapoulos Saint-Constant QC
4 Kathleen Mercier St Johns' NL
>>> import pandas as pd
>>> df = pd.read_csv('test2.csv', skipinitialspace=True, quotechar="'")
>>> df
Firstname Lastname City Province
0 Guy Ouell Brossard QC
1 Michelle Balonne Stittsville ON
2 Ben Sluzing Toronto ON
3 Theodora Panapoulos Saint-Constant QC
4 Kathleen Mercier St Johns' NL
>>> for mis_column, mis_row in df.iterrows():
... if mis_row['Firstname'] == 'Guy':
... print('A')
...
A
>>>

Related

How to select columns from different tables based on other facture to create a new dataframe python

I have 2 DataFrames both countain countries
1-first have 183 row
2-the second have 156 row
both of them has import information on each other
I need one column from the first and one column from the second
My goal is to create a single Dataframe contain both columns that I need and name of the contain that both datafames commo.
This is what I did and the message that I got
for i in range(183) :
for j in range(156):
if df['Country'][i]==df_happy['Country or region'][j]:
df.drop(i,axis=0,inplace=True)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-25-e078ef71e219> in <module>
1 for i in range(183) :
2 for j in range(156):
----> 3 if df['Country'][i]==df_happy['Country or region'][j]:
4 df.drop(i,axis=0,inplace=True)
/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/pandas/core/series.py in __getitem__(self, key)
869 key = com.apply_if_callable(key, self)
870 try:
--> 871 result = self.index.get_value(self, key)
872
873 if not is_scalar(result):
/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
4403 k = self._convert_scalar_indexer(k, kind="getitem")
4404 try:
-> 4405 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4406 except KeyError as e1:
4407 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 1
You can merge both data frames:
newdf=df.merge(df_happy,how='left', left_on='Country', right_on='Country or region')
and then drop the extra columns with:
newdf.drop(columns=['B', 'C'])

Pandas dataframe index causing problems when indexing subset of dataframe. How do I remove the indexes, or prevent the error from occurring?

I have a dataframe x1. I made a subset of the dataframe, x1_sub, where I need to use a for loop to index its items. But because the subset retains the indexing of the original pandas dataframe, it has its rows like so:
x1_sub['words']
1 investment
2 fund
4 company
7 claim
9 customer
20 easy
... ...
So, when I do something like this to index the rows of x1_sub serially:
for i in range(len(x1)):
for j in range(len(x1_sub)):
if (x1['word'][i]==x1_sub['word'][j]):
print(i, j)
it gives the following error:
KeyError Traceback (most recent call last)
<ipython-input-48-e3c9806732a6> in <module>()
3 for i in range(len(x1)):
4 for j in range(len(x1_sub)):
----> 5 if (x1['word'][i]==x1_sub['word'][j]):
6 print(i, j)
7
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
621 key = com._apply_if_callable(key, self)
622 try:
--> 623 result = self.index.get_value(self, key)
624
625 if not is_scalar(result):
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2558 try:
2559 return self._engine.get_value(s, k,
-> 2560 tz=getattr(series.dtype, 'tz', None))
2561 except KeyError as e1:
2562 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 0
EDIT: Some example data:
The following data is saved in a csv file named example.csv:
word score
service 1
customer 4
agent 3
product 6
easy 2
claim 2
fast 1
financial 5
information 1
benefit 4
company 3
helpful 6
time 2
future 2
policy 1
health 5
life 1
fund 4
complicated 3
investment 6
join 2
payment 2
premium 1
excellent 5
experience 1
family 4
nice 3
proces 6
satisfactory 2
And the code is this:
import pandas as pd
x1 = pd.read_csv(r'C:\Users\h473\Documents\Indonesia_verbatims W1 2018\Insurance Data X3\example.csv')
x1_sub = x1[x1['score']<=2]
for i in range(len(x1)):
for j in range(len(x1_sub)):
if (x1['word'][i]==x1_sub['word'][j]):
print(i, j)
And this is the output:
0 0
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-63-08d55a712c99> in <module>()
7 for i in range(len(x1)):
8 for j in range(len(x1_sub)):
----> 9 if (x1['word'][i]==x1_sub['word'][j]):
10 print(i, j)
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
621 key = com._apply_if_callable(key, self)
622 try:
--> 623 result = self.index.get_value(self, key)
624
625 if not is_scalar(result):
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2558 try:
2559 return self._engine.get_value(s, k,
-> 2560 tz=getattr(series.dtype, 'tz', None))
2561 except KeyError as e1:
2562 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 1
EDIT 2: Also, if x1_sub is a list, then the error is different:
import pandas as pd
x1 = pd.read_csv(r'C:\Users\h473\Documents\Indonesia_verbatims W1 2018\Insurance Data X3\example.csv')
#x1_sub = x1[x1['score']<=2]
x1_sub = ['service', 'claim', 'health', 'fund', 'premium', 'nice', 'process']
for i in range(len(x1)):
for j in range(len(x1_sub)):
if (x1['word'][i]==x1_sub['word'][j]):
print(i, j)
Produces the following output:
TypeError Traceback (most recent call last)
<ipython-input-68-dec8c7e33757> in <module>()
8 for i in range(len(x1)):
9 for j in range(len(x1_sub)):
---> 10 if (x1['word'][i]==x1_sub['word'][j]):
11 print(i, j)
TypeError: list indices must be integers or slices, not str
I think looping is best avoid in pandas, because very slow if exist some vectorized solution:
x1_sub = ['service', 'claim', 'health', 'fund', 'premium', 'nice', 'process']
x2 = x1[x1['word'].isin(x1_sub)]
print (x2)
word score
0 service 1
5 claim 2
15 health 5
17 fund 4
22 premium 1
26 nice 3
try to assign separate indices by using Dataframe.set_index(keys,inplace=True)
please refer to this documentation
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.set_index.html

Error in converting datas from string to int

I have a dataframe cleaned_bp['VISITCODE'] which looks like:
0 1
1 2
2 3
3 6
4 9
5 12
6 15
where the non-index column consists of strings.
I wanted to convert them to integers by doing:
for i in range(len(cleaned_bp['VISITCODE'])):
cleaned_bp['VISITCODE'][i] = int(cleaned_bp['VISITCODE'][i])
but I get this error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-42-4d6508c1abda> in <module>()
1 for i in range(len(cleaned_bp['VISITCODE'])):
----> 2 cleaned_bp['VISITCODE'][i] = int(cleaned_bp['VISITCODE'][i])
~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 13
what wrong am I doing here?
Try:
for i in range(len(cleaned_bp['VISITCODE'])):
cleaned_bp['VISITCODE'].iloc[i] = int(cleaned_bp['VISITCODE'].iloc[i])
This will use the position in the index not the index itself.
if you are using pandas you can try:
cleaned_bp.VISITCODE.astype(int)

Basic json and pandas DataFrame build

I am very new to python and learning my way up. My task is to crawl data from web and filing xlsx data using json and pandas (and etc..). I am researching through some examples of modifing json dic to pandas DataFrame, and I cant seem to find the one that I need.
Im gussing this would be very basic, but help me out.
so below is my code
js ='{"startDate":"2017-01-01","endDate":"2017-10-31","timeUnit":"month","results":
[{"title":"fruit","keywords":["apple","banana"],"data":
[{"period":"2017-01-01","ratio":19.35608},
{"period":"2017-02-01","ratio":17.33902},
{"period":"2017-03-01","ratio":22.30411},
{"period":"2017-04-01","ratio":20.94646},
{"period":"2017-05-01","ratio":23.8557},
{"period":"2017-06-01","ratio":22.38169},
{"period":"2017-07-01","ratio":27.38557},
{"period":"2017-08-01","ratio":19.16214},
{"period":"2017-09-01","ratio":32.07913},
{"period":"2017-10-01","ratio":41.89293}]},
{"title":"veg","keywords":["carrot","onion"],"data":
[{"period":"2017-01-01","ratio":100.0},
{"period":"2017-02-01","ratio":80.41117},
{"period":"2017-03-01","ratio":89.29402},
{"period":"2017-04-01","ratio":74.32118},
{"period":"2017-05-01","ratio":69.82156},
{"period":"2017-06-01","ratio":66.52444},
{"period":"2017-07-01","ratio":67.84328},
{"period":"2017-08-01","ratio":74.43754},
{"period":"2017-09-01","ratio":65.82621},
{"period":"2017-10-01","ratio":65.55469}]}]}'
And I have tried below
df = pd.DataFrame.from_dict(json_normalize(js), orient='columns')
df
and
df = pd.read_json(js)
results = df['results'].head()
dd = results['data']
results.to_json(orient='split')
and
data = json.loads(js)
data["results"]
data["startDate"]
data2 = json.loads(data["results"])
data2["data"]
And I want my DataFrame to be like below
Date Fruit Veg
0 2017-01-01 19.35608 100.0
1 2017-02-01 17.33902 80.41117
2 2017-03-01 22.30411 89.29402
3 2017-04-01 20.94646 74.32118
4 2017-05-01 23.8557 69.82156
--------------------------------------------------------------------------------------------------------------------edit
The code (from #COLDSPEED) worked perfect until one point. I use your code to my new crawler "Crawler: Combining DataFrame per each loop Python" and it ran perfectly until my DNA reached to 170. The error message is below
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'period'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-30-2a1de403b285> in <module>()
47 d = json.loads(js)
48 lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
---> 49 for r in d['results']]
50 df = pd.concat(lst, 1)
51 dfdfdf = Data.join(df)
<ipython-input-30-2a1de403b285> in <listcomp>(.0)
47 d = json.loads(js)
48 lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
---> 49 for r in d['results']]
50 df = pd.concat(lst, 1)
51 dfdfdf = Data.join(df)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
2828 names.append(None)
2829 else:
-> 2830 level = frame[col]._values
2831 names.append(col)
2832 if drop:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'period'
I found out that if the js has no value in 'data' which shows below. (please disregard the Korean title)
{"startDate":"2016-01-01","endDate":"2017-12-03","timeUnit":"date","results":[{"title":"황금뿔나팔버섯","keywords":["황금뿔나팔버섯"],"data":[]}]}
So I want to check if there is 'data' before using your code. please take a look below and tell me what is wrong with it please.
if ([pd.DataFrame.from_dict(r['data']) for r in d['results']] == []):
#want to put only the column name as 'title' and move on
else:
lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
for r in d['results']]
df = pd.concat(lst, 1)
Assuming your structure is consistent, use a list comprehension and then concatenate -
import json
d = json.loads(js)
lst = [
pd.DataFrame.from_dict(r['data'])\
.set_index('period').rename(columns={'ratio' : r['title']})
for r in d['results']
]
df = pd.concat(lst, 1)
df
fruit veg
period
2017-01-01 19.35608 100.00000
2017-02-01 17.33902 80.41117
2017-03-01 22.30411 89.29402
2017-04-01 20.94646 74.32118
2017-05-01 23.85570 69.82156
2017-06-01 22.38169 66.52444
2017-07-01 27.38557 67.84328
2017-08-01 19.16214 74.43754
2017-09-01 32.07913 65.82621
2017-10-01 41.89293 65.55469

Numpy TypeError: an integer is required

This will be maybe quite personal question but I don't know who to ask I hope somebody can help and don't skip me THANKS!. I have installed python using Anaconda and using Jupyter notebook. I have 2 csv files of data.
products.head()
ID_FUPID FUPID
0 1 674563
1 2 674597
2 3 674606
3 4 694776
4 5 694788
Products contain id of product and product number.
ratings.head()
ID_CUSTOMER ID_FUPID RATING
0 1 216 1
1 2 390 1
2 3 851 5
3 4 5897 1
4 5 9341 1
Ratings containt id of customer, productID and Rating which customer give to product.
I have created table as:
M = ratings.pivot_table(index=['ID_CUSTOMER'],columns=['ID_FUPID'],values='RATING')
Which is showing data correctly in matrix with productID= columns and customerID as rows.
I wanted to count pearson colleration between products so here is the pearson function:
def pearson(s1, s2):
import numpy as np
"""take two pd.series objects and return a pearson correlation"""
s1_c = s1 - s1.mean()
s2_c = s2 - s2.mean()
return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))
When I'm trying to count pearson(M['17'], M['21']) I got following errors:
TypeError Traceback (most recent call last)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: '17'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-277-d4ead225b6ab> in <module>()
----> 1 pearson(M['17'], M['21'])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: '17'
I will really appreciate any help ! thanks a million.
There were two places in the error message with the following line:
KeyError: '17'
This indicates there is no key '17' in M. This is likely because your index is an integer. However, you are currently accessing the DataFrame M with a string. The code to call pearson might be as follows:
pearson(M[17], M[21])

Categories

Resources