Basic json and pandas DataFrame build - python

I am very new to python and learning my way up. My task is to crawl data from web and filing xlsx data using json and pandas (and etc..). I am researching through some examples of modifing json dic to pandas DataFrame, and I cant seem to find the one that I need.
Im gussing this would be very basic, but help me out.
so below is my code
js ='{"startDate":"2017-01-01","endDate":"2017-10-31","timeUnit":"month","results":
[{"title":"fruit","keywords":["apple","banana"],"data":
[{"period":"2017-01-01","ratio":19.35608},
{"period":"2017-02-01","ratio":17.33902},
{"period":"2017-03-01","ratio":22.30411},
{"period":"2017-04-01","ratio":20.94646},
{"period":"2017-05-01","ratio":23.8557},
{"period":"2017-06-01","ratio":22.38169},
{"period":"2017-07-01","ratio":27.38557},
{"period":"2017-08-01","ratio":19.16214},
{"period":"2017-09-01","ratio":32.07913},
{"period":"2017-10-01","ratio":41.89293}]},
{"title":"veg","keywords":["carrot","onion"],"data":
[{"period":"2017-01-01","ratio":100.0},
{"period":"2017-02-01","ratio":80.41117},
{"period":"2017-03-01","ratio":89.29402},
{"period":"2017-04-01","ratio":74.32118},
{"period":"2017-05-01","ratio":69.82156},
{"period":"2017-06-01","ratio":66.52444},
{"period":"2017-07-01","ratio":67.84328},
{"period":"2017-08-01","ratio":74.43754},
{"period":"2017-09-01","ratio":65.82621},
{"period":"2017-10-01","ratio":65.55469}]}]}'
And I have tried below
df = pd.DataFrame.from_dict(json_normalize(js), orient='columns')
df
and
df = pd.read_json(js)
results = df['results'].head()
dd = results['data']
results.to_json(orient='split')
and
data = json.loads(js)
data["results"]
data["startDate"]
data2 = json.loads(data["results"])
data2["data"]
And I want my DataFrame to be like below
Date Fruit Veg
0 2017-01-01 19.35608 100.0
1 2017-02-01 17.33902 80.41117
2 2017-03-01 22.30411 89.29402
3 2017-04-01 20.94646 74.32118
4 2017-05-01 23.8557 69.82156
--------------------------------------------------------------------------------------------------------------------edit
The code (from #COLDSPEED) worked perfect until one point. I use your code to my new crawler "Crawler: Combining DataFrame per each loop Python" and it ran perfectly until my DNA reached to 170. The error message is below
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'period'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-30-2a1de403b285> in <module>()
47 d = json.loads(js)
48 lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
---> 49 for r in d['results']]
50 df = pd.concat(lst, 1)
51 dfdfdf = Data.join(df)
<ipython-input-30-2a1de403b285> in <listcomp>(.0)
47 d = json.loads(js)
48 lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
---> 49 for r in d['results']]
50 df = pd.concat(lst, 1)
51 dfdfdf = Data.join(df)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
2828 names.append(None)
2829 else:
-> 2830 level = frame[col]._values
2831 names.append(col)
2832 if drop:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'period'
I found out that if the js has no value in 'data' which shows below. (please disregard the Korean title)
{"startDate":"2016-01-01","endDate":"2017-12-03","timeUnit":"date","results":[{"title":"황금뿔나팔버섯","keywords":["황금뿔나팔버섯"],"data":[]}]}
So I want to check if there is 'data' before using your code. please take a look below and tell me what is wrong with it please.
if ([pd.DataFrame.from_dict(r['data']) for r in d['results']] == []):
#want to put only the column name as 'title' and move on
else:
lst = [pd.DataFrame.from_dict(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
for r in d['results']]
df = pd.concat(lst, 1)

Assuming your structure is consistent, use a list comprehension and then concatenate -
import json
d = json.loads(js)
lst = [
pd.DataFrame.from_dict(r['data'])\
.set_index('period').rename(columns={'ratio' : r['title']})
for r in d['results']
]
df = pd.concat(lst, 1)
df
fruit veg
period
2017-01-01 19.35608 100.00000
2017-02-01 17.33902 80.41117
2017-03-01 22.30411 89.29402
2017-04-01 20.94646 74.32118
2017-05-01 23.85570 69.82156
2017-06-01 22.38169 66.52444
2017-07-01 27.38557 67.84328
2017-08-01 19.16214 74.43754
2017-09-01 32.07913 65.82621
2017-10-01 41.89293 65.55469

Related

Getting KeyError: 'Date' when trying to create dataframes for S&P 500

So I am trying to create dataframes for each company in the S&P. Here is the code so far:
import pandas as pd
import pandas_datareader as web
import datetime as dt
from datetime import date
import numpy as np
data = []
#Get the asset data
def get_data():
start = dt.datetime(2020, 9, 30)
end = date.today()
csv_file = pd.read_csv(os.path.expanduser("/Users/benitocano/Downloads/copyOfSandP500.csv"), delimiter = ',')
tickers = pd.read_csv("/Users/benitocano/Downloads/copyOfSandP500.csv", delimiter=',', names = ['Symbol', 'Name', 'Sector'])
for i in tickers['Symbol'][0:100]:
df = web.DataReader(i, 'yahoo', start, end)
df.drop(['High', 'Low', 'Open', 'Close', 'Volume'], axis=1, inplace=True)
data.append(df)
The code will work fine when I index the first 100 companies or more. The error I get is: KeyError: 'Date'. I think this is happening because there a new companies added to the S&P 500. But I cant find new companies added that do not have at least 3 months of data. Why is this error happening?
Thanks!
Full error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2645 try:
-> 2646 return self._engine.get_loc(key)
2647 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Date'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-100-b1ba6beb8cbb> in <module>
31 # non_stationary.append(i)
32
---> 33 get_data()
34 # ADF_Test()
35
<ipython-input-100-b1ba6beb8cbb> in get_data()
16 tickers = pd.read_csv("/Users/benitocano/Downloads/copyOfSandP500.csv", delimiter=',', names = ['Symbol', 'Name', 'Sector'])
17 for i in tickers['Symbol'][0:100]:
---> 18 df = web.DataReader(i, 'yahoo', start, end)
19 df.drop(['High', 'Low', 'Open', 'Close', 'Volume'], axis=1, inplace=True)
20 data.append(df)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
212 else:
213 kwargs[new_arg_name] = new_arg_value
--> 214 return func(*args, **kwargs)
215
216 return cast(F, wrapper)
~/opt/anaconda3/lib/python3.8/site-packages/pandas_datareader/data.py in DataReader(name, data_source, start, end, retry_count, pause, session, api_key)
374
375 if data_source == "yahoo":
--> 376 return YahooDailyReader(
377 symbols=name,
378 start=start,
~/opt/anaconda3/lib/python3.8/site-packages/pandas_datareader/base.py in read(self)
251 # If a single symbol, (e.g., 'GOOG')
252 if isinstance(self.symbols, (string_types, int)):
--> 253 df = self._read_one_data(self.url, params=self._get_params(self.symbols))
254 # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
255 elif isinstance(self.symbols, DataFrame):
~/opt/anaconda3/lib/python3.8/site-packages/pandas_datareader/yahoo/daily.py in _read_one_data(self, url, params)
163 prices = DataFrame(data["prices"])
164 prices.columns = [col.capitalize() for col in prices.columns]
--> 165 prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
166
167 if "Data" in prices.columns:
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
2798 if self.columns.nlevels > 1:
2799 return self._getitem_multilevel(key)
-> 2800 indexer = self.columns.get_loc(key)
2801 if is_integer(indexer):
2802 indexer = [indexer]
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2646 return self._engine.get_loc(key)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2650 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Date'

Impute missing values using apply and lambda functions

I am trying to impute the missing values in "Item_Weight" variable by taking the average of the variable according to different "Item_Types" as per the code below. But when I run it, I am getting Key error as added below. Is it the pandas version that does not allow this or something wrong with the code?
Item_Weight_Average =
train.dropna(subset['Item_Weight']).pivot_table(values='Item_Weight',index='Item_Type')
missing = train['Item_Weight'].isnull()
train.loc[missing,'Item_Weight']= train.loc[missing,'Item_Type'].apply(lambda x: Item_Weight_Average[x])
KeyError Traceback (most recent call last)
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
KeyError: 'Snack Foods'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-25-c9971d0bdaf7> in <module>()
1 Item_Weight_Average = train.dropna(subset=['Item_Weight']).pivot_table(values='Item_Weight',index='Item_Type')
2 missing = train['Item_Weight'].isnull()
----> 3 train.loc[missing,'Item_Weight'] = train.loc[missing,'Item_Type'].apply(lambda x: Item_Weight_Average[x])
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
2353 else:
2354 values = self.asobject
-> 2355 mapped = lib.map_infer(values, f, convert=convert_dtype)
2356
2357 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\src\inference.pyx in pandas._libs.lib.map_infer (pandas\_libs\lib.c:66645)()
<ipython-input-25-c9971d0bdaf7> in <lambda>(x)
1 Item_Weight_Average = train.dropna(subset=['Item_Weight']).pivot_table(values='Item_Weight',index='Item_Type')
2 missing = train['Item_Weight'].isnull()
----> 3 train.loc[missing,'Item_Weight'] = train.loc[missing,'Item_Type'].apply(lambda x: Item_Weight_Average[x])
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\m1013523\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
KeyError: 'Snack Foods'
any ideas or workarounds for this one?
If I understand what you're trying to do, then there's an easier way to solve your problem. Instead of making a new series of averages, you can calculate the average item_weight by item_type using groupby, transform, and np.mean(), and fill in the missing spots in item_weight using fillna().
# Setting up some toy data
import pandas as pd
import numpy as np
df = pd.DataFrame({'item_type': [1,1,1,2,2,2],
'item_weight': [2,4,np.nan,10,np.nan,np.nan]})
# The solution
df.item_weight.fillna(df.groupby('item_type').item_weight.transform(np.mean), inplace=True)
The result:
item_type item_weight
0 1 2.0
1 1 4.0
2 1 3.0
3 2 10.0
4 2 10.0
5 2 10.0

How to plot a histogram of one column colored by another in Python?

I have a dataset that contains, among other columns, 3 columns titled Gender (either M or F), House (either A or B or C), and Indicator (either 0 or 1). I want to plot the histogram of House A colored by Gender. This is my code to do this:
import pandas as pd
df = pd.read_csv('dataset.csv', usecols=['House','Gender','Indicator')
A = df[df['House']=='A']
A = pd.DataFrame(A, columns=['Indicator', 'Gender'])
This imports the values of House A for the respective genders correctly, as shown by its contents:
print(A)
Indicator Gender
0 1 Male
1 1 Male
2 1 Male
4 1 Female
7 1 Male
8 1 Male
11 1 Male
14 1 Male
17 1 Male
18 1 Female
19 1 Female
20 1 Female
21 1 Male
24 1 Male
26 1 Female
27 1 Male
... ... ...
Now when I want to plot the histogram of A colored by gender the way I did in MATLAB, it gives an error:
import matplotlib.pyplot as plt
plt.hist(A)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-130-81c3aef1748b> in <module>()
----> 1 plt.hist(A)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, normed, hold, data, **kwargs)
3130 histtype=histtype, align=align, orientation=orientation,
3131 rwidth=rwidth, log=log, color=color, label=label,
-> 3132 stacked=stacked, normed=normed, data=data, **kwargs)
3133 finally:
3134 ax._hold = washold
~\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, *args, **kwargs)
1853 "the Matplotlib list!)" % (label_namer, func.__name__),
1854 RuntimeWarning, stacklevel=2)
-> 1855 return func(ax, *args, **kwargs)
1856
1857 inner.__doc__ = _add_data_doc(inner.__doc__,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\axes\_axes.py in hist(***failed resolving arguments***)
6512 for xi in x:
6513 if len(xi) > 0:
-> 6514 xmin = min(xmin, xi.min())
6515 xmax = max(xmax, xi.max())
6516 bin_range = (xmin, xmax)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\_methods.py in _amin(a, axis, out, keepdims)
27
28 def _amin(a, axis=None, out=None, keepdims=False):
---> 29 return umr_minimum(a, axis, None, out, keepdims)
30
31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
TypeError: '<=' not supported between instances of 'int' and 'str'
It seems we need to specify the exact column we want to make histogram of. It can't automatically understand (unlike MATLAB) that it needs to color according to the other column. So, doing the following plots the histogram, but with no color indicating the Gender:
plt.hist(A['Indicator'])
So, how do I make either a stacked histogram, or a side-by-side one colored by gender? Something like this, except there'll be only 2 bars for each Indicator, at x=0 and x=1:
x = np.random.randn(1000, 2)
colors = ['red', 'green']
plt.hist(x, color=colors)
plt.legend(['Male', 'Female'])
plt.title('Male and Female indicator by gender')
I have tried to imitate the above by copying the 2 dataframe columns into 2 columns of a list, and then trying to plot the histogram:
y=[]
y[0] = A[A['Gender'=='M']].tolist()
y[1] = A[A['Gender'=='F']].tolist()
plt.hist(y)
But this gives the following error:
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: False
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-152-138cb74b6e00> in <module>()
2 A= pd.DataFrame(A, columns=['Indicator', 'Gender'])
3 y=[]
----> 4 y[0] = A[A['Gender'=='M']].tolist()
5 y[1] = A[A['Gender'=='F']].tolist()
6 plt.hist(y)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2484 res = cache.get(item)
2485 if res is None:
-> 2486 values = self._data.get(item)
2487 res = self._box_item_values(item, values)
2488 cache[item] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3063 return self._engine.get_loc(key)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: False
The following should work, not tested with your data though.
genders = A.Gender.unique()
plt.hist([A.loc[A.Gender == x, 'Indicator'] for x in genders], label=genders)
Your code fails on A[A['Gender'=='M']] because it should be A[A['Gender'] == 'M'] to get the Male elements, but you also need to select the column that you want.

Reading and Using a CSV file in python 3 panda

I have CSV file
Firstname Lastname City Province
'Guy', 'Ouell', 'Brossard','QC'
'Michelle', 'Balonne','Stittsville','ON'
'Ben', 'Sluzing','Toronto','ON'
'Theodora', 'Panapoulos','Saint-Constant','QC'
'Kathleen', 'Mercier','St Johns','NL'
...
and I open and check it which is everything is fine:
df = pd.read_csv('a.csv')
df.head(n=5)
When I want to use columns I have two different problems:
Problem1: Only I have access to the first column and when I want to use other columns I get an error:
for mis_column, mis_row in missing_df.iterrows():
print(mis_row['Firstname'])
I get all of the first names but when I want to get all of the cities, for example, I see:
TypeError Traceback (most recent call last)
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2482 try:
-> 2483 return libts.get_value_box(s, key)
2484 except IndexError:
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box
(pandas\_libs\tslib.c:18843)()
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box
(pandas\_libs\tslib.c:18477)()
TypeError: 'str' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-36-55ba81245685> in <module>()
1
2 for mis_column, mis_row in missing_df.iterrows():
----> 3 print(mis_row['City'])
4
5
E:\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_value(self, series, key)
2489 raise InvalidIndexError(key)
2490 else:
-> 2491 raise e1
2492 except Exception: # pragma: no cover
2493 raise e1
E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'City'
Problem 2:
for mis_column, mis_row in df.iterrows():
if mis_row['Firstname'] == 'Guy':
print('A')
does not print A
Thanks in advance
With your CSV's header comma separated. Like this,
Firstname, Lastname, City, Province
'Guy', 'Ouell', 'Brossard','QC'
'Michelle', 'Balonne','Stittsville','ON'
'Ben', 'Sluzing','Toronto','ON'
'Theodora', 'Panapoulos','Saint-Constant','QC'
'Kathleen', 'Mercier','St John's','NL'
As your CSV has white spaces around, you can read to dataframe by skipping,
df = pd.read_csv('<your_input>.csv', skipinitialspace=True)
If you want to remove the single quotes as well, then,
df = pd.read_csv('<your_input>.csv', skipinitialspace=True, quotechar="'")
>>> df
Firstname Lastname City Province
0 Guy Ouell Brossard QC
1 Michelle Balonne Stittsville ON
2 Ben Sluzing Toronto ON
3 Theodora Panapoulos Saint-Constant QC
4 Kathleen Mercier St Johns' NL
>>> import pandas as pd
>>> df = pd.read_csv('test2.csv', skipinitialspace=True, quotechar="'")
>>> df
Firstname Lastname City Province
0 Guy Ouell Brossard QC
1 Michelle Balonne Stittsville ON
2 Ben Sluzing Toronto ON
3 Theodora Panapoulos Saint-Constant QC
4 Kathleen Mercier St Johns' NL
>>> for mis_column, mis_row in df.iterrows():
... if mis_row['Firstname'] == 'Guy':
... print('A')
...
A
>>>

Numpy TypeError: an integer is required

This will be maybe quite personal question but I don't know who to ask I hope somebody can help and don't skip me THANKS!. I have installed python using Anaconda and using Jupyter notebook. I have 2 csv files of data.
products.head()
ID_FUPID FUPID
0 1 674563
1 2 674597
2 3 674606
3 4 694776
4 5 694788
Products contain id of product and product number.
ratings.head()
ID_CUSTOMER ID_FUPID RATING
0 1 216 1
1 2 390 1
2 3 851 5
3 4 5897 1
4 5 9341 1
Ratings containt id of customer, productID and Rating which customer give to product.
I have created table as:
M = ratings.pivot_table(index=['ID_CUSTOMER'],columns=['ID_FUPID'],values='RATING')
Which is showing data correctly in matrix with productID= columns and customerID as rows.
I wanted to count pearson colleration between products so here is the pearson function:
def pearson(s1, s2):
import numpy as np
"""take two pd.series objects and return a pearson correlation"""
s1_c = s1 - s1.mean()
s2_c = s2 - s2.mean()
return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))
When I'm trying to count pearson(M['17'], M['21']) I got following errors:
TypeError Traceback (most recent call last)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: '17'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-277-d4ead225b6ab> in <module>()
----> 1 pearson(M['17'], M['21'])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: '17'
I will really appreciate any help ! thanks a million.
There were two places in the error message with the following line:
KeyError: '17'
This indicates there is no key '17' in M. This is likely because your index is an integer. However, you are currently accessing the DataFrame M with a string. The code to call pearson might be as follows:
pearson(M[17], M[21])

Categories

Resources