Specify lambda function to continue even if there is an error - python

I am trying to run this line of code:
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
But for some address and zipcodes the function get_zillow_id() fails. But I want the lambda function to just ignore the error for that particular address and zipcode and continue. How do I do that?
Here is the entire code:
from pyzillow.pyzillow import ZillowWrapper, GetDeepSearchResults, GetUpdatedPropertyDetails
import pandas as pd
import numpy as np
key = "X1-ZWz1gtmiat11xn_7ew1d"
# Create function to get zillow_id
def get_zillow_id(key, address, zipcode):
zillow_data = ZillowWrapper(key)
deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
result = GetDeepSearchResults(deep_search_response)
return result.zillow_id
# Create function to get propery data
def get_property_data(key, address, zipcode):
zillow_data = ZillowWrapper(key)
updated_property_details_response = zillow_data.get_updated_property_details(get_zillow_id(key, address, zipcode))
result = GetUpdatedPropertyDetails(updated_property_details_response)
return result.year_built
# Import data into dataframe
df = pd.read_csv('test.csv')
# Get zillow ids
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
Here is a picture of the data frame:
Here is the error I am getting:
ZillowError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self, code_obj, result)
2861 #rprint('Running code', repr(code_obj)) # dbg
-> 2862 exec(code_obj, self.user_global_ns, self.user_ns)
2863 finally:
<ipython-input-40-55f38b77eeea> in <module>()
1 # Get zillow ids
----> 2 df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
<ipython-input-40-55f38b77eeea> in <lambda>(row)
1 # Get zillow ids
----> 2 df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
<ipython-input-37-ce158395fdb8> in get_zillow_id(key, address, zipcode)
3 zillow_data = ZillowWrapper(key)
----> 4 deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
5 result = GetDeepSearchResults(deep_search_response)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyzillow\pyzillow.py in get_deep_search_results(self, address, zipcode)
30 }
---> 31 return self.get_data(url, params)
32
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyzillow\pyzillow.py in get_data(self, url, params)
81 if response.findall('message/code')[0].text is not '0':
---> 82 raise ZillowError(int(response.findall('message/code')[0].text))
83 else:
<class 'str'>: (<class 'TypeError'>, TypeError('__str__ returned non-string (type dict)',))
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self, code_obj, result)
2877 if result is not None:
2878 result.error_in_exec = sys.exc_info()[1]
-> 2879 self.showtraceback(running_compiled_code=True)
2880 else:
2881 outflag = False
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
1809 value, tb, tb_offset=tb_offset)
1810
-> 1811 self._showtraceback(etype, value, stb)
1812 if self.call_pdb:
1813 # drop into debugger
~\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py in _showtraceback(self, etype, evalue, stb)
541 u'traceback' : stb,
542 u'ename' : unicode_type(etype.__name__),
--> 543 u'evalue' : py3compat.safe_unicode(evalue),
544 }
545
~\AppData\Local\Continuum\anaconda3\lib\site-packages\ipython_genutils\py3compat.py in safe_unicode(e)
63 """
64 try:
---> 65 return unicode_type(e)
66 except UnicodeError:
67 pass
TypeError: __str__ returned non-string (type dict)

You should try and understand exactly why your function will fail. Then use a try / except clause to ignore the specific problem you wish to avoid. For example, to ignore TypeError:
def get_zillow_id(key, address, zipcode):
try:
zillow_data = ZillowWrapper(key)
deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
result = GetDeepSearchResults(deep_search_response)
return result.zillow_id
except TypeError, ZillowError:
return None
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']),
axis=1)
If ZillowError is an actual error, you may need to import it from that library.

Related

trying to create a pivot table with pandas and numpy

I'm using the following to try and get a pivot table that multiplies the quantity times the effective price and groups it by Product name:
df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
This is the stack trace - not sure why this isn't working.
ValueError Traceback (most recent call last)
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
970 try:
--> 971 result = self._aggregate_multiple_funcs([func], _axis=self.axis)
972
/usr/local/lib/python3.9/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
544 if not len(results):
--> 545 raise ValueError("no results")
546
ValueError: no results
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-35-f616d7b46a13> in <module>
----> 1 df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
/usr/local/lib/python3.9/site-packages/pandas/core/frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
6824 from pandas.core.reshape.pivot import pivot_table
6825
-> 6826 return pivot_table(
6827 self,
6828 values=values,
/usr/local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
981 # raised directly by _aggregate_multiple_funcs
982 raise
--> 983 result = self._aggregate_frame(func)
984 except AttributeError:
985 # catch exception from line 969
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in _aggregate_frame(self, func, *args, **kwargs)
1173 if axis != obj._info_axis_number:
1174 for name, data in self:
-> 1175 fres = func(data, *args, **kwargs)
1176 result[name] = fres
1177 else:
ValueError: invalid number of arguments```
My understanding is you cannot apply multi-column operation in pivot table. Maybe using groupby + transform can do. Based on the explanation I recommend this code (I am not sure the expected result is what you want):
df_sorted['TotalPrice'] = df_sorted['Quantity']*df_sorted['EffectivePrice']
result = df_sorted.groupby('ProductName')['TotalPrice'].sum()
For this sample dataframe:
Quantity EffectivePrice ProductName
0 1 12 A
1 1 13 B
2 2 14 A
The output is like this:
ProductName
A 40
B 13

How to use .map on an integer column in python pandas

I'm trying to take an integer column and map discrete values to another column. Basically, if a credit tier is marked, 1, 2, 3, antoher column maps those to no credit state, no hit or thin files. Then fill the null values with vaild. I tried However, I keep getting this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-129-926e6625f2b6> in <module>
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-129-926e6625f2b6> in <lambda>(row)
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
<ipython-input-126-462888d46184> in discrete_credit(row, variable)
6
7 """
----> 8 score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
9 score = row[score].fillna('valid')
10 score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
AttributeError: ("'numpy.int64' object has no attribute 'map'", 'occurred at index 0')
Here is a code example that is throwing the same error:
import pandas as pd
credit = {'credit_52278':[1,2,3,500,550,600,650,700,750,800,900]
}
df = pd.DataFrame(credit)
def discrete_credit(row, variable):
"""
allows thin files, no hits and no credit scores to float which will then allow the rest of the credit score to be fit \
with a spline
"""
score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
score = row[score].fillna('valid')
score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
return score
df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
map is a Series method, but you are trying to use it on a scalar (float) value.
You could simply do something like:
df['discrete_52278'] = (
df['credit_52278']
.map({
1: 'no_credit_state',
2: 'thin_file',
3: 'no_hit'
})
.fillna('valid')
.astype('category')
)

Give row number as index when reading csv

I have a csv file like the one here below:
30,60,14.3,53.6,0.71,403,0
30,60,15.3,54.9,0.72,403,0
30,60,16.5,56.2,0.73,403,0
30,60,17.9,57.5,0.74,403,0
No header, just data. The columns are
colNames = {
'doa_in1': np.float64, 'doa_in2': np.float64,
'doa_est1': np.float64, 'doa_est2': np.float64,
'rho': np.float64,
'seed': np.int32, 'matl_chan':np.int32
}
I read the csv with:
tmp_df = pd.read_csv(
io.BytesIO(tmp_csv), encoding='utf8',
header=None,
names=colNames.keys(), dtype=colNames,
converters={
'matl_chan': lambda x: bool(int(x))
}
)
This gives a warning as I'm giving two possible conversion to matl_chan, but it's just a warning that python will use only what is in converters (i.e. the lambda function)
I would like to have as index for each row a number or something unique.
That's because, then I process tmp_df with this function
def remove_lines(df):
THRES = 50
THRES_angle = 10 # degrees
is_converging = True
for idx, row in df.iterrows():
if idx == 0:
is_converging = False
# check if MUSIC started converging
if abs(row['doa_est1']-row['doa_in1']) < THRES_angle:
if abs(row['doa_est2']-row['doa_in2']) < THRES_angle:
is_converging = True
# calc error
err = abs(row['doa_est1']- row['doa_in1'])+abs(row['doa_est2']-row['doa_in2'])
if err > THRES and is_converging:
df=df.drop(idx)
return df
All rows, though, have index 30, so the function doesn't drop anything as I get this error:
KeyError: '[30] not found in axis'
The full stacktrace is
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-143-b61c0402f9d7> in <module>
----> 1 df=get_dataframe()
<ipython-input-121-b76aab8b17ee> in get_dataframe()
24 continue
25
---> 26 tmp_df_sanitized = remove_lines(tmp_df)
27 all_dataframes.append(tmp_df_sanitized)
28
<ipython-input-142-31019390251a> in remove_lines(df)
62 err = abs(row['doa_est1']-row['doa_in1'])+abs(row['doa_est2']-row['doa_in2'])
63 if err > THRES and is_converging:
---> 64 df=df.drop(idx)
65 print("dropped {}".format(idx))
66 return df
/usr/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3938 index=index, columns=columns,
3939 level=level, inplace=inplace,
-> 3940 errors=errors)
3941
3942 #rewrite_axis_style_signature('mapper', [('copy', True),
/usr/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3778 for axis, labels in axes.items():
3779 if labels is not None:
-> 3780 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3781
3782 if inplace:
/usr/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
3810 new_axis = axis.drop(labels, level=level, errors=errors)
3811 else:
-> 3812 new_axis = axis.drop(labels, errors=errors)
3813 result = self.reindex(**{axis_name: new_axis})
3814
/usr/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
4962 if mask.any():
4963 if errors != 'ignore':
-> 4964 raise KeyError(
4965 '{} not found in axis'.format(labels[mask]))
4966 indexer = indexer[~mask]
KeyError: '[30] not found in axis'
Is there anyone who has a solution?
edit: to be clearer, I'd like to have the row index as [0,1,2,3] for the four row I put above

Pandas error when filtering rows based on multiple column conditionals, "ValueError: invalid literal for int() with base 10: "

I am getting a 'ValueError: invalid literal for int() with base 10: ' when I try to filter the dataframe using multiple column conditions
Here is the code to set up the pandas dataframe. Warning: it'll download 6 mb of data. Can run in Google Colab if too concerned.
Code to import stuff and download the data
#Import stuff
import re
import os
import zipfile
from urllib.request import urlretrieve
from os.path import isfile, isdir
import requests
#Define Download Function
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#Download data
download_file_from_google_drive('1sZk3WWgdyHLru7q1KSWQwCT4nwwzHlpY', 'TheAnimeList.csv')
Code to set up the pandas dataframe
download_file_from_google_drive('1sZk3WWgdyHLru7q1KSWQwCT4nwwzHlpY', 'TheAnimeList.csv')
animeuser = pd.read_csv('TheAnimeList.csv' )
animeuser = animeuser[['anime_id','title_english', 'popularity', 'rank']]
animeuser.head()
anime_id title_english popularity rank
0 11013 Inu X Boku Secret Service 231 1274.0
1 2104 My Bride is a Mermaid 366 727.0
2 5262 Shugo Chara!! Doki 1173 1508.0
3 721 Princess Tutu 916 307.0
4 12365 Bakuman. 426 50.0
I am trying to filter rows based on column conditionals. First I tried
animeuser = animeuser[ (animeuser.popularity >= 3000) | (animeuser.rank >= 3000) ]
But that gave me this error
TypeError Traceback (most recent call last)
<ipython-input-39-8fb6d8508f25> in <module>()
----> 1 animeuser = animeuser[ (animeuser.popularity >= 3000) | (animeuser.rank >= 3000) ]
TypeError: '>=' not supported between instances of 'method' and 'int'
Then I tried
animeuser = animeuser[ ( animeuser.astype(int)['popularity'] >= 3000 ) | ( animeuser.astype(int)['rank'] >= 3000 ) ]
But that gave me this error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-40-a2ea65786b2a> in <module>()
----> 1 animeuser = animeuser[ ( animeuser.astype(int)['popularity'] >= 3000 ) | ( animeuser.astype(int)['rank'] >= 3000 ) ]
/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
116 else:
117 kwargs[new_arg_name] = new_arg_value
--> 118 return func(*args, **kwargs)
119 return wrapper
120 return _deprecate_kwarg
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
4002 # else, only a single dtype is given
4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004 **kwargs)
4005 return self._constructor(new_data).__finalize__(self)
4006
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
3460
3461 def astype(self, dtype, **kwargs):
-> 3462 return self.apply('astype', dtype=dtype, **kwargs)
3463
3464 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3327
3328 kwargs['mgr'] = self
-> 3329 applied = getattr(b, f)(**kwargs)
3330 result_blocks = _extend_blocks(applied, result_blocks)
3331
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
543 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544 **kwargs)
545
546 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
623
624 # _astype_nansafe works fine with 1-d only
--> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
626 values = values.reshape(self.shape)
627
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
690 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
691 # work around NumPy brokenness, #1987
--> 692 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
693
694 if dtype.name in ("datetime64", "timedelta64"):
pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src/util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: 'Inu X Boku Secret Service'
The string 'Inu X Boku Secret Service' belongs to the 'title_english' column in the very first row of the dataframe. But the 'rank' and 'popularity' columns see to be float and ints.
I even tried looking at the datatypes
animeuser.dtypes
anime_id int64
title_english object
popularity int64
rank float64
dtype: object
And everything seems to be in order.
The first error you are facing is because rank is a method of pandas.DataFrame. Methods have precedence over column access via attribute notation. So in order to access the data you need to use bracket notation: animeuser['rank'].
The second error occurs because you try to represent the whole data frame as int which is not possible for various columns. This would only be possible for the 'rank' and 'popularity' columns.
With statement
animeuser.astype(int)['popularity']
you trying to convert to int all animeuser columns. And got an error on string column. Try just
animeuser['popularity']

Lambda - apply combination on pandas dataframe

Starting from the following dataframe:
I created the following function:
def campaign_name(name,ID,prefix):
campaign = "S[" + prefix + ID + "]: " + name
return campaign
I would like to use it in a dataframe like this:
keywords_merge_temporary["campaign name"] = keywords_merge_temporary.apply(lambda x: campaign_name(x.name,x.id,x.prefix), axis=1)
Problem being, for some reason I get the following error, which I never had in the past when I used this kind of lambda + apply combinations:
<ipython-input-...> in <module>()
----> 1 keywords_merge_temporary["campaign name"] = keywords_merge_temporary.apply(lambda z: campaign_name(z.name,z.id,z.prefix), axis=1)
/Users/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds) 4150 if reduce is None: 4151 reduce = True
-> 4152 return self._apply_standard(f, axis, reduce=reduce) 4153 else: 4154 return self._apply_broadcast(f, axis)
/Users/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in
_apply_standard(self, func, axis, ignore_failures, reduce) 4246 try: 4247 for i, v in enumerate(series_gen):
-> 4248 results[i] = func(v) 4249 keys.append(v.name) 4250 except Exception as e:
<ipython-input-...> in <lambda>(z)
----> 1 keywords_merge_temporary["campaign name"] = keywords_merge_temporary.apply(lambda z: campaign_name(z.name,z.id,z.prefix), axis=1)
<ipython-input-52-f727ebf9b9ee> in campaign_name(name, ID, prefix)
1 def campaign_name(name,ID,prefix):
----> 2 campaign = "S[" + prefix + ID + "]: " + name
3 return campaign
TypeError: ("ufunc 'add' did not contain a loop with signature matching types dtype('<U21') dtype('<U21') dtype('<U21')", 'occurred at index 0')

Categories

Resources