In the following codes, I try to define a function first and apply the function to a dataframe to reset the geozone.
import pandas as pd
testdata ={'country': ['USA','AUT','CHE','ABC'], 'geozone':[0,0,0,0]}
d =pd.DataFrame.from_dict(testdata, orient = 'columns')
def setgeozone(dataframe, dcountry, dgeozone):
dataframe.loc[dataframe['dcountry'].isin(['USA','CAN']),'dgeozone'] =1
dataframe.loc[dataframe['dcountry'].isin(['AUT','BEL']),'dgeozone'] =2
dataframe.loc[dataframe['dcountry'].isin(['CHE','DNK']),'dgeozone'] =3
setgeozone(d, country, geozone)
I got error message saying:
Traceback (most recent call last):
File "<ipython-input-56-98dad4781f73>", line 1, in <module>
setgeozone(d, country, geozone)
NameError: name 'country' is not defined
Can someone help me understand what I did wrong.
Many thanks.
You don't need to pass parameters other than the DataFrame itself to your function. Try this:
def setgeozone(df):
df.loc[df['country'].isin(['USA','CAN']),'geozone'] = 1
df.loc[df['country'].isin(['AUT','BEL']),'geozone'] = 2
df.loc[df['country'].isin(['CHE','DNK']),'geozone'] = 3
setgeozone(df)
Here's two other (also better) ways to accomplish what you need:
Use map:
df["geozone"] = df["country"].map({"USA": 1, "CAN": 1, "AUT": 2, "BEL": 2, "CHE": 3, "DNK": 3})
Use numpy.select:
import numpy as np
df["geozone"] = np.select([df["country"].isin(["USA", "CAN"]), df["country"].isin(["AUT", "BEL"]), df["country"].isin(["CHE", "DNK"])],
[1, 2, 3])
Related
I am using the following code to print ratio by applying a function, but am getting the following errors.
Code
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL")
print(x)
Error
NameError Traceback (most recent call last)
<ipython-input-2-c17535375449> in <module>
12 return new_df
13 x = main("IOC","HPCL")
---> 14 print(x)
NameError: name 'x' is not defined
You are calling x = main("IOC","HPCL") inside the function main
This makes x defined only inside the scope of the function main
When you call print(x) outside function main the interpreter throws error, as it should, that x is not defined
Does this correction solve the issue:
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL") # Edit moving this line outside the function main()
print(x)
I am writing a short piece of code to remove web browser version numbers from the name in a column of data in a pandas dataframe. i.e. replace a string containing alpha and numerical characters with just the alpha characters.
I have written:
df_new=(rename())
str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
I am getting this error message and I don't understand what it's telling me
AttributeError Traceback (most recent call last)
<ipython-input-4-d8c6f9119b9f> in <module>
3 df_new=(rename())
4
----> 5 str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
AttributeError: 'method_descriptor' object has no attribute 'df_new'
The code above is following this code/function in a Jupyter Notebook
import pandas as pd
import numpy as np
import re
#write a function to change column names using a dictionary
def rename():
dict_col = {'Browser':'new_browser', 'Page':'web_page', 'URL':'web_address', 'Visitor ID':'visitor_ID'}
df = pd.read_csv ('dummy_webchat_data.csv')
for y in df.columns:
if y in dict_col:
df_new=df.rename(columns={y:dict_col}[y])
return df_new
rename()
I've been having trouble with the dataframe updates not being recognised when I next call it. Usually in JN I just keep writing the amends to the df and it retains the updates. But even the code df_new.head(1) needs to be written like this to work after the first function is run for some reason (mentioning as it feels like a similar problem even though the error messages are different):
df_new=(rename())
df_new.head(1)
can anyone help me please?
Best
Miriam
The error tells you that you are not using the Series.str.replace() method correctly.
You have:
str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
when what you want is:
df_new[new_browser].str.replace('[.*0-9]',"",regex=True)
See this:
>>> import pandas as pd
>>>
>>> s = pd.Series(['a1', 'b2', 'c3', 'd4'])
>>> s.str.replace('[.*0-9]',"",regex=True)
0 a
1 b
2 c
3 d
dtype: object
and compare it with this (which is what you get, s here is your df_new[new_browser]):
>>> import pandas as pd
>>>
>>> s = pd.Series(['a1', 'b2', 'c3', 'd4'])
>>> str.replace.s('[.*0-9]',"",regex=True)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'method_descriptor' object has no attribute 's'
When I want to remove some elements which satisfy a particular condition, python is throwing up the following error:
TypeError Traceback (most recent call last)
<ipython-input-25-93addf38c9f9> in <module>()
4
5 df = pd.read_csv('fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv;
----> 6 df = filter(df,~('-02-29' in df['Date']))
7 '''tmax = []; tmin = []
8 for dates in df['Date']:
TypeError: 'int' object is not iterable
The following is the code :
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data/C2A2_data/BinnedCsvs_d400/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv');
df = filter(df,~('-02-29' in df['Date']))
What wrong could I be doing?
Following is sample data
Sample Data
Use df.filter() (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html)
Also please attach the csv so we can run it locally.
Another way to do this is to use one of pandas' string methods for Boolean indexing:
df = df[~ df['Date'].str.contains('-02-29')]
You will still have to make sure that all the dates are actually strings first.
Edit:
Seeing the picture of your data, maybe this is what you want (slashes instead of hyphens):
df = df[~ df['Date'].str.contains('/02/29')]
Below is the problem, the code and the error that arises. top_10_movies has two columns, which are rating and name.
import babypandas as bpd
top_10_movies = top_10_movies = bpd.DataFrame().assign(
Rating = top_10_movie_ratings,
Name = top_10_movie_names
)
top_10_movies
You can use the assign method to add a column to an already-existing
table, too. Create a new DataFrame called with_ranking by adding a
column named "Ranking" to the table in top_10_movies
import babypandas as bpd
Ranking = my_ranking
with_ranking = top_10_movies.assign(Ranking)
TypeError Traceback (most recent call last)
<ipython-input-41-a56d9c05ae19> in <module>
1 import babypandas as bpd
2 Ranking = my_ranking
----> 3 with_ranking = top_10_movies.assign(Ranking)
TypeError: assign() takes 1 positional argument but 2 were given
While using assign, it needs a key to assign to, you can do:
with_ranking = top_10_movies.assign(ranking = Ranking)
Here's a simple example to check:
df = pd.DataFrame({'col': ['a','b']})
ranks = [1, 2]
df.assign(ranks) # causes the same error
df.assign(rank = ranks) # works
According to an article, the vectorization is much faster than apply a function to a pandas dafaframe column.
But I had a somehow special case like this:
import pandas as pd
df = pd.DataFrame({'IP': [ '1.0.64.2', '100.23.154.63', '54.62.1.3']})
def compare3rd(ip):
"""Check if the 3dr part of an IP is greater than 100 or not"""
ip_3rd = ip.split('.')[2]
if int(ip_3rd) > 100:
return True
else:
return False
# This works but very slow
df['check_results'] = df.IP.apply(lambda x: compare3rd(x))
print df
# This is supposed to be much faster
# But it doesn't work ...
df['check_results_2'] = compare3rd(df['IP'].values)
print df
Full error traceback look like this:
Traceback (most recent call last):
File "test.py", line 16, in <module>
df['check_results_2'] = compare3rd(df['IP'].values)
File "test.py", line 6, in compare3rd
ip_3rd = ip.split('.')[2]
AttributeError: 'numpy.ndarray' object has no attribute 'split'
My question is: how do I properly use this vectorization method in this case?
Check with str in pandas
df.IP.str.split('.').str[2].astype(int)>100
0 False
1 True
2 False
Name: IP, dtype: bool
Since you mention vectorize
import numpy as np
np.vectorize(compare3rd)(df.IP.values)
array([False, True, False])