from fastai import *
from fastai.tabular import *
from fastai.tabular.all import *
import pandas as pd
# set seed for reproducibility
custom_set_seed(42)
df = pd.read_csv('credit_card_default.csv', index_col=0, na_values='')
df.head()
DEP_VAR = 'default_payment_next_month'
num_features = list(df.select_dtypes('number').columns)
num_features.remove(DEP_VAR)
cat_features = list(df.select_dtypes('object').columns)
preprocessing = [FillMissing, Categorify, Normalize]
data = (TabularList.from_df(df, cat_names=cat_features, cont_names=num_features, procs=preprocessing).split_by_rand_pct(valid_pct=0.2, seed=42).label_from_df(cols=DEP_VAR).databunch())
I have been trying to run this piece of code but it keeps running into this error:
NameError Traceback (most recent call last)
<ipython-input-42-5ca7e57a8e36> in <module>
1 # Create a TabularDataBunch from the DataFrame
2
----> 3 data = (TabularList.from_df(df, cat_names=cat_features, cont_names=num_features, procs=preprocessing).split_by_rand_pct(valid_pct=0.2, seed=42).label_from_df(cols=DEP_VAR).databunch())
NameError: name 'TabularList' is not defined
I believe I have imported all the modules that were needed. Can someone suggest a solution for this?
Check with the full path import as below
from fastai.tabular.data import TabularList
I got this working by installing an older fastai i.e.
pip install fastai==1.0.61
then
from fastai.tabular.data import TabularList
works with no problems.
Related
Here is my working code at the moment
import os as os
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
#extract specific columns from dataframe
F1_para1 = 'ROI' #x-axis
F2_para2 = 'Diet'
value = 'Time(s)'
df['comb'] = df[F1_para1].map(str) + "+" + df[F2_para2].map(str)
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+'*'+F2_para2, data = df).fit()
aov = anova_lm(mod, type=2)
comparison=MultiComparison(df[value], df['comb'])
tdf = pd.read_html(comparison.tukeyhsd().summary().as_html())[0] #unfiltered
I am getting the following error on line
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+'*'+F2_para2, data = df).fit()
Because the error says it doesnt recognized 'Time(s)' which im presuming its because of the brackets as without the brackets it works. Just the original dataframe needs to have those brackets in the heading
Error code:
Exception has occurred: PatsyError
Error evaluating factor: NameError: name 'Time' is not defined
Time(s)~ROI+Diet+ROI*Diet
^^^^^^^
File "", line 1, in
NameError: name 'Time' is not defined
The above exception was the direct cause of the following exception:
File "C:\Users\temp.py", line 55, in bar_2para
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+''+F2_para2, data = df).fit()
File "C:\Users\temp.py", line 215, in
bar_2para(df,F1_para1,F2_para2,para3,flt_para3a,x,x,x)
patsy.PatsyError: Error evaluating factor: NameError: name 'Time' is not defined
Time(s)~ROI+Diet+ROIDiet
^^^^^^^
I don't know what to try as sometimes i need to include '(%)' as part of the value in the two-way anova formula
I am using the following code to print ratio by applying a function, but am getting the following errors.
Code
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL")
print(x)
Error
NameError Traceback (most recent call last)
<ipython-input-2-c17535375449> in <module>
12 return new_df
13 x = main("IOC","HPCL")
---> 14 print(x)
NameError: name 'x' is not defined
You are calling x = main("IOC","HPCL") inside the function main
This makes x defined only inside the scope of the function main
When you call print(x) outside function main the interpreter throws error, as it should, that x is not defined
Does this correction solve the issue:
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL") # Edit moving this line outside the function main()
print(x)
I am writing a short piece of code to remove web browser version numbers from the name in a column of data in a pandas dataframe. i.e. replace a string containing alpha and numerical characters with just the alpha characters.
I have written:
df_new=(rename())
str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
I am getting this error message and I don't understand what it's telling me
AttributeError Traceback (most recent call last)
<ipython-input-4-d8c6f9119b9f> in <module>
3 df_new=(rename())
4
----> 5 str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
AttributeError: 'method_descriptor' object has no attribute 'df_new'
The code above is following this code/function in a Jupyter Notebook
import pandas as pd
import numpy as np
import re
#write a function to change column names using a dictionary
def rename():
dict_col = {'Browser':'new_browser', 'Page':'web_page', 'URL':'web_address', 'Visitor ID':'visitor_ID'}
df = pd.read_csv ('dummy_webchat_data.csv')
for y in df.columns:
if y in dict_col:
df_new=df.rename(columns={y:dict_col}[y])
return df_new
rename()
I've been having trouble with the dataframe updates not being recognised when I next call it. Usually in JN I just keep writing the amends to the df and it retains the updates. But even the code df_new.head(1) needs to be written like this to work after the first function is run for some reason (mentioning as it feels like a similar problem even though the error messages are different):
df_new=(rename())
df_new.head(1)
can anyone help me please?
Best
Miriam
The error tells you that you are not using the Series.str.replace() method correctly.
You have:
str.replace.df_new[new_browser]('[.*0-9]',"",regex=True)
when what you want is:
df_new[new_browser].str.replace('[.*0-9]',"",regex=True)
See this:
>>> import pandas as pd
>>>
>>> s = pd.Series(['a1', 'b2', 'c3', 'd4'])
>>> s.str.replace('[.*0-9]',"",regex=True)
0 a
1 b
2 c
3 d
dtype: object
and compare it with this (which is what you get, s here is your df_new[new_browser]):
>>> import pandas as pd
>>>
>>> s = pd.Series(['a1', 'b2', 'c3', 'd4'])
>>> str.replace.s('[.*0-9]',"",regex=True)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'method_descriptor' object has no attribute 's'
When I want to remove some elements which satisfy a particular condition, python is throwing up the following error:
TypeError Traceback (most recent call last)
<ipython-input-25-93addf38c9f9> in <module>()
4
5 df = pd.read_csv('fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv;
----> 6 df = filter(df,~('-02-29' in df['Date']))
7 '''tmax = []; tmin = []
8 for dates in df['Date']:
TypeError: 'int' object is not iterable
The following is the code :
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data/C2A2_data/BinnedCsvs_d400/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv');
df = filter(df,~('-02-29' in df['Date']))
What wrong could I be doing?
Following is sample data
Sample Data
Use df.filter() (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html)
Also please attach the csv so we can run it locally.
Another way to do this is to use one of pandas' string methods for Boolean indexing:
df = df[~ df['Date'].str.contains('-02-29')]
You will still have to make sure that all the dates are actually strings first.
Edit:
Seeing the picture of your data, maybe this is what you want (slashes instead of hyphens):
df = df[~ df['Date'].str.contains('/02/29')]
I am trying to clean some data from a csv file. I need to make sure that whatever is in the 'Duration' category matches a certain format. This is how I went about that:
import re
import pandas as pd
data_path = './ufos.csv'
ufos = pd.read_csv(data_path)
valid_duration = re.compile('^[0-9]+ (seconds|minutes|hours|days)$')
ufos_clean = ufos[valid_duration.match(ufos.Duration)]
ufos_clean.head()
This gives me the following error:
TypeErrorTraceback (most recent call last)
<ipython-input-4-5ebeaec39a83> in <module>()
6
7 valid_duration = re.compile('^[0-9]+ (seconds|minutes|hours|days)$')
----> 8 ufos_clean = ufos[valid_duration.match(ufos.Duration)]
9
10 ufos_clean.head()
TypeError: expected string or buffer
I used a similar method to clean data before without the regular expressions. What am I doing wrong?
Edit:
MaxU got me the closest, but what ended up working was:
valid_duration_RE = '^[0-9]+ (seconds|minutes|hours|days)$'
ufos_clean = ufos
ufos_clean = ufos_clean[ufos.Duration.str.contains(valid_duration_RE)]
There's probably a lot of redundancy in there, I'm pretty new to python, but it worked.
You can use vectorized .str.match() method:
valid_duration_RE = '^[0-9]+ (seconds|minutes|hours|days)$'
ufos_clean = ufos[ufos.Duration.str.contains(valid_duration_RE)]
I guess you want it the other way round (not tested):
import re
import pandas as pd
data_path = './ufos.csv'
ufos = pd.read_csv(data_path)
def cleanit(val):
# your regex solution here
pass
ufos['ufos_clean'] = ufos['Duration'].apply(cleanit)
After all, ufos is a DataFrame.