drop a DataFrame column in python - python
I desperately need help here. I am trying to get the dimension of a dataframe. I always get 31 columns instead of 30: Value should be 30, found 31. I tried to reset_index(drop = True) but I still get the same error. any help is appreciated. Stay safe.
def read_data(dataset_id):
data = None
# Begin CODE
if dataset_id == 'breast_cancer':
disease = 'breast_cancer'
datafile = 'wdbc.data'
bc_columns = ['ptid', 'diagnosis', 'mean_radius', 'mean_texture',
'mean_perimeter', 'mean_area',
'mean_smoothness', 'mean_compactness', 'mean_concavity',
'mean_concave_pts', 'mean_symmetry ',
'mean_fractal_dim', 'std_err_radius', 'std_err_texture',
'std_err_perimeter', 'std_err_area',
'std_err_smoothness', 'std_err_compactness',
'std_err_concavity', 'std_err_concave_pts',
'std_err_symmetry ', 'std_err_fractal_dim', 'worst_radius',
'worst_texture', 'worst_perimeter',
'worst_area', 'worst_smoothness', 'worst_compactness',
'worst_concavity', 'worst_concave_pts',
'worst_symmetry ', 'worst_fractal_dim']
data = pd.read_csv(datafile, skipinitialspace=True, names=bc_columns)
data.drop(labels=['ptid'], axis=1, inplace=True)
bc_diag_class = get_class_list_dict(data['diagnosis'])
elif dataset_id == 'hyperthyroidism':
disease = 'hyperthyroidism'
datafile1 = 'allhyper.data' # tab delimited, no header
datafile2 = 'allhyper.test' # comma delimited, no header
ht_columns = ['age', 'Gender', 'on thyroxine', 'query on thyroxine', 'on
antithyroid medication', 'sick',
'pregnant', 'thyroid surgery', 'I131 treatment', 'query
hypothyroid', 'query hyperthyroid',
'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych',
'TSH measured', 'TSH', 'T3 measured',
'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI
measured', 'FTI', 'TBG measured', 'TBG',
'referral source', 'diag_class']
data1 = pd.read_csv(datafile1, sep='\t', skipinitialspace=True,
names=ht_columns)
data2 = pd.read_csv(datafile2, skipinitialspace=True, names=ht_columns)
data = data1.append(data2, ignore_index=True)
data = data.replace(to_replace='?', value=float('nan'))
data[['diag_class', 'ptid']] = data['diag_class'].str.split(pat='.\|',
expand=True)
diag_class = data['diag_class']
data.drop(labels=['diag_class', 'ptid'], axis=1, inplace=True)
data.insert(0, 'diag_class', diag_class)
data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']] \
= data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI',
'TBG']].apply(pd.to_numeric)
elif dataset_id == 'cervical_cancer':
disease = 'cervical_cancer'
datafile = 'risk_factors_cervical_cancer.csv'
cc_columns = ('Age', 'Num_sex_partners', 'First_sex_intercourse',
'Num_pregnancies',
'Smokes', 'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps',
'Hormonal_Contraceps_years', 'IUD', 'IUD_years', 'STD',
'STD_number',
'STD_condylomatosis', 'STDscervical_condylomatosis',
'STD_vaginal_condylomatosis',
'STD_vulvo_perin_condylomatosis', 'STD_syphilis',
'STD_pelvic_inflam_disease',
'STD_genital_herpes', 'STD_molluscum_contagiosum',
'STD_AIDS', 'STD_HIV', 'STD_HepB',
'STD_HPV', 'STD_Num_diagnosis',
'STD_Time_since_first_diag', 'STDs_Time_since_last_diag',
'Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 'Schiller',
'Citology', 'Biopsy')
data = pd.read_csv(datafile, skipinitialspace=True)
data.columns = cc_columns
data = data.replace(to_replace='?', value=float('nan'))
biopsy_class = data['Biopsy']
data.drop(labels=['Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann',
'Schiller', 'Citology', 'Biopsy'],
axis=1, inplace=True)
data.insert(0, 'Biopsy', biopsy_class)
data[['Num_sex_partners', 'First_sex_intercourse', 'Num_pregnancies',
'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps_years', 'IUD_years',
'STD_number', 'STD_Time_since_first_diag',
'STDs_Time_since_last_diag']] \
= data[['Num_sex_partners', 'First_sex_intercourse',
'Num_pregnancies', 'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps_years', 'IUD_years',
'STD_number', 'STD_Time_since_first_diag',
'STDs_Time_since_last_diag']].apply(pd.to_numeric)
elif dataset_id == 'liver_cancer':
disease = 'liver_cancer'
datafile = 'Indian Liver Patient Dataset (ILPD).csv' # comma delimited,
no header
ld_columns = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot',
'TP', 'ALB', 'A/G Ratio', 'Selector']
data = pd.read_csv(datafile, skipinitialspace=True, names=ld_columns)
data.loc[data['Gender'] == 'Male', 'Gender'] = 'M'
data.loc[data['Gender'] == 'Female', 'Gender'] = 'F'
selector_class = data['Selector']
data.drop(labels=['Selector'], axis=1, inplace=True)
data.insert(0, 'Selector', selector_class)
data.reset_index(drop=True, inplace=True)
# End CODE
print(data.head(20))
return data
def dimensions(dataset_id, dataset):
dim = None
# dim = dataset.shape
num_inst = len(dataset)
num_feat = len(dataset.iloc[0].reset_index())
dim = (num_inst, num_feat)
return dim
If you want to drop a column from DataFrame, You can do like this.
If you want to drop single column:
df.drop(['column_name'], axis = 1)
If you want to drop multiple columns:
df.drop(['Column1', 'Column2'], axis = 1)
If you want to drop based on some other condition instead of column name. You can comment below. I'll update the answer accordingly. Hope it helps!.
Related
Removing utc info from yfinance dataframe
How can I remove the utc portion of a DF created from a yfinance? Every example I and approach I seen has failed. eg: df = yf.download('2022-01-01', '2023-01-06', interval = '60m' ) pd.to_datetime(df['Datetime']) error: 3806 #If we have a listlike key, _check_indexing_error will raise KeyError: 'Datetime' As well as the following approaches enter code heredf = df.reset_index() df = pd.DataFrame(df, columns = ['Datetime', "Close"]) df.rename(columns = {'Date': 'ds'}, inplace = True) df.rename(columns = {'Close':'y'}, inplace = True) #df['ds'] = df['ds'].dt.date #df['ds'] = datetime.fromtimestamp(df['ds'], tz = None) #df['ds'] = df['ds'].dt.floor("Min") #df['ds'] = pd.to_datetime(df['ds'].dt.tz_convert(None)) #df['ds'] = pd.to_datetime['ds'] #pd.to_datetime(df['ds']) df['ds'].dt.tz_localize(None) print(df) with similar errors, Any help or pointer will greatly appreciated I have spent the entire morning on this. Thanks in advance BTT
Your code interprets '2022-01-01' as the first and required argument tickers. This date is not a valid ticker, so yf.download() does not return any price and volume data. Try: df = yf.download(tickers='AAPL', start='2022-01-01', end='2023-01-06', interval = '60m' ) df.index = df.index.tz_localize(None)
Deleting empty columns with a few columns between data
I'm fetching data from a Google sheet: values1 = pd.DataFrame(values) aux = values1.head(1) values1.drop(index={0}, inplace=True) senal1 = (values1[2] == "SEÑAL") senal = values1[senal1] senal.dropna(axis=1, inplace=True) print(senal) This is my result after running the code:
How can I convert .append to .concat pandas python
I have this data entry: [{'id': 2269396, 'from': 1647086100, 'at': 1647086160000000000, 'to': 1647086160, 'open': 1.072652, 'close': 1.072691, 'min': 1.072641, 'max': 1.072701, 'volume': 0},..] Apllying this indexing pandas: current = self.getAllCandles(self.active_id,start_candle) main = pd.DataFrame() useful_frame = pd.DataFrame() for candle in current: useful_frame = pd.DataFrame(list(candle.values()),index = list(candle.keys())).T.drop(columns = ['at']) useful_frame = useful_frame.set_index(useful_frame['from']).drop(columns = ['id']) main = main.append(useful_frame) main.drop_duplicates() final_data = main.drop(columns = {'to'}) final_data = final_data.loc[~final_data.index.duplicated(keep = 'first')] return final_data After that I have the following result: from open close min max volume from 1.647086e+09 1.647086e+09 1.072652 1.072691 1.072641 1.072701 0.0 ... ... ... ... ... ... ... Since df.append() will be deprecated, I'm struggling to execute the same instructions using df.concat(). But I'm not getting it, how could I change that? Thank you all, I made a small modification to the code suggested by our friend Stuart Berg #stuart-berg, and it was perfect: current = self.getAllCandles(self.active_id, start_candle) frames = [] useful_frame = pd.DataFrame.from_dict(current, orient='columns') useful_frame = useful_frame.set_index('from') useful_frame = useful_frame.drop(columns=['at', 'id']) frames.append(useful_frame) main = pd.concat(frames).drop_duplicates() final_data = main.drop(columns='to') final_data = final_data.loc[~final_data.index.duplicated()] return final_data
I think this is what you're looking for: current = self.getAllCandles(self.active_id, start_candle) frames = [] for candle in current: useful_frame = pd.DataFrame.from_dict(candle, orient='columns') #useful_frame['from'] = datetime.datetime.fromtimestamp(int(useful_frame['from'])).strftime('%Y-%m-%d %H:%M:%S') useful_frame = useful_frame.set_index('from') useful_frame = useful_frame.drop(columns=['at', 'id']) frames.append(useful_frame) main = pd.concat(frames).drop_duplicates() final_data = main.drop(columns='to') final_data = final_data.loc[~final_data.index.duplicated()]
Create an empty python list and then append all the series to the list. Finally call pandas' concat on that list, this will give you that dataframe.
Panda DataFrame Row Items IF Comparison doesnt return correct result
I retrieve data from quandl and load it to a pandas DF object. Afterwards I calculate SMA values (SMA21, SMA55) based on "Last Price". Adding those SMA values as a column do my DF object. I iterate through DF to catch a buy signal. I know the buy condition is holding true for some dates but my code does not printing anything out. I am expecting to print the buy condition at the very least. as below you can see the following condition: kitem['SMA21'] >= kitem['Last'] My code: import requests import pandas as pd import json class URL_Params: def __init__ (self, endPoint, symboll, startDate, endDate, apiKey): self.endPoint = endPoint self.symboll = symboll self.startDate = startDate self.endDate = endDate self.apiKey = apiKey def createURL (self): return self.endPoint + self.symboll + '?start_date=' + self.startDate + '&end_date=' + self.endDate + '&api_key=' + self.apiKey def add_url(self, _url): self.url_list my_portfolio = {'BTC':1.0, 'XRP':0, 'DSH':0, 'XMR':0, 'TotalBTCValue':1.0} _endPoint = 'https://www.quandl.com/api/v3/datasets/BITFINEX/' _symbolls = ['BTCEUR','XRPBTC','DSHBTC','IOTBTC','XMRBTC'] _startDate = '2017-01-01' _endDate = '2019-03-01' _apiKey = '' #needs to be set for quandl my_data = {} my_conns = {} my_col_names = ['Date', 'High', 'Low', 'Mid', 'Last', 'Bid', 'Ask', 'Volume'] orderbook = [] #create connection and load data for each pair/market. #load them in a dict for later use for idx_symbol in _symbolls: my_url_params = URL_Params(_endPoint,idx_symbol,_startDate,_endDate,_apiKey) response = requests.get(my_url_params.createURL()) my_data[idx_symbol] = json.loads(response.text) #Prepare Data my_raw_data_df_xrpbtc = pd.DataFrame(my_data['XRPBTC']['dataset']['data'], columns= my_data['XRPBTC']['dataset']['column_names']) #Set Index to Date Column and Sort my_raw_data_df_xrpbtc['Date'] = pd.to_datetime(my_raw_data_df_xrpbtc['Date']) my_raw_data_df_xrpbtc.index = my_raw_data_df_xrpbtc['Date'] my_raw_data_df_xrpbtc = my_raw_data_df_xrpbtc.sort_index() #Drop unrelated columns my_raw_data_df_xrpbtc.drop(['Date'], axis=1, inplace=True) my_raw_data_df_xrpbtc.drop(['Ask'], axis=1, inplace=True) my_raw_data_df_xrpbtc.drop(['Bid'], axis=1, inplace=True) my_raw_data_df_xrpbtc.drop(['Low'], axis=1, inplace=True) my_raw_data_df_xrpbtc.drop(['High'], axis=1, inplace=True) my_raw_data_df_xrpbtc.drop(['Mid'], axis=1, inplace=True) #Calculate SMA values to create buy-sell signal my_raw_data_df_xrpbtc['SMA21'] = my_raw_data_df_xrpbtc['Last'].rolling(21).mean() my_raw_data_df_xrpbtc['SMA55'] = my_raw_data_df_xrpbtc['Last'].rolling(55).mean() my_raw_data_df_xrpbtc['SMA200'] = my_raw_data_df_xrpbtc['Last'].rolling(200).mean() #Check for each day if buy signal holds BUY if sell signal holds SELL for idx,kitem in my_raw_data_df_xrpbtc.iterrows(): if (kitem['SMA21'] >= kitem['Last']) is True: #buy signal print("buy0") if my_portfolio['BTC'] > 0 is True: print("buy1") if (kitem['Last'] * my_portfolio['XRP']) >= (my_portfolio['BTC'] * 1.05) is True: #sell signal print("sell0") if my_portfolio['XRP'] > 0 is True: print("sell1") I know that there are lots of rows that holds true but my code never enters this path of code so it does not print out what I expect. Could anyone please help/comment what might be wrong?
The reason is that your comparison is wrong. The result of kitem['SMA21'] >= kitem['Last'] will be a numpy.bool_. When you use is to compare it to True this will fail as it is not the same object. If you change the comparison to == it will work as expected: if (kitem['SMA21'] >= kitem['Last']) == True:
check 1st row of a column with all rows of anothter in python (panda)
I have two text files. First file contains those cols, ['Start time', 'End Time', 'Activity'] second file contains ['Start time', 'End Time', 'Location', 'Type', 'Place']. I want to check for example Start_time1[0] with all rows of Start_time2. The code is # -*- coding: utf-8 -*- """ Created on Fri Jun 09 15:04:08 2017 #author: Owner """ import pandas as pd #import matplotlib.pyplot as plt import time import datetime import numpy as np df = pd.read_csv("OrdonezA_ADLs.txt", header = None, delimiter=' *\t+ *', engine='python') df.columns=['Start time', 'End Time', 'Activity'] df2 = pd.read_csv("OrdonezA_Sensors.txt", header = None, delimiter=' *\t+ *', engine='python') df2.columns=['Start time', 'End Time', 'Location', 'Type', 'Place'] #print df2.head() print df['Start time'][0] print df2['Location'][0] df['Start time'] = df.apply(lambda row: time.mktime(datetime.datetime.strptime(row['Start time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1) df['End Time'] = df.apply(lambda row: time.mktime(datetime.datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1) #print df df2['Start time'] = df2.apply(lambda row: time.mktime(datetime.datetime.strptime(row['Start time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1) df2['End Time'] = df2.apply(lambda row: time.mktime(datetime.datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1) #print df2 p = np.zeros(shape=(len(df),12)) print p for x in xrange(len(df)): for y in xrange(len(df2)): if (df['Start time'][x] == df2['Start time'][y] and df['End Time'][x] <= df2['End Time'][y]): if df2['Location'][x] == 'Shower': print "Shower on" # p[x,0]=1 here i want to add 1 in first col of p,in x row of p elif df2['Location'][x]=='Basin': print "Basin on" # p[x,1]=1 elif df2['Location'][x]=='Door Kitchen': print "Door Kitchen on" # p[x,2]=1 elif df2['Location'][x]=='Door Bathroom': print "Door Bathroom on" # p[x,3]=1 elif df2['Location'][x]=='Maindoor': print "Maindoor on" # p[x,4]=1 elif df2['Location'][x]=='Fridge': print "Fridge on" # p[x,5]=1 elif df2['Location'][x]=='Cupboard': print "Cupboard on" # p[x,6]=1 elif df2['Location'][x]=='Toilet': print "Toilet on" # p[x,7]=1 elif df2['Location'][x]=='Seat': print "Seat on" # p[x,8]=1 elif df2['Location'][x]=='Bed': print "Bed on" # p[x,9]=1 elif df2['Location'][x]=='Microwave': print "Microwave on" # p[x,10]=1 elif df2['Location'][x]=='Door Bedroom': print "Door Bedroom on" # p[x,11]=1 else: print ("not") Can anyone help me please? Thanks a lot.
I see that you transform the date string to datetime and then to timestamp. Use merge with inneroption to extract only the matching cases for both columns that you are looking for. The code will be something like: import pandas as pd df1 = pd.DataFrame({'Start_time': [1,2,3], 'End_time': [1,2,3], 'Activity': [4, 5, 9]}) df2 = pd.DataFrame({'Start_time': [1,2,3], 'End_time': [3,2,1], 'Location': ['x','y','z'], 'Type': [7,8,9], 'Place': ['a','b','c']}) df = pd.merge(df1, df2, how='inner', left_on=['Start_time','End_time'], right_on=['Start_time','End_time'], left_index=False, right_index=False, sort=False) for i in df['Location']: if(i=='y'): print 'Ok' else: print 'Error' Where only one row has 'Start_time' and 'End_time' in common.