drop a DataFrame column in python - python

I desperately need help here. I am trying to get the dimension of a dataframe. I always get 31 columns instead of 30: Value should be 30, found 31. I tried to reset_index(drop = True) but I still get the same error. any help is appreciated. Stay safe.
def read_data(dataset_id):
data = None
# Begin CODE
if dataset_id == 'breast_cancer':
disease = 'breast_cancer'
datafile = 'wdbc.data'
bc_columns = ['ptid', 'diagnosis', 'mean_radius', 'mean_texture',
'mean_perimeter', 'mean_area',
'mean_smoothness', 'mean_compactness', 'mean_concavity',
'mean_concave_pts', 'mean_symmetry ',
'mean_fractal_dim', 'std_err_radius', 'std_err_texture',
'std_err_perimeter', 'std_err_area',
'std_err_smoothness', 'std_err_compactness',
'std_err_concavity', 'std_err_concave_pts',
'std_err_symmetry ', 'std_err_fractal_dim', 'worst_radius',
'worst_texture', 'worst_perimeter',
'worst_area', 'worst_smoothness', 'worst_compactness',
'worst_concavity', 'worst_concave_pts',
'worst_symmetry ', 'worst_fractal_dim']
data = pd.read_csv(datafile, skipinitialspace=True, names=bc_columns)
data.drop(labels=['ptid'], axis=1, inplace=True)
bc_diag_class = get_class_list_dict(data['diagnosis'])
elif dataset_id == 'hyperthyroidism':
disease = 'hyperthyroidism'
datafile1 = 'allhyper.data' # tab delimited, no header
datafile2 = 'allhyper.test' # comma delimited, no header
ht_columns = ['age', 'Gender', 'on thyroxine', 'query on thyroxine', 'on
antithyroid medication', 'sick',
'pregnant', 'thyroid surgery', 'I131 treatment', 'query
hypothyroid', 'query hyperthyroid',
'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych',
'TSH measured', 'TSH', 'T3 measured',
'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI
measured', 'FTI', 'TBG measured', 'TBG',
'referral source', 'diag_class']
data1 = pd.read_csv(datafile1, sep='\t', skipinitialspace=True,
names=ht_columns)
data2 = pd.read_csv(datafile2, skipinitialspace=True, names=ht_columns)
data = data1.append(data2, ignore_index=True)
data = data.replace(to_replace='?', value=float('nan'))
data[['diag_class', 'ptid']] = data['diag_class'].str.split(pat='.\|',
expand=True)
diag_class = data['diag_class']
data.drop(labels=['diag_class', 'ptid'], axis=1, inplace=True)
data.insert(0, 'diag_class', diag_class)
data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']] \
= data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI',
'TBG']].apply(pd.to_numeric)
elif dataset_id == 'cervical_cancer':
disease = 'cervical_cancer'
datafile = 'risk_factors_cervical_cancer.csv'
cc_columns = ('Age', 'Num_sex_partners', 'First_sex_intercourse',
'Num_pregnancies',
'Smokes', 'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps',
'Hormonal_Contraceps_years', 'IUD', 'IUD_years', 'STD',
'STD_number',
'STD_condylomatosis', 'STDscervical_condylomatosis',
'STD_vaginal_condylomatosis',
'STD_vulvo_perin_condylomatosis', 'STD_syphilis',
'STD_pelvic_inflam_disease',
'STD_genital_herpes', 'STD_molluscum_contagiosum',
'STD_AIDS', 'STD_HIV', 'STD_HepB',
'STD_HPV', 'STD_Num_diagnosis',
'STD_Time_since_first_diag', 'STDs_Time_since_last_diag',
'Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 'Schiller',
'Citology', 'Biopsy')
data = pd.read_csv(datafile, skipinitialspace=True)
data.columns = cc_columns
data = data.replace(to_replace='?', value=float('nan'))
biopsy_class = data['Biopsy']
data.drop(labels=['Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann',
'Schiller', 'Citology', 'Biopsy'],
axis=1, inplace=True)
data.insert(0, 'Biopsy', biopsy_class)
data[['Num_sex_partners', 'First_sex_intercourse', 'Num_pregnancies',
'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps_years', 'IUD_years',
'STD_number', 'STD_Time_since_first_diag',
'STDs_Time_since_last_diag']] \
= data[['Num_sex_partners', 'First_sex_intercourse',
'Num_pregnancies', 'Smokes_years', 'Smokes_packs_year',
'Hormonal_Contraceps_years', 'IUD_years',
'STD_number', 'STD_Time_since_first_diag',
'STDs_Time_since_last_diag']].apply(pd.to_numeric)
elif dataset_id == 'liver_cancer':
disease = 'liver_cancer'
datafile = 'Indian Liver Patient Dataset (ILPD).csv' # comma delimited,
no header
ld_columns = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot',
'TP', 'ALB', 'A/G Ratio', 'Selector']
data = pd.read_csv(datafile, skipinitialspace=True, names=ld_columns)
data.loc[data['Gender'] == 'Male', 'Gender'] = 'M'
data.loc[data['Gender'] == 'Female', 'Gender'] = 'F'
selector_class = data['Selector']
data.drop(labels=['Selector'], axis=1, inplace=True)
data.insert(0, 'Selector', selector_class)
data.reset_index(drop=True, inplace=True)
# End CODE
print(data.head(20))
return data
def dimensions(dataset_id, dataset):
dim = None
# dim = dataset.shape
num_inst = len(dataset)
num_feat = len(dataset.iloc[0].reset_index())
dim = (num_inst, num_feat)
return dim

If you want to drop a column from DataFrame, You can do like this.
If you want to drop single column:
df.drop(['column_name'], axis = 1)
If you want to drop multiple columns:
df.drop(['Column1', 'Column2'], axis = 1)
If you want to drop based on some other condition instead of column name. You can comment below. I'll update the answer accordingly. Hope it helps!.

Related

Removing utc info from yfinance dataframe

How can I remove the utc portion of a DF created from a yfinance? Every example I and approach I seen has failed.
eg:
df = yf.download('2022-01-01', '2023-01-06', interval = '60m' )
pd.to_datetime(df['Datetime'])
error: 3806 #If we have a listlike key, _check_indexing_error will raise
KeyError: 'Datetime'
As well as the following approaches
enter code heredf = df.reset_index()
df = pd.DataFrame(df, columns = ['Datetime', "Close"])
df.rename(columns = {'Date': 'ds'}, inplace = True)
df.rename(columns = {'Close':'y'}, inplace = True)
#df['ds'] = df['ds'].dt.date
#df['ds'] = datetime.fromtimestamp(df['ds'], tz = None)
#df['ds'] = df['ds'].dt.floor("Min")
#df['ds'] = pd.to_datetime(df['ds'].dt.tz_convert(None))
#df['ds'] = pd.to_datetime['ds']
#pd.to_datetime(df['ds'])
df['ds'].dt.tz_localize(None)
print(df)
with similar errors, Any help or pointer will greatly appreciated I have spent the entire morning on this.
Thanks in advance
BTT
Your code interprets '2022-01-01' as the first and required argument tickers.
This date is not a valid ticker, so yf.download() does not return any price and volume data.
Try:
df = yf.download(tickers='AAPL', start='2022-01-01', end='2023-01-06', interval = '60m' )
df.index = df.index.tz_localize(None)

Deleting empty columns with a few columns between data

I'm fetching data from a Google sheet:
values1 = pd.DataFrame(values)
aux = values1.head(1)
values1.drop(index={0}, inplace=True)
senal1 = (values1[2] == "SEÑAL")
senal = values1[senal1]
senal.dropna(axis=1, inplace=True)
print(senal)
This is my result after running the code:

How can I convert .append to .concat pandas python

I have this data entry:
[{'id': 2269396, 'from': 1647086100, 'at': 1647086160000000000, 'to': 1647086160, 'open': 1.072652, 'close': 1.072691, 'min': 1.072641, 'max': 1.072701, 'volume': 0},..]
Apllying this indexing pandas:
current = self.getAllCandles(self.active_id,start_candle)
main = pd.DataFrame()
useful_frame = pd.DataFrame()
for candle in current:
useful_frame = pd.DataFrame(list(candle.values()),index = list(candle.keys())).T.drop(columns = ['at'])
useful_frame = useful_frame.set_index(useful_frame['from']).drop(columns = ['id'])
main = main.append(useful_frame)
main.drop_duplicates()
final_data = main.drop(columns = {'to'})
final_data = final_data.loc[~final_data.index.duplicated(keep = 'first')]
return final_data
After that I have the following result:
from open close min max volume
from
1.647086e+09 1.647086e+09 1.072652 1.072691 1.072641 1.072701 0.0
... ... ... ... ... ... ...
Since df.append() will be deprecated, I'm struggling to execute the same instructions using df.concat(). But I'm not getting it, how could I change that?
Thank you all, I made a small modification to the code suggested by our friend Stuart Berg #stuart-berg, and it was perfect:
current = self.getAllCandles(self.active_id, start_candle)
frames = []
useful_frame = pd.DataFrame.from_dict(current, orient='columns')
useful_frame = useful_frame.set_index('from')
useful_frame = useful_frame.drop(columns=['at', 'id'])
frames.append(useful_frame)
main = pd.concat(frames).drop_duplicates()
final_data = main.drop(columns='to')
final_data = final_data.loc[~final_data.index.duplicated()]
return final_data
I think this is what you're looking for:
current = self.getAllCandles(self.active_id, start_candle)
frames = []
for candle in current:
useful_frame = pd.DataFrame.from_dict(candle, orient='columns')
#useful_frame['from'] = datetime.datetime.fromtimestamp(int(useful_frame['from'])).strftime('%Y-%m-%d %H:%M:%S')
useful_frame = useful_frame.set_index('from')
useful_frame = useful_frame.drop(columns=['at', 'id'])
frames.append(useful_frame)
main = pd.concat(frames).drop_duplicates()
final_data = main.drop(columns='to')
final_data = final_data.loc[~final_data.index.duplicated()]
Create an empty python list and then append all the series to the list. Finally call pandas' concat on that list, this will give you that dataframe.

Panda DataFrame Row Items IF Comparison doesnt return correct result

I retrieve data from quandl and load it to a pandas DF object.
Afterwards I calculate SMA values (SMA21, SMA55) based on "Last Price".
Adding those SMA values as a column do my DF object.
I iterate through DF to catch a buy signal.
I know the buy condition is holding true for some dates but my code does not printing anything out. I am expecting to print the buy condition at the very least.
as below you can see the following condition:
kitem['SMA21'] >= kitem['Last']
My code:
import requests
import pandas as pd
import json
class URL_Params:
def __init__ (self, endPoint, symboll, startDate, endDate, apiKey):
self.endPoint = endPoint
self.symboll = symboll
self.startDate = startDate
self.endDate = endDate
self.apiKey = apiKey
def createURL (self):
return self.endPoint + self.symboll + '?start_date=' + self.startDate + '&end_date=' + self.endDate + '&api_key=' + self.apiKey
def add_url(self, _url):
self.url_list
my_portfolio = {'BTC':1.0, 'XRP':0, 'DSH':0, 'XMR':0, 'TotalBTCValue':1.0}
_endPoint = 'https://www.quandl.com/api/v3/datasets/BITFINEX/'
_symbolls = ['BTCEUR','XRPBTC','DSHBTC','IOTBTC','XMRBTC']
_startDate = '2017-01-01'
_endDate = '2019-03-01'
_apiKey = '' #needs to be set for quandl
my_data = {}
my_conns = {}
my_col_names = ['Date', 'High', 'Low', 'Mid', 'Last', 'Bid', 'Ask', 'Volume']
orderbook = []
#create connection and load data for each pair/market.
#load them in a dict for later use
for idx_symbol in _symbolls:
my_url_params = URL_Params(_endPoint,idx_symbol,_startDate,_endDate,_apiKey)
response = requests.get(my_url_params.createURL())
my_data[idx_symbol] = json.loads(response.text)
#Prepare Data
my_raw_data_df_xrpbtc = pd.DataFrame(my_data['XRPBTC']['dataset']['data'], columns= my_data['XRPBTC']['dataset']['column_names'])
#Set Index to Date Column and Sort
my_raw_data_df_xrpbtc['Date'] = pd.to_datetime(my_raw_data_df_xrpbtc['Date'])
my_raw_data_df_xrpbtc.index = my_raw_data_df_xrpbtc['Date']
my_raw_data_df_xrpbtc = my_raw_data_df_xrpbtc.sort_index()
#Drop unrelated columns
my_raw_data_df_xrpbtc.drop(['Date'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Ask'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Bid'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Low'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['High'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Mid'], axis=1, inplace=True)
#Calculate SMA values to create buy-sell signal
my_raw_data_df_xrpbtc['SMA21'] = my_raw_data_df_xrpbtc['Last'].rolling(21).mean()
my_raw_data_df_xrpbtc['SMA55'] = my_raw_data_df_xrpbtc['Last'].rolling(55).mean()
my_raw_data_df_xrpbtc['SMA200'] = my_raw_data_df_xrpbtc['Last'].rolling(200).mean()
#Check for each day if buy signal holds BUY if sell signal holds SELL
for idx,kitem in my_raw_data_df_xrpbtc.iterrows():
if (kitem['SMA21'] >= kitem['Last']) is True: #buy signal
print("buy0")
if my_portfolio['BTC'] > 0 is True:
print("buy1")
if (kitem['Last'] * my_portfolio['XRP']) >= (my_portfolio['BTC'] * 1.05) is True: #sell signal
print("sell0")
if my_portfolio['XRP'] > 0 is True:
print("sell1")
I know that there are lots of rows that holds true but my code never enters this path of code so it does not print out what I expect.
Could anyone please help/comment what might be wrong?
The reason is that your comparison is wrong. The result of kitem['SMA21'] >= kitem['Last'] will be a numpy.bool_. When you use is to compare it to True this will fail as it is not the same object.
If you change the comparison to == it will work as expected:
if (kitem['SMA21'] >= kitem['Last']) == True:

check 1st row of a column with all rows of anothter in python (panda)

I have two text files. First file contains those cols,
['Start time', 'End Time', 'Activity']
second file contains
['Start time', 'End Time', 'Location', 'Type', 'Place'].
I want to check for example
Start_time1[0] with all rows of Start_time2.
The code is
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 09 15:04:08 2017
#author: Owner
"""
import pandas as pd
#import matplotlib.pyplot as plt
import time
import datetime
import numpy as np
df = pd.read_csv("OrdonezA_ADLs.txt", header = None, delimiter=' *\t+ *', engine='python')
df.columns=['Start time', 'End Time', 'Activity']
df2 = pd.read_csv("OrdonezA_Sensors.txt", header = None, delimiter=' *\t+ *', engine='python')
df2.columns=['Start time', 'End Time', 'Location', 'Type', 'Place']
#print df2.head()
print df['Start time'][0]
print df2['Location'][0]
df['Start time'] = df.apply(lambda row: time.mktime(datetime.datetime.strptime(row['Start time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1)
df['End Time'] = df.apply(lambda row: time.mktime(datetime.datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1)
#print df
df2['Start time'] = df2.apply(lambda row: time.mktime(datetime.datetime.strptime(row['Start time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1)
df2['End Time'] = df2.apply(lambda row: time.mktime(datetime.datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple()), axis=1)
#print df2
p = np.zeros(shape=(len(df),12))
print p
for x in xrange(len(df)):
for y in xrange(len(df2)):
if (df['Start time'][x] == df2['Start time'][y] and df['End Time'][x] <= df2['End Time'][y]):
if df2['Location'][x] == 'Shower':
print "Shower on"
# p[x,0]=1 here i want to add 1 in first col of p,in x row of p
elif df2['Location'][x]=='Basin':
print "Basin on"
# p[x,1]=1
elif df2['Location'][x]=='Door Kitchen':
print "Door Kitchen on"
# p[x,2]=1
elif df2['Location'][x]=='Door Bathroom':
print "Door Bathroom on"
# p[x,3]=1
elif df2['Location'][x]=='Maindoor':
print "Maindoor on"
# p[x,4]=1
elif df2['Location'][x]=='Fridge':
print "Fridge on"
# p[x,5]=1
elif df2['Location'][x]=='Cupboard':
print "Cupboard on"
# p[x,6]=1
elif df2['Location'][x]=='Toilet':
print "Toilet on"
# p[x,7]=1
elif df2['Location'][x]=='Seat':
print "Seat on"
# p[x,8]=1
elif df2['Location'][x]=='Bed':
print "Bed on"
# p[x,9]=1
elif df2['Location'][x]=='Microwave':
print "Microwave on"
# p[x,10]=1
elif df2['Location'][x]=='Door Bedroom':
print "Door Bedroom on"
# p[x,11]=1
else:
print ("not")
Can anyone help me please? Thanks a lot.
I see that you transform the date string to datetime and then to timestamp.
Use merge with inneroption to extract only the matching cases for both columns that you are looking for.
The code will be something like:
import pandas as pd
df1 = pd.DataFrame({'Start_time': [1,2,3], 'End_time': [1,2,3], 'Activity': [4, 5, 9]})
df2 = pd.DataFrame({'Start_time': [1,2,3], 'End_time': [3,2,1], 'Location': ['x','y','z'], 'Type': [7,8,9], 'Place': ['a','b','c']})
df = pd.merge(df1, df2, how='inner', left_on=['Start_time','End_time'], right_on=['Start_time','End_time'], left_index=False, right_index=False, sort=False)
for i in df['Location']:
if(i=='y'):
print 'Ok'
else:
print 'Error'
Where only one row has 'Start_time' and 'End_time' in common.

Categories

Resources