In the following code I am generating a new column which has list of names of those columns which are >90 and <10. I have another similar time series dataframe and I want to have those values from second dataframe df_1, which are there in the form list in the first dataframe columns named as df['Top_90'] and df['Below'].
Thanks in advance!
import pandas as pd
from datetime import datetime
import numpy as np
date_rng = pd.date_range(start='1/1/2018', end='1/08/2018', freq='H')
df = pd.DataFrame(date_rng, columns=['Date'])
df['Data_1'] = np.random.randint(0,100,size=(len(date_rng)))
df['Data_2'] = np.random.randint(0,100,size=(len(date_rng)))
df['Data_3'] = np.random.randint(0,100,size=(len(date_rng)))
df['Data_4'] = np.random.randint(0,100,size=(len(date_rng)))
df['Top_90'] = list(map(str, df.apply(lambda x: ','.join(x.index[x > 80]), axis=1)))
df['Below_10'] = list(map(str, df.drop('Top_90', axis=1).apply(lambda x: ','.join(x.index[x > 10]), axis=1)))
date_rng_1 = pd.date_range(start='1/1/2018', end='1/08/2018', freq='H')
df_1 = pd.DataFrame(date_rng_1, columns=['Date'])
df_1['Data_1'] = np.random.randint(0,1000,size=(len(date_rng)))
df_1['Data_2'] = np.random.randint(0,1000,size=(len(date_rng)))
df_1['Data_3'] = np.random.randint(0,1000,size=(len(date_rng)))
df_1['Data_4'] = np.random.randint(0,1000,size=(len(date_rng)))
df_1 = df.set_index('Date')
for index in df_1.index:
print(df_1)
for col in df['Top_90']:
print(df_1._get_value(index, col))
I have a question about performance improvement of the following code:
df["range_column"] = list(zip(df.START, df1.END))
df["range_col"] = df["range_col"].swifter.apply(lambda x: pd.date_range(x[0], x[1], freq="60min"))
Explanation: I have two datetime columns. Based on these columns I create a tuple and a data range of 60 minutes.
For larger datasets, it takes quite a long time to run this code.
Below I have created some sample data to run the code.
Does anyone perhaps know of an alternative that produces the same result but is faster?
import faker
import pandas as pd
from faker import Faker
import swifter
# create some fake date data
fake = Faker()
Faker.seed(0)
df = []
for _ in range(5):
df.append(fake.date("%Y-%m-%d_%H_%M_%S"))
df1 = []
for _ in range(5):
df1.append(fake.date("%Y-%m-%d_%H_%M_%S"))
# create df
df = pd.DataFrame(df)
df["START"] = df
df = pd.DataFrame(df["START"])
df["START"] = pd.to_datetime(df["START"], format="%Y-%m-%d_%H_%M_%S")
# create df
df1 = pd.DataFrame(df1)
df1["END"] = df1
df1 = pd.DataFrame(df1["END"])
df1["END"] = pd.to_datetime(df1["END"], format="%Y-%m-%d_%H_%M_%S")
# merge
df2 = pd.concat([df, df1], axis = 1)
# create tuple
df2["range_col"] = list(zip(df2.START, df2.END))
# create date range
df2["range__col1"] = df2["range_col"].swifter.apply(lambda x: pd.date_range(x[0], x[1], freq="60min"))
I need to add seconds in YYYY-MM-DD-HH-MM-SS. My code works perfectly for one data point but not for the whole set. The data.txt consists of 7 columns and around 200 rows.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
df = pd.read_csv('data.txt',sep='\t',header=None)
a = np.array(list(df[0]))
b = np.array(list(df[1]))
c = np.array(list(df[2]))
d = np.array(list(df[3]))
e = np.array(list(df[4]))
f = np.array(list(df[5]))
g = np.array(list(df[6]))
t1=datetime(year=a, month=b, day=c, hour=d, minute=e, second=f)
t = t1 + timedelta(seconds=g)
print(t)
You can pass parameter names to read_csv for new columns names in first step and then convert first 5 columns to datetimes by to_datetime and add seconds converted to timedeltas by to_timedelta:
names = ["year","month","day","hour","minute","second","new"]
df = pd.read_csv('data.txt',sep='\t',names=names)
df['out'] = pd.to_datetime(df[names]) + pd.to_timedelta(df["new"], unit='s')
use apply with axis=1 to apply a function to every row of the dataframe.
df.apply(lambda x: datetime(year=x[0],
month=x[1],
day=x[2],
hour=x[3],
minute=x[4],
second=x[5]) + timedelta(seconds=int(x[6])) , axis=1)
generating dataset
simple to do as pandas series
s = 20
df = pd.DataFrame(np.array([np.random.randint(2015,2020,s),np.random.randint(1,12,s),np.random.randint(1,28,s),
np.random.randint(0,23,s), np.random.randint(0,59,s), np.random.randint(0,59,s),
np.random.randint(0,200,s)]).T,
columns=["year","month","day","hour","minute","second","add"])
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + df["add"].apply(lambda s: pd.Timedelta(seconds=s))
without using apply()
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + pd.to_timedelta(df["add"], unit="s")
I was wondering whether somebody could please give me some assistance with the Pandas iterrows package.
I'm currently using an iterative function which works but I was wondering whether using iterrows would make it more efficient to avoid a for loop?
import pandas as pd
import numpy as np
dataframe_1 = pd.read_csv("D\data\2018_19.csv")
def append_date_column(df):
df = df.copy()
df['date_column'] = np.nan
date_range = pd.date_range(start = '01/01/2001', periods = 207, freq = 'M').values
for row in range(df.shape[0]):
date_number = df.loc[row, "income2"]
if (not pd.isna(date_number)) and date_number < 207:
date = date_range[int(date_number) -1]
df.loc[row, 'date_column'] = date
df_with_date_column = df
return df_with_date_column
Thanks!
I am trying to combine a series of stock tick data based on the dates.
But it wont work. Please help.
import pandas as pd
import tushare as ts
def get_all_tick(stockID):
dates=pd.date_range('2016-01-01',periods=5,freq='D')
append_data=[]
for i in dates:
stock_tick=pd.DataFrame(ts.get_tick_data(stockID,date=i))
stock_tick.sort('volume',inplace=True, ascending=False)
stock_tick=stock_tick[:10]
stock_tick.sort('time',inplace=True, ascending=False)
append_data.append(stock_tick.iterrows())
get_all_tick('300243')
I figure it out myself.
def get_all_tick(stockID):
.........
df = pd.DataFrame()
for i in get_date:
stock_tick = ts.get_tick_data(stockID, date=i)
stock_tick['Date']=i
stock_tick.sort('volume', inplace=True, ascending=False)
stock_tick = stock_tick[:10]
stock_tick.sort('time', inplace=True, ascending=False)
df = df.append(stock_tick)
df.to_excel('tick.xlsx',sheet_name='Sheet1')
get_all_tick('300243')