i am trying to create a set of day-of-week boxplots for a timeseries (e.g. 5-minute temperature observations).
My code:
# ts is our timeseries
ts = df.SomeColumn
dow_map = {}
days = ['MON','TUE','WED','THU','FRI','SAT','SUN']
dow_idx = ts.index.dayofweek
i = 0
for d in days:
dow_map[d] = ts[dow_idx == i]
i = i + 1
df = pd.DataFrame(dow_map)
df.boxplot()
results in:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-898-6070c45e4c4b> in <module>()
41 i = i + 1
42
---> 43 df = pd.DataFrame(dow_map)
44 df.boxplot()
...
Exception: Reindexing only valid with uniquely valued Index objects
I did find succcess by creating DataFrames for each day-of-week and then concat-ing them into a final DataFrame, but this seems inefficient...
1st Create data frame and use weekdays method to get days of week:
import pandas as pd
import numpy.random as random
n=1000
df = pd.DataFrame(random.randn(n), pd.date_range('2010-01-01', periods=n), columns=["data"])
df['Dates'] = df.index
df['week_days'] =df.index.weekday
df
now pivot that table so that the week_days are as columns (could also change the needdays to string formats of days but leaving that for you.
x =df.pivot(index='Dates', columns='week_days', values='data')
x.boxplot()
import locale, calendar
# for example pl_PL
locale.setlocale(locale.LC_ALL, 'pl_PL.UTF-8')
x = x.rename_axis(lambda x: calendar.day_abbr[x].capitalize())
Related
I need to add seconds in YYYY-MM-DD-HH-MM-SS. My code works perfectly for one data point but not for the whole set. The data.txt consists of 7 columns and around 200 rows.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
df = pd.read_csv('data.txt',sep='\t',header=None)
a = np.array(list(df[0]))
b = np.array(list(df[1]))
c = np.array(list(df[2]))
d = np.array(list(df[3]))
e = np.array(list(df[4]))
f = np.array(list(df[5]))
g = np.array(list(df[6]))
t1=datetime(year=a, month=b, day=c, hour=d, minute=e, second=f)
t = t1 + timedelta(seconds=g)
print(t)
You can pass parameter names to read_csv for new columns names in first step and then convert first 5 columns to datetimes by to_datetime and add seconds converted to timedeltas by to_timedelta:
names = ["year","month","day","hour","minute","second","new"]
df = pd.read_csv('data.txt',sep='\t',names=names)
df['out'] = pd.to_datetime(df[names]) + pd.to_timedelta(df["new"], unit='s')
use apply with axis=1 to apply a function to every row of the dataframe.
df.apply(lambda x: datetime(year=x[0],
month=x[1],
day=x[2],
hour=x[3],
minute=x[4],
second=x[5]) + timedelta(seconds=int(x[6])) , axis=1)
generating dataset
simple to do as pandas series
s = 20
df = pd.DataFrame(np.array([np.random.randint(2015,2020,s),np.random.randint(1,12,s),np.random.randint(1,28,s),
np.random.randint(0,23,s), np.random.randint(0,59,s), np.random.randint(0,59,s),
np.random.randint(0,200,s)]).T,
columns=["year","month","day","hour","minute","second","add"])
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + df["add"].apply(lambda s: pd.Timedelta(seconds=s))
without using apply()
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + pd.to_timedelta(df["add"], unit="s")
I am trying to loop over a dataframe and apply a customer function, however my date column keeps either corrupting, or adds brackets to each element
Does anyone know what I am doing wrong?
import numpy as np
import string
import random
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.Series(d, index=['date', 'y']) # return a series
# Generate some dummy data
todays_date = datetime.datetime.now().date()
date = pd.date_range(todays_date-datetime.timedelta(10), periods=10, freq='D')
columns = ['y']
data = [random.randint(0,10) for i in range(0,10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.letters)
df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
# ***
#df.loc[:,'date'] = df.loc[:,'date'].dt.strftime('%Y-%m-%d')
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
summarised['date']
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
What final output are you looking to get?
Does it work if you change pd.Series to pd.DataFrame within def summarise_dummy(x), setting the date and y along the columns.
import numpy as np
import string
import random
import pandas as pd
import datetime
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.DataFrame(d, columns=['date', 'y']) # return a series
# Generate some dummy data
date = pd.date_range(datetime.datetime.now().date() - datetime.timedelta(10), periods=10, freq='D')
print(date)
columns = ['y']
data = [random.randint(0, 10) for i in range(0, 10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.ascii_letters)
# df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
print(df)
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
print(summarised)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
print(summarised_json)
After apply:
date y
groups_1
a 9 2018-08-21 0
b 6 2018-08-18 7
c 4 2018-08-16 0
7 2018-08-19 5
8 2018-08-20 1
d 1 2018-08-13 6
3 2018-08-15 8
e 5 2018-08-17 1
After to_json:
[{"date":"2018-08-21","y":0},{"date":"2018-08-18","y":7},{"date":"2018-08-16","y":0},{"date":"2018-08-19","y":5},{"date":"2018-08-20","y":1},{"date":"2018-08-13","y":6},{"date":"2018-08-15","y":8},{"date":"2018-08-17","y":1}]
Additionally, you can configure the json format with orient.
I have a dataset like the one shown below.
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_metering_1;Sub_metering_2;Sub_metering_3
16/12/2006;17:24:00;4.216;0.418;234.840;18.400;0.000;1.000;17.000
16/12/2006;17:25:00;5.360;0.436;233.630;23.000;0.000;1.000;16.000
16/12/2006;17:26:00;5.374;0.498;233.290;23.000;0.000;2.000;17.000
16/12/2006;17:27:00;5.388;0.502;233.740;23.000;0.000;1.000;17.000
16/12/2006;17:28:00;3.666;0.528;235.680;15.800;0.000;1.000;17.000
16/12/2006;17:29:00;3.520;0.522;235.020;15.000;0.000;2.000;17.000
16/12/2006;17:30:00;3.702;0.520;235.090;15.800;0.000;1.000;17.000
16/12/2006;17:31:00;3.700;0.520;235.220;15.800;0.000;1.000;17.000
16/12/2006;17:32:00;3.668;0.510;233.990;15.800;0.000;1.000;17.000
I've used pandas to get the data into a DataFrame. The dataset has data for multiple days with an interval of 1 min for each row in the dataset.
I want to plot separate graphs for the voltage with respect to the time(shown in column 2) for each day(shown in column 1) using python. How can I do that?
txt = '''Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_metering_1;Sub_metering_2;Sub_metering_3
16/12/2006;17:24:00;4.216;0.418;234.840;18.400;0.000;1.000;17.000
16/12/2006;17:25:00;5.360;0.436;233.630;23.000;0.000;1.000;16.000
16/12/2006;17:26:00;5.374;0.498;233.290;23.000;0.000;2.000;17.000
16/12/2006;17:27:00;5.388;0.502;233.740;23.000;0.000;1.000;17.000
16/12/2006;17:28:00;3.666;0.528;235.680;15.800;0.000;1.000;17.000
16/12/2006;17:29:00;3.520;0.522;235.020;15.000;0.000;2.000;17.000
16/12/2006;17:30:00;3.702;0.520;235.090;15.800;0.000;1.000;17.000
16/12/2006;17:31:00;3.700;0.520;235.220;15.800;0.000;1.000;17.000
16/12/2006;17:32:00;3.668;0.510;233.990;15.800;0.000;1.000;17.000'''
from io import StringIO
f = StringIO(txt)
df = pd.read_table(f,sep =';' )
plt.plot(df['Time'],df['Voltage'])
plt.show()
gives output :
I believe this will do the trick (I edited the dates so we have two dates)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline #If you use Jupyter Notebook
df = pd.read_csv('test.csv', sep=';', usecols=['Date','Time','Voltage'])
unique_dates = df.Date.unique()
for date in unique_dates:
print('Date: ' + date)
df.loc[df.Date == date].plot.line('Time', 'Voltage')
plt.show()
You will get this:
X = df.Date.unique()
for i in X: #iterate over unique days
temp_df = df[df.Date==i] #get df for specific day
temp_df.plot(x = 'Time', y = 'Voltage') #plot
If you want to change x values you can use
x = np.arange(1, len(temp_df.Time), 1)
group by hour and minute after creating a DateTime variable to handle multiple days. you can filter the grouped for a specific day.
txt =
'''Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_metering_1;Sub_metering_2;Sub_metering_3
16/12/2006;17:24:00;4.216;0.418;234.840;18.400;0.000;1.000;17.000
16/12/2006;17:25:00;5.360;0.436;233.630;23.000;0.000;1.000;16.000
16/12/2006;17:26:00;5.374;0.498;233.290;23.000;0.000;2.000;17.000
16/12/2006;17:27:00;5.388;0.502;233.740;23.000;0.000;1.000;17.000
16/12/2006;17:28:00;3.666;0.528;235.680;15.800;0.000;1.000;17.000
16/12/2006;17:29:00;3.520;0.522;235.020;15.000;0.000;2.000;17.000
16/12/2006;17:30:00;3.702;0.520;235.090;15.800;0.000;1.000;17.000
16/12/2006;17:31:00;3.700;0.520;235.220;15.800;0.000;1.000;17.000
16/12/2006;17:32:00;3.668;0.510;233.990;15.800;0.000;1.000;17.000'''
from io import StringIO
f = StringIO(txt)
df = pd.read_table(f,sep =';' )
df['DateTime']=pd.to_datetime(df['Date']+"T"+df['Time']+"Z")
df.set_index('DateTime',inplace=True)
filter=df['Date']=='16/12/2006'
grouped=df[filter].groupby([df.index.hour,df.index.minute])['Voltage'].mean()
grouped.plot()
plt.show()
My process is this:
Import csv of data containing dates, activations, and cancellations
subset the data by activated or cancelled
pivot the data with aggfunc 'sum'
convert back to data frames
Now, I need to merge the 2 data frames together but there are dates that exist in one data frame but not the other. Both data frames start Jan 1, 2017 and end Dec 31, 2017. Preferably, the output for any observation in which the index month needs to be filled with have a corresponding value of 0.
Here's the .head() from both data frames:
For reference, here's the code up to this point:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
%matplotlib inline
#import data
directory1 = "C:\python\Contracts"
directory_source = os.path.join(directory1, "Contract_Data.csv")
df_source = pd.read_csv(directory_source)
#format date ranges as times
#df_source["Activation_Month"] = pd.to_datetime(df_source["Activation_Month"])
#df_source["Cancellation_Month"] = pd.to_datetime(df_source["Cancellation_Month"])
df_source["Activation_Day"] = pd.to_datetime(df_source["Activation_Day"])
df_source["Cancellation_Day"] = pd.to_datetime(df_source["Cancellation_Day"])
#subset the data based on status
df_active = df_source[df_source["Order Status"]=="Active"]
df_active = pd.DataFrame(df_active[["Activation_Day", "Event_Value"]].copy())
df_cancelled = df_source[df_source["Order Status"]=="Cancelled"]
df_cancelled = pd.DataFrame(df_cancelled[["Cancellation_Day", "Event_Value"]].copy())
#remove activations outside 2017 and cancellations outside 2017
df_cancelled = df_cancelled[(df_cancelled['Cancellation_Day'] > '2016-12-31') &
(df_cancelled['Cancellation_Day'] <= '2017-12-31')]
df_active = df_active[(df_active['Activation_Day'] > '2016-12-31') &
(df_active['Activation_Day'] <= '2017-12-31')]
#pivot the data to aggregate by day
df_active_aggregated = df_active.pivot_table(index='Activation_Day',
values='Event_Value',
aggfunc='sum')
df_cancelled_aggregated = df_cancelled.pivot_table(index='Cancellation_Day',
values='Event_Value',
aggfunc='sum')
#convert pivot tables back to useable dataframes
activations_aggregated = pd.DataFrame(df_active_aggregated.to_records())
cancellations_aggregated = pd.DataFrame(df_cancelled_aggregated.to_records())
#rename the time columns so they can be referenced when merging into one DF
activations_aggregated.columns = ["index_month", "Activations"]
#activations_aggregated = activations_aggregated.set_index(pd.DatetimeIndex(activations_aggregated["index_month"]))
cancellations_aggregated.columns = ["index_month", "Cancellations"]
#cancellations_aggregated = cancellations_aggregated.set_index(pd.DatetimeIndex(cancellations_aggregated["index_month"]))
I'm aware there are many posts that address issues similar to this but I haven't been able to find anything that has helped. Thanks to anyone that can give me a hand with this!
You can try:
activations_aggregated.merge(cancellations_aggregated, how='outer', on='index_month').fillna(0)
I need to use my own data for a zipline project. I keep getting this error whenever I try:
/Library/Python/2.7/site-packages/zipline/sources/data_source.pyc in <dictcomp>((target, (mapping_func, source_key)))
47 """
48 row = {target: mapping_func(raw_row[source_key])
---> 49 for target, (mapping_func, source_key)
50 in self.mapping.items()}
51 row.update({'source_id': self.get_hash()})
ValueError: cannot convert float NaN to integer
Here is the trading algorithm I am running:
from zipline.algorithm import TradingAlgorithm
from zipline.api import order_target, order, record, symbol, history, add_history
import numpy as np
from pandas import Series, DataFrame, Panel
import pandas as pd
# Define algorithm
def initialize(context):
context.dateIndex = 0
def handle_data(context, data):
today = data.major_axis[context.dateIndex]
if today > data.US9663871021[data.US9663871021.close.notnull()].index[0] and today < data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order(symbol('US9663871021'), 10)
record(US9663871021=data[symbol('US9663871021')].price)
if today > data.US7954351067[data.US7954351067.close.notnull()].index[0] and today < data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order(symbol('US7954351067'), 10)
record(US7954351067=data[symbol('US7954351067')].price)
if today == data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order_target(symbol('US9663871021'), 0)
record(US9663871021=data[symbol('US9663871021')].price)
if today == data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order_target(symbol('US7954351067'), 0)
record(US9663871021=data[symbol('US7954351067')].price)
context.dateIndex = context.dateIndex + 1
def prepDf(fileName):
df = pd.io.parsers.read_csv(fileName, index_col=[0],parse_dates=[0], na_values=["#N/A N/A"],
names=["date", "open","high","low","close","volume","mkt_cap"])
df["price"] = df.close
df.index = df.index.tz_localize('UTC')
df = df[df.close.notnull()]
return df
fileName = #fill in file name
fileName2 = #fill in file name
dictionaryOfDfs = {"US9663871021" : prepDf(fileName), "US7954351067": prepDf(fileName2)}
data = Panel(dictionaryOfDfs)
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
The idea is that I'm buying when the data should be non-NaN and selling the position before the end of the series. There should be no need for the data beyond that, yet zipline insists that NaN causes an error even when the value shouldn't be used.
After researching, I believe the solution is to re-index the underlying DataFrames:
df1 = df1.reindex(index=data.major_axis, fill_value=0)
df2 = df2.reindex(index=data.major_axis, fill_value=0)
where data is the pandas Panel