Do you know with this code how can I get weekly datas ? because the output is only daily and I did not find the answer ...
start = datetime(2018,1,1)
end = datetime(2022,1,16)
stocks = ['DUK']
def ImportDataClose(name,start,end):
n = len(name)
ptf = pd.DataFrame()
for i in range(n):
tmp = pd.DataFrame(web.DataReader(name[i],'yahoo',start, end)["Close"])
ptf = pd.concat([ptf,tmp], axis = 1)
ptf.columns = name
return(ptf)
portfolio = ImportDataClose(stocks,start,end)
portfolio
regards !
Related
Is there a more efficient way of writing the following? I current have this set up to calculate using a for-loop and at this pace, it will take a few days to compile.
I am forecasting demand over a period of 6 years on a weekly basis (52 weeks) broken down by product type (586 types) and zip code (892 unique ZIPs). The rand arrays are the parameter demand shares for each year drawn from a normal distribution and have dimensions [#weeks/#types/#zips x 6]. The demand growth array is the annual demand for each year.
I ultimately need to produce a data frame that has the following:
Year | Week of the Year | Product | Zip Code | Qty
This is what I currently have
demand_growth = [10,15,20,23,26,30]
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(586,6)
rand_zipcode_total = np.random.rand(892,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
for i in range(len(years)):
for j in range(len(week)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(week[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
'''
Edited: included examples of the arrays being multiplied
Any recommendations would be greatly appreciated!
I think you can do more than than by studying how to use arrays and/or threading. For now, the best I got was 3x faster. I used lower boundaries to not spend the night on this.
import numpy as np
import timeit
def f1():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f2():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
temp_ij = demand_growth[i]*rand_week_total[j,i]
for k in range(len(product)):
temp_ikj = temp_ij*rand_product_total[k,i]
for l in range(len(zipcode)):
a = np.rint(temp_ikj*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f3():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for j in range(len(weeks)):
temp_j = demand_growth*rand_week_total[j,:]
for k in range(len(product)):
temp_jk = temp_j * rand_product_total[k,:]
for l in range(len(zipcode)):
a = np.rint(temp_jk*rand_zipcode_total[l,:])
for i in range(len(years)):
if a[i] !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a[i])
print(timeit.Timer(f1).timeit(5))
print(timeit.Timer(f2).timeit(5))
print(timeit.Timer(f3).timeit(5))
I have this script I'm running to try to create a dataframe to summarize some statistics:
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in month:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
It returns exactly what I want to see. But when I place it inside a function I get the following error:
AssertionError: 5 columns passed, passed data had 1 columns
Here is the code inside the function:
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
You have a problem with the last line of the for loop in the function. this_df is being defined in every iteration of the loop.
The corrected code is below.
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
Base on my understanding , you do not need the for loop here
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
df=pd.concat(month,keys=month_str)
df=df.mask(df==0|df==99999)
df.groupby(level=0).mean().T
I'm working on a Sentiment Analysis project using Twitter Data, and I've encountered a small problem regarding Dates. The code itself runs fine, but I don't know how to build custom time blocks for grouping my final data. Right now, it is defaulting to grouping them by the second, which is not very useful. I want to be able to group them in half-hour, hour, and day segments...
Feel free to skip to the bottom of the code to see where the issue lies!
Here is the code:
import tweepy
API_KEY = "XXXXX"
API_SECRET = XXXXXX"
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
import sklearn as sk
import pandas as pd
import got3
#"Get Old Tweets" to find older data
tweetCriteria = got3.manager.TweetCriteria()
tweetCriteria.setQuerySearch("Kentucky Derby")
tweetCriteria.setSince("2016-05-07")
tweetCriteria.setUntil("2016-05-08")
tweetCriteria.setMaxTweets(1000)
TweetCriteria = got3.manager.TweetCriteria()
KYDerby_tweets = got3.manager.TweetManager.getTweets(tweetCriteria)
from afinn import Afinn
afinn = Afinn()
#getting afinn library to use for sentiment polarity analysis
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
Id = x.id
print(Text)
AllText = []
AllRetweets = []
AllFavorites = []
AllDates = []
AllIDs = []
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
AllText.append(Text)
AllRetweets.append(Retweets)
AllFavorites.append(Favorites)
AllDates.append(Date)
AllIDs.append(Id)
data_set = [[x.id, x.date, x.text, x.retweets, x.favorites]
for x in KYDerby_tweets]
df = pd.DataFrame(data=data_set, columns=["Id", "Date", "Text", "Favorites", "Retweets"])
#I now have a DataFrame with my basic info in it
pscore = []
for x in KYDerby_tweets:
afinn.score(x.text)
pscore.append(afinn.score(x.text))
df['P Score'] = pscore
#I now have the pscores for each Tweet in the DataFrame
nrc = pd.read_csv('C:\\users\\andrew.smith\\downloads\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep="\t", names=["word", "emotion", "association"], skiprows=45)
#import NRC emotion lexicon
nrc = nrc[nrc["association"]==1]
nrc = nrc[nrc["emotion"].isin(["positive", "negative"]) == False]
#cleaned it up a bit
from nltk import TweetTokenizer
tt = TweetTokenizer()
tokenized = [x.lower() for x in tokenized]
#built my Tweet-specific, NRC-ready tokenizer
emotions = list(set(nrc["emotion"]))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
cv = [0] * len(emotions)
#built indices showing locations of emotions
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index]+=1
emotions = list(set(nrc['emotion']))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
def makeEmoVector(tweettext):
cv = [0] * len(emotions)
tokenized = tt.tokenize(tweettext)
tokenized = [x.lower() for x in tokenized]
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index] += 1
return cv
tweettext = df.iloc[14,:]['Text']
emotion_vectors = []
for text in df['Text']:
emotion_vector = makeEmoVector(text)
emotion_vectors.append(emotion_vector)
ev = pd.DataFrame(emotion_vectors, index=df.index, columns=emotions)
#Now I have a DataFrame with all of the emotion counts for each tweet
Date_Group = df.groupby("Date")
Date_Group[emotions].agg("sum")
#Finally, we arrive at the problem! When I run this, I end up with tweets that are grouped *by the second. What I want is to be able to group them: a) by the half-hour, b) by the hour, and c) by the day
Since, the default date format for tweets with the Tweepy API is "2017-04-14 18:41:56". To get tweets grouped by hour, you can do something as simple as this:
# This will get the time parameter
time = [item.split(" ")[1] for item in df['date'].values]
# This will get the hour parameter
hour = [item.split(":")[0] for item in time]
df['time'] = hour
grouped_tweets = df[['time', 'number_tweets']].groupby('time')
tweet_growth_hour = grouped_tweets.sum()
tweet_growth_hour['time']= tweet_growth_hour.index
print tweet_growth_hour
To group by date, you can do something similiar like:
days = [item.split(" ")[0] for item in df['date'].values]
df['days'] = days
grouped_tweets = df[['days', 'number_tweets']].groupby('days')
tweet_growth_days = grouped_tweets.sum()
tweet_growth_days['days']= tweet_growth_days.index
print tweet_growth_days
Sorry for the unsophisticated question title but I need help desperately:
My objective at work is to create a script that pulls all the records from exacttarget salesforce marketing cloud API. I have successfully setup the API calls, and successfully imported the data into DataFrames.
The problem I am running into is two-fold that I need to keep pulling records till "Results_Message" in my code stops reading "MoreDataAvailable" and I need to setup logic which allows me to control the date from either within the API call or from parsing the DataFrame.
My code is getting stuck at line 44 where "print Results_Message" is looping around the string "MoreDataAvailable"
Here is my code so far, on lines 94 and 95 you will see my attempt at parsing the date directly from the dataframe but no luck and no luck on line 32 where I have specified the date:
import ET_Client
import pandas as pd
AggreateDF = pd.DataFrame()
Data_Aggregator = pd.DataFrame()
#Start_Date = "2016-02-20"
#End_Date = "2016-02-25"
#retrieveDate = '2016-07-25T13:00:00.000'
Export_Dir = 'C:/temp/'
try:
debug = False
stubObj = ET_Client.ET_Client(False, debug)
print '>>>BounceEvents'
getBounceEvent = ET_Client.ET_BounceEvent()
getBounceEvent.auth_stub = stubObj
getBounceEvent.search_filter = {'Property' : 'EventDate','SimpleOperator' : 'greaterThan','Value' : '2016-02-22T13:00:00.000'}
getResponse1 = getBounceEvent.get()
ResponseResultsBounces = getResponse1.results
Results_Message = getResponse1.message
print(Results_Message)
#EventDate = "2016-05-09"
print "This is orginial " + str(Results_Message)
#print ResponseResultsBounces
i = 1
while (Results_Message == 'MoreDataAvailable'):
#if i > 5: break
print Results_Message
results1 = getResponse1.results
#print(results1)
i = i + 1
ClientIDBounces = []
partner_keys1 = []
created_dates1 = []
modified_date1 = []
ID1 = []
ObjectID1 = []
SendID1 = []
SubscriberKey1 = []
EventDate1 = []
EventType1 = []
TriggeredSendDefinitionObjectID1 = []
BatchID1 = []
SMTPCode = []
BounceCategory = []
SMTPReason = []
BounceType = []
for BounceEvent in ResponseResultsBounces:
ClientIDBounces.append(str(BounceEvent['Client']['ID']))
partner_keys1.append(BounceEvent['PartnerKey'])
created_dates1.append(BounceEvent['CreatedDate'])
modified_date1.append(BounceEvent['ModifiedDate'])
ID1.append(BounceEvent['ID'])
ObjectID1.append(BounceEvent['ObjectID'])
SendID1.append(BounceEvent['SendID'])
SubscriberKey1.append(BounceEvent['SubscriberKey'])
EventDate1.append(BounceEvent['EventDate'])
EventType1.append(BounceEvent['EventType'])
TriggeredSendDefinitionObjectID1.append(BounceEvent['TriggeredSendDefinitionObjectID'])
BatchID1.append(BounceEvent['BatchID'])
SMTPCode.append(BounceEvent['SMTPCode'])
BounceCategory.append(BounceEvent['BounceCategory'])
SMTPReason.append(BounceEvent['SMTPReason'])
BounceType.append(BounceEvent['BounceType'])
df1 = pd.DataFrame({'ClientID': ClientIDBounces, 'PartnerKey': partner_keys1,
'CreatedDate' : created_dates1, 'ModifiedDate': modified_date1,
'ID':ID1, 'ObjectID': ObjectID1,'SendID':SendID1,'SubscriberKey':SubscriberKey1,
'EventDate':EventDate1,'EventType':EventType1,'TriggeredSendDefinitionObjectID':TriggeredSendDefinitionObjectID1,
'BatchID':BatchID1,'SMTPCode':SMTPCode,'BounceCategory':BounceCategory,'SMTPReason':SMTPReason,'BounceType':BounceType})
#print df1
#df1 = df1[(df1.EventDate > "2016-02-20") & (df1.EventDate < "2016-02-25")]
#AggreateDF = AggreateDF[(AggreateDF.EventDate > Start_Date) and (AggreateDF.EventDate < End_Date)]
print(df1['ID'].max())
AggreateDF = AggreateDF.append(df1)
print(AggreateDF.shape)
#df1 = df1[(df1.EventDate > "2016-02-20") and (df1.EventDate < "2016-03-25")]
#AggreateDF = AggreateDF[(AggreateDF.EventDate > Start_Date) and (AggreateDF.EventDate < End_Date)]
print("Final Aggregate DF is: " + str(AggreateDF.shape))
#EXPORT TO CSV
AggreateDF.to_csv(Export_Dir +'DataTest1.csv')
#with pd.option_context('display.max_rows',10000):
#print (df_masked1.shape)
#print df_masked1
except Exception as e:
print 'Caught exception: ' + str(e.message)
print e
Before my code parses the data, the orginal format I get of the data is a SOAP response, this is what it look like(below). Is it possible to directly parse records based on EventDate from the SOAP response?
}, (BounceEvent){
Client =
(ClientID){
ID = 1111111
}
PartnerKey = None
CreatedDate = 2016-05-12 07:32:20.000937
ModifiedDate = 2016-05-12 07:32:20.000937
ID = 1111111
ObjectID = "1111111"
SendID = 1111111
SubscriberKey = "aaa#aaaa.com"
EventDate = 2016-05-12 07:32:20.000937
EventType = "HardBounce"
TriggeredSendDefinitionObjectID = "aa111aaa"
BatchID = 1111111
SMTPCode = "1111111"
BounceCategory = "Hard bounce - User Unknown"
SMTPReason = "aaaa"
BounceType = "immediate"
Hope this makes sense, this is my desperately plea for help.
Thank you in advance!
You don't seem to be updating Results_Message in your loop, so it's always going to have the value it gets in line 29: Results_Message = getResponse1.message. Unless there's code involved that you didn't share, that is.
I have problem listing DataFrame rows. The below function returns only one row (if indented returns first row, if not indented returns the last one). Does anyone knows where's the problem?
def ols_regression(formula, framedict):
for yp in framedict.keys():
ols_model = ols(formula, framedict[str(yp)]).fit()
year = int(yp[:-5])
params = ols_model.params
d = (dict(yp = yp, year = year, formula=formula, R_squared=ols_model.rsquared,
intercept = params.values[0], DP1 = params.values[1], I = params.values[2], P = params.values[3],
p_intercept = ols_model.pvalues[0], p_DP1 = ols_model.pvalues[1], p_I = ols_model.pvalues[2],
p_P = ols_model.pvalues[3]))
return pd.DataFrame(d, index=[0])
I solve the problem with appending the dictionary to array.
def ols_regression(formula, framedict):
arr = []
for yp in framedict.keys():
ols_model = ols(formula, framedict[str(yp)]).fit()
year = int(yp[:-5])
params = ols_model.params
arr.append(dict(yp = yp, year = year, formula=formula, R_squared=int(ols_model.rsquared),
intercept = params.values[0], DP1 = params.values[1], I = params.values[2], P = params.values[3],
p_intercept = ols_model.pvalues[0], p_DP1 = ols_model.pvalues[1], p_I = ols_model.pvalues[2],
p_P = ols_model.pvalues[3]))
return pd.DataFrame(arr)