all_data = {}
for ticker in ['TWTR', 'SNAP', 'FB']:
all_data[ticker] = np.array(pd.read_csv('https://www.google.com/finance/getprices?i=60&p=10d&f=d,o,h,l,c,v&df=cpct&q={}'.format(ticker, skiprows=7, header=None))
date = []
for i in np.arange(0, len(all_data['SNAP'])):
if all_data['SNAP'][i][0][0] == 'a':
t = datetime.datetime.fromtimestamp(int(all_data['SNAP'][i][0].replace('a','')))
date.append(t)
else:
date.append(t+ datetime.timedelta(minutes= int(all_data['SNAP'][i][0])))
Hi, what this code does is to create a dictionary(all_data) and then put intraday data for twitter, snapchat, facebook into the dictionary from the url. The dates are in epoch time format and so the second for did a second for loop.
I was only able to do so for one of the tickers (SNAP) and i was wondering if anyone knew how to create iterate all the data to do the same
With pandas, you normally convert a timestamp to datetime using:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit="s")
Note:
Your script seems to contain other errors, which are outside the scope of the question.
Related
As you can tell I'm new to python, and struggling with the correct way/syntax to iterate over date ranges in my Google Analytics API call. The master function iterates over specific Website View IDs already (I have it loop through the 'client_id', but I'm wondering what the correct way to have it iterate over date ranges would be/where I need to put it in my code.
Right now I am running it using the GA Date Dimension to get daily results, but I also need to run it for MTD, 7 Day Rolling, and 30 Day rolling iterations. Here is a sample the code I am using.
def APmain(client_id):
base_df = pd.DataFrame()
columns = traffic_columns
start_date = datetime.strptime(master_startdate , '%Y-%m-%d').date()
end_date = datetime.strptime(master_enddate , '%Y-%m-%d').date()
filters = 'ga:screenResolution!=0x0'
print('Running Google Analytics Report.')
#change based the type and number of dimensions pulled. These are set
dimensions = 'ga:'+(',ga:'.join(columns[:3]))
metrics = 'ga:'+(',ga:'.join(columns[3:]))
results = analytics_get_report(client_id, str(start_date), str(end_date), dimensions, metrics, filters)
df = pd.DataFrame(results, columns=columns)
df['view_id'] = client_id
df['filter_type'] = 'New'
df['dimension'] = 'SourceMedium'
df['DateType'] = 'Daily'
base_df = base_df.append(df)
base_df = base_df.applymap(str)
base_df = base_df[base_df.date != '(other)'] #remove if date column has '(other)' as a value
base_df['date'] = pd.to_datetime(base_df['date'], format='%Y%m%d') #format date
# Powershell output
print(base_df.head())
#bigquery output
base_df.to_gbq(
'', #dataset name + table name
'', #project name
chunksize=10000,
reauth=False,
if_exists='append',
credentials=credentials
)
I have a variables names 'master_startdate' and 'master_enddate' where I define the date range I have it pull data for currently. I've messed around with something like this but can't get it to work correctly, but I feel like I'm on the right track and/or thinking about it correctly.
import calendar
from datetime import datetime, timedelta, date
import calendar
cal= calendar.Calendar()
wholeyear= range(0,365)
master_start_date= '2019-01-01'
start_date= datetime.strptime(master_start_date , '%Y-%m-%d').date()
end_date2= datetime.strptime(end_date , '%Y-%m-%d').date()
for x in wholeyear:
end_date = start_date + (timedelta(days=1)*x)
print(end_date)
pass
start_date2= datetime.date.today()
start_date2= datetime.date(end_date2.year(),end_date2.month(),1)
print(start_date2)
Any help for a beginner here would be greatly appreciated!
I am trying to pull out multiple ticker data from the yfinance API and save it to a csv file (in total I have 1000 tickers I need to get the data for, that data being the entire table of date, open, high, low, close, volume, etc etc), so far I am able to successfully get data for 1 ticker by using the following Python code:
import yfinance as yf
def yfinance(ticker_symbol):
ticker_data = yf.Ticker(ticker_symbol)
tickerDF = ticker_data.history(period='1d', start='2020-09-30', end='2020-10-31')
print(tickerDF)
yfinance('000001.SS')
However if I try on multiple tickers this doesn't work. Following the yfinance docs which say for multiple tickers use:
tickers = yf.Tickers('msft aapl goog')
# ^ returns a named tuple of Ticker objects
# access each ticker using (example)
tickers.tickers.MSFT.info
tickers.tickers.AAPL.history(period="1mo")
tickers.tickers.GOOG.actions
I have a couple of issue here, the docs use a string such as 'aapl' my tickers are all of digit format like '000001.SS', the ".SS" part is proving to be an issue when passing it into the code:
tickers.tickers.000001.SS.history(period="1mo")
# Clearly this wont for for a start
The next issue I am having is, even if I pass in for example 3 tickers to my function like so:
yfinance('000001.SS 000050.KS 00006.KS')
# similar to yfinance docs of tickers = yf.Tickers('msft aapl goog')
I get errors like:
AttributeError: 'Tickers' object has no attribute '000001.SS'
(I have also tried to run these into a for loop and pass each on to the Tickers object but get the same error.)
Im stuck now, I dont know how to pass in multiple tickers to yfinance and get back data that I want and the docs aren't very helpful.
Is anyone able to help me with this?
Could you not just store them in an array specifying the type as dtype object then use that pull the data from.
import yfinance as yf
import numpy as np
tickers = ['msft', 'aapl', 'goog']
totalPortfolio = np.empty([len(tickers)], dtype=object)
num = 0
for ticker in tickers:
totalPortfolio[num] = yf.download(ticker, start='2020-09-30', end='2020-10-31', interval="1d")
num = num + 1
Take a look at the code below:
test = yf.Tickers("A B C")
# creates test as a yf.tickers object
test_dict = test.tickers
# creates a dict object containing the individual tickers. Can be checked with type()
You are trying to use "tickers.tickers.MSFT.info" to retrieve the ticker data from your dictionary "tickers.tickers" but like your error message says, a dict object has no attributes named after your specific ticker names. This is in general not how you access elements in a dictionary.
Instead you should use the code as below (like with all dict objects):
#old code from above
test = yf.Tickers("A B C")
test_dict = test.tickers
#new code accessing the dict correctly
a_data = test_dict["A"]
a_data = test.tickers["A"] #does the same as the line above
b_data = test.tickers["B"] #and so on for the other tickers
In a loop this could look something like this:
ticker_list = ["A", "B", "C"] #add tickers as needed
tickers_data = {}
tickers_history = {}
for ticker in ticker_list:
tickers_data[ticker] = yf.Ticker(ticker)
tickers_history = tickers_data[ticker].history(period='1d', start='2020-09-30', end='2020-10-31')
#access the dicts as needed using tickers_data[" your ticker name "]
alternatively you can also use the "yf.Tickers" function to retrieve multiple tickers at once, but because you save the history seperately I don't think this will necessarily improve your code much.
You should pay attention however, that "yf.Ticker()" and "yf.Tickers()" are different functions from each other with differing syntax and are not interchangeable.
You did mix that up when you tried accessing multiple tickers with your custom "yfinance()" function, that has been previously defined with the "yf.Ticker()" function and thus only accepts one symbol at a time.
I am fairly new to python and coding in general.
I have a big data file that provides daily data for the period 2011-2018 for a number of stock tickers (300~).
The data is a .csv file with circa 150k rows and looks as follows (short example):
Date,Symbol,ShortExemptVolume,ShortVolume,TotalVolume
20110103,AAWW,0.0,28369,78113.0
20110103,AMD,0.0,3183556,8095093.0
20110103,AMRS,0.0,14196,18811.0
20110103,ARAY,0.0,31685,77976.0
20110103,ARCC,0.0,177208,423768.0
20110103,ASCMA,0.0,3930,26527.0
20110103,ATI,0.0,193772,301287.0
20110103,ATSG,0.0,23659,72965.0
20110103,AVID,0.0,7211,18896.0
20110103,BMRN,0.0,21740,213974.0
20110103,CAMP,0.0,2000,11401.0
20110103,CIEN,0.0,625165,1309490.0
20110103,COWN,0.0,3195,24293.0
20110103,CSV,0.0,6133,25394.0
I have a function that allows me to filter for a specific symbol and get 10 observations before and after a specified date (could be any date between 2011 and 2018).
import pandas as pd
from datetime import datetime
import urllib
import datetime
def get_data(issue_date, stock_ticker):
df = pd.read_csv (r'D:\Project\Data\Short_Interest\exampledata.csv')
df['Date'] = pd.to_datetime(df['Date'], format="%Y%m%d")
d = df
df = pd.DataFrame(d)
short = df.loc[df.Symbol.eq(stock_ticker)]
# get the index of the row of interest
ix = short[short.Date.eq(issue_date)].index[0]
# get the item row for that row's index
iloc_ix = short.index.get_loc(ix)
# get the +/-1 iloc rows (+2 because that is how slices work), basically +1 and -1 trading days
short_data = short.iloc[iloc_ix-10: iloc_ix+11]
return [short_data]
I want to create a script that iterates a list of 'issue_dates' and 'stock_tickers'. The list (a .csv) looks as following:
ARAY,07/08/2017
ARAY,24/04/2014
ACETQ,16/11/2015
ACETQ,16/11/2015
NVLNA,15/08/2014
ATSG,29/09/2017
ATI,24/05/2016
MDRX,18/06/2013
MDRX,18/06/2013
AMAGX,10/05/2017
AMAGX,14/02/2014
AMD,14/09/2016
To break down my problem and question I would like to know how to do the following:
First, how do I load the inputs?
Second, how do I call the function on each input?
And last, how do I accumulate all the function returns in one dataframe?
To load the inputs and call the function for each row; iterate over the csv file and pass each row's values to the function and accumulate the resulting Seriesin a list.
I modified your function a bit: removed the DataFrame creation so it is only done once and added a try/except block to account for missing dates or tickers (your example data didn't match up too well). The dates in the second csv look like they are day/month/year so I converted them for that format.
import pandas as pd
import datetime, csv
def get_data(df, issue_date, stock_ticker):
'''Return a Series for the ticker centered on the issue date.
'''
short = df.loc[df.Symbol.eq(stock_ticker)]
# get the index of the row of interest
try:
ix = short[short.Date.eq(issue_date)].index[0]
# get the item row for that row's index
iloc_ix = short.index.get_loc(ix)
# get the +/-1 iloc rows (+2 because that is how slices work), basically +1 and -1 trading days
short_data = short.iloc[iloc_ix-10: iloc_ix+11]
except IndexError:
msg = f'no data for {stock_ticker} on {issue_date}'
#log.info(msg)
print(msg)
short_data = None
return short_data
df = pd.read_csv (datafile)
df['Date'] = pd.to_datetime(df['Date'], format="%Y%m%d")
results = []
with open('issues.csv') as issues:
for ticker,date in csv.reader(issues):
day,month,year = map(int,date.split('/'))
# dt = datetime.datetime.strptime(date, r'%d/%m/%Y')
date = datetime.date(year,month,day)
s = get_data(df,date,ticker)
results.append(s)
# print(s)
Creating a single DataFrame or table for all that info may be problematic especially since the date ranges are all different. Probably should ask a separate question regarding that. Its mcve should probably just include a few minimal Pandas Series with a couple of different date ranges and tickers.
I have a 3 years dataset. I have split my dataset in days. now, I want to store each month's data in a separate list/variable.
SDD2=Restaurant[Restaurant.Item == ' Soft Drink '].groupby(pd.Grouper(key='Date',freq='D')).sum()
print(SDD2)
This a data which I get from above code now I want to store each month data in separate variable/list
You should store data into json format or csv format of each of month into file so it easily accessible from your python script.
For more information check python's module JSON and CSV.
You can just do df.groupby(pd.Grouper(key="Date", freq="M")) and then query on the groups to get your data with get_group('date') or optionally you could convert the grouped data to dict of lists with either .apply(list).to_dict() or dict(list(groups)).
Example:
import pandas as pd
import numpy as np
# create some random dates
start = pd.to_datetime('2018-01-01')
end = pd.to_datetime('2019-12-31')
start_u = start.value//10**9
end_u = end.value//10**9
date_range = pd.to_datetime(np.random.randint(start_u, end_u, 30), unit='s')
# convert to DF
df = pd.DataFrame(date_range, columns=["Date"])
# Add random data
df['Data'] = np.random.randint(0, 100, size=(len(date_range)))
# Format to y-m-d
df['Date'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d'))
print(df)
# group by month
grouped_df = df.groupby(pd.Grouper(key="Date", freq="M"))
# query the groups
print("\n\ngrouped data for feb 2018\n")
#print(grouped_df.get_group('2018-02-28'))
dict_of_list = dict(list(grouped_df))
feb_2018 = pd.Timestamp('2018-02-28')
if feb_2018 in dict_of_list:
print(dict_of_list[feb_2018])
I have a csv file with data every ~minute over 2 years, and am wanting to run code to calculate 24-hour averages. Ideally I'd like the code to iterate over the data, calculate averages and standard deviations, and R^2 between dataA and dataB, for every 24hr period and then output this new data into a new csv file (with datestamp and calculated data for each 24hr period).
The data has an unusual timestamp which I think might be tripping me up slightly. I've been trying different For Loops to iterate over the data, but I'm not sure how to specify that I want the averages,etc for each 24hr period.
This is the code I have so far, but I'm not sure how to complete the For Loop to achieve what I'm wanting. If anyone can help that would be great!
import math
import pandas as pd
import os
import numpy as np
from datetime import timedelta, date
# read the file in csv
data = pd.read_csv("Jacaranda_data_HST.csv")
# Extract the data columns from the csv
data_date = data.iloc[:,1]
dataA = data.iloc[:,2]
dataB = data.iloc[:,3]
# set the start and end dates of the data
start_date = data_date.iloc[0]
end_date = data_date.iloc[-1:]
# for loop to run over every 24 hours of data
day_count = (end_date - start_date).days + 1
for single_date in [d for d in (start_date + timedelta(n) for n in
range(day_count)) if d <= end_date]:
print np.mean(dataA), np.mean(dataB), np.std(dataA), np.std(dataB)
# output new csv file - **unsure how to call the data**
csvfile = "Jacaranda_new.csv"
outdf = pd.DataFrame()
#outdf['dataA_mean'] = ??
#outdf['dataB_mean'] = ??
#outdf['dataA_stdev'] = ??
#outdf['dataB_stdev'] = ??
outdf.to_csv(csvfile, index=False)
A simplified aproach could be to group by calendar day in a dict. I don't have much experience with pandas time management in DataFrames, so this could be an alternative.
You could create a dict where the keys are the dates of the data (without the time part), so you can later calculate the mean of all the data points that are under each key.
data_date = data.iloc[:,1]
data_a = data.iloc[:,2]
data_b = data.iloc[:,3]
import collections
dd_a = collections.defaultdict(list)
dd_b = collections.defaultdict(list)
for date_str, data_point_a, data_point_b in zip(data_date, data_a, data_b):
# we split the string by the first space, so we get only the date part
date_part, _ = date_str.split(' ', maxsplit=1)
dd_a[date_part].append(data_point_a)
dd_b[date_part].append(data_point_b)
Now you can calculate the averages:
for date, v_list in dd_a.items():
if len(v_list) > 0:
print(date, 'mean:', sum(v_list) / len(v_list))
for date, v_list in dd_b.items():
if len(v_list) > 0:
print(date, 'mean:', sum(v_list) / len(v_list))