I have a pandas dataframe where I want to loop over its rows, run and save output, if any error then ingore it and move to next row.
import pandas as pd
from nsepy import get_history #can be install by "pip install nsepy"
from datetime import date
data = {'script': ['SBIN = get_history(symbol="SBIN", start=date(1985,1,1), end=date(2022,1,31))',
'SAIL = get_history(symbol="SAIL", start=date(1985,1,1), end=date(2022,1,31))',
'20MICRONS = get_history(symbol="20MICRONS", start=date(1985,1,1), end=date(2022,1,31))',
'RELIANCE = get_history(symbol="RELIANCE", start=date(1985,1,1), end=date(2022,1,31))']}
df = pd.DataFrame(data)
Now I want to run each line one by one
I can do it by
#run each row
#1
SBIN = get_history(symbol="SBIN", start=date(1985,1,1), end=date(2022,1,31))
df1.to_csv('SBIN', sep="\t")
#2
SAIL = get_history(symbol="SAIL", start=date(1985,1,1), end=date(2022,1,31))'
df1.to_csv('SAIL', sep="\t")
#3
20MICRONS = get_history(symbol="20MICRONS", start=date(1985,1,1), end=date(2022,1,31))
df1.to_csv('20MICRONS', sep="\t")
#4
RELIANCE = get_history(symbol="RELIANCE", start=date(1985,1,1), end=date(2022,1,31))
df1.to_csv('RELIANCE', sep="\t")
But it is going to take huge time. so how can it be done by for loop or while loop
Please note I would like to run each row and save the output as a character extracted before = sign of same row for example "SBIN" for first row. In case if there is any error on any line then ignore the error and move to the next line (line 3 is going to return an error which is due to the unavailability of data)
As your process is IO-Bounded, you can use Threading to increase the speed.
You can try this:
import pandas as pd
from nsepy import get_history
from datetime import date
import concurrent.futures
history = {
"SBIN": {"start": date(2021, 1, 1), "end": date(2022, 1, 31)},
"SAIL": {"start": date(2021, 1, 1), "end": date(2022, 1, 31)},
"20MICRONS": {"start": date(2021, 1, 1), "end": date(2022, 1, 31)},
"RELIANCE": {"start": date(2021, 1, 1), "end": date(2022, 1, 31)},
}
def get_historical_data(symbol, /, **kwds):
print(symbol)
df = get_history(symbol, **kwds)
df.to_csv(f'{symbol}.csv', sep='\t')
return df
data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
future_history = [
executor.submit(get_historical_data, symbol, **data)
for symbol, data in history.items()
]
data = []
for future in concurrent.futures.as_completed(future_history):
data.append(future.result())
df = pd.concat(data)
Related
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime
query = "elonmusk"
limit = 10000
start_date = datetime.datetime(2023, 1, 27)
end_date = datetime.datetime(2023, 1, 28)
tweets = sntwitter.TwitterSearchScraper(query).get_items()
index = 0
df = pd.DataFrame(columns=['Date','Username' ,'Tweet'])
for tweet in tweets:
# filter by date
if ((start_date.date() <= tweet.date.date()) and (end_date.date() >= tweet.date.date())):
# hit the limit to quit
if index == limit:
break
df2 = {'Date': tweet.date, 'Username': tweet.user.username, 'Tweet': tweet.rawContent}
df = pd.concat([df, pd.DataFrame.from_records([df2])])
index = index + 1
# out out date to quit
elif (start_date.date() > tweet.date.date()):
break
# Converting time zone from UTC to GMT+8
df['Date'] = df['Date'].dt.tz_convert('Etc/GMT+8')
print(df)
when i use the snscrape it cant work, and i have cheked the version is new snscrape 0.5.0.20230113, but it still have error.
i checked the snscrape version, here is the error information:
Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=elonmusk&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=elonmusk&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.
I am trying to fetch glue job executions that got failed previous day using 'get_job_runs' function available through boto3's glue client.
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_job_runs.
The request syntax, does not have an option to filter executions or job runs by date/status -
response = client.get_job_runs(
JobName='string',
NextToken='string',
MaxResults=123
)
The response I receive back looks something like below -
{
"JobRuns": [
{
"Id": "jr_89bfa55b544f7eec4f6ea574dfb0345345uhi4df65e59869e93c5d8f5efef989",
"Attempt": 0,
"JobName": "GlueJobName",
"StartedOn": datetime.datetime(2021, 1, 27, 4, 32, 47, 718000, tzinfo=tzlocal()),
"LastModifiedOn": datetime.datetime(2021, 1, 27, 4, 36, 14, 975000, tzinfo=tzlocal()),
"CompletedOn": datetime.datetime(2021, 1, 27, 4, 36, 14, 975000, tzinfo=tzlocal()),
"JobRunState": "FAILED",
"Arguments": {
"--additional-python-modules": "awswrangler",
"--conf": "spark.executor.memory=40g",
"--conf ": "spark.driver.memory=40g",
"--enable-spark-ui": "true",
"--extra-py-files": "s3://GlueJobName/lttb.py",
"--job-bookmark-option": "job-bookmark-disable",
"--spark-event-logs-path": "s3://GlueJobName/glue-script/spark-event-logs"
},
"ErrorMessage": "MemoryError: Unable to allocate xxxxx",
"PredecessorRuns": [],
"AllocatedCapacity": 8,
"ExecutionTime": 199,
"Timeout": 2880,
"MaxCapacity": 8.0,
"WorkerType": "G.2X",
"NumberOfWorkers": 4,
"LogGroupName": "/aws-glue/jobs",
"GlueVersion": "2.0"
}
],
"NextToken": "string"
}
So, what I am doing now is looping through the response object to check if the "CompletedOn" date matches with yesterday's date using prev_day calculated using datetime and timedelta and I am doing this in a while loop to fetch last 10000 executions, as the 'get_job_runs' single call is insufficient.
import boto3
from datetime import datetime, timedelta
logger = logging.getLogger()
logger.setLevel(logging.INFO)
glue_client = boto3.client("glue")
def filter_failed_exec_prev_day(executions, prev_day) -> list:
filtered_resp = []
for execution in executions['JobRuns']:
if execution['JobRunState'] == 'FAILED' and execution['CompletedOn'].date() == prev_day:
filtered_resp.append(execution)
return filtered_resp
def get_final_executions() -> list:
final_job_runs_list = []
MAX_EXEC_SEARCH_CNT = 10000
prev_day = (datetime.utcnow() - timedelta(days=1)).date()
buff_exec_cnt = 0
l_job = 'GlueJobName'
response = glue_client.get_job_runs(
JobName=l_job
)
resp_count = len(response['JobRuns'])
if resp_count > 0:
buff_exec_cnt += resp_count
filtered_resp = filter_failed_exec_prev_day(response, prev_day)
final_job_runs_list.extend(filtered_resp)
while buff_exec_cnt <= MAX_EXEC_SEARCH_CNT:
if 'NextToken' in response:
response = glue_client.get_job_runs(
JobName=l_job
)
buff_exec_cnt += len(response['JobRuns'])
filtered_resp = filter_failed_exec_prev_day(response, prev_day)
final_job_runs_list.extend(filtered_resp)
else:
logger.info(f"{job} executions list: {final_job_runs_list}")
break
return final_job_runs_list
Here, I am using a while loop to break the call after hitting 10K executions, this is triple the amount of executions we see each day on the job.
Now, I am hoping to break the while loop after I encounter execution that belongs to prev_day - 1, so is it possible to search the entire response dict for prev_day - 1 to make sure all prev day's executions are covered considering the datetime.datetime object we receive from boto3 for CompletedOn attribute?
Appreciate reading through.
Thank you
I looked at your code. And I think it might return always the same result as you're not iterating through the resultset correctly.
here:
while buff_exec_cnt <= MAX_EXEC_SEARCH_CNT:
if 'NextToken' in response:
response = glue_client.get_job_runs(
JobName=l_job
)
you need to pass the NextToken value to the get_job_runs method, like this:
response = glue_client.get_job_runs(
JobName=l_job, NextToken= response['NextToken']
)
So I am trying to get multiple stock prices using pandas and panadas datareader. If I only try to import one ticker it will run fine, but if I use more than one then an error arises. The code is:
import pandas as pd
import pandas_datareader as web
import datetime as dt
stocks = ['BA', 'AMD']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
Though I get the error:
ValueError: Wrong number of items passed 2, placement implies 1
So how do I get around it only allowing to pass 1 stock.
So far I have tried using quandl and google instead, which dont work either. I also have tried pdr.get_data_yahoo but I get the same result. I have also tried yf.download() and still get the same issue. Does anyone have any ideas to get around this? Thank you.
EDIT: Full code:
import pandas as pd
import pandas_datareader as web
import datetime as dt
import yfinance as yf
import numpy as np
stocks = ['BA', 'AMD', 'AAPL']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
d['sma50'] = np.round(d['Close'].rolling(window=2).mean(), decimals=2)
d['sma200'] = np.round(d['Close'].rolling(window=14).mean(), decimals=2)
d['200-50'] = d['sma200'] - d['sma50']
_buy = -2
d['Crossover_Long'] = np.where(d['200-50'] < _buy, 1, 0)
d['Crossover_Long_Change']=d.Crossover_Long.diff()
d['buy'] = np.where(d['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d['sell'] = np.where(d['Crossover_Long_Change'] == -1, 'sell', 'n/a')
pd.set_option('display.max_rows', 5093)
d.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d.dropna(inplace=True)
#make 2 dataframe
d.set_index(d['Adj Close'], inplace=True)
buy_price = d.index[d['Crossover_Long_Change']==1]
sell_price = d.index[d['Crossover_Long_Change']==-1]
d['Crossover_Long_Change'].value_counts()
profit_loss = (sell_price - buy_price)*10
commision = buy_price*.01
position_value = (buy_price + commision)*10
percent_return = (profit_loss/position_value)*100
percent_rounded = np.round(percent_return, decimals=2)
prices = {
"Buy Price" : buy_price,
"Sell Price" : sell_price,
"P/L" : profit_loss,
"Return": percent_rounded
}
df = pd.DataFrame(prices)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return'].sum(), decimals=2),
np.round(df['P/L'].sum(), decimals=2)))
d
I tried 3 stocks in your code and it returns data for all 3, not sure I understood the problem you're facing?
import pandas as pd
import pandas_datareader as web
import datetime as dt
stocks = ['BA', 'AMD', 'AAPL']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
print(d)
Output:
Attributes Adj Close Close ... Open Volume
Symbols BA AMD AAPL BA AMD AAPL ... BA AMD AAPL BA AMD AAPL
Date ...
2018-01-02 282.886383 10.980000 166.353714 296.839996 10.980000 172.259995 ... 295.750000 10.420000 170.160004 2978900.0 44146300.0 25555900.0
2018-01-03 283.801239 11.550000 166.324722 297.799988 11.550000 172.229996 ... 295.940002 11.610000 172.529999 3211200.0 154066700.0 29517900.0
2018-01-04 282.724396 12.120000 167.097290 296.670013 12.120000 173.029999 ... 297.940002 12.100000 172.539993 4171700.0 109503000.0 22434600.0
2018-01-05 294.322296 11.880000 168.999741 308.839996 11.880000 175.000000 ... 296.769989 12.190000 173.440002 6177700.0 63808900.0 23660000.0
2018-01-08 295.570740 12.280000 168.372040 310.149994 12.280000 174.350006 ... 308.660004 12.010000 174.350006 4124900.0 63346000.0 20567800.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2019-12-24 331.030457 46.540001 282.831299 333.000000 46.540001 284.269989 ... 339.510010 46.099998 284.690002 4120100.0 44432200.0 12119700.0
2019-12-26 327.968689 46.630001 288.442780 329.920013 46.630001 289.910004 ... 332.700012 46.990002 284.820007 4593400.0 57562800.0 23280300.0
2019-12-27 328.187408 46.180000 288.333313 330.140015 46.180000 289.799988 ... 330.200012 46.849998 291.119995 4124000.0 36581300.0 36566500.0
2019-12-30 324.469513 45.520000 290.044617 326.399994 45.520000 291.519989 ... 330.500000 46.139999 289.459991 4525500.0 41149700.0 36028600.0
2019-12-31 323.833313 45.860001 292.163818 325.760010 45.860001 293.649994 ... 325.410004 45.070000 289.929993 4958800.0 31673200.0 25201400.0
I think the error comes from your moving average and the line
d['sma50'] = np.round(d['Close'].rolling(window=2).mean(), decimals=2)
because d represent 3 stocks, I think you have to separate each stock and compute the moving average separately
EDIT : I tried something for two stocks only (BA and AMD) but it is not the best solution because I'm always repeating myself for every line.
I'm just a beginner in Python but maybe this will help you to find a solution to your problem
PS : The last line doesn't work really well (which is the printing of the P&L and Return)
"
import pandas as pd
import pandas_datareader as web
import datetime as dt
stock1 = ['BA']
stock2=['AMD']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d1 = web.DataReader(stock1, 'yahoo', start, end)
d2 = web.DataReader(stock2, 'yahoo', start, end)
d1['sma50'] = np.round(d1['Close'].rolling(window=2).mean(), decimals=2)
d2['sma50'] = np.round(d2['Close'].rolling(window=2).mean(), decimals=2)
d1['sma200'] = np.round(d1['Close'].rolling(window=14).mean(), decimals=2)
d2['sma200'] = np.round(d2['Close'].rolling(window=14).mean(), decimals=2)
d1['200-50'] = d1['sma200'] - d1['sma50']
d2['200-50'] = d2['sma200'] - d2['sma50']
_buy = -2
d1['Crossover_Long'] = np.where(d1['200-50'] < _buy, 1, 0)
d2['Crossover_Long'] = np.where(d2['200-50'] < _buy, 1, 0)
d1['Crossover_Long_Change']=d1.Crossover_Long.diff()
d2['Crossover_Long_Change']=d2.Crossover_Long.diff()
d1['buy'] = np.where(d1['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d2['buy'] = np.where(d2['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d1['sell_BA'] = np.where(d1['Crossover_Long_Change'] == -1, 'sell', 'n/a')
d2['sell_AMD'] = np.where(d2['Crossover_Long_Change'] == -1, 'sell', 'n/a')
pd.set_option('display.max_rows', 5093)
d1.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d2.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d2.dropna(inplace=True)
d1.dropna(inplace=True)
d1.set_index("Adj Close",inplace=True)
d2.set_index("Adj Close",inplace=True)
buy_price_BA = np.array(d1.index[d1['Crossover_Long_Change']==1])
buy_price_AMD = np.array(d2.index[d2['Crossover_Long_Change']==1])
sell_price_BA = np.array(d1.index[d1['Crossover_Long_Change']==-1])
sell_price_AMD = np.array(d2.index[d2['Crossover_Long_Change']==-1])
d1['Crossover_Long_Change'].value_counts()
d2['Crossover_Long_Change'].value_counts()
profit_loss_BA = (sell_price_BA - buy_price_BA)*10
profit_loss_AMD = (sell_price_AMD - buy_price_AMD)*10
commision_BA = buy_price_BA*.01
commision_AMD = buy_price_AMD*.01
position_value_BA = (buy_price_BA + commision_BA)*10
position_value_AMD = (buy_price_AMD + commision_AMD)*10
percent_return_BA = np.round(((profit_loss_BA/position_value_BA)*100),decimals=2)
percent_return_AMD = np.round(((profit_loss_AMD/position_value_AMD)*100),decimals=2)
prices_BA = {
"Buy Price BA" : [buy_price_BA],
"Sell Price BA" : [sell_price_BA],
"P/L BA" : [profit_loss_BA],
"Return BA": [percent_return_BA]}
df = pd.DataFrame(prices_BA)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return BA'].sum(), decimals=2),
np.round(df['P/L BA'].sum(), decimals=2)))
prices_AMD = {
"Buy Price AMD" : [buy_price_AMD],
"Sell Price AMD" : [sell_price_AMD],
"P/L AMD" : [profit_loss_AMD],
"Return AMD": [percent_return_AMD]}
df = pd.DataFrame(prices_AMD)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return AMD'].sum(), decimals=2),
np.round(df['P/L AMD'].sum(), decimals=2)))
It seems like there's a bug in the pandas data reader. I work around it by initialising with one symbol and then setting the symbols property on the instantiated object. After doing that, it works fine to call read() on tmp below.
import pandas_datareader as pdr
all_symbols = ['ibb', 'xly', 'fb', 'exx1.de']
tmp = pdr.yahoo.daily.YahooDailyReader(symbols=all_symbols[0])
# this is a work-around, pdr is broken...
tmp.symbols = all_symbols
data = tmp.read()
I am extracting data from a REST API that I need to write to a SQL table.
My approach is adding the JSON data to a dictionary, pass the dictionary to a dataframe and write the dataframe to SQL.
I get the following error when passing the balances() function to the dataframe:
ValueError: If using all scalar values, you must pass an index
What am I doing wrong? Also feel free to provide feedback on the structure of my code, I feel like there are easier ways of extracting the data with less code.
def balances():
for b in get_balances["balances"]:
result = {}
result["employeeID"] = int(b.get("employeeID"))
result["resourceID"] = int(b.get("resourceID"))
result["resourceType"] = int(b.get("resourceType"))
if b.get("startDate") is None:
pass
else:
result["startDate"] = b.get("startDate").split("#")[0]
if b.get("endDate") is None:
pass
else:
result["endDate"] = b.get("endDate").split("#")[0]
result["minutesLeft"] = b.get("minutesLeft")
result["minutestoTake"] = b.get("minutestoTake")
result["minutesTaken"] = b.get("minutesTaken")
result["minutesTakenPast"] = b.get("minutesTakenPast")
result["minutestakenFuture"] = b.get("minutesTakenFuture")
result["periodMinutesToTake"] = b.get("periodMinutesToTake")
result["periodMinutesTaken"] = b.get("periodMinutesTaken")
for h in b.get("history"):
if h.get("planningDate") is None:
pass
else:
result["planningDate"] = h.get("planningDate").split("#")[0]
result["resourceTypeHistory"] = h.get("resourceType")
result["resourceIDHistory"] = h.get("resourceID")
result["minutes"] = h.get("minutes")
result["balanceMinutes"] = h.get("balanceMinutes")
result["remark"] = h.get("remark")
yield result
print(pd.DataFrame(balances()))
#ValueError: If using all scalar values, you must pass an index
Sample output data of 2 rows:
{'employeeID': 569, 'resourceID': 230, 'resourceType': 144, 'startDate': '2020-01-01', 'endDate': '2020-12-31', 'minutesLeft': 11281, 'minutestoTake': None, 'minutesTaken': 960, 'minutesTakenPast': 0, 'minutestakenFuture': -960, 'periodMinutesToTake': 0, 'periodMinutesTaken': 0, 'planningDate': '2020-01-01', 'resourceTypeHistory': 15, 'resourceIDHistory': 3, 'minutes': 12000, 'balanceMinutes': 12000, 'remark': ''}
{'employeeID': 877, 'resourceID': 33, 'resourceType': 125, 'startDate': '2020-01-01', 'endDate': '2020-12-31', 'minutesLeft': 11281, 'minutestoTake': None, 'minutesTaken': 960, 'minutesTakenPast': 0, 'minutestakenFuture': -960, 'periodMinutesToTake': 0, 'periodMinutesTaken': 0, 'planningDate': '2020-06-05', 'resourceTypeHistory': 2, 'resourceIDHistory': 3, 'minutes': -480, 'balanceMinutes': 11281, 'remark': ''}
Works using json_normalize for your sample JSON:
import pandas as pd
import json
with open('1.json', 'r+') as f:
data = json.load(f)
df = pd.json_normalize(data)
print(df)
employeeID resourceID resourceType startDate endDate ... resourceTypeHistory resourceIDHistory minutes balanceMinutes remark
0 569 230 144 2020-01-01 2020-12-31 ... 15 3 12000 12000
1 877 33 125 2020-01-01 2020-12-31 ... 2 3 -480 11281
I have two columns of data
(sample data) and I want to calculate total users for each week day.
For instance, I'd want my output like this (dict/list anything will do):
Monday: 25,
Tuesday: 30,
Wednesday:45,
Thursday: 50,
Friday:24,
Saturday:22,
Sunday:21
Here's my attempt:
def rider_ship (filename):
with open('./data/Washington-2016-Summary.csv','r') as f_in:
Sdict = []
Cdict = []
reader = csv.DictReader(f_in)
for row in reader:
if row['user_type']=="Subscriber":
if row['day_of_week'] in Sdict:
Sdict[row['day_of_week']]+=1
else:
Sdict [row['day_of_week']] = row['day_of_week']
else:
if row ['day_of_week'] in Cdict:
Cdict[row['day_of_week']] +=1
else:
Cdict[row['day_of_week']] = row['day_of_week']
return Sdict, Cdict
print (Sdict)
print (Cdict)
t= rider_ship ('./data/Washington-2016-Summary.csv')
print (t)
TypeError::list indices must be integers or slices, not str
How about using pandas?
Let's first create a file-like object with io library:
import io
s = u"""day_of_week,user_type
Monday,subscriber
Tuesday,customer
Tuesday,subscriber
Tuesday,subscriber"""
file = io.StringIO(s)
Now to the actual code:
import pandas as pd
df = pd.read_csv(file) # "path/to/file.csv"
Sdict = df[df["user_type"] == "subscriber"]["day_of_week"].value_counts().to_dict()
Cdict = df[df["user_type"] == "customer"]["day_of_week"].value_counts().to_dict()
Now we have:
Sdict = {'Tuesday': 2, 'Monday': 1}
Cdict = {'Tuesday': 1}