Currently I have a function which returns the stock ticker with the highest error for the entire data set. What I actually want is to return the stock ticker with the highest error for the current day.
Here is the current function:
#main.route('/api/highest/error')
def get_highest_error():
"""
API which returns the highest stock error for the current day.
:return: ticker of the stock matching the query.
"""
sub = db.session.query(db.func.max(Stock.error).label('max_error')).subquery()
stock = db.session.query(Stock).join(sub, sub.c.max_error == Stock.error).first()
return stock.ticker
Here is what I attempted:
todays_stock = db.session.query(db.func.date(Stock.time_stamp) == date.today())
stock = todays_stock.filter(db.func.max(Stock.error))
return stock.ticker
Unfortunately this is operating on a BaseQuery which is not what I expected.
I also tried:
stock = Stock.query.filter(db.func.date(Stock.time_stamp) == date.today()).filter(db.func.max(Stock.error)).first()
But this generated an error with the messageaggregate functions are not allowed in WHERE
The error is pretty self explanatory. You cannot use aggregate functions in the WHERE clause. If you have to eliminate group rows based on aggregates, use HAVING. But that's not what you need: for fetching the row with greatest error order by error in descending order and pick the first row:
stock = Stock.query.\
filter(db.func.date(Stock.time_stamp) == date.today()).\
order_by(Stock.error.desc().nullslast()).\
first()
Unless you have a ridiculous amount of Stock per day, the sorting should be plenty fast. Note that db.func.date(Stock.time_stamp) == date.today() is not very index friendly, unless your DB supports functional indexes. Instead you could filter on a half open range:
today = date.today()
...
filter(Stock.time_stamp >= today,
Stock.time_stamp < today + timedelta(days=1)).\
Related
I have an original function from pandas which worked perfectly for my use case, however it only worked on a small dataset. My current dataset is 50+MM rows, which Pandas is unable to handle.
As stated in the title the goal behind the function is to Iterate through each row in a PySpark Frame and return the distinct count of users operating +-5 minutes outside of the timeframe identified in each row. e.g if an user completes a task at 2021-02-04 12:44:33, i want to know the number of distinct individuals that also completed the same task + and - 5minutes from this user completing the task. This would need to be checked for every row in the dataframe.
My original python code for pandas was as follows
def workers(s):
#get unique count of employee id working on process/function where start_time after amended_start and end_time after amended_end.
if s.tracking_type == 'indirect':
balancedate = s.balancedate
process = s.process_name
function = s.function_name
start_time = s.start_date_utc
end_time = s.end_date_utc
amended_start = s.start_date_utc - datetime.timedelta(minutes=5)
amended_end = s.end_date_utc + datetime.timedelta(minutes=5)
t = df2_mer[(df2_mer['balancedate']==balancedate)&(df2_mer['function_name']==function)&(df2_mer['process_name']==process)&(df2_mer['start_date_utc'] >= amended_start)&(df2_mer['end_date_utc'] <= amended_end)&(df2_mer['tracking_type']=='direct')].employee_id.nunique()
return t
My Attempted PySpark Modification:
def workers(s):
#get unique count of employee id working on process/function where start_time after amended_start and end_time after amended_end.
balancedate = s.balancedate
process = s.process_name
function = s.function_name
amended_start = s.start_date_convert - f.expr('INTERVAL 5 MINUTES')
amended_end = s.end_date_convert + f.expr('INTERVAL 5 MINUTES')
t = df3.filter((f.col('balancedate')==balancedate))\
.filter(
(f.col('function_name') == function) & f.col('start_time_convert') > amended_start & f.col('end_time_convert') <= amended_end).select(countDistinct('employee_id'))
return function
However, upon attempting to run the same/similiar code in PySpark (edited for filters), I am hit with a Could not serialize object error as a result of including a dataframe in the function.
I am not sure how to proceed to achieve my objective, whether i can use the same methodology as in Pandas or I would need to use a completely different methodology.
I am using the this dataset for a project.
I am trying to find the total yield for each inverter for the 34 day duration of the dataset (basically use the final and initial value available for each inverter). I have been able to get the list of inverters using pd.unique()(there are 22 inverters for each solar power plant.
I am having trouble querying the total_yield data for each inverter.
Here is what I have tried:
def get_yields(arr: np.ndarray, df:pd.core.frame.DataFrame) -> np.ndarray:
delta = np.zeros(len(arr))
index =0
for i in arr:
initial = df.loc[df["DATE_TIME"]=="15-05-2020 02:00"]
initial = initial.loc[initial["INVERTER_ID"]==i]
initial.reset_index(inplace=True,drop=True)
initial = initial.at[0,"TOTAL_YIELD"]
final = df.loc[(df["DATE_TIME"]=="17-06-2020 23:45")]
final = final.loc[final["INVERTER_ID"]==i]
final.reset_index(inplace=True, drop=True)
final = final.at[0,"TOTAL_YIELD"]
delta[index] = final - initial
index = index + 1
return delta
Reference: arr is the array of inverters, listed below. df is the generation dataframe for each plant.
The problem is that not every inverter has a data point for each interval. This makes this function only work for the inverters at the first plant, not the second one.
My second approach was to filter by the inverter first, then take the first and last data points. But I get an error- 'Series' objects are mutable, thus they cannot be hashed
Here is the code for that so far:
def get_yields2(arr: np.ndarray, df: pd.core.frame.DataFrame) -> np.ndarry:
delta = np.zeros(len(arr))
index = 0
for i in arr:
initial = df.loc(df["INVERTER_ID"] == i)
index += 1
break
return delta
List of inverters at plant 1 for reference(labeled as SOURCE_KEY):
['1BY6WEcLGh8j5v7' '1IF53ai7Xc0U56Y' '3PZuoBAID5Wc2HD' '7JYdWkrLSPkdwr4'
'McdE0feGgRqW7Ca' 'VHMLBKoKgIrUVDU' 'WRmjgnKYAwPKWDb' 'ZnxXDlPa8U1GXgE'
'ZoEaEvLYb1n2sOq' 'adLQvlD726eNBSB' 'bvBOhCH3iADSZry' 'iCRJl6heRkivqQ3'
'ih0vzX44oOqAx2f' 'pkci93gMrogZuBj' 'rGa61gmuvPhdLxV' 'sjndEbLyjtCKgGv'
'uHbuxQJl8lW7ozc' 'wCURE6d3bPkepu2' 'z9Y9gH1T5YWrNuG' 'zBIq5rxdHJRwDNY'
'zVJPv84UY57bAof' 'YxYtjZvoooNbGkE']
List of inverters at plant 2:
['4UPUqMRk7TRMgml' '81aHJ1q11NBPMrL' '9kRcWv60rDACzjR' 'Et9kgGMDl729KT4'
'IQ2d7wF4YD8zU1Q' 'LYwnQax7tkwH5Cb' 'LlT2YUhhzqhg5Sw' 'Mx2yZCDsyf6DPfv'
'NgDl19wMapZy17u' 'PeE6FRyGXUgsRhN' 'Qf4GUc1pJu5T6c6' 'Quc1TzYxW2pYoWX'
'V94E5Ben1TlhnDV' 'WcxssY2VbP4hApt' 'mqwcsP2rE7J0TFp' 'oZ35aAeoifZaQzV'
'oZZkBaNadn6DNKz' 'q49J1IKaHRwDQnt' 'rrq4fwE8jgrTyWY' 'vOuJvMaM2sgwLmb'
'xMbIugepa2P7lBB' 'xoJJ8DcxJEcupym']
Thank you very much.
I can't download the dataset to test this. Getting "To May Requests" Error.
However, you should be able to do this with a groupby.
import pandas as pd
result = df.groupby('INVERTER_ID')['TOTAL_YIELD'].agg(['max','min'])
result['delta'] = result['max']-result['min']
print(result[['delta']])
So if I'm understanding this right, what you want is the TOTAL_YIELD for each inverter for the beginning of the time period starting 5-05-2020 02:00 and ending 17-06-2020 23:45. Try this:
# enumerate lets you have an index value along with iterating through the array
for i, code in enumerate(arr):
# to filter the info to between the two dates, but not necessarily assuming that
# each inverter's data starts and ends at each date
inverter_df = df.loc[df['DATE_TIME'] >= pd.to_datetime('15-05-2020 02:00:00')]
inverter_df = inverter_df.loc[inverter_df['DATE_TIME'] <= pd.to_datetime('17-06-2020
23:45:00')]
inverter_df = inverter_df.loc[inverter_df["INVERTER_ID"]==code]]
# sort by date
inverter_df.sort_values(by='DATE_TIME', inplace= True)
# grab TOTAL_YIELD at the first available date
initial = inverter_df['TOTAL_YIELD'].iloc[0]
# grab TOTAL_YIELD at the last available date
final = inverter_df['TOTAL_YIELD'].iloc[-1]
delta[index] = final - initial
I'm attempting to determine which loans in a loan portfolio exceed the FHFA County Loan Limit to project impact of upcoming law changes for a study. I've had versions of the code work with a small (14k loans) sample set, but when importing the full portfolio (5.6m) the code does not work. I'm definitely pretty new to Python, my experience is limited to SAS and R, and that's admittedly rusty.
As I don't have access to live data, I'm importing the data w/ chunksize of 5k which has alleviated memory issues. and I've imported the loan limit data from the FHFA website, and created a dictionary for year, state, and county code.
I also used pd.to_datetime() and a .notnull() in an attempt to remove nulls from the data and county fields.
def loan_calculation_new(row):
year = row['PROCESSED_DATE'].year
if row['PROCESSED_DATE'].month > 9:
year += 1
state_dict = year_dict[year]
if row['FIPS_STATE_CODE'] not in state_dict:
print("No State Code")
return None
county_dict = state_dict[row['FIPS_STATE_CODE']]
if row['FIPS_COUNTY_CODE'] not in county_dict:
limit = 485300
return
limit = county_dict[row['FIPS_COUNTY_CODE']]
limit > row['MTGE_LOAN_AMOUNT'].astype(int)
I keep getting this error when trying to run the calculation:
AttributeError: ("'str' object has no attribute 'year'", 'occurred at index 0')
I'm wondering if the issue is with my data being pipe delineated, and not being interpreted as a date. The Sample was a .csv and seemed to work.
it seems the col PROCESSED_DATE is string, so you need to convert to datetime
if the row from dataframe, you can do:
df['PROCESSED_DATE'] = pd.to_datetime(df['PROCESSED_DATE'])
import datetime
def loan_calculation_new(row):
year = datetime.strptime(row['PROCESSED_DATE'], "<EXPECTED FORMAT>").year
if row['PROCESSED_DATE'].month > 9:
year += 1
...
I'm currently creating a simple calendar for one of my Django projects. The calendar will display the current month and the days. Any day which has a item for that day, will be highlighted in red so that the user knows that there are items for that day. The number of items or what items they are don't matter. All we care about is whether a day has items.
Lets say I have the following model.
class Items(models.Model):
name = models.CharField(max_length=140)
datetime = models.DateTimeField(auto_now_add=False)
def save(self, *args, **kwargs):
if datetim is None:
created = datetime.now()
super(Items, self).save()
Here is my current logic for finding which days have items:
from calendar import monthrange
# Find number of days for June 2015
num_days = monthrange(2015, 6)[1]
days_with_items = []
'''
Increase num_days by 1 so that the last day of the month
is included in the range
'''
num_days =+ 1
for day in range(0, num_days):
has_items = Items.objects.filter(datetime__day = day,
datetime__month = 6,
datetime__year = 2015).exists()
if has_items:
days_with_items.append(day)
return days_with_items
This returns me a list with all the days that have items. This works however I'm looking for a more efficient way of doing this since Django is making multiple trips to the DB for the .exists()
Any suggestions?
I see two possible options. The first one is to add counts at DB level, the second is to have an efficient loop over available data at python level. Depending on data size and db-efficiency you can choose which suits you best.
Counting in the database is explained here:
Django ORM, group by day
Or solution two, a simple script (not so elegant.. but just as an example):
days_with_items_hash = {}
items = Items.objects.filter(
datetime__month = 6,
datetime__year = 2015
)
for item in items:
days_with_item_hash[item.datetime.day] = True
days_with_item = days_with_item_hash.keys()
I would stick with the database solution because it can be optimised (sql views, extra column with just the day, etc)
At first, let's get all the items for the required month.
items = Items.objects.filter(datetime__month=6, datetime__year=2015)
days = set([item.datetime.day for item in items]) # unique days
If you want to make a partial query, specify values you need, here's the concept:
items = Item.objects.filter(
date_added__month=6, date_added__year=2015
).values('date_added')
days = set([item['date_added'].day for item in items])
It will result in the following SQL query:
QUERY = u'SELECT "main_item"."date_added" FROM "main_item" WHERE
(django_datetime_extract(\'month\', "main_item"."date_added", %s) = %s
AND "main_item"."date_added" BETWEEN %s AND %s)'
- PARAMS = (u"'UTC'", u'6', u'datetime.datetime(2015, 1, 1, 0, 0, tzinfo=<UTC>)',
u'datetime.datetime(2015, 12, 31, 23, 59, 59, 999999, tzinfo=<UTC>)')
If you are dealing with big amout of Items, you can break your query into parts (< 15 and >=15 for example). This will result in extra database hit, but the memory usage pick will be smaller. You can also consider different methods.
Please, also note:
that datetime is not the best name for a field. Name it
meaningfully, like: "date_added", "date_created" or something like
that
if self.datetime is None is 'almost' equal to if not self.datetime
Use the dates method.
items = Item.objects.filter(date_added__month=6, date_added__year=2015)
dates = items.dates('date_added', 'day') # returns a queryset of datetimes
days = [d.day for d in dates] # convert to a list of days
Though this may be elementary but for me its proving beyond my level. And my thanks goes beforehand.
What I wanted to achieve is that a row of data which I am querying,and after querying on the basis of the value of the field in the Liveroute table being inactive I perform calculation of finding the distance between two sets of longitude and latitude values, I am able to use the haversine formula to calculate the distance between two points longitude and latitude. But what I am wary of that this calculation process may take time and I will not be able to display the data on time for larger number of rows in the table. So I thought I will save the result of calculation in another Table and fetch the data from that table and display it.
I will perform the calculation on a value in the table becoming inactive.
Here is the Django code which tells whether the row is active or inactive for a route in Liveroute model class.
class LiveRoutes(models.Model):
user = models.ForeignKey(User)
route = models.ForeignKey(UserRoutes)
status = models.ForeignKey(LiveRoutesStatus)
traveller = models.ManyToManyField(LiveRouteTravellers)
datetime = models.DateTimeField()
def __unicode__(self):
return self.route.__unicode__()
def isActive(self):
utc = pytz.utc
os.environ['TZ'] = 'UTC'
local = pytz.timezone("Asia/Calcutta")
now = utc.localize(datetime.datetime.today())
now = now.astimezone(local)
time_delta = (local.localize(self.datetime.replace(tzinfo=None)) + datetime.timedelta(minutes=self.route.journey_time_day)) - now
if time_delta.days == -1 and (24 - (time_delta.seconds / 3600)) <= 2:
return True
elif time_delta.days >= 0:
return True
else:
return False
Based on this value from isActive function I wanted to perform the calculation as follows
def carbonFootPrint(request):
if request.method != "GET":
raise Http404
routes = LiveRoutes.objects.all();
routeDetailArr = []
for lroute in routes:
routeDetail = dict()
if lroute.isActive() == False:
#Now I need to find out the start location and end location for the journey and the number of travellers.
routeDetail['travellers'] = lroute.traveller.all().count()
routeDetail['start_loc_lat']= lroute.route.start_location.latitude
routeDetail['start_loc_long'] = lroute.route.start_location.longitude
routeDetail ['end_loc_lat'] = lroute.route.end_location.latitude
routeDetail['end_loc_long'] = lroute.route.end_location.longitude
routeDetail['distance'] = haversine(start_loc_lat,start_loc_long,end_loc_lat,end_loc_long)
routeDetailArr.append(routeDetail)
my problem is how to insert all this data back into another table, so that later on I could fetch those values.
Thanks any advice will be highly appreciated.
In order to trigger the session, it must be like this:
request.session['travellers'] = lroute.traveller.all().count()
//other session here
To get and use the data, it must be:
travellers = request.session.get('travellers')