Related
The following plots two separate scatterplots using Plotly. I want to combine the points from each subplot into a single legend. However, if I plot the figure as is, there are some duplicate entries. On the other hand, if I hide a legend from a certain subplot, not all entries are displayed.
df = pd.DataFrame({'Type' : ['1','1','1','1','1','2','2','2','2','2'],
'Category' : ['A','D','D','D','F','B','D','A','D','E']
})
df['Color'] = df['Category'].map(dict(zip(df['Category'].unique(),
px.colors.qualitative.Dark24[:len(df['Category'].unique())])))
df = pd.concat([df]*10, ignore_index = True)
df['Lat'] = np.random.randint(0, 20, 100)
df['Lon'] = np.random.randint(0, 20, 100)
Color = df['Color'].unique()
Category = df['Category'].unique()
cats = dict(zip(Color, Category))
df_type_1 = df[df['Type'] == '1'].copy()
df_type_2 = df[df['Type'] == '2'].copy()
fig = make_subplots(
rows = 1,
cols = 2,
specs = [[{"type": "scattermapbox"}, {"type": "scattermapbox"}]],
vertical_spacing = 0.05,
horizontal_spacing = 0.05
)
for c in df_type_1['Color'].unique():
df_color = df_type_1[df_type_1['Color'] == c]
fig.add_trace(go.Scattermapbox(
lat = df_color['Lat'],
lon = df_color['Lon'],
mode = 'markers',
name = cats[c],
marker = dict(color = c),
opacity = 0.8,
#legendgroup = 'group2',
#showlegend = True,
),
row = 1,
col = 1
)
for c in df_type_2['Color'].unique():
df_color = df_type_2[df_type_2['Color'] == c]
fig.add_trace(go.Scattermapbox(
lat = df_color['Lat'],
lon = df_color['Lon'],
mode = 'markers',
name = cats[c],
marker = dict(color = c),
opacity = 0.8,
#legendgroup = 'group2',
#showlegend = False,
),
row = 1,
col = 2
)
fig.update_layout(height = 600, width = 800, margin = dict(l = 10, r = 10, t = 30, b = 10));
fig.update_layout(mapbox1 = dict(zoom = 2, style = 'carto-positron'),
mapbox2 = dict(zoom = 2, style = 'carto-positron'),
)
fig.show()
output: duplicate entries
if I use showlegend = False on either subplot, then the legend will not show all applicable entries.
output: (subplot 2 showlegend = False)
The best way to remove duplicate legends at this time is to use set() to remove duplicates from the created legend and update it with that content. I am saving this as a snippet. I am getting the snippet from this answer. I have also changed the method to use the color information set in the columns. I have also redesigned it so that it can be created in a single loop process without creating an extra data frame.
fig = make_subplots(
rows = 1,
cols = 2,
specs = [[{"type": "scattermapbox"}, {"type": "scattermapbox"}]],
vertical_spacing = 0.05,
horizontal_spacing = 0.05
)
for t in df['Type'].unique():
dff = df.query('Type ==#t')
for c in dff['Category'].unique():
dffc = dff.query('Category == #c')
fig.add_trace(go.Scattermapbox(
lat = dffc['Lat'],
lon = dffc['Lon'],
mode = 'markers',
name = c,
marker = dict(color = dffc['Color']),
opacity = 0.8,
),
row = 1,
col = int(t)
)
fig.update_layout(height = 600, width = 800, margin = dict(l = 10, r = 10, t = 30, b = 10));
fig.update_layout(mapbox1 = dict(zoom = 2, style = 'carto-positron'),
mapbox2 = dict(zoom = 2, style = 'carto-positron'),
)
names = set()
fig.for_each_trace(
lambda trace:
trace.update(showlegend=False)
if (trace.name in names) else names.add(trace.name))
fig.show()
I have written a code to create a data table and save it as an image. Here is the code-
df_avg = data_rtm_market.groupby('date').mean()
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(1,8)]
dtr = [x.strftime("%d-%m-%Y") for x in date_list]
df_avg.reset_index(inplace=True)
last_7_day = df_avg[df_avg['date'].isin(dtr)]
data_rtm_market.date = pd.to_datetime(data_rtm_market.date,format="%d-%m-%Y")
dam_market.date = pd.to_datetime(dam_market.date,format="%d-%m-%Y")
final_ = pd.merge(data_rtm_market,dam_market,how='inner', on=['date','block'])
df = final_
df[['total_purchase_bid','total_sell_bid','total_cleared_vol']]=df.groupby('date')['purchase_bid', 'sell_bid', 'cleared_volume'].transform('sum')
df[['max_mcp_rtm', 'max_mcp_dam']]=final_.groupby('date')['mcp_x','mcp_y'].transform('max')
df[['avg_mcp_rtm','avg_mcp_dam']]=final_.groupby('date')['mcp_x','mcp_y'].transform('mean')
df[['min_mcp_rtm','min_mcp_dam']]=final_.groupby('date')['mcp_x','mcp_y'].transform('min')
summary = df[['date','total_purchase_bid',
'total_sell_bid', 'total_cleared_vol', 'max_mcp_rtm', 'max_mcp_dam',
'avg_mcp_rtm', 'avg_mcp_dam', 'min_mcp_rtm', 'min_mcp_dam']]
table = summary.drop_duplicates(keep='first')
table1 = table.tail(8)
table1 = table1.iloc[:-1]
col3 = table1[['total_purchase_bid',
'total_sell_bid', 'total_cleared_vol']].apply(lambda x:round((x/4000),2))
col4 = table1[['max_mcp_rtm', 'max_mcp_dam',
'avg_mcp_rtm', 'avg_mcp_dam', 'min_mcp_rtm', 'min_mcp_dam']].apply(lambda x:round(x,2))
final_temp = pd.concat([table1['date'],col3, col4], axis=1, sort=False)
final_temp['date'] = final_temp['date'].dt.strftime('%d-%m-%Y')
final_temp = final_temp.set_index('date').T
final_temp.reset_index(inplace=True,drop=True)
final_temp.insert(0,'1', ["Volume(MUs)","",""," Price(Rs/kWh)","","Price(Rs/kWh)","","Price(Rs/kWh)",""])
final_temp.insert(1,'2', ["Buy ","Sell ","Cleared","Max RTM","Max DAM","Avg RTM","Avg DAM","Min RTM","Min DAM"])
def render_mpl_table(data, col_width=3.0, row_height=0.825, font_size=26,
header_color='#5DADE2', row_colors=['#f1f1f2', 'w'], edge_color='black',
bbox=[0, 0, 1, 1], header_columns=0,
ax=None, **kwargs):
if ax is None:
size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
fig, ax = plt.subplots(figsize=size)
ax.axis('off')
mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)
mpl_table.auto_set_font_size(False)
mpl_table.set_fontsize(font_size)
for k, cell in six.iteritems(mpl_table._cells):
cell.set_edgecolor(edge_color)
if k[0] == 0 or k[1] < header_columns:
cell.set_text_props(weight='bold', color='white')
cell.set_facecolor(header_color)
else:
cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
return fig
final_temp.columns.values[0] = ""
final_temp.columns.values[1] = ""
fig = render_mpl_table(final_temp, header_columns=0, col_width=4)
fig.savefig("/content/assets/summary_table.jpeg",bbox_inches='tight')
This is giving me the data table as below-
I want to use conditional formatting in the cells such that, for example if Max RTM is greater than Max DAM, then the cell background or cell text turns green, and if Max RTM is less than Max DAM, the cell background or cell text turns red. Any way of doing it?
Also how can I merge the text in the first column?
You can do the following if you transpose your table:
import pandas as pd
#Initialize random dataframe
df_test = pd.DataFrame(np.random.normal(size = (10,2)),
columns=['A', 'B'])
#Style functions
def compare_columns(data, column1, column2, color1, color2):
attr = 'background-color: {}'
if data[column1] > data[column2]:
return [attr.format(color1) for s in data]
elif data[column1] <= data[column2]:
return [attr.format(color2) for s in data]
df_test.style.apply(compare_columns, column1 = 'A',
column2 = 'B', color1 = 'red',
color2 = 'green', axis=1)
Output:
See answer here (Coloring Cells in Pandas) and pandas docs (https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html) for more details.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('D:\ history/segment.csv')
data = pd.DataFrame(data)
data = data.sort_values(['Prob_score'], ascending=[False])
one = len(data)
actualpaid_overall = len(data.loc[data['paidstatus'] == 1])
data_split = np.array_split(data, 10)
data1 = data_split[0]
actualpaid_ten = len(data1.loc[data1['paidstatus'] == 1])
percent_ten = actualpaid_ten/actualpaid_overall
data2 = data_split[1]
actualpaid_twenty = len(data2.loc[data2['paidstatus'] == 1])
percent_twenty = (actualpaid_twenty/actualpaid_overall) + percent_ten
data3 = data_split[2]
actualpaid_thirty = len(data3.loc[data3['paidstatus'] == 1])
percent_thirty = (actualpaid_thirty/actualpaid_overall) + percent_twenty
data4 = data_split[3]
actualpaid_forty = len(data4.loc[data4['paidstatus'] == 1])
percent_forty = (actualpaid_forty/actualpaid_overall) + percent_thirty
data5 = data_split[4]
actualpaid_fifty = len(data5.loc[data5['paidstatus'] == 1])
percent_fifty = (actualpaid_fifty/actualpaid_overall) + percent_forty
data6 = data_split[5]
actualpaid_sixty = len(data6.loc[data6['paidstatus'] == 1])
percent_sixty = (actualpaid_sixty/actualpaid_overall) + percent_fifty
data7 = data_split[6]
actualpaid_seventy = len(data7.loc[data7['paidstatus'] == 1])
percent_seventy = (actualpaid_seventy/actualpaid_overall) + percent_sixty
data8 = data_split[7]
actualpaid_eighty = len(data8.loc[data8['paidstatus'] == 1])
percent_eighty = (actualpaid_eighty/actualpaid_overall) + percent_seventy
data9 = data_split[8]
actualpaid_ninenty = len(data9.loc[data9['paidstatus'] == 1])
percent_ninenty = (actualpaid_ninenty/actualpaid_overall) + percent_eighty
data10 = data_split[9]
actualpaid_hundred = len(data10.loc[data10['paidstatus'] == 1])
percent_hundred = (actualpaid_hundred/actualpaid_overall) + percent_ninenty
array_x = [10,20,30,40,50,60,70,80,90,100]
array_y = [ percent_ten, percent_twenty, percent_thirty, percent_forty,percent_fifty, percent_sixty, percent_seventy, percent_eighty, percent_ninenty, percent_hundred]
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
The above is my python code i have splitted my dataframe into 10 equal sections and plotted the graph but I'm not satisfied with this i have two concerns:
array_x = [10,20,30,40,50,60,70,80,90,100] in this line of code i have manually taken the x values, is there any possible way to process automatically as i have taken split(data,10) it should show 10 array values
As we can see the whole data1,2,3,4...10 is being repeated again and again is there a solution to write this in a function or loop.
Any help with codes will be appreciated. Thanks
I believe you need list comprehension and for count is possible use simplier way - sum of boolean mask, True values are processes like 1, then convert list to numpy array and use numpy.cumsum:
data = pd.read_csv('D:\ history/segment.csv')
data = data.sort_values('Prob_score', ascending=False)
one = len(data)
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
Sample:
np.random.seed(2019)
N = 1000
data = pd.DataFrame({'paidstatus':np.random.randint(3, size=N),
'Prob_score':np.random.randint(100, size=N)})
#print (data)
data = data.sort_values(['Prob_score'], ascending=[False])
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
print (array_x)
[ 100 200 300 400 500 600 700 800 900 1000]
print (array_y)
[0.09118541 0.18844985 0.27963526 0.38601824 0.49848024 0.61702128
0.72036474 0.81155015 0.9331307 1. ]
I am new to Python, but hope to explain the issue.
dfrow - is a dictionary of a single regression summary
results - is an empty dataframe with same columns as in dfrow
I would like to save regression results for each observation in the outer loop at the same time making sure column order in the inner loop. I am getting a result for the first observations but cannot move further, error saying:
Traceback (most recent call last):
File "<stdin>", line 109, in <module>
TypeError: 'numpy.int64' object is not iterable
when I run this code
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.stats import stattools as st
import statsmodels.api as sm
import collections
import datetime
import warnings
import scipy.stats
df_rent = import_rents()
df_return = import_ee_rets()
mostrecent = df_return.iloc[len(df_return) - 1]
mostrecentYYYY = mostrecent['Year']
mostrecentQ = mostrecent['Quarter']
mostrecentperiod = str(mostrecentYYYY) + "-Q" + str(mostrecentQ)
rentcols = df_rent.columns.values
colnames = []
#loop through the columns in df_rent until the column == the most recent period for which we have ee return data
for colname in rentcols:
if colname != mostrecentperiod:
colnames.append(colname)
else:
colnames.append(colname)
break
rentcols = colnames
#subset df_rent to only include columns that also have ee return data
df_rent = df_rent[rentcols]
#change dtype of metro_code / metro columns to string for matching later
df_rent['metro_code'] = df_rent['metro_code'].apply(str)
df_return['Metro'] = df_return['Metro'].apply(str)
df = pd.read_csv('//x/Project/_data/raw_data/rent_change.csv')
metros = list(np.unique(df['metro_code']))
regress_result_names = [
'metro',
'num_lag',
'num_ma',
'num_AR',
'beta_x1_retmov',
'x1_se',
'x1_tstat',
'x1_pval',
'r-squared',
'reg_fstat',
'fstat_pvalue',
'durbin-watson',
'resid_var']
regress_result_names = pd.Series(regress_result_names)
results = pd.DataFrame(columns=regress_result_names)
row = 0
for metro in metros:
for nlag in range(0, 5):
for nma in range(1, 11):
for AR in range(1, 5):
y = df_rent[df_rent['metro_code'] == str(metro)]
y = y.values.tolist()
y = y[0]
# delete first two columns of df_rent (they don't contain numeric data)
y.pop(0)
y.pop(0)
#y = rent time series data for specific metro
y = pd.Series(y)
#x1 = lagged moving average data for given params
df_return1 = df_return[df_return['Metro'] == str(metro)]
df_return1 = df_return1.reset_index(drop = True)
x1 = lagged_moving_avg(df = df_return1, metro_code = metro, nlag = nlag, nma = nma)
#y and x1 dataframe
y_label = 'y_Rent'
x_lagMA_label = 'x1_LaggedMA'
df1 = pd.DataFrame()
df1[y_label] = y
df1[x_lagMA_label] = x1
if mostrecentQ == 1:
currmonth = "01"
elif mostrecentQ == 2:
currmonth = "04"
elif mostrecentQ == 3:
currmonth = "07"
else:
currmonth = "10"
#convert index to datetime to run the regressions
currpd = pd.to_datetime((str(mostrecentYYYY) + currmonth), format='%Y%m')
df1.index = pd.date_range(*(pd.to_datetime(['1990-01', currpd]) + pd.offsets.QuarterEnd()), freq='Q')
#drop any rows that have missing observations
df1 = df1.dropna()
#df1.to_csv('//Nisfile01/x/Project - Real Estate Database/real_estate/odil/XandY.csv', index=True)
reg = ARIMA(endog = df1[y_label], order = (AR, 0,0)).fit(trend = 'nc', disp = 0, tol=1e-20)
resid_reg = reg.resid
reg2 = sm.OLS(resid_reg, df1[x_lagMA_label]).fit()
resid_reg2 = reg2.resid
dfrow = {
'metro': metro,
'num_lag': nlag,
'num_ma': nma,
'num_AR': AR,
'beta_x1_retmov': reg2.params[0],
'x1_se': reg2.bse[0],
'x1_tstat': reg2.tvalues[0],
'x1_pval': reg2.pvalues[0],
'r-squared': reg2.rsquared,
'reg_fstat':reg2.fvalue,
'fstat_pvalue': reg2.f_pvalue,
'durbin-watson': st.durbin_watson(reg2.resid),
'resid_var': resid_reg2.var(),
}
#create df for output called results
for key in dfrow.keys():
results.loc[row, key] = list(dfrow[key])
row = row + 1
Any help is very much appreciated.
P.S. Sorry for the messy code
The offending line is results.loc[row, key] = list(dfrow[key]).
You are trying to convert a single value, in this case a numpy.int64 object, to a list. I assume that what you're trying to do, and correct me if I am wrong, is create a singleton list with the int64 inside it. If that's what you want to do, you should use:
results.loc[row, key] = [dfrow[key]]
I am trying to create new columns in a Spark SQL dataframe that compare two columns within the dataframe, and return True if they are equal and False otherwise. I have to do this for a datset with thousands of columns. To be a sample problem, I've included all of my code here. However, the important problem comes in the second for loop at the end of the code bloc.
from pyspark.sql import SQLContext
from pyspark.sql.types import *
data = sc.parallelize([[1, None, 'BND'], [2, None, 'RMP'], [3, None, 'SWP'], [4, None, "IRS"], [5, None, "SWP"], [6, None, "IRS"]])
match = sc.parallelize([[1, 2, 100], [3, 5, 101], [4, 6, 102]])
trade_schema_string = 'trade_id,match_id,product'
trade_fields = [StructField(field_name, StringType(), True) for field_name in trade_schema_string.split(',')]
trade_fields[0].dataType = IntegerType()
trade_fields[1].dataType = IntegerType()
trade_schema = StructType(trade_fields)
match_schema_string = "pri_netting_id,sec_netting_id,match_id"
match_fields = [StructField(field_name, IntegerType(), True) for field_name in match_schema_string.split(',')]
match_schema = StructType(match_fields)
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(data, trade_schema)
odf = sqlContext.createDataFrame(match, match_schema)
df.registerTempTable("trade")
odf.registerTempTable("match")
# Get match_ids so you can match up front office and back office records
# Change column names for fo and bo dataframes so that they say "bo_product" and "fo_product", etc.
fo = sqlContext.sql("SELECT t.trade_id,t.product,m.match_id FROM trade t INNER JOIN match m WHERE t.trade_id = m.pri_netting_id")
bo = sqlContext.sql("SELECT t.trade_id,t.product,m.match_id FROM trade t INNER JOIN match m WHERE t.trade_id = m.sec_netting_id")
col_names = fo.columns
for x in range(0, len(col_names)):
col_name = col_names[x]
fo = fo.withColumnRenamed(col_name, "fo_" + col_name)
bo = bo.withColumnRenamed(col_name, "bo_" + col_name)
fo.registerTempTable("front_office")
bo.registerTempTable("back_office")
fobo = sqlContext.sql("SELECT f.fo_trade_id,f.fo_product,b.bo_trade_id,b.bo_product FROM front_office f INNER JOIN back_office b WHERE f.fo_match_id = b.bo_match_id")
fobo = fobo.repartion(5)
# How to create diff columns
num_cols = len(fobo.columns)
fobo_names = fobo.columns
first = fobo.first()
for x in range(0, num_cols / 2):
new_name = "\'diff_" + fobo_names[x][3:] + "\'"
old_column_fo = "fobo." + fobo_names[x]
old_column_bo = "fobo." + fobo_names[x + (num_cols / 2)]
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)
The error I get is:
Traceback (most recent call last):
File "", line 8, in
File "/opt/cloudera/parcels/CDH-5.4.0-1.cdh5.4.0.p0.27/lib/spark/python/pyspark/sql/dataframe.py", line 695, in withColumn
return self.select('*', col.alias(colName))
AttributeError: 'bool' object has no attribute 'alias'
So, the strange thing is that if I execute the following by hand:
fobo = fobo.withColumn("diff_product", fobo.fo_product == fobo.bo_product)
and
fobo = fobo.withColumn("diff_trade_id", fobo.fo_trade_id == fobo.bo_trade_id)
The whole thing works perfectly. However, this isn't practical for my true use case, which has many columns.
old_column_fo = "fobo." + fobo_names[x]
old_column_bo = "fobo." + fobo_names[x + (num_cols / 2)]
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)
old_column_fo and old_column_bo will be strings that merely look like the attribute names you're trying to access, but they won't be the actual attributes. Try using getattr instead.
old_column_fo = getattr(fobo, fobo_names[x])
old_column_bo = getattr(fobo, fobo_names[x + (num_cols / 2)])
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)