Create a combination of "relative" and "grouped" chart in Python - python

I need to create a combination of "relative" and "grouped" chart in plotly.
I figured out how to create stacked and grouped by using this code:
from plotly import graph_objects as go
import plotly
pyplt = plotly.offline.plot
data = {
"Sports_19": [15, 23, 32, 10, 23, 22, 32, 24],
"Casual_19": [4, 12, 11, 14, 15, 12, 22, 14],
"Yoga_19": [4, 8, 18, 6, 12, 11, 10, 4],
"Sports_20": [11, 18, 18, 0, 20, 12, 12, 11],
"Casual_20": [20, 10, 9, 6, 10, 11, 17, 22],
"Yoga_20": [11, 18, 18, 0, 20, 12, 12, 11],
"labels": ["January", "February", "March", "April", "May", 'June', 'July', "August"]
}
fig = go.Figure()
fig.add_trace(go.Bar(name="Sports",x=data["labels"],y=data["Sports_19"],offsetgroup=19,marker_color='lightsalmon',text=data["Sports_19"],textposition='auto'))
fig.add_trace(go.Bar(name="Casual",x=data['labels'],y=data['Casual_19'],offsetgroup=19,base=data['Sports_19'],marker_color='crimson',text=data["Casual_19"],textposition='auto'))
fig.add_trace(go.Bar(name="Yoga",x=data['labels'],y=data['Yoga_19'],marker_color='indianred',text=data["Yoga_19"],textposition='auto',offsetgroup=19,base=[val1 + val2 for val1, val2 in zip(data["Sports_19"], data["Casual_19"])]))
fig.add_trace(go.Bar(name="Sports_20",x=data["labels"],y=data["Sports_20"],offsetgroup=20,marker_color='lightsalmon',showlegend=False,text=data["Sports_20"],textposition='auto'))
fig.add_trace(go.Bar(name="Casual_20",x=data['labels'],y=data['Casual_20'],offsetgroup=20,base=data['Sports_20'],marker_color='crimson',showlegend=False,text=data["Casual_20"],textposition='auto'))
fig.add_trace(go.Bar(name="Yoga_20", x=data['labels'], y=data['Yoga_20'], marker_color='indianred', text=data["Yoga_20"], showlegend=False, textposition='auto', offsetgroup=20, base=[val1 + val2 for val1, val2 in zip(data["Sports_20"], data["Casual_20"])]))
fig.update_layout(title="2019 vs 2020 Sales by Category",yaxis_title="Sales amount in US$")
fig.show()
pyplt(fig, auto_open=True)
Output is this:
Is there is any way i can convert this graph to combination of "relative" and "grouped"? May be not with plotly, but with matplotlib or another tools?
p.s. Here is the example of "relative graph"(but its not grouped):

Probably the most straightforward way is to create two new dataframes df_perc_19 and df_perc_20 to store your data, normalized to relative percentages for each month in each year, rounding off to two digits using .round(2) since a long decimal will cause the default direction of the text to change - feel free to adjust this however you like.
Then access the values in these new dataframes for your traces, and although it's ugly, you can get percentages to display for the text parameter using something like: text=[str(x)+"%" for x in df_perc_19["Casual_19"]]
import pandas as pd
import plotly
from plotly import graph_objects as go
# pyplt = plotly.offline.plot
data = {
"Sports_19": [15, 23, 32, 10, 23, 22, 32, 24],
"Casual_19": [4, 12, 11, 14, 15, 12, 22, 14],
"Yoga_19": [4, 8, 18, 6, 12, 11, 10, 4],
"Sports_20": [11, 18, 18, 0, 20, 12, 12, 11],
"Casual_20": [20, 10, 9, 6, 10, 11, 17, 22],
"Yoga_20": [11, 18, 18, 0, 20, 12, 12, 11],
# "labels": ["January", "February", "March", "April", "May", 'June', 'July', "August"]
}
labels = ["January", "February", "March", "April", "May", 'June', 'July', "August"]
df = pd.DataFrame(data=data,index=labels)
## normalize data for the months of 2019, and the months of 2020
df_perc_19 = df.apply(lambda x: 100*x[["Sports_19","Casual_19","Yoga_19"]] / x[["Sports_19","Casual_19","Yoga_19"]].sum(),axis=1).round(2)
df_perc_20 = df.apply(lambda x: 100*x[["Sports_20","Casual_20","Yoga_20"]] / x[["Sports_20","Casual_20","Yoga_20"]].sum(),axis=1).round(2)
fig = go.Figure()
## traces for 2019
fig.add_trace(go.Bar(name="Sports",x=labels,y=df_perc_19["Sports_19"],offsetgroup=19,marker_color='lightsalmon',text=[str(x)+"%" for x in df_perc_19["Sports_19"]],textposition='auto'))
fig.add_trace(go.Bar(name="Casual",x=labels,y=df_perc_19['Casual_19'],offsetgroup=19,base=df_perc_19['Sports_19'],marker_color='crimson',text=[str(x)+"%" for x in df_perc_19["Casual_19"]],textposition='auto'))
fig.add_trace(go.Bar(name="Yoga",x=labels,y=df_perc_19['Yoga_19'],marker_color='indianred',text=[str(x)+"%" for x in df_perc_19["Yoga_19"]],textposition='auto',offsetgroup=19,base=[val1 + val2 for val1, val2 in zip(df_perc_19["Sports_19"], df_perc_19["Casual_19"])]))
## traces for 2020
fig.add_trace(go.Bar(name="Sports_20",x=labels,y=df_perc_20["Sports_20"],offsetgroup=20,marker_color='lightsalmon',showlegend=False,text=[str(x)+"%" for x in df_perc_20["Sports_20"]] ,textposition='auto'))
fig.add_trace(go.Bar(name="Casual_20",x=labels,y=df_perc_20['Casual_20'],offsetgroup=20,base=df_perc_20['Sports_20'],marker_color='crimson',showlegend=False,text=[str(x)+"%" for x in df_perc_20["Casual_20"]],textposition='auto'))
fig.add_trace(go.Bar(name="Yoga_20", x=labels, y=df_perc_20['Yoga_20'], marker_color='indianred', text=[str(x)+"%" for x in df_perc_20["Yoga_20"]], showlegend=False, textposition='auto', offsetgroup=20, base=[val1 + val2 for val1, val2 in zip(df_perc_20["Sports_20"], df_perc_20["Casual_20"])]))
fig.update_layout(title="2019 vs 2020 Sales by Category",yaxis_title="Sales amount in US$ (percentage)")
fig.show()
# pyplt(fig, auto_open=True)

Related

Check which value from my list is not in my dataframe column

I need to check if any of the values in my list is missing in my df column. I used this:
data_xls['date'].isin([datetime(2015, 7, 20, 11,7),datetime(2015, 7, 20, 11,13),datetime(2015, 7, 20, 11,14),datetime(2015, 7, 20, 11,16)])
But I also want to know which one amongst my list is missing. How can I do that?
You need the ~ symbol to index the dates that are not in that list:
lst = [datetime(2015, 7, 20, 11,7),datetime(2015, 7, 20, 11,13),datetime(2015, 7, 20, 11,14),datetime(2015, 7, 20, 11,16)]
data_xls['date'][~data_xls['date'].isin(lst)]
But since you want the dates in your list missing in data_xls, you can find that by:
set(lst).difference(data_xls['date'])
If need difference between dates and data_xls['date'] columns use:
data_xls = pd.DataFrame({'date': pd.date_range(datetime(2015, 7, 20, 11,11),
freq='1Min', periods=5)})
print (data_xls)
date
0 2015-07-20 11:11:00
1 2015-07-20 11:12:00
2 2015-07-20 11:13:00
3 2015-07-20 11:14:00
4 2015-07-20 11:15:00
dates = [datetime(2015, 7, 20, 11,7),datetime(2015, 7, 20, 11,13),
datetime(2015, 7, 20, 11,14),datetime(2015, 7, 20, 11,16)]
missing = [x for x in dates if x not in set(data_xls['date'])]
print (missing)
[datetime.datetime(2015, 7, 20, 11, 7), datetime.datetime(2015, 7, 20, 11, 16)]
missing = list(set(dates) - set(data_xls['date']))
print (missing)
[datetime.datetime(2015, 7, 20, 11, 7), datetime.datetime(2015, 7, 20, 11, 16)]

Find indices of line intersection in shapely

I would like to find a better solution for what I am proposing below. I am trying to find the indices associated with a line intersection when using the shapely library. Solutions from other libraries are welcome.
Right now I am iterating through the location coordinates and storing the index where an intersection is observed. I would like to do away with the loop and create a more streamlined function.
The code below results in a single intersection/crossing.
line_crossings = []
latitude = [10, 11, 12, 13, 14, 15, 16, 17 ,18]
longitude = [7, 9, 11, 13, 17, 19, 23, 25 ,29]
location = np.column_stack((latitude, longitude))
C = (14.5, 14.5)
D = (12.3, 12.5)
line2 = LineString([C, D])
for idx in range(0, len(location)-1):
A = (latitude[idx], longitude[idx])
B = (latitude[idx+1], longitude[idx+1])
line1 = LineString([A, B])
int_pt = line2.intersection(line1)
if int_pt.type == 'Point':
print(int_pt)
line_crossings.append(idx)
Update
It would seem the quickest way to get the coordinates of the crossings is as follows:
latitude = [10, 11, 12, 13, 14, 15, 16, 17 ,16, 15, 14, 13, 12, 11, 10]
longitude = [7, 9, 11, 13, 17, 19, 23, 25 ,29, 25, 23, 13, 13, 13, 11]
location = LineString([i for i in zip(latitude,longitude)])
C = (14.5, 14.5)
D = (12.3, 12.5)
gate = LineString([C, D])
[[i.x, i.y] for i in location.intersection(gate)]
But I need to be able to get the index in the location variable where the intersection occurs. Is it possible to get this using the list comprehension?

Python f-string and append()

I hope y'all doing fine!
So I want to make 5 groups of 6 people randomly chosen from a list and then append those 6 chosen names to the special group.
Example: If a, b, c, d, e, f, are the first six chosen names -> append those names to group1;
after the group1 contains 6 names, then the next 6 names -> append to group2; and so and so till I have 5 groups of 6 people.
I hope you understand me and that you can help :)
My code:
import random
names = [30 names i dont wanna share]
group1 = list()
group2 = list()
group3 = list()
group4 = list()
group5 = list()
def choosegroup():
def chooserandom():
return(random.choice(names))
def creategroup():
for i in range(1,7):
chosed = chooserandom()
names.remove(chosed)
#while(chosed in group1):
#print('Ups')
#print(chosed + ' already chosed')
# chosed = chooserandom()
#print(chosed)
group1.append(chosed)
#print('Group 1:' + '\n' + str(group1) + '\n')
createdgroup = creategroup()
print(group1)
for i in range(1,6):
print(f'Group {i}')
choosegroup()
group1.clear()
random.shuffle(names)
groups = [ names[i:i+6] for i in range(0, len(names), 6) ]
Now groups[0], groups[1] etc. are your 6-person groups.
Once you have your list of names, to split them into random groups, I would instead use numpy
import numpy as np
groups = np.array(names)
np.shuffle(groups)
groups = np.reshape(groups, (5,6))
As an example with numbers instead of names
>>> names = np.arange(30)
>>> names
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
>>> np.random.shuffle(names)
>>> names
array([ 8, 18, 23, 7, 25, 14, 11, 20, 13, 24, 15, 26, 19, 21, 12, 17, 0,
6, 3, 10, 29, 9, 16, 28, 22, 5, 1, 4, 27, 2])
>>> np.reshape(names, (5,6))
array([[ 8, 18, 23, 7, 25, 14],
[11, 20, 13, 24, 15, 26],
[19, 21, 12, 17, 0, 6],
[ 3, 10, 29, 9, 16, 28],
[22, 5, 1, 4, 27, 2]])
You can access them from globals as such:
globals()[f"group{i}"]
though storing and retrieving them from a dictionary is preferable.
You can rewrite your code as follows:
import random
from collections import defaultdict
names = [30 names i dont wanna share]
groups = defaultdict(list)
def choosegroup(group_name):
def chooserandom():
return(random.choice(names))
def creategroup(group_name):
for i in range(1,7):
chosed = chooserandom()
names.remove(chosed)
groups[group_name].append(chosed)
createdgroup = creategroup()
print(group_name, "\n", group[group_name])
for i in range(1,6):
print(f'Group {i}')
group_name = f"group{i}"
choosegroup(group_name)
groups[group_name].clear()

how to do mean of redundant data in list-of-list of python?

I am new to programming and python. The list-of-list data that I have looks like this:
year,month,date_of_month,day_of_week,births
combine_list = [[2003, 12, 29, 1, 13125],
[2003, 12, 30, 2, 14700],
[2003, 12, 31, 3, 12540],
[2003, 12, 30, 2, 14700],
[2003, 12, 30, 2, 14438],
.......................]]
As, you can see the second,fourth and fifth entries have same values for all fields except births. I have written a function to find mean of entries having redundant data :
def distinct(file) :
distinct_data = file
for i in range(len(file)) :
for j in range(i+1,len(file)) :
if (i[0]==j[0] and i[1]==j[1] and i[2]==j[2] and i[3]==j[3]) :
disctinct_data = (i+j)/2
return (distinct_data)
distinctdata = distinct(combine_list)
But, interpreter throws me error :
TypeErrorTraceback (most recent call last)
<ipython-input-19-70e5f515fb64> in <module>()
7 return (distinct_data)
8
----> 9 distinctdata = distinct(combine_list)
10
11
<ipython-input-19-70e5f515fb64> in distinct(file)
3 for i in range(len(file)) :
4 for j in range(i+1,len(file)) :
----> 5 if (i[0]==j[0] and i[1]==j[1] and i[2]==j[2] and i[3]==j[3]) :
6 disctinct_data = (i+j)/2
7 return (distinct_data)
TypeError: 'int' object is not subscriptable
Please, tell me where my function is wrong and what I can do to correct it.
You should use pandas for such operations http://pandas.pydata.org/pandas-docs/stable/10min.html
With pandas You can find the mean using
import pandas as pd
df = pd.DataFrame([[2003, 12, 29, 1, 13125],
[2003, 12, 30, 2, 14700],
[2003, 12, 31, 3, 12540],
[2003, 12, 30, 2, 14700],
[2003, 12, 30, 2, 14438]], columns = ['year','month','date_of_month','day_of_week','births'])
df[df.duplicated(subset = ['year','month','date_of_month','day_of_week'])].mean()
I fixed the syntax error you were referring to. You were treating your element indexes as arrays:
combine_list = [[2003, 12, 29, 1, 13125],
[2003, 12, 30, 2, 14700],
[2003, 12, 31, 3, 12540],
[2003, 12, 30, 2, 14700],
[2003, 12, 30, 2, 14438]]
def distinct(file) :
distinct_data = file
print(file)
for i in range(len(file)) :
for j in range(i+1,len(file)) :
if (file[i][:3] == file[j][:3]) :
disctinct_data = (i+j)/2
return (distinct_data)
distinctdata = distinct(combine_list)

python numpy polyfit function

I'm new to python and haven't found an answer on this site so far.
I'm using numpy.polyfit in a loop and getting an error as below and don't understand as when I run the code in debug everything works fine and the len of arrays going into the function are the same:
Error Runtime exception: TypeError: expected x and y to have same length
My code is below:
import numpy as np
from collections import defaultdict
bb = [ 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10 ]
i = 0
b = -3
bb_gradient = defaultdict(dict)
while ( b <= 0 ):
print i
print len(range(3))
print len(bb[b-3:b])
bb_gradient[i][0], _ = np.polyfit( range(3), weekly_bb_lower[b-3:b], 1 )
i += 1
b += 1
What am I doing wrong?
Thanks in anticipation.
I am assuming bb is weekly_bb_lower. Change while ( b <= 0 ) to while ( b < 0 ). because when b becomes 0, weekly_bb_lower[-3:0] will return an empty list. a list[-n:0] is supposed to be empty.
You can avoid referencing an empty list by moving the last three elements to the start of your list:
import numpy as np
from collections import defaultdict
bb = [ 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10, 11, 12, 22, 10, 11, 12, 11, 10 ]
bb = bb[-3:] + bb[:-3] # moves the last three elements of the list to the start prior to looping
bb_gradient = defaultdict(dict)
for i in range(3):
bb_gradient[i][0], _ = np.polyfit( range(3) , bb[i:i+3], 1 )
Prashanth's explanation is correct.

Categories

Resources