My goal is to update the histogram shown on the right side of 1 based on the selection of points on the left side.
Initially the plot seems to be alright, however once a selection is made the histogram won't be redrawn (altair 3.2.0)
Does anybody know how to do this?
below is the code to recreate the example:
import altair as alt
import pandas as pd
import numpy as np
from random import choice
dates = pd.date_range("1.1.2019", "2.28.2019")
np.random.seed(999)
Y = np.random.normal(0.5, 0.1, len(dates))
features = [choice(["a", "b", "c"]) for i in range(len(dates))]
df = pd.DataFrame({"dates": dates, "Y": Y, "features": features})
base = alt.Chart(df)
area_args = {"opacity": 0.3, "interpolate": "step"}
pts = alt.selection(type="interval", encodings=["x"])
points = (
base.mark_circle()
.encode(alt.X("dates:T"), alt.Y("Y:Q"), color="features:N")
.add_selection(pts)
)
yscale = alt.Scale(domain=(0, 1))
right_hist = (
base.mark_area(**area_args)
.encode(
alt.Y(
"Y:Q", bin=alt.Bin(maxbins=20, extent=yscale.domain), stack=None, title=""
),
alt.X("count()", stack=None, title=""),
alt.Color("features:N"),
)
.transform_filter(pts)
)
(points | right_hist)
edit1: another image to clarify my point #jvp
Solved in the comments as an issue with the OPs setup and how the plots were rendered on their end.
Related
I want to make a treemap following a predefined ordering, but I don't know how to manipulate the hierarchy defined by plotly treemap.
Here is the treemap I've created:
Here I want the O to be above the U box and the UR to be a unique column, without dividing its space with NA.
Here is my code:
import plotly.graph_objects as go
fig = go.Figure(go.Treemap(
labels = ["DR","1,2","D 1","D 2","O","U",
"3,4,5","P","B","Pe",
"6,7,8","F","C","Pl",
"9","UR",
"10","NA"],
parents = ["", "DR", "1,2","1,2","1,2","1,2","DR","3,4,5","3,4,5","3,4,5",
"DR","6,7,8","6,7,8","6,7,8",
"DR","9",
"DR","10"],
marker_colors = ["lightgrey",
"lightblue", "cornflowerblue", "cornflowerblue","goldenrod","green",
"lightblue","lightgray","chocolate","cadetblue",
"lightblue","grey",'chocolate',"saddlebrown",
"salmon","salmon",
"burlywood","burlywood",
],
branchvalues="total"
))
fig.update_layout(margin = dict(t=10, l=60, r=25, b=30))
fig.show()
Much appreciated!
I've tried creating values for each rectangle, but it didn't help and didn't work as expected.
You need the argument tiling. I think this is what you're asking for.
import plotly.graph_objects as go
fig = go.Figure(go.Treemap(
labels = ["DR","1,2","D 1","D 2","O","U",
"3,4,5","P","B","Pe",
"6,7,8","F","C","Pl",
"9","UR",
"10","NA"],
parents = ["", "DR", "1,2","1,2","1,2","1,2","DR","3,4,5","3,4,5","3,4,5",
"DR","6,7,8","6,7,8","6,7,8",
"DR","9",
"DR","10"],
marker_colors = ["lightgrey",
"lightblue", "cornflowerblue", "cornflowerblue","goldenrod","green",
"lightblue","lightgray","chocolate","cadetblue",
"lightblue","grey",'chocolate',"saddlebrown",
"salmon","salmon",
"burlywood","burlywood",
],
branchvalues="total",
tiling = dict(packing='slice-dice')
))
The following code produces a column chart in which the y axis grows in the wrong direction.
alt.Chart(df).mark_line().encode(
x = alt.X('pub_date', timeUnit='month'),
y = alt.Y('sum(has_kw)', ),
)
I wanted to correct it as suggested by https://stackoverflow.com/a/58326269, and changed my code to
alt.Chart(df).mark_line().encode(
x = alt.X('pub_date', timeUnit='month'),
y = alt.Y('sum(has_kw)', sort=alt.EncodingSortField('y', order='descending') ),
)
But now altair produces a strange diagram, see 2.
That is, sum(has_kw) is calculated wrong. Why this, and how to correct it?
It is hard to know exactly without seeing a sample of your data but you could try one of the following (based on the example you linked). This first approach is similar to what you tried already:
import altair as alt
import numpy as np
import pandas as pd
# Compute x^2 + y^2 across a 2D grid
x, y = np.meshgrid(range(0, 3), range(0, 3))
z = x ** 2 + y ** 2
# Convert this grid to columnar data expected by Altair
source = pd.DataFrame({
'x': x.ravel(),
'y': y.ravel(),
'z': z.ravel()
})
alt.Chart(source).mark_rect().encode(
x='x:O',
y=alt.Y('y:O', sort='descending'),
color='z:Q'
)
This second approaches simply reverses the axes without sorting it and might be more compatible with your data:
alt.Chart(source).mark_rect().encode(
x='x:O',
y=alt.Y('y:O', scale=alt.Scale(reverse=True)),
color='z:Q'
)
I have a shapefile of points, defined by X and Y coordinates, ad the ID feature.
I have at least 3 different points with the same ID number.
I would like to define, for each ID, the shapefile of a circle that circumscribes the points.
How can this be done in python environment?
there is a library that does it: https://pypi.org/project/miniball/
it's pretty forward to integrate in standard pandas pattern https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
solution really reduces to this:
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
with sample example and visualisation, circles do become distorted due to epsg:4326 projection limitations
full working example
import geopandas as gpd
import numpy as np
import shapely
import miniball
import math
import pandas as pd
cities = gpd.read_file(gpd.datasets.get_path("naturalearth_cities"))
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
# a semi-synthetic grouping of cities
world["size"] = world.groupby("continent")["pop_est"].apply(
lambda d: pd.cut(d, 2, labels=list("ab"), duplicates="drop").astype(str)
)
cities = cities.sjoin(world.loc[:, ["continent", "iso_a3", "size", "geometry"]])
cities["group"] = cities["continent"] + cities["size"]
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
# visualize it
m = cities.explore(column=col, height=300, width=600, legend=False)
gdf_c.loc[~gdf_c["geometry"].is_empty].explore(
m=m, column=col, marker_kwds={"radius": 20}, legend=False
)
output
Hello I am using the following code at the bottom to extract countries from coordinates. Please see the following url which provides a more detailed explanation of the code: Extracting countries from NetCDF data using geopandas.
My main variable/value is the monthly mean pdsi value from: https://psl.noaa.gov/data/gridded/data.pdsi.html. The image below represents a portion of the visualization created by the code below. The shaded squares represent the spatial regions of pdsi values, which is overlapping a shapefile of the world.
From the image of Belgium, you can see that the 4 squares that touch the land area of Belgium are also touching other countries. If I attribute the base values to the Belgium, I believe this overestimates the mean pdsi values. Especially when considering the bottom two squares barely touch Belgium, the weight of these values when calculating the mean should be significantly lower. Thus, is there a way to incorporate some sort of weighted average where the area of each square within a country can be used as the weight to adjust each pdsi value? Additionally, I would like to standardize this process not only for Belgium, but for all countries as well.
Any help would be greatly appreciated!
import geopandas as gpd
import numpy as np
import plotly.express as px
import requests
from pathlib import Path
from zipfile import ZipFile
import urllib
import shapely.geometry
import xarray as xr
# download NetCDF data...
# fmt: off
url = "https://psl.noaa.gov/repository/entry/get/pdsi.mon.mean.selfcalibrated.nc?entryid=synth%3Ae570c8f9-ec09-4e89-93b4-babd5651e7a9%3AL2RhaV9wZHNpL3Bkc2kubW9uLm1lYW4uc2VsZmNhbGlicmF0ZWQubmM%3D"
f = Path.cwd().joinpath(Path(urllib.parse.urlparse(url).path).name)
# fmt: on
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
ds = xr.open_dataset(f)
pdsi = ds.to_dataframe()
pdsi = pdsi.reset_index().dropna() # don't care about places in oceans...
# use subset for testing... last 5 times...
pdsim = pdsi.loc[pdsi["time"].isin(pdsi.groupby("time").size().index[-5:])]
# create geopandas dataframe
gdf = gpd.GeoDataFrame(
pdsim, geometry=pdsim.loc[:, ["lon", "lat"]].apply(shapely.geometry.Point, axis=1)
)
# make sure that data supports using a buffer...
assert (
gdf["lat"].diff().loc[lambda s: s.ne(0)].mode()
== gdf["lon"].diff().loc[lambda s: s.ne(0)].mode()
).all()
# how big should the square buffer be around the point??
buffer = gdf["lat"].diff().loc[lambda s: s.ne(0)].mode().values[0] / 2
gdf["geometry"] = gdf["geometry"].buffer(buffer, cap_style=3)
# Import shapefile from geopandas
path_to_data = gpd.datasets.get_path("naturalearth_lowres")
world_shp = gpd.read_file(path_to_data)
# the solution... spatial join buffered polygons to countries
# comma separate associated countries
gdf = gdf.join(
world_shp.sjoin(gdf.set_crs("EPSG:4326"))
.groupby("index_right")["name"]
.agg(",".join)
)
gdf["time_a"] = gdf["time"].dt.strftime("%Y-%b-%d")
# simplest way to test is visualise...
px.choropleth_mapbox(
gdf,
geojson=gdf.geometry,
locations=gdf.index,
color="pdsi",
hover_data=["name"],
animation_frame="time_a",
opacity=.3
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)
using https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.intersection.html you can get part of grid that intersects with country polygon
using area, you can calculate proportion of overlap
from this I have generated two visualisations
show countries a grid overlaps and how much it overlaps
aggregate to countries using a weighted average plus calculate other measures that can be used for transparency
I do not know if this is mathematically / scientifically sound to aggregate PDSI in this way (either means or weighted averages). This does demonstrate how to get results your question requests.
# the solution... spatial join buffered polygons to countries
# plus work out overlap between PDSI grid and country. Area of each grid is constant...
gdf_c = (
world_shp.sjoin(gdf.set_crs("EPSG:4326"))
.merge(
gdf.loc[:, "geometry"],
left_on="index_right",
right_index=True,
suffixes=("", "_pdsi"),
)
.assign(
overlap=lambda d: (
d["geometry"]
.intersection(gpd.GeoSeries(d["geometry_pdsi"], crs="EPSG:4326"))
.area
/ (buffer * 2) ** 2
).round(3)
)
)
# comma separate associated countries and a list of overlaps
gdf_pdsi = gdf.loc[:, ["geometry", "time", "pdsi"]].join(
gdf_c.groupby("index_right").agg({"name": ",".join, "overlap": list})
)
gdf_pdsi["time_a"] = gdf_pdsi["time"].dt.strftime("%Y-%b-%d")
# simplest way to test is visualise...
fig_buf = px.choropleth_mapbox(
gdf_pdsi,
geojson=gdf_pdsi.geometry,
locations=gdf_pdsi.index,
color="pdsi",
hover_data=["name", "overlap"],
animation_frame="time_a",
opacity=0.3,
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)
fig_buf
import pandas as pd
# prepare data to plot by country
df_pdsi = (
gdf_c.groupby(["name", "time"])
.apply(
lambda d: pd.Series(
{
"weighted_pdsi": (d["pdsi"] * d["overlap"]).sum() / d["overlap"].sum(),
"unweighted_pdsi": d["pdsi"].mean(),
"min_pdsi": d["pdsi"].min(),
"max_pdsi": d["pdsi"].max(),
"min_overlap": d["overlap"].min(),
"max_overlap": d["overlap"].max(),
"size_pdsi": len(d["pdsi"]),
# "pdsi_list":[round(v,2) for v in d["pdsi"]]
}
)
)
.reset_index()
)
df_pdsi["time_a"] = df_pdsi["time"].dt.strftime("%Y-%b-%d")
fig = px.choropleth_mapbox(
df_pdsi,
geojson=world_shp.set_index("name").loc[:, "geometry"],
locations="name",
color="weighted_pdsi",
hover_data=df_pdsi.columns,
animation_frame="time_a",
opacity=0.3,
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)
fig
I'm adding the rSquared to a chart using the method outlined in this answer:
r2 = alt.Chart(df).transform_regression('x', 'y', params=True
).mark_text().encode(x=alt.value(20), y=alt.value(20), text=alt.Text('rSquared:N', format='.4f'))
But I want to prepend "rSquared = " to the final text.
I've seen this answer involving an f string and a value calculated outside the chart, but I'm not clever enough to figure out how to apply that solution to this scenario.
I've tried, e.g., format='rSquared = .4f', but adding any additional text breaks the output, which I'm sure is the system working as intended.
One possible solution using the posts you linked to would be to extract the value of the parameter using altair_transform and then add the value to the plot. This is not the most elegant solution but should achieve what you want.
# pip install git+https://github.com/altair-viz/altair-transform.git
import altair as alt
import pandas as pd
import numpy as np
import altair_transform
np.random.seed(42)
x = np.linspace(0, 10)
y = x - 5 + np.random.randn(len(x))
df = pd.DataFrame({'x': x, 'y': y})
chart = alt.Chart(df).mark_point().encode(
x='x',
y='y'
)
line = chart.transform_regression('x', 'y').mark_line()
params = chart.transform_regression('x','y', params=True).mark_line()
R2 = altair_transform.extract_data(params)['rSquared'][0]
text = alt.Chart({'values':[{}]}).mark_text(
align="left", baseline="top"
).encode(
x=alt.value(5), # pixels from left
y=alt.value(5), # pixels from top
text=alt.value(f"rSquared = {R2:.4f}"),
)
chart + line + text