dictionary or sub df from df - python

I am totally new in programming in general, so please explain.
The general aim: I am dealing with x,y,z data. I want to reduce the number of points in each cell (could have variable sizes depinding on the project)to let's say 50 without affecting the mean value.
The problem: I have df with x,y,z,binnumber and I want to produce either dictionary(ex binnumber:[x,y,z],[x,y,z].....which is inside this bin), or some how sub datasets that I can work with as df so I can work with.
what I did:
`# import the data
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic_2d
inputpath=input("write the file path:")
Data = pd.read_csv(inputpath, index_col=False, header= None, names =
['X','Y', 'Z'],skip_blank_lines=True) # file name , index =False means
without index , names are the columns names
Data = pd.DataFrame(Data)
# creating the grid cells
min_x = int(min(Data['X']))
max_x = int(max(Data['X'])+1)
min_y = int(min(Data['Y']))
max_y = int(max(Data['Y'])+1)
bin_size = float(input('write the cell size:'))
bx= int(((max_x-min_x)//bin_size)+1)
by=int(((max_y-min_y)//bin_size)+1)
xedges = np.linspace(min_x, max_x, bx, dtype=int)
yedges = np.linspace(min_y, max_y, by, dtype=int)
# assign the data to the cells
count, x_edge,y_edge,binnumber= binned_statistic_2d(Data['X'], Data['Y'],
Data['Z'],bins=(xedges, yedges))
Data['binnumber']= binnumber
# sub sets
subsets = dict(Data.groupby('binnumber'))
print (subsets)
this did not work...
Another solution was to deal with the cells itself but it did not work also.
cells= {}
for i in xedges:
for j in yedges:
cells[str(i),str(j)]=[]
print(cells.keys())
for x in Data.X:
for y in Data.Y:
for z in Data.Z:
for k,v in cells.keys():
if x>= int(k[0]) and x < int(k[0]) +1 and y>= int(k[1]) and y
< int(k[1]) +1:
k=(x,y,z)
else:
cells=('0')
print(cells)
Thanks for any try to help.

import the data
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic_2d
inputpath=input("write the file path:")
Data = pd.read_csv(inputpath, index_col=False, header= None, names =
['X','Y', 'Z'],skip_blank_lines=True) # file name , index =False means
without index , names are the columns names
Data = pd.DataFrame(Data)
# creating the grid cells
min_x = int(min(Data['X']))
max_x = int(max(Data['X'])+1)
min_y = int(min(Data['Y']))
max_y = int(max(Data['Y'])+1)
bin_size = float(input('write the cell size:'))
bx= int(((max_x-min_x)//bin_size)+1)
by=int(((max_y-min_y)//bin_size)+1)
xedges = np.linspace(min_x, max_x, bx, dtype=int)
yedges = np.linspace(min_y, max_y, by, dtype=int)
# assign the data to the cells
count, x_edge,y_edge,binnumber= binned_statistic_2d(Data['X'], Data['Y'],
Data['Z'],bins=(xedges, yedges))
Data['binnumber']= binnumber
# making dictionary with >>> binnumber: all associated points......
Data['value'] = list(zip(Data['X'], Data['Y'], Data['Z']))
d = defaultdict(list)
for idx, row in Data.iterrows():
d[row['binnumber']].append(row['value'])

Related

Python's `.loc` is really slow on selecting subsets of Data

I'm having a large multindexed (y,t) single valued DataFrame df. Currently, I'm selecting a subset via df.loc[(Y,T), :] and create a dictionary out of it. The following MWE works, but the selection is very slow for large subsets.
import numpy as np
import pandas as pd
# Full DataFrame
y_max = 50
Y_max = range(1, y_max+1)
t_max = 100
T_max = range(1, t_max+1)
idx_max = tuple((y,t) for y in Y_max for t in T_max)
df = pd.DataFrame(np.random.sample(y_max*t_max), index=idx_max, columns=['Value'])
# Create Dictionary of Subset of Data
y1 = 4
yN = 10
Y = range(y1, yN+1)
t1 = 5
tN = 9
T = range(t1, tN+1)
idx_sub = tuple((y,t) for y in Y for t in T)
data_sub = df.loc[(Y,T), :] #This is really slow
dict_sub = dict(zip(idx_sub, data_sub['Value']))
# result, e.g. (y,t) = (5,7)
dict_sub[5,7] == df.loc[(5,7), 'Value']
I was thinking of using df.loc[(y1,t1),(yN,tN), :], but it does not work properly, as the second index is only bounded in the final year yN.
One idea is use Index.isin with itertools.product in boolean indexing:
from itertools import product
idx_sub = tuple(product(Y, T))
dict_sub = df.loc[df.index.isin(idx_sub),'Value'].to_dict()
print (dict_sub)

Having some problem to understand the x_bin in regplot of Seaborn

I used the seaborn.regplot to plot data, but not quite understand how the error bar in regplot was calculated. I have compared the results with the mean and standard deviation derived from mannual calculation. Here is my testing script.
import numpy as np
import pandas as pd
import seaborn as sn
def get_data_XYE(p):
x_list = []
lower_list = []
upper_list = []
for line in p.lines:
x_list.append(line.get_xdata()[0])
lower_list.append(line.get_ydata()[0])
upper_list.append(line.get_ydata()[1])
y = 0.5 * (np.asarray(lower_list) + np.asarray(upper_list))
y_error = np.asarray(upper_list) - y
x = np.asarray(x_list)
return x, y, y_error
x = [37.3448,36.6026,42.7795,34.7072,75.4027,226.2615,192.7984,140.8045,242.9952,458.451,640.6542,726.1024,231.7347,107.5605,200.2254,190.0006,314.1349,146.8131,152.4497,175.9096,284.9926,116.9681,118.2953,312.3787,815.8389,458.0146,409.5797,595.5373,188.9955,15.7716,36.1839,244.8689,57.4579,94.8717,112.2237,87.0687,72.79,22.3457,24.1728,29.505,80.8765,252.7454,280.6002,252.9573,348.246,112.705,98.7545,317.0541,300.9573,402.8411,406.6884,56.1286,30.1385,32.9909,497.556,19.3606,20.8409,95.2324,108.6074,15.7753,54.5511,45.5623,64.564,101.1934,81.8459,88.286,58.2642,56.1225,51.2943,38.0649,63.5882,63.6847,120.495,102.4097,49.3255,111.3309,171.6028,58.9526,28.7698,144.6884,180.0661,116.6028,146.2594,199.8702,128.9378,423.2363,119.8537,124.6508,518.8625,306.3023,79.5213,121.0309,116.9346,170.8863,930.361,48.9983,55.039,47.1092,72.0548,75.4045,103.521,83.4134,142.3253,146.6215,121.4467,101.4252,68.4812,291.4275,143.9475,142.647,78.9826,47.094,204.2196,89.0208,82.792,27.1346,142.4764,83.7874,67.3216,112.9531,138.2549,133.3446,86.2659,45.3464,56.1604,43.5882,54.3623,86.296,115.7272,96.5498,111.8081,36.1756,40.2947,34.2532,89.1452,53.9062,36.458,113.9297,176.9962,77.3125,77.8891,64.807,64.1515,127.7242,119.6876,976.2324,322.8454,434.2883,168.6923,250.0284,234.7329,131.0793,152.335,118.8838,243.1772,24.1776,168.6327,170.7541,167.8444,75.9315,110.1045,113.4417,60.5464,66.8956,79.7606,71.6659,72.5251,77.513,207.8019,21.8592,35.2787,169.7698,146.5012,412.9934,248.0708,318.5489,104.1278,184.7592,108.0581,175.2646,169.7698,340.3732,570.3396,23.9853,69.0405,66.7391,67.9435,294.6085,68.0537,77.6344,433.2713,104.3178,229.4615,187.8587,78.1399,121.4737,122.5451,384.5935,38.5232,117.6835,50.3308,318.2513,103.6695,20.7181,321.9601,510.3248,13.4754,16.1188,44.8082,37.7291,733.4587,446.6241,21.1822,287.9603,327.2367,274.1109,195.4713,158.2114,64.4537,26.9857,172.8503]
y = [37,40,30,29,24,23,27,12,21,20,29,28,27,32,23,29,28,22,28,23,24,29,32,18,22,12,12,14,29,31,34,31,22,40,25,36,27,27,29,35,33,25,25,27,27,19,35,26,18,24,25,37,52,47,34,39,40,48,41,44,35,36,53,46,38,44,23,26,26,28,27,21,25,21,20,27,35,24,46,34,22,30,30,30,31,26,25,28,21,31,24,27,33,21,31,33,29,33,32,21,25,22,39,31,34,26,23,18,20,18,34,25,20,12,23,25,21,21,25,31,17,27,28,29,25,24,25,21,24,27,23,22,23,22,22,26,22,19,26,35,33,35,29,26,26,30,22,32,33,33,28,32,26,29,36,37,37,28,24,30,25,20,29,24,33,35,30,32,31,33,40,35,37,24,34,29,27,24,36,26,26,26,27,27,20,17,28,34,18,20,20,18,19,23,20,22,25,32,44,41,39,41,40,44,36,42,31,32,26,29,23,29,29,28,31,22,29,24,28,28,25]
xbreaks = [13.4754, 27.1346, 43.5882, 58.9526, 72.79, 89.1452, 110.1045, 131.0793, 158.2114, 180.0661, 207.8019, 234.7329, 252.9573, 300.9573, 327.2367, 348.246, 412.9934, 434.2883, 458.451, 518.8625, 595.5373, 640.6542, 733.4587, 815.8389, 930.361, 976.2324]
df = pd.DataFrame([x,y]).T
df.columns = ['x','y']
# Check the bin average and std using agge
bins = pd.cut(df.x,xbreaks,right=False)
t = df[['x','y']].groupby(bins).agg({"x": "mean", "y": ["mean","std"]})
t.reset_index(inplace=True)
t.columns = ['range_cut','x_avg_cut','y_avg_cut','y_std_cut']
t.index.name ='id'
# Get the bin average from
g = sns.regplot(x='x',y='y',data=df,fit_reg=False,x_bins=xbreaks,seed=seed)
xye = pd.DataFrame(get_data_XYE(g)).T
xye.columns = ['x_regplot','y_regplot','e_regplot']
xye.index.name = 'id'
t2 = xye.merge(t,on='id',how='left')
t2
You can see the y and e from the two ways are different. I understand that the default x_ci or x_estimator may afect the result of regplot, but I still can not the these values in excel by removing some lowest and/or highest values in each bin.
In seaborn.regplot, the x_bins are the center of each bin, and the original x values are assigned to the nearest bin value. Whereas in pandas.cut, the breaks define the bin edges.

Understanding problems in scikit learn in Python

Below is Youtuber Sentdex's machine learning code, and I couldn't understand some parts.
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)
df.drop(['body', 'name'], 1, inplace=True)
df.fillna(0, inplace=True)
def handle_non_numerical_data(df):
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
# creating dict that contains new
# id per unique string
text_digit_vals[unique] = x
x += 1
df[column] = list(map(convert_to_int, df[column]))
return df
df = handle_non_numerical_data(df)
df.drop(['ticket', 'home.dest'], 1, inplace=True)
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])
clf = MeanShift()
clf.fit(X)
labels= clf.labels_ ###Can't understand###
cluster_centers = clf.cluster_centers_
original_df['cluster_group'] = np.nan
for i in range(len(X)):
original_df['cluster_group'].iloc[i] = labels[i]
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
temp_df = original_df[(original_df['cluster_group'] == float(i))]
# print(temp_df.head())
survival_cluster = temp_df[(temp_df['survived'] == 1)]
survival_rate = len(survival_cluster) / len(temp_df)
# print(i,survival_rate)
survival_rates[i] = survival_rate
print(survival_rates)
Supposedly in "labels = clf.labels_", labels are [0 : 5] (when I ran program and I got those numbers). But here's the question. Where do those numbers come from? and why 0,1,2? why not bigger number?
scikitlearn's documentation on Meanshift provides an explanation of the labels_ attribute that you seem confused about. Taken directly from the documentation
labels_ :
Labels of each point.
If you're more confused about what these labels represent, a brief explanation would be that the number refers to what bin that specific point was clustered into. So all the points with a value of 0 would all belong to the same cluster, and all the points with a value of 1 would all belong to the same cluster, and so on. What the value of these labels are doesn't really matter, they're just here to be able to identify which cluster the data point belongs to.
I'd recommend reading more about clustering if you're still confused about why you would want to label the data.

Pandas subsetting returing different results to numpy

I am trying to subset a pandas dataframe using two conditions. However, I am not getting the same results as when done with numpy. What am I doing wrong?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(20,120,101)
y = np.linspace(-45,25,101)
xs,ys = np.meshgrid(x,y)
idx = (xs >=100) & (ys >= 0)
plt.scatter(xs,ys,s=2,c='b')
plt.scatter(xs[idx],ys[idx],s=2,c='r')
I need to remove the red block from my dataset, which I can do with numpy by using:
plt.scatter(xs[~idx],ys[~idx],s=2,c='b')
How do I replicate this with a pandas dataframe?
I've tried using the same logic as I used above:
data = {'x':x,'y':y}
df = pd.DataFrame(data)
mask = (df.x >=100) & (df.y >= 0)
df2 = df[~mask]
I've also tried using loc:
df.loc[(df.x >=100) & (df.y >= 0),['x','y']] = np.nan
Both of these methods give the following result:
How do I replicate the results from numpy?
Many thanks.
You don't obtain the same result because you didn't create all the couple of coordinates before passing them to pandas. Here is a quick solution:
data = {'x':xs.flatten(),'y':ys.flatten()}
df = pd.DataFrame(data)
mask = (df.x >=100) & (df.y >= 0)
df2 = df[~mask]
plt.scatter(df2.x,df2.y,s=2,c='b')
Flatten reshape your arrays to only have one dimension so that they can be used to construct a DF containing couple of coordinates and not lists.
Output:
Edit: Same result but with dataframe containing x and y
Split the df in chunks
data_x = np.linspace(20,120,101)
data_y = np.linspace(-45,25,101)
dataframe = pd.DataFrame({'x':data_x,'y':data_y})
chunk_size = 25
dfs = [dataframe[i:i+chunk_size] for i in range(0,dataframe.shape[0],chunk_size)]
Define the function that will give you the points you are interested in. Two loops because you need to get every configuration of x and y values
def generatorPoints(dfs):
for i in range(len(dfs)):
x = dfs[i].x
for j in range(len(dfs)):
y = dfs[j].y
xs, ys = np.meshgrid(x,y)
idx = (xs >=100) & (ys >= 0)
yield xs[~idx], ys[~idx]
x, y = [], []
for xs, ys in generatorPoints(dfs):
x.extend(xs), y.extend(ys)
plt.scatter(x,y,s=2,c='b')
This gives the same result as the previous code. There is certainly place to make some optimization but this is a start for your request :).

python: increase performance of finding the best timeshift for a correlation between each X column and y

I have a dataframe X with several columns and a dataframe y with only one column (series). The rows in X represent timesteps and I want to find the interval I need to shift each column of X to obtain the highest correlation with y. I wrote a function that loops over all columns and then loops over all timesteps and correlates the X column with y. If the R² is better than before I store the timestep. However, with over 300 columns this routine is really taking some time and I need to increase the performance. Is there a nice way to simplify this code?
(In the example I used the iris data set which is of course not a timeseries...)
from sklearn import datasets
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from copy import deepcopy
def get_best_shift(dfX, dfy, ti=60, maxt=1440):
"""
determines the best correlation for the last maxt minutes based on a
timestep of ti minutes. Creates a dataframe with the shifted variables based on the
best match (strongest correlation).
"""
df_out = deepcopy(dfX)
for xcol in dfX:
bestshift = 0
Rmax = 0
for ishift in range(0, int(maxt / ti)):
xvals = dfX[xcol].iloc[0:(dfX.shape[0] - ishift)].values
yvals = np.array([val[0] for val in dfy.iloc[ishift:dfy.shape[0]].values])
selector = np.array([str(val)!="nan" for val in (xvals*yvals)],dtype=bool)
xvals = xvals[selector]
yvals = yvals[selector]
R = np.corrcoef(xvals,yvals)[0][1]
# plt.figure()
# plt.plot(xvals,yvals,'k.')
# plt.show()
if R ** 2 > Rmax:
Rmax = R ** 2
# print(Rmax)
bestshift = ishift
df_out[xcol] = list(np.zeros(bestshift)) + list(dfX[xcol].iloc[0:dfX.shape[0] - bestshift].values)
df_out = df_out.rename(columns={xcol: ''.join([str(xcol), '_t-', str(bestshift)])})
return df_out
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
y = pd.DataFrame(iris.target)
df = get_best_shift(X,y)

Categories

Resources