How to use Dask for with .groupby.apply?

How to use Dask for with .groupby.apply? - python

I have dataframe df
I would like to partition df into sub-dataframes and apply function find_root on each of them. My function only takes columns id and parent_id as input.
Then I would like to concatenate resulted dataframes. Because my dataframe is huge (over 4 million rows), I would like to use Dask. Then I have an error
ValueError: The columns in the computed data do not match the columns in the provided metadata
Extra: []
Missing: [2]
Could you please elaborate on how to solve this error?
import pandas as pd
import networkx as nx
from dask.distributed import Client
import dask.dataframe as dd
client = Client(n_workers=2, threads_per_worker=1, processes=False, memory_limit='4GB')
def find_root(df):
g = nx.from_pandas_edgelist(df, source = 'parent_id', target = 'id', create_using = nx.DiGraph())
roots = {n for n, d in g.in_degree() if d == 0}
tmp = {}
for r in roots:
tree = dfs_tree(g, r)
tmp[r] = list(tree.nodes)
tmp = pd.DataFrame.from_dict(tmp, orient = 'index').T
tmp = tmp.melt(value_name = 'node', var_name = 'root').dropna()
return tmp
path = 'https://raw.githubusercontent.com/leanhdung1994/WebMining/main/sample_df.csv'
df = dd.read_csv(path, header = 0)
df = df[['id', 'created_utc', 'ups', 'link_id', 'author', 'body', 'parent_id']]
df['parent_id'] = df['parent_id'].str.split('_', expand = True, n = 2)[1]
df['link_id'] = df['link_id'].str.split('_', expand = True, n = 2)[1]
result = df.groupby('link_id').apply(find_root, meta = object)
computed_result = result.compute()
Update: I added dtype to dd.read_csv
df = dd.read_csv(path, header = 0, dtype = {'id': 'str', 'parent_id': 'str', 'link_id': 'str'})
but the problem persists.

Related

Loop through multiple xml files

I'm fairly new to python and would like to loop through multiple xml files. I'm currently using the existing code to pull in sample2 xml file:
import xml.etree.ElementTree as ET
import pandas as pd
import os
tree=ET.parse("sample2.xml")
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
I tried adding soup xml to the below line of code but this didn't work
tree=ET.parse("sample2.xml , "soup xml")
root = tree.getroot()

Consider turning your code into a function and calling it for the various files you need:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename):
tree=ET.parse(filename)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
return df2
You can then call it for your files:
my_xml_processor("sample2.xml")
my_xml_processor("soup.xml")
EDIT: these are some minor code changes that I'd recommend:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename:str)->pd.DataFrame: # <- Add type hints
root = ET.parse(filename).getroot() # <- tree is not used
qty = root.iterfind(".//Qty")
pri = root.iterfind(".//PriceAmount")
cor = root.iterfind(".//AuctionIdentification")
data = [ # <- This could be a list comprehension
(x.get('v'), y.get('v'), z.get('v'))
for x,y,z in zip(qty, pri, cor)
]
df = (pd
.DataFrame(data, columns=["Qty", "Price" , "Border"])
.astype({
'Qty': float,
'Price': float,
})
)
df2 = df.agg({
'Qty':'sum',
'Price':'mean',
'Border': lambda x: str(x[0])[:12]
}).to_frame().T
return df2

You could use your existing code, but running it in a loop for each filename you have, something like:
import xml.etree.ElementTree as ET
import pandas as pd
import os
files = ['sample2.xml', 'sample3.xml', 'sample4.xml']
for file in files: #read each filename from above list
tree=ET.parse(file)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]

Import output of kubectl get pods -o json in to pandas dataframe

I'd like to import the output of:
kubectl get pods -o json
into a python pandas dataframe. This should contain also all containers and there resource request and limits.
My code starts as follows:
import json
import numpy as np
import pandas as pd
import os
pods_raw = os.popen('kubectl get pods -o json').read()
pods_json = json.loads(pods_raw)['items']
from here on I struggle to get the data in a correct way in a dataframe, especially the 'spec.containers' should be split up when multiple containers exist.

Here is an example how you can extract the data of interest to the dataframe. The output is only an example (as you didn't specify the required output in the question):
import json
import pandas as pd
# open the Json data from file (or use os.popen):
with open("data.json", "r") as f_in:
data = json.load(f_in)
df = pd.DataFrame(data["items"])
# metadata:
df = pd.concat(
[df, df.pop("metadata").apply(pd.Series).add_prefix("meta_")], axis=1
)
# spec:
df = pd.concat(
[df, df.pop("spec").apply(pd.Series).add_prefix("spec_")], axis=1
)
# status:
df = pd.concat(
[df, df.pop("status").apply(pd.Series).add_prefix("status_")], axis=1
)
# keep only columns of interests:
df = df[["meta_name", "meta_namespace", "status_phase", "spec_containers"]]
# explode spec_containers column
df = df.explode("spec_containers")
df = pd.concat(
[
df,
df.pop("spec_containers")
.apply(pd.Series)
.add_prefix("spec_")[["spec_image", "spec_name"]],
],
axis=1,
)
print(df)
Prints:
meta_name meta_namespace status_phase spec_image spec_name
0 apache-lb-648c5cb8cb-mw5zh default Running httpd apache
0 apache-lb-648c5cb8cb-mw5zh default Running index.docker.io/istio/proxyv2:1.13.4 istio-proxy
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/sig-storage/csi-node-driver-registrar:v2.0.1 driver-registrar
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/cephcsi/cephcsi:v3.3.1 csi-cephfsplugin
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/cephcsi/cephcsi:v3.3.1 liveness-prometheus
...and so on.

Currently I have the following code to solve this:
#!/usr/bin/env python
import json
import pandas as pd
import os
kb = 1024
mb = kb * kb
gb = mb * kb
tb = gb * kb
def main():
pods_raw = os.popen('kubectl get pods -A -o json').read()
pods_json = json.loads(pods_raw)['items']
first_split = ['status','metadata','spec']
second_split = ['spec.containers','spec.containers.resources',"spec.containers.resources.limits","spec.containers.resources.requests"]
df_pods = pd.DataFrame.from_dict(pods_json)
df_pods = concat_data(df_pods, first_split)
df_pods = expand_data(df_pods, ['spec.containers'])
df_pods = concat_data(df_pods, second_split)
df_pods.index
df_pods.index.name='index'
col_to_normalize = ['spec.containers.resources.limits.cpu',
'spec.containers.resources.limits.memory',
'spec.containers.resources.requests.cpu',
'spec.containers.resources.requests.memory']
for col_name in col_to_normalize:
df_pods[col_name] = df_pods[col_name].map(normalize_values)
df_pods[col_to_normalize] = df_pods.groupby('index')[col_to_normalize].sum()
df_pods = df_pods.drop_duplicates(['metadata.name'])
df_pods[df_pods['status.phase'] == 'Running']
print(df_pods)
def concat_data(df: pd.DataFrame, expands: list) -> pd.DataFrame:
for expantion in expands:
# df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)
df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)
return df
def expand_data(df: pd.DataFrame, expands: list) -> pd.DataFrame:
for expantion in expands:
s = df[expantion].apply(pd.Series).stack()
s.index = s.index.droplevel(-1)
s.index
df.index = [x for x in df.index]
del df[expantion]
s.name = expantion
df=df.join(s)
return df
def normalize_values(val: str) -> int:
try:
if val[-1] == 'm':
return int(val[:-1]) / 1000
if val[-2].lower() == "k":
return int(val[:-2]) * kb
if val[-2].lower() == "m":
return int(val[:-2]) * mb
if val[-2].lower() == "g":
return int(val[:-2]) * gb
if val[-2].lower() == "t":
return int(val[:-2]) * tb
return int(val)
except:
return 0
if __name__ == '__main__':
main()
This works fine except for the following FutureWarning I get and don't know how to solve this yet:
./resources.py:43: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)

Is there a way to store output dataframes and appending them to the last output in the same dataframe

I am trying to fetch data from API for 50 parcels. I want them to be in a single data frame. While running this loop the data frame is storing only the last parcel which is satisfying the loop condition. Is there any way to store all the previous outputs also in the same dataframe.
For e.g upon running this code it only returns the data frame for foreign id=50, I want the dataframe for all 1-50.
import requests
import pandas as pd
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
dfinal_p6

The issue is resolved by first creating an empty data frame and then appending the outputs in the dataframe within the loop.
The updated code is as follows:
column_names = ["parcel_foreign_id_x", "s1product_end_time", "s1product_ron","cohvh_avg", "cohvv_avg", "vhvv_avg","parcel_foreign_id_y", "s2product_start_time", "s2product_ron", "ndvi_avg" ]
df = pd.DataFrame(columns = column_names)
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
df = pd.concat([dfinal_p6,df],ignore_index = True)
foreign = foreign+1

Function erroring out when calling another function

I'm getting the following error when calling a function from another function:
TypeError: 'GLMResultsWrapper' object is not callable
I get the error at the coeffs = model_results(model_results) line below.
This is another function that runs error free outside of the table_to_graph function. The model_results function takes the summary output from a statsmodel model and puts it into a data frame.
The table_to_graph function joins that dataframe to another table that is the df in the input. table_to_graph function below.
The ultimate function is the following:
# Add into table generation table
def table_to_graph(model_results, df):
'''
#function that combines rating tables and model summary
'''
coeffs = model_results(model_results)
try:
df['key'] = df['variable']+"_"+df['level']
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
#title2 = title1
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up', 'error_down'
, 'pricing_model_1_p_values']]
return df
#df1 = df1.append(df)
except:
#df['level'] = df['level'].astype('str')
df['key'] = df['variable']+"_"+df['level'].astype('str')
df['level'] = df['level'].astype('int')
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up'
, 'error_down', 'pricing_model_1_p_values']]
#df1 = df1.append(df)
return df
model_results function below:
def model_results(model_results):
'''
function that puts model parameters into a data frame
'''
df = pd.DataFrame(model_results.params, columns = ['factor'])
df['error_down'] = model_results.conf_int()[0]
df['error_up'] = model_results.conf_int()[1]
df['standard_error'] = model_results.bse
df['pvalues'] = round(model_results.pvalues, 3)
df.reset_index(inplace = True)
return df

The problem is that you are not calling the function you have defined as model_results but instead are "calling" the model_results data on the model_results data. This is why you get the error that the object is not callable.
Change either the function name or the name of the model_results data to something else, this will allow python to make a distinction between the two and do what you want it to do. Which is call the function model_results on the model_results data.

Scipy minimize: ValueError: Shape of passed values is (10, 10), indices imply (1, 10)

I am doing a mean-variance portfolio optimization exercise and I am stuck with the error mentioned in the title. The goal of my optimization is to maximize the sharpe ratio of a portfolio of 10 assets with an additional tracking error constrains.
For tracking-error portfolio, the benchmark is an equal-weighted portfolio of the 10 funds (in each sheet). The tracking error is defined as the standard deviation of the difference between the returns of your portfolio and the benchmark. In this project, we require the annual tracking error to be not greater than 5%. See picture of constraint formula:
Here is my code:
import numpy as np
from scipy.optimize import minimize
import pandas as pd
# importing the data
data_ff = pd.read_excel(r'path\Project_2.xlsx', 'Fama-French Factor', parse_dates=True, index_col="Date")
data_mf = pd.read_excel(r'path\Project_2.xlsx', 'Mutual Fund', parse_dates=True, index_col="Date")
data_sb = pd.read_excel(r'path\Project_2.xlsx', 'SmartBeta', parse_dates=True, index_col="Date")
data_hf = pd.read_excel(r'path\Project_2.xlsx', 'Hedge Fund Index', parse_dates=True, index_col="Date")
data_ff.rename(columns={'dateff':'Date'}, inplace = True)
# in-sample period
data_ff = data_ff[data_ff.index>=200101].dropna()
data_ff = data_ff[data_ff.index<=201212].dropna()
data_mf = data_mf[data_mf.index>=200101].dropna()
data_mf = data_mf[data_mf.index<=201212].dropna()
data_sb = data_sb[data_sb.index>=200101].dropna()
data_sb = data_sb[data_sb.index<=201212].dropna()
data_hf = data_hf[data_hf.index>=200101].dropna()
data_hf = data_hf[data_hf.index<=201212].dropna()
mf_cov_mat = data_mf.cov()*(12)
mf_mu = data_mf.mean()*12
sb_cov_mat = data_sb.cov()*(12)
sb_mu = data_sb.mean()*12
hf_cov_mat = data_hf.cov()*(12)
hf_mu = data_hf.mean()*12
weight_MVP_mf = pd.DataFrame(index = mf_mu.index)
weight_MVP_sb = pd.DataFrame(index = sb_mu.index)
weight_MVP_hf = pd.DataFrame(index = hf_mu.index)
results_MVP_mf = pd.DataFrame(np.zeros((3,4)), index = ('sharpe','return','volatility'),
columns = ('long-only','long-short','error-tracking','factor-neutral'))
results_MVP_sb = pd.DataFrame(np.zeros((3,4)), index = ('sharpe','return','volatility'),
columns = ('long-only','long-short','error-tracking','factor-neutral'))
results_MVP_hf = pd.DataFrame(np.zeros((3,4)), index = ('sharpe','return','volatility'),
columns = ('long-only','long-short','error-tracking','factor-neutral'))
for df in (data_mf, data_sb, data_hf):
# return vector and var-cov matrix
df_mu = pd.Series.to_frame(df.mean()*12)
df_cov = df.cov()
eq_weight = pd.DataFrame(np.array([0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]))
# sharpe ratio
def objective_df(x):
risk_free = data_ff['rf'].iloc[-1]
df_port_mu = np.dot(df_mu.T, x)
df_port_stdev = np.sqrt(np.dot(np.dot(x.T,df_cov),x))*np.sqrt(12)
return -df_port_mu/df_port_stdev
# total weight = 100%
def constraint1(x):
return x.sum()-1
# tracking error
def constraint2(x):
y = x-eq_weight
z = float(np.sqrt(np.dot(np.dot(y.T,df_cov),y))*np.sqrt(12)-0.05)
return z
x0 = eq_weight1
b = (0,1)
bnds = (b,b,b,b,b,b,b,b,b,b)
cons1 = {'type': 'eq', 'fun': constraint1}
cons2 = {'type': 'ineq', 'fun': constraint2}
cons = [cons1,cons2]
sol = minimize(objective_df, x0, method = 'SLSQP', bounds = bnds, constraints = cons)
print (sol.fun)
The error message I get seems to be coming from constraint 2. Can anyone help me with this?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use Dask for with .groupby.apply? - python

Related

Loop through multiple xml files

Import output of kubectl get pods -o json in to pandas dataframe

Is there a way to store output dataframes and appending them to the last output in the same dataframe

Function erroring out when calling another function

Scipy minimize: ValueError: Shape of passed values is (10, 10), indices imply (1, 10)

Categories

Resources