Trying to compute a value for each row of dataframe in parallel using the following code, but getting errors either when I pass individual input ranges or the combination:
#!pip install pyblaze
import itertools
import pyblaze
import pyblaze.multiprocessing as xmp
import pandas as pd
inputs = [range(2),range(2),range(3)]
inputs_list = list(itertools.product(*inputs))
Index = pd.MultiIndex.from_tuples(inputs_list,names={"a", "b", "c"})
df = pd.DataFrame(index = Index)
df['Output'] = 0
print(df)
def Addition(A,B,C):
df.loc[A,B,C]['Output']=A+B+C
return df
def parallel(inputs_list):
tokenizer = xmp.Vectorizer(Addition, num_workers=8)
return tokenizer.process(inputs_list)
parallel(inputs_list)
Related
In my code, I run a t-test which sometimes yields "NaN" or "nan" when running a test on two zero value groups. I have tried making new data frames, tried replacing using .replace and also tried fillna() but nothing was successful. I get errors when also trying to define a new dataframe or read the file again after adding new calculations.
How do I replace the nulls and "nan" in these files: "significant_report2.xls" or "quant_report2.xls"
import json
import os, sys
import numpy as np
import pandas as pd
import scipy.stats
output_report = "quant_report2.xls"
significant_report = "significant_report2.xls"
output_report_writer = open(output_report, "w")
significant_writer = open(significant_report, "w")
# setup samples grouped by control and treatment
header = []
for idx in control_indices:
header.append(quant_columns[idx])
for idx in treatment_indices:
header.append(quant_columns[idx])
output_report_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
significant_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
feature_list = list(quantitative_data_frame.index)
for feature_idx in range(len(feature_list)):
feature_name = feature_list[feature_idx]
control_values = quantitative_data_frame.iloc[feature_idx, control_indices]
treatment_values = quantitative_data_frame.iloc[feature_idx, treatment_indices]
ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(control_values, treatment_values, equal_var=False)
ctrl_means = scipy.mean(control_values,0)
ctrl_stdDev = scipy.stats.tstd(control_values)
tx_means= scipy.mean(treatment_values,axis=0)
tx_stdDev1 = scipy.stats.tstd(treatment_values)
output_report_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,
"\t".join([str(x) for x in list(control_values)]),
"\t".join([str(x) for x in list(treatment_values)]), ttest_pvalue, ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
significant_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,"\t".join([str(x) for x in list(control_values)]), "\t".join([str(x) for x in list(treatment_values)]),ttest_pvalue,ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
I am using investpy to get historical stock data for 2 stocks ( TRP_pb , TRP_pc )
import investpy
import pandas as pd
import numpy as np
TRP_pb = investpy.get_stock_historical_data(stock='TRP_pb',
country='canada',
from_date='01/01/2022',
to_date='01/04/2022')
print(TRP_pb.head())
TRP_pc = investpy.get_stock_historical_data(stock='TRP_pc',
country='canada',
from_date='01/01/2022',
to_date='01/04/2022')
print(TRP_pc.head())
I can append the two tables by using the append method
appendedtable = TRP_pb.append(TRP_pc, ignore_index=False)
What I am trying to do is to use a loop function in order to combine these two tables
Here is what I have tried so far
preferredlist = ['TRP_pb','TRP_pc']
for i in preferredlist:
new = investpy.get_stock_historical_data(stock=i,
country='canada',
from_date='01/01/2022',
to_date='01/04/2022')
new.append(new, ignore_index=True)
However this doesnt work.
I would appreciate any help
Since get_stock_historical_data returns a DataFrame, you can create an empty dataframe before the for loop and concat in the loop.
preferredlist = ['TRP_pb','TRP_pc']
final_list = pd.DataFrame()
for i in preferredlist:
new = investpy.get_stock_historical_data(stock=i,
country='canada',
from_date='01/01/2022',
to_date='01/04/2022')
final_list = pd.concat([final_list, new])
I have a dataframe and am trying to get the closest matches using mahalanobis distance across three categories, like:
from io import StringIO
from sklearn import metrics
import pandas as pd
stringdata = StringIO(u"""pid,ratio1,pct1,rsp
0,2.9,26.7,95.073615
1,11.6,29.6,96.963660
2,0.7,37.9,97.750412
3,2.7,27.9,102.750412
4,1.2,19.9,93.750412
5,0.2,22.1,96.750412
""")
stats = ['ratio1','pct1','rsp']
df = pd.read_csv(stringdata)
d = metrics.pairwise.pairwise_distances(df[stats].as_matrix(),
metric='mahalanobis')
print(df)
print(d)
Where that pid column is a unique identifier.
What I need to do is take that ndarray returned by the pairwise_distances call and update the original dataframe so each row has some kind of list of its closest N matches (so pid 0 might have an ordered list by distance of like 2, 1, 5, 3, 4 (or whatever it actually is), but I'm totally stumped how this is done in python.
from io import StringIO
from sklearn import metrics
stringdata = StringIO(u"""pid,ratio1,pct1,rsp
0,2.9,26.7,95.073615
1,11.6,29.6,96.963660
2,0.7,37.9,97.750412
3,2.7,27.9,102.750412
4,1.2,19.9,93.750412
5,0.2,22.1,96.750412
""")
stats = ['ratio1','pct1','rsp']
df = pd.read_csv(stringdata)
dist = metrics.pairwise.pairwise_distances(df[stats].as_matrix(),
metric='mahalanobis')
dist = pd.DataFrame(dist)
ranks = np.argsort(dist, axis=1)
df["rankcol"] = ranks.apply(lambda row: ','.join(map(str, row)), axis=1)
df
I have a function which I apply it on the rows of a dataframe. This function returns a list of variable length depending on a parameter.
For now I use the following example code:
import pandas as pd
df = pd.read_csv("data.csv")
def add_columns(x, amount):
return range(amount)
df["L1"], df["L2"], df["L3"] = zip(*df.apply(lambda x: add_columns(x, 3), axis=1))
Is there a way to add the labels automatically ?
Not sure if I understand your question correctly in what you want to populate your columns with but this should work:
import pandas as pd
import numpy as np
def add_columns(x, *args):
col_names = args[0]
return pd.Series({i: x for i in col_names})
def add_range(x, *args):
col_names = args[1]
return pd.Series({k: v for k,v in zip(args[1],range(args[0]))})
df = pd.DataFrame(np.random.uniform(size=(10,2)),columns=["A","B"])
labels = ["L1","L2","L3"]
# This populates with values from "A" column
df.merge(df["A"].apply(add_columns, args=([labels])),left_index=True, right_index=True)
# This populates with values from range(number_passed) function
df.merge(df["A"].apply(add_range, args=([3,labels])),left_index=True, right_index=True)
I need to use my own data for a zipline project. I keep getting this error whenever I try:
/Library/Python/2.7/site-packages/zipline/sources/data_source.pyc in <dictcomp>((target, (mapping_func, source_key)))
47 """
48 row = {target: mapping_func(raw_row[source_key])
---> 49 for target, (mapping_func, source_key)
50 in self.mapping.items()}
51 row.update({'source_id': self.get_hash()})
ValueError: cannot convert float NaN to integer
Here is the trading algorithm I am running:
from zipline.algorithm import TradingAlgorithm
from zipline.api import order_target, order, record, symbol, history, add_history
import numpy as np
from pandas import Series, DataFrame, Panel
import pandas as pd
# Define algorithm
def initialize(context):
context.dateIndex = 0
def handle_data(context, data):
today = data.major_axis[context.dateIndex]
if today > data.US9663871021[data.US9663871021.close.notnull()].index[0] and today < data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order(symbol('US9663871021'), 10)
record(US9663871021=data[symbol('US9663871021')].price)
if today > data.US7954351067[data.US7954351067.close.notnull()].index[0] and today < data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order(symbol('US7954351067'), 10)
record(US7954351067=data[symbol('US7954351067')].price)
if today == data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order_target(symbol('US9663871021'), 0)
record(US9663871021=data[symbol('US9663871021')].price)
if today == data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order_target(symbol('US7954351067'), 0)
record(US9663871021=data[symbol('US7954351067')].price)
context.dateIndex = context.dateIndex + 1
def prepDf(fileName):
df = pd.io.parsers.read_csv(fileName, index_col=[0],parse_dates=[0], na_values=["#N/A N/A"],
names=["date", "open","high","low","close","volume","mkt_cap"])
df["price"] = df.close
df.index = df.index.tz_localize('UTC')
df = df[df.close.notnull()]
return df
fileName = #fill in file name
fileName2 = #fill in file name
dictionaryOfDfs = {"US9663871021" : prepDf(fileName), "US7954351067": prepDf(fileName2)}
data = Panel(dictionaryOfDfs)
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
The idea is that I'm buying when the data should be non-NaN and selling the position before the end of the series. There should be no need for the data beyond that, yet zipline insists that NaN causes an error even when the value shouldn't be used.
After researching, I believe the solution is to re-index the underlying DataFrames:
df1 = df1.reindex(index=data.major_axis, fill_value=0)
df2 = df2.reindex(index=data.major_axis, fill_value=0)
where data is the pandas Panel