getting different threads to alter different parts of a pandas dataframe - python

I am new to multithreading in python so am not sure how to set this up. I am trying to produce a large output dataframe populated with calculations based on another input dataframe. The output dataframe is like an adjacency matrix of the columns of the input dataframe.
The following non-multithreaded version works perfectly:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import json
import os
import time
def build_adjacency_matrix(DATA_MATRIX, OUT):
# READS DATA: data must be a csv with a header and an index column
my_data = pd.read_csv(DATA_MATRIX, index_col=0)
# INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)
y=0
w=2
for c1 in my_data.columns:
print (c1)
y+=1
if y > w:
time.sleep(1) # GIVE THE PROCESSER A REST AFTER EACH 10 COLUMNS
print(y) #KEEP TRACK OF HOW MANY COLS HAVE BEEN PROCESSED
w+=10
for c2 in my_data.columns:
if c1==c2: AM.loc[c1,c2]=0; continue
sample_df = pd.DataFrame(my_data, columns=[c1,c2])
# KEEP ONLY ROWS WITH 1s and 0s
sample_df = sample_df[sample_df[c1] != 0.5]
sample_df = sample_df[sample_df[c2] != 0.5]
sample_df = sample_df.dropna()
# CALCULATE ChiX
# Contingency table.
contingency = pd.crosstab(sample_df[c1], sample_df[c2])
# Chi-square test of independence.
try:
chi2, p, ddof, expected = chi2_contingency(contingency)
AM.loc[c1,c2] = p
except:
ValueError;
# ASSIGN AS NOT SIGNIFICANT IF THERE IS A PROBLEM
AM.loc[c1,c2] = 1
AM.to_csv(OUT, sep=',')
return
# FILES
data_matrix='input_test.csv'
out='output_mt_test.csv'
# FUNCTION CALL
build_adjacency_matrix(data_matrix, out)
Here is the top few rows of the input file:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
SAMPLE1,1,0,0.5,1,1,0.5,0.5,1,0.5,0.5,0.5,0.5,0,0.5,0,0.5,0,0.5,0.5
SAMPLE2,0.5,0.5,0.5,1,1,0.5,0.5,1,0.5,0.5,0,1,0,0.5,0,0.5,0.5,0.5,0.5
SAMPLE3,0.5,0,0.5,1,1,0.5,0.5,1,0.5,0.5,1,0.5,0.5,0.5,0,1,0,0.5,0.5
SAMPLE4,1,0.5,0.5,1,1,0.5,0.5,0,0.5,0.5,0.5,0.5,0.5,0.5,1,1,0.5,0.5,1
And here is the top few rows of the output file:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,0.920167313802
VAR2,0.00326965769624,0,0.67328997966,0.714393037634,0.714393037634,0.829638099719,1.0,0.881545828869,1.0,1.0,0.504985075094,0.665005542102,0.672603817442,0.75946286538,0.365088814029,1.0,0.478520976544,0.698535358303,0.700311372937
VAR3,0.67328997966,0.67328997966,0,1.0,1.0,0.665005542102,1.0,0.672603817442,1.0,1.0,1.0,1.0,0.819476976778,1.0,0.324126587758,1.0,1.0,0.665005542102,0.608407800233
The code works well and produces the expected output for the test file, however the real input file (exactly the same file structure but with 100s rows and 1000s of cols) is considerably larger and takes ~48 hours to run so I need to make it faster.
I tried the following attempt to implement multithreading:
import pandas as pd
from scipy.stats import chi2_contingency
from threading import Thread
def build_adjacency_matrix(DATA_MATRIX, OUT, THREADS):
# READS DATA: data must be a csv with a header and an index column
my_data = pd.read_csv(DATA_MATRIX, index_col=0)
# INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)
print(len(my_data.columns))
print(len(my_data.index))
# BUILD THREAD GROUPS
thread_groups={}
chunk=int(len(AM.columns)/THREADS)
i=0; j=chunk
for t in range(THREADS): thread_groups[t]=list(range(i,j)); i+=chunk; j+=chunk;
# DELEGATE REMAINING COLS TO THE LAST THREAD
if thread_groups[THREADS-1][-1] != len(AM.columns):
thread_groups[THREADS-1] = thread_groups[THREADS-1] + \
list(range((thread_groups[THREADS-1][-1]),len(AM.columns)))
print(thread_groups)
def populate_DF(section):
for c1 in AM.columns[section]:
for c2 in AM.columns:
if c1==c2: AM.loc[c1,c2]=0; continue
sample_df = pd.DataFrame(my_data, columns=[c1,c2])
# KEEP ONLY ROWS WITH 1s and 0s
sample_df = sample_df[sample_df[c1] != 0.5]
sample_df = sample_df[sample_df[c2] != 0.5]
sample_df = sample_df.dropna()
# CALCULATE ChiX
# Contingency table.
contingency = pd.crosstab(sample_df[c1], sample_df[c2])
#Chi-square test of independence.
try:
# POPULATE AM WITH CHI-SQ p-value
chi2, p, ddof, expected = chi2_contingency(contingency)
AM.loc[c1,c2] = p
except:
# ASSIGN A p-value OF 1.0 IF THERE IS A PROBLEM
ValueError;
AM.loc[c1,c2] = 1
for tg in thread_groups:
t = Thread(target=populate_DF, args=(thread_groups[tg],))
print(tg)
print(thread_groups[tg])
t.start()
AM.to_csv(OUT, sep=',')
return
data_matrix='input_test.csv'
out='output_mt_test.csv'
build_adjacency_matrix(data_matrix, out, 4)
I'm not sure if I should be making the output dataframe a global variable? Or how to do it? The aim of the section on 'building thread groups' is to delegate groups of columns from the input file to be delegated to separate threads and each of the outputs added to the final dataframe. I have up to 16 cores available so thought a multithreading solution would help here. The code as it is produces an unexpected, partially complete output:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,
VAR2,,,,,,,,,,,,,,,,,,,
VAR3,,,,,,,,,,,,,,,,,,,
VAR4,,,,,,,,,,,,,,,,,,,
VAR5,0.573642138098,0.714393037634,1.0,5.61531250139e-06,0,1.0,1.0,0.859350808026,0.819476976778,0.819476976778,1.0,1.0,0.805020272634,,,,,,
VAR6,,,,,,,,,,,,,,,,,,,
VAR7,,,,,,,,,,,,,,,,,,,
VAR8,,,,,,,,,,,,,,,,,,,
VAR9,1.0,1.0,1.0,0.819476976778,,,,,,,,,,,,,,,
VAR10,,,,,,,,,,,,,,,,,,,
VAR11,,,,,,,,,,,,,,,,,,,
VAR12,,,,,,,,,,,,,,,,,,,
VAR13,0.914326585297,,,,,,,,,,,,,,,,,,
VAR14,,,,,,,,,,,,,,,,,,,
VAR15,,,,,,,,,,,,,,,,,,,
VAR16,,,,,,,,,,,,,,,,,,,
VAR17,,,,,,,,,,,,,,,,,,,
VAR18,,,,,,,,,,,,,,,,,,,
VAR19,,,,,,,,,,,,,,,,,,,
i'm not sure if this is to do with an issue with the multithreads trying to output to the same variable or if this is a problem with how I have spread the workload. I would really appreciate any help with how to fix this, or any other ways to optimize the code? Thanks in advance!

Related

Why can't I replace null values in this excel sheet?

In my code, I run a t-test which sometimes yields "NaN" or "nan" when running a test on two zero value groups. I have tried making new data frames, tried replacing using .replace and also tried fillna() but nothing was successful. I get errors when also trying to define a new dataframe or read the file again after adding new calculations.
How do I replace the nulls and "nan" in these files: "significant_report2.xls" or "quant_report2.xls"
import json
import os, sys
import numpy as np
import pandas as pd
import scipy.stats
output_report = "quant_report2.xls"
significant_report = "significant_report2.xls"
output_report_writer = open(output_report, "w")
significant_writer = open(significant_report, "w")
# setup samples grouped by control and treatment
header = []
for idx in control_indices:
header.append(quant_columns[idx])
for idx in treatment_indices:
header.append(quant_columns[idx])
output_report_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
significant_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
feature_list = list(quantitative_data_frame.index)
for feature_idx in range(len(feature_list)):
feature_name = feature_list[feature_idx]
control_values = quantitative_data_frame.iloc[feature_idx, control_indices]
treatment_values = quantitative_data_frame.iloc[feature_idx, treatment_indices]
ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(control_values, treatment_values, equal_var=False)
ctrl_means = scipy.mean(control_values,0)
ctrl_stdDev = scipy.stats.tstd(control_values)
tx_means= scipy.mean(treatment_values,axis=0)
tx_stdDev1 = scipy.stats.tstd(treatment_values)
output_report_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,
"\t".join([str(x) for x in list(control_values)]),
"\t".join([str(x) for x in list(treatment_values)]), ttest_pvalue, ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
significant_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,"\t".join([str(x) for x in list(control_values)]), "\t".join([str(x) for x in list(treatment_values)]),ttest_pvalue,ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))

Creating Cartesian Product DataFrame without maxing Memory

I have several dataframes, from which I'm creating a cartesian product (on purpose!)
After this, I'm exporting the result to disk.
I believe the size of the resulting dataframe could exceed my memory footprint, so I'm wondering is there a way that I can chunk this so that the dataframe doesn't need to all be in memory at the same time?
Example Code:
import pandas as pd
def create_list_from_range(r1,r2):
if (r1 == r2):
return r1
else:
res = []
while(r1 < r2+1 ):
res.append(r1)
r1 += 1
return res
# make a list of options
color_opt = ['red','blue','green','orange']
dow_opt = create_list_from_range(1,7)
hod_opt = create_list_from_range(0,23)
# turn each list into a dataframe
df_color = pd.DataFrame({'color': color_opt})
df_day = pd.DataFrame({'day_of_week': dow_opt})
df_hour = pd.DataFrame({'hour_of_day': hod_opt})
# add a dummy columns to everything so I can easily do a cartesian product
df_color['dummy']=1
df_day['dummy']=1
df_hour['dummy']=1
# now cartesian product... cascading
merge1 = pd.merge(df_day, df_hour, on='dummy')
FINAL = pd.merge(merge1, df_color, on='dummy')
FINAL.to_csv('FINAL_OUTPUT.csv', index=False)
You could try building up individual rows using itertools.product. In your example, you could do this as follows:
from itertools import product
prod = product(color_opt, dow_opt, hod_opt)
You can then get a number of rows and append them to an existing csv file using
df.to_csv("file", mode="a")

pandas - how to add row with index and compare the difference of the data?

I have brought the data from excecl(csv) and it is about 300000 rows * 1 column and I plotted by db.plot() ----- it is a time series data
I am trying to delete (drop) the data that is higher than 0.006 and, after that
I want to compare the difference between the data(step by step : the one next to each other) and if the difference is bigger 0.00001, I want to drop thos data also.
then, I will be only left with data with very very low difference (almost 0, flat slope).
I am a very beginner in python and I tried my best but I don't know what is wrong with my code:
import pandas as pd
excel_df = pd.read_csv('data.csv', header=None)
excel_df.plot()
bool_idx = excel_df < 0.006
valid_data = excel_df[bool_idx]
true_data = valid_data.dropna()
# print(true_data)
# print(valid_data)
ax1 = valid_data.plot()
ax1.set_ylim(-0.005, 0.045)
ax1.plot()
print(true_data)
al2 = true_data.diff()
# print(al2)
number = 0
for true_data in ture data:
number = number + 1
if true_data.diff() < 0.00001:
true_data.drop()
print(true_data)
Try running this on your dataset.
#!/usr/bin/env python3
# coding: utf-8
# In[1]:
import pandas as pd
excel_df = pd.read_csv('data.csv', header=None)
x=excel_df.plot()
# x
bool_idx = excel_df < 0.006
# bool_idx
valid_data = excel_df[bool_idx]
# valid_data
true_data = valid_data.dropna()
# true_data
ax1 = valid_data.plot()
ax1.set_ylim(-0.005, 0.045)
# ax1
al2 = true_data.diff()
# al2
number = 0
for (true_data_diff_val,rid) in zip(true_data.diff()[0],true_data.diff()[0].index):
# print(number,true_data_diff_val)
# print(rid)
if true_data_diff_val < 0.00001 and rid != 0:
true_data=true_data.drop(int(rid),0)
print(rid)
print(true_data)
Your query to my understanding is to get the row/tuple ID w.r.t. the if condition within loop in order to drop it from another dataframe. The simplest method I know of uses zip function to get it iterated parallel with the data.
Also you need to save the dataframe when you drop a row/column in order to observe changes!
I checked for rid!=0 because diff() gives first element as NaN, you can apply any appropriate condition.

Tracking Error on a number of benchmarks

I'm trying to calculate tracking error for a number of different benchmarks versus a fund that I'm looking at (tracking error is defined as the standard deviation of the percent difference between the fund and benchmark). The time series for the fund and all the benchmarks are all in a data frame that I'm reading from an excel on file and what I have so far is this (with the idea that arg1 represents all the benchmarks and is then applied using applymap), but it's returning a KeyError, any suggestions?
import pandas as pd
import numpy as np
data = pd.read_excel('File_Path.xlsx')
def index_analytics(arg1):
tracking_err = np.std((data['Fund'] - data[arg1]) / data[arg1])
return tracking_err
data.applymap(index_analytics)
There are a few things that need fixed. First,applymap passes each individual value for all the columns to your calling function (index_analytics). So arg1 is the individual scalar value for all the values in your dataframe. data[arg1] is always going to return a key error unless all your values are also column names.
You also shouldn't need to use apply to do this. Assuming your benchmarks are in the same dataframe then you should be able to do something like this for each benchmark. Next time include a sample of your dataframe.
df['Benchmark1_result'] = (df['Fund'] - data['Benchmark1']) / data['Benchmark1']
And if you want to calculate all the standard deviations for all the benchmarks you can do this
# assume you have a dataframe with a list of all the benchmark columns
benchmark_columns = [list, of, benchmark, columns]
np.std((df['Fund'].values - df[benchmark_columns].values) / df['Fund'].values, axis=1)
Assuming you're following the definition of Tracking Error below:
import pandas as pd
import numpy as np
# Example DataFrame
df = pd.DataFrame({'Portfolio_Returns': [5.00, 1.67], 'Bench_Returns': [2.89, .759]})
df['Active_Return'] = df['Portfolio_Returns'] - df['Bench_Returns']
print(df.head())
list_ = df['Active_Return']
temp_ = []
for val in list_:
x = val**2
temp_.append(x)
tracking_error = np.sqrt(sum(temp_))
print(f"Tracking Error is: {tracking_error}")
Or if you want it more compact (because apparently the cool kids do it):
df = pd.DataFrame({'Portfolio_Returns': [5.00, 1.67], 'Bench_Returns': [2.89, .759]})
tracking_error = np.sqrt(sum([val**2 for val in df['Portfolio_Returns'] - df['Bench_Returns']]))
print(f"Tracking Error is: {tracking_error}")

How to create a pivot table on extremely large dataframes in Pandas

I need to create a pivot table of 2000 columns by around 30-50 million rows from a dataset of around 60 million rows. I've tried pivoting in chunks of 100,000 rows, and that works, but when I try to recombine the DataFrames by doing a .append() followed by .groupby('someKey').sum(), all my memory is taken up and python eventually crashes.
How can I do a pivot on data this large with a limited ammount of RAM?
EDIT: adding sample code
The following code includes various test outputs along the way, but the last print is what we're really interested in. Note that if we change segMax to 3, instead of 4, the code will produce a false positive for correct output. The main issue is that if a shipmentid entry is not in each and every chunk that sum(wawa) looks at, it doesn't show up in the output.
import pandas as pd
import numpy as np
import random
from pandas.io.pytables import *
import os
pd.set_option('io.hdf.default_format','table')
# create a small dataframe to simulate the real data.
def loadFrame():
frame = pd.DataFrame()
frame['shipmentid']=[1,2,3,1,2,3,1,2,3] #evenly distributing shipmentid values for testing purposes
frame['qty']= np.random.randint(1,5,9) #random quantity is ok for this test
frame['catid'] = np.random.randint(1,5,9) #random category is ok for this test
return frame
def pivotSegment(segmentNumber,passedFrame):
segmentSize = 3 #take 3 rows at a time
frame = passedFrame[(segmentNumber*segmentSize):(segmentNumber*segmentSize + segmentSize)] #slice the input DF
# ensure that all chunks are identically formatted after the pivot by appending a dummy DF with all possible category values
span = pd.DataFrame()
span['catid'] = range(1,5+1)
span['shipmentid']=1
span['qty']=0
frame = frame.append(span)
return frame.pivot_table(['qty'],index=['shipmentid'],columns='catid', \
aggfunc='sum',fill_value=0).reset_index()
def createStore():
store = pd.HDFStore('testdata.h5')
return store
segMin = 0
segMax = 4
store = createStore()
frame = loadFrame()
print('Printing Frame')
print(frame)
print(frame.info())
for i in range(segMin,segMax):
segment = pivotSegment(i,frame)
store.append('data',frame[(i*3):(i*3 + 3)])
store.append('pivotedData',segment)
print('\nPrinting Store')
print(store)
print('\nPrinting Store: data')
print(store['data'])
print('\nPrinting Store: pivotedData')
print(store['pivotedData'])
print('**************')
print(store['pivotedData'].set_index('shipmentid').groupby('shipmentid',level=0).sum())
print('**************')
print('$$$')
for df in store.select('pivotedData',chunksize=3):
print(df.set_index('shipmentid').groupby('shipmentid',level=0).sum())
print('$$$')
store['pivotedAndSummed'] = sum((df.set_index('shipmentid').groupby('shipmentid',level=0).sum() for df in store.select('pivotedData',chunksize=3)))
print('\nPrinting Store: pivotedAndSummed')
print(store['pivotedAndSummed'])
store.close()
os.remove('testdata.h5')
print('closed')
You could do the appending with HDF5/pytables. This keeps it out of RAM.
Use the table format:
store = pd.HDFStore('store.h5')
for ...:
...
chunk # the chunk of the DataFrame (which you want to append)
store.append('df', chunk)
Now you can read it in as a DataFrame in one go (assuming this DataFrame can fit in memory!):
df = store['df']
You can also query, to get only subsections of the DataFrame.
Aside: You should also buy more RAM, it's cheap.
Edit: you can groupby/sum from the store iteratively since this "map-reduces" over the chunks:
# note: this doesn't work, see below
sum(df.groupby().sum() for df in store.select('df', chunksize=50000))
# equivalent to (but doesn't read in the entire frame)
store['df'].groupby().sum()
Edit2: Using sum as above doesn't actually work in pandas 0.16 (I thought it did in 0.15.2), instead you can use reduce with add:
reduce(lambda x, y: x.add(y, fill_value=0),
(df.groupby().sum() for df in store.select('df', chunksize=50000)))
In python 3 you must import reduce from functools.
Perhaps it's more pythonic/readable to write this as:
chunks = (df.groupby().sum() for df in store.select('df', chunksize=50000))
res = next(chunks) # will raise if there are no chunks!
for c in chunks:
res = res.add(c, fill_value=0)
If performance is poor / if there are a large number of new groups then it may be preferable to start the res as zero of the correct size (by getting the unique group keys e.g. by looping through the chunks), and then add in place.

Categories

Resources