Why can't I replace null values in this excel sheet? - python

In my code, I run a t-test which sometimes yields "NaN" or "nan" when running a test on two zero value groups. I have tried making new data frames, tried replacing using .replace and also tried fillna() but nothing was successful. I get errors when also trying to define a new dataframe or read the file again after adding new calculations.
How do I replace the nulls and "nan" in these files: "significant_report2.xls" or "quant_report2.xls"
import json
import os, sys
import numpy as np
import pandas as pd
import scipy.stats
output_report = "quant_report2.xls"
significant_report = "significant_report2.xls"
output_report_writer = open(output_report, "w")
significant_writer = open(significant_report, "w")
# setup samples grouped by control and treatment
header = []
for idx in control_indices:
header.append(quant_columns[idx])
for idx in treatment_indices:
header.append(quant_columns[idx])
output_report_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
significant_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
feature_list = list(quantitative_data_frame.index)
for feature_idx in range(len(feature_list)):
feature_name = feature_list[feature_idx]
control_values = quantitative_data_frame.iloc[feature_idx, control_indices]
treatment_values = quantitative_data_frame.iloc[feature_idx, treatment_indices]
ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(control_values, treatment_values, equal_var=False)
ctrl_means = scipy.mean(control_values,0)
ctrl_stdDev = scipy.stats.tstd(control_values)
tx_means= scipy.mean(treatment_values,axis=0)
tx_stdDev1 = scipy.stats.tstd(treatment_values)
output_report_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,
"\t".join([str(x) for x in list(control_values)]),
"\t".join([str(x) for x in list(treatment_values)]), ttest_pvalue, ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
significant_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,"\t".join([str(x) for x in list(control_values)]), "\t".join([str(x) for x in list(treatment_values)]),ttest_pvalue,ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))

Related

ValueError: '_index' is a reserved name for dataframe columns

I am trying to save a file as to h5ad format and it is giving this value error; ValueError: '_index' is a reserved name for dataframe columns.
import pandas as pd
import scanpy as sc
import numpy as np
data = sc.read_h5ad('f.h5ad')
annotation = pd.read_csv('n.tsv', sep='\t')
annotation_dict = {item['barcodes']:item['celltype'] for item in annotation.to_dict('records')}
data.obs['barcodes'] = data.obs.index
data.obs['celltype'] = data.obs['barcodes'].map(annotation_dict)
sc.pp.filter_genes(data,min_cells=686)
sc.pp.filter_cells(data,min_genes=10)
sc.pp.normalize_per_cell(data,20000)
sc.pp.log1p(data)
sc.pp.highly_variable_genes(data,n_top_genes=1000)
data.X = np.exp(data.X.toarray())-1
data=data[:,data.var['highly_variable']]
sc.pp.normalize_per_cell(data,3800)
clustered = sc.read_h5ad('f.h5ad')
sc.pp.filter_cells(data,min_genes=10)
sc.pp.recipe_zheng17(clustered)
sc.tl.pca(clustered, n_comps=50)
sc.pp.neighbors(clustered, n_pcs=50)
sc.tl.louvain(clustered, resolution=0.15)
clustered.obs.groupby('louvain').count()
data.obs['louvain'] = list(clustered.obs['louvain'])
split = pd.DataFrame(data.obs['barcodes'])
test = split.sample(frac=0.2)
d_split = {item:'test' for item in test['barcodes']}
data.obs['split'] = data.obs['barcodes'].map(d_split).fillna('train')
data.write_h5ad(e.h5ad')
This is probably related to a known issue with the AnnData .raw object.
Two workarounds (From here):
#1
data.__dict__['_raw'].__dict__['_var'] = data.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
#2, deleting the backed up raw information
del data.raw

Filling out a dataframe column using parallel processing in Python

Trying to compute a value for each row of dataframe in parallel using the following code, but getting errors either when I pass individual input ranges or the combination:
#!pip install pyblaze
import itertools
import pyblaze
import pyblaze.multiprocessing as xmp
import pandas as pd
inputs = [range(2),range(2),range(3)]
inputs_list = list(itertools.product(*inputs))
Index = pd.MultiIndex.from_tuples(inputs_list,names={"a", "b", "c"})
df = pd.DataFrame(index = Index)
df['Output'] = 0
print(df)
def Addition(A,B,C):
df.loc[A,B,C]['Output']=A+B+C
return df
def parallel(inputs_list):
tokenizer = xmp.Vectorizer(Addition, num_workers=8)
return tokenizer.process(inputs_list)
parallel(inputs_list)

Normalize columns in a numpy array- results in typeerror

want to do a simple normalization of the data in a numpy ndarray.
specifically want X-mu/sigma. Tried using the exact code that
that I found in earlier questions - kept getting error = TypeError
cannot perform reduce with flexible type. Gave up and tried a simpler
normzlization method X-mu/X.ptp - got the same error.
import csv
import numpy as np
from numpy import *
import urllib.request
#Import comma separated data from git.hub
url = 'http://archive.ics.uci.edu/ml/machine-learning-
databases/wine/wine.data'
urllib.request.urlretrieve(url,'F:/Python/Wine Dataset/wine_data')
#open file
filename = 'F:/Python/Wine Dataset/wine_data';
raw_data = open(filename,'rt');
#Put raw_data into a numpy.ndarray
reader = csv.reader(raw_data);
x = list(reader);
data = np.array(x)
#First column is classification, other columns are features
y= data[:,0];
X_raw = data[:,1:13];
# Attempt at normalizing data- really wanted X-mu/sigma gave up
# even this simplified version doesn't work
# latest error is TypeError cannot perform reduce with flexible type?????
X = (X_raw - X_raw.min(0)) / X_raw.ptp(0);
print(X);
#
#
#
#
Finally figured it out. The line "data = np.array(x)" returned an array containing string data.
was:
data = "np.array(x)"
changed to: "np.array(x).astype(np.float)"
after that everything worked - simple issue cost me hours

getting different threads to alter different parts of a pandas dataframe

I am new to multithreading in python so am not sure how to set this up. I am trying to produce a large output dataframe populated with calculations based on another input dataframe. The output dataframe is like an adjacency matrix of the columns of the input dataframe.
The following non-multithreaded version works perfectly:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import json
import os
import time
def build_adjacency_matrix(DATA_MATRIX, OUT):
# READS DATA: data must be a csv with a header and an index column
my_data = pd.read_csv(DATA_MATRIX, index_col=0)
# INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)
y=0
w=2
for c1 in my_data.columns:
print (c1)
y+=1
if y > w:
time.sleep(1) # GIVE THE PROCESSER A REST AFTER EACH 10 COLUMNS
print(y) #KEEP TRACK OF HOW MANY COLS HAVE BEEN PROCESSED
w+=10
for c2 in my_data.columns:
if c1==c2: AM.loc[c1,c2]=0; continue
sample_df = pd.DataFrame(my_data, columns=[c1,c2])
# KEEP ONLY ROWS WITH 1s and 0s
sample_df = sample_df[sample_df[c1] != 0.5]
sample_df = sample_df[sample_df[c2] != 0.5]
sample_df = sample_df.dropna()
# CALCULATE ChiX
# Contingency table.
contingency = pd.crosstab(sample_df[c1], sample_df[c2])
# Chi-square test of independence.
try:
chi2, p, ddof, expected = chi2_contingency(contingency)
AM.loc[c1,c2] = p
except:
ValueError;
# ASSIGN AS NOT SIGNIFICANT IF THERE IS A PROBLEM
AM.loc[c1,c2] = 1
AM.to_csv(OUT, sep=',')
return
# FILES
data_matrix='input_test.csv'
out='output_mt_test.csv'
# FUNCTION CALL
build_adjacency_matrix(data_matrix, out)
Here is the top few rows of the input file:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
SAMPLE1,1,0,0.5,1,1,0.5,0.5,1,0.5,0.5,0.5,0.5,0,0.5,0,0.5,0,0.5,0.5
SAMPLE2,0.5,0.5,0.5,1,1,0.5,0.5,1,0.5,0.5,0,1,0,0.5,0,0.5,0.5,0.5,0.5
SAMPLE3,0.5,0,0.5,1,1,0.5,0.5,1,0.5,0.5,1,0.5,0.5,0.5,0,1,0,0.5,0.5
SAMPLE4,1,0.5,0.5,1,1,0.5,0.5,0,0.5,0.5,0.5,0.5,0.5,0.5,1,1,0.5,0.5,1
And here is the top few rows of the output file:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,0.920167313802
VAR2,0.00326965769624,0,0.67328997966,0.714393037634,0.714393037634,0.829638099719,1.0,0.881545828869,1.0,1.0,0.504985075094,0.665005542102,0.672603817442,0.75946286538,0.365088814029,1.0,0.478520976544,0.698535358303,0.700311372937
VAR3,0.67328997966,0.67328997966,0,1.0,1.0,0.665005542102,1.0,0.672603817442,1.0,1.0,1.0,1.0,0.819476976778,1.0,0.324126587758,1.0,1.0,0.665005542102,0.608407800233
The code works well and produces the expected output for the test file, however the real input file (exactly the same file structure but with 100s rows and 1000s of cols) is considerably larger and takes ~48 hours to run so I need to make it faster.
I tried the following attempt to implement multithreading:
import pandas as pd
from scipy.stats import chi2_contingency
from threading import Thread
def build_adjacency_matrix(DATA_MATRIX, OUT, THREADS):
# READS DATA: data must be a csv with a header and an index column
my_data = pd.read_csv(DATA_MATRIX, index_col=0)
# INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)
print(len(my_data.columns))
print(len(my_data.index))
# BUILD THREAD GROUPS
thread_groups={}
chunk=int(len(AM.columns)/THREADS)
i=0; j=chunk
for t in range(THREADS): thread_groups[t]=list(range(i,j)); i+=chunk; j+=chunk;
# DELEGATE REMAINING COLS TO THE LAST THREAD
if thread_groups[THREADS-1][-1] != len(AM.columns):
thread_groups[THREADS-1] = thread_groups[THREADS-1] + \
list(range((thread_groups[THREADS-1][-1]),len(AM.columns)))
print(thread_groups)
def populate_DF(section):
for c1 in AM.columns[section]:
for c2 in AM.columns:
if c1==c2: AM.loc[c1,c2]=0; continue
sample_df = pd.DataFrame(my_data, columns=[c1,c2])
# KEEP ONLY ROWS WITH 1s and 0s
sample_df = sample_df[sample_df[c1] != 0.5]
sample_df = sample_df[sample_df[c2] != 0.5]
sample_df = sample_df.dropna()
# CALCULATE ChiX
# Contingency table.
contingency = pd.crosstab(sample_df[c1], sample_df[c2])
#Chi-square test of independence.
try:
# POPULATE AM WITH CHI-SQ p-value
chi2, p, ddof, expected = chi2_contingency(contingency)
AM.loc[c1,c2] = p
except:
# ASSIGN A p-value OF 1.0 IF THERE IS A PROBLEM
ValueError;
AM.loc[c1,c2] = 1
for tg in thread_groups:
t = Thread(target=populate_DF, args=(thread_groups[tg],))
print(tg)
print(thread_groups[tg])
t.start()
AM.to_csv(OUT, sep=',')
return
data_matrix='input_test.csv'
out='output_mt_test.csv'
build_adjacency_matrix(data_matrix, out, 4)
I'm not sure if I should be making the output dataframe a global variable? Or how to do it? The aim of the section on 'building thread groups' is to delegate groups of columns from the input file to be delegated to separate threads and each of the outputs added to the final dataframe. I have up to 16 cores available so thought a multithreading solution would help here. The code as it is produces an unexpected, partially complete output:
,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,
VAR2,,,,,,,,,,,,,,,,,,,
VAR3,,,,,,,,,,,,,,,,,,,
VAR4,,,,,,,,,,,,,,,,,,,
VAR5,0.573642138098,0.714393037634,1.0,5.61531250139e-06,0,1.0,1.0,0.859350808026,0.819476976778,0.819476976778,1.0,1.0,0.805020272634,,,,,,
VAR6,,,,,,,,,,,,,,,,,,,
VAR7,,,,,,,,,,,,,,,,,,,
VAR8,,,,,,,,,,,,,,,,,,,
VAR9,1.0,1.0,1.0,0.819476976778,,,,,,,,,,,,,,,
VAR10,,,,,,,,,,,,,,,,,,,
VAR11,,,,,,,,,,,,,,,,,,,
VAR12,,,,,,,,,,,,,,,,,,,
VAR13,0.914326585297,,,,,,,,,,,,,,,,,,
VAR14,,,,,,,,,,,,,,,,,,,
VAR15,,,,,,,,,,,,,,,,,,,
VAR16,,,,,,,,,,,,,,,,,,,
VAR17,,,,,,,,,,,,,,,,,,,
VAR18,,,,,,,,,,,,,,,,,,,
VAR19,,,,,,,,,,,,,,,,,,,
i'm not sure if this is to do with an issue with the multithreads trying to output to the same variable or if this is a problem with how I have spread the workload. I would really appreciate any help with how to fix this, or any other ways to optimize the code? Thanks in advance!

python/pandas/sklearn: getting closest matches from pairwise_distances

I have a dataframe and am trying to get the closest matches using mahalanobis distance across three categories, like:
from io import StringIO
from sklearn import metrics
import pandas as pd
stringdata = StringIO(u"""pid,ratio1,pct1,rsp
0,2.9,26.7,95.073615
1,11.6,29.6,96.963660
2,0.7,37.9,97.750412
3,2.7,27.9,102.750412
4,1.2,19.9,93.750412
5,0.2,22.1,96.750412
""")
stats = ['ratio1','pct1','rsp']
df = pd.read_csv(stringdata)
d = metrics.pairwise.pairwise_distances(df[stats].as_matrix(),
metric='mahalanobis')
print(df)
print(d)
Where that pid column is a unique identifier.
What I need to do is take that ndarray returned by the pairwise_distances call and update the original dataframe so each row has some kind of list of its closest N matches (so pid 0 might have an ordered list by distance of like 2, 1, 5, 3, 4 (or whatever it actually is), but I'm totally stumped how this is done in python.
from io import StringIO
from sklearn import metrics
stringdata = StringIO(u"""pid,ratio1,pct1,rsp
0,2.9,26.7,95.073615
1,11.6,29.6,96.963660
2,0.7,37.9,97.750412
3,2.7,27.9,102.750412
4,1.2,19.9,93.750412
5,0.2,22.1,96.750412
""")
stats = ['ratio1','pct1','rsp']
df = pd.read_csv(stringdata)
dist = metrics.pairwise.pairwise_distances(df[stats].as_matrix(),
metric='mahalanobis')
dist = pd.DataFrame(dist)
ranks = np.argsort(dist, axis=1)
df["rankcol"] = ranks.apply(lambda row: ','.join(map(str, row)), axis=1)
df

Categories

Resources