Why can't I replace null values in this excel sheet? - python
In my code, I run a t-test which sometimes yields "NaN" or "nan" when running a test on two zero value groups. I have tried making new data frames, tried replacing using .replace and also tried fillna() but nothing was successful. I get errors when also trying to define a new dataframe or read the file again after adding new calculations.
How do I replace the nulls and "nan" in these files: "significant_report2.xls" or "quant_report2.xls"
import json
import os, sys
import numpy as np
import pandas as pd
import scipy.stats
output_report = "quant_report2.xls"
significant_report = "significant_report2.xls"
output_report_writer = open(output_report, "w")
significant_writer = open(significant_report, "w")
# setup samples grouped by control and treatment
header = []
for idx in control_indices:
header.append(quant_columns[idx])
for idx in treatment_indices:
header.append(quant_columns[idx])
output_report_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
significant_writer.write("Feature\t%s\tP-value\tctrl_means\tctrl_stdDev\ttx_means\ttx_stdDev\n"%"\t".join(header))
feature_list = list(quantitative_data_frame.index)
for feature_idx in range(len(feature_list)):
feature_name = feature_list[feature_idx]
control_values = quantitative_data_frame.iloc[feature_idx, control_indices]
treatment_values = quantitative_data_frame.iloc[feature_idx, treatment_indices]
ttest_stat, ttest_pvalue = scipy.stats.ttest_ind(control_values, treatment_values, equal_var=False)
ctrl_means = scipy.mean(control_values,0)
ctrl_stdDev = scipy.stats.tstd(control_values)
tx_means= scipy.mean(treatment_values,axis=0)
tx_stdDev1 = scipy.stats.tstd(treatment_values)
output_report_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,
"\t".join([str(x) for x in list(control_values)]),
"\t".join([str(x) for x in list(treatment_values)]), ttest_pvalue, ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
significant_writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(feature_name,"\t".join([str(x) for x in list(control_values)]), "\t".join([str(x) for x in list(treatment_values)]),ttest_pvalue,ctrl_means,ctrl_stdDev,tx_means,tx_stdDev))
Related
ValueError: '_index' is a reserved name for dataframe columns
I am trying to save a file as to h5ad format and it is giving this value error; ValueError: '_index' is a reserved name for dataframe columns. import pandas as pd import scanpy as sc import numpy as np data = sc.read_h5ad('f.h5ad') annotation = pd.read_csv('n.tsv', sep='\t') annotation_dict = {item['barcodes']:item['celltype'] for item in annotation.to_dict('records')} data.obs['barcodes'] = data.obs.index data.obs['celltype'] = data.obs['barcodes'].map(annotation_dict) sc.pp.filter_genes(data,min_cells=686) sc.pp.filter_cells(data,min_genes=10) sc.pp.normalize_per_cell(data,20000) sc.pp.log1p(data) sc.pp.highly_variable_genes(data,n_top_genes=1000) data.X = np.exp(data.X.toarray())-1 data=data[:,data.var['highly_variable']] sc.pp.normalize_per_cell(data,3800) clustered = sc.read_h5ad('f.h5ad') sc.pp.filter_cells(data,min_genes=10) sc.pp.recipe_zheng17(clustered) sc.tl.pca(clustered, n_comps=50) sc.pp.neighbors(clustered, n_pcs=50) sc.tl.louvain(clustered, resolution=0.15) clustered.obs.groupby('louvain').count() data.obs['louvain'] = list(clustered.obs['louvain']) split = pd.DataFrame(data.obs['barcodes']) test = split.sample(frac=0.2) d_split = {item:'test' for item in test['barcodes']} data.obs['split'] = data.obs['barcodes'].map(d_split).fillna('train') data.write_h5ad(e.h5ad')
This is probably related to a known issue with the AnnData .raw object. Two workarounds (From here): #1 data.__dict__['_raw'].__dict__['_var'] = data.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'}) #2, deleting the backed up raw information del data.raw
Filling out a dataframe column using parallel processing in Python
Trying to compute a value for each row of dataframe in parallel using the following code, but getting errors either when I pass individual input ranges or the combination: #!pip install pyblaze import itertools import pyblaze import pyblaze.multiprocessing as xmp import pandas as pd inputs = [range(2),range(2),range(3)] inputs_list = list(itertools.product(*inputs)) Index = pd.MultiIndex.from_tuples(inputs_list,names={"a", "b", "c"}) df = pd.DataFrame(index = Index) df['Output'] = 0 print(df) def Addition(A,B,C): df.loc[A,B,C]['Output']=A+B+C return df def parallel(inputs_list): tokenizer = xmp.Vectorizer(Addition, num_workers=8) return tokenizer.process(inputs_list) parallel(inputs_list)
Normalize columns in a numpy array- results in typeerror
want to do a simple normalization of the data in a numpy ndarray. specifically want X-mu/sigma. Tried using the exact code that that I found in earlier questions - kept getting error = TypeError cannot perform reduce with flexible type. Gave up and tried a simpler normzlization method X-mu/X.ptp - got the same error. import csv import numpy as np from numpy import * import urllib.request #Import comma separated data from git.hub url = 'http://archive.ics.uci.edu/ml/machine-learning- databases/wine/wine.data' urllib.request.urlretrieve(url,'F:/Python/Wine Dataset/wine_data') #open file filename = 'F:/Python/Wine Dataset/wine_data'; raw_data = open(filename,'rt'); #Put raw_data into a numpy.ndarray reader = csv.reader(raw_data); x = list(reader); data = np.array(x) #First column is classification, other columns are features y= data[:,0]; X_raw = data[:,1:13]; # Attempt at normalizing data- really wanted X-mu/sigma gave up # even this simplified version doesn't work # latest error is TypeError cannot perform reduce with flexible type????? X = (X_raw - X_raw.min(0)) / X_raw.ptp(0); print(X); # # # #
Finally figured it out. The line "data = np.array(x)" returned an array containing string data. was: data = "np.array(x)" changed to: "np.array(x).astype(np.float)" after that everything worked - simple issue cost me hours
getting different threads to alter different parts of a pandas dataframe
I am new to multithreading in python so am not sure how to set this up. I am trying to produce a large output dataframe populated with calculations based on another input dataframe. The output dataframe is like an adjacency matrix of the columns of the input dataframe. The following non-multithreaded version works perfectly: import numpy as np import pandas as pd from scipy.stats import chi2_contingency import json import os import time def build_adjacency_matrix(DATA_MATRIX, OUT): # READS DATA: data must be a csv with a header and an index column my_data = pd.read_csv(DATA_MATRIX, index_col=0) # INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames) AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns) y=0 w=2 for c1 in my_data.columns: print (c1) y+=1 if y > w: time.sleep(1) # GIVE THE PROCESSER A REST AFTER EACH 10 COLUMNS print(y) #KEEP TRACK OF HOW MANY COLS HAVE BEEN PROCESSED w+=10 for c2 in my_data.columns: if c1==c2: AM.loc[c1,c2]=0; continue sample_df = pd.DataFrame(my_data, columns=[c1,c2]) # KEEP ONLY ROWS WITH 1s and 0s sample_df = sample_df[sample_df[c1] != 0.5] sample_df = sample_df[sample_df[c2] != 0.5] sample_df = sample_df.dropna() # CALCULATE ChiX # Contingency table. contingency = pd.crosstab(sample_df[c1], sample_df[c2]) # Chi-square test of independence. try: chi2, p, ddof, expected = chi2_contingency(contingency) AM.loc[c1,c2] = p except: ValueError; # ASSIGN AS NOT SIGNIFICANT IF THERE IS A PROBLEM AM.loc[c1,c2] = 1 AM.to_csv(OUT, sep=',') return # FILES data_matrix='input_test.csv' out='output_mt_test.csv' # FUNCTION CALL build_adjacency_matrix(data_matrix, out) Here is the top few rows of the input file: ,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19 SAMPLE1,1,0,0.5,1,1,0.5,0.5,1,0.5,0.5,0.5,0.5,0,0.5,0,0.5,0,0.5,0.5 SAMPLE2,0.5,0.5,0.5,1,1,0.5,0.5,1,0.5,0.5,0,1,0,0.5,0,0.5,0.5,0.5,0.5 SAMPLE3,0.5,0,0.5,1,1,0.5,0.5,1,0.5,0.5,1,0.5,0.5,0.5,0,1,0,0.5,0.5 SAMPLE4,1,0.5,0.5,1,1,0.5,0.5,0,0.5,0.5,0.5,0.5,0.5,0.5,1,1,0.5,0.5,1 And here is the top few rows of the output file: ,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19 VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,0.920167313802 VAR2,0.00326965769624,0,0.67328997966,0.714393037634,0.714393037634,0.829638099719,1.0,0.881545828869,1.0,1.0,0.504985075094,0.665005542102,0.672603817442,0.75946286538,0.365088814029,1.0,0.478520976544,0.698535358303,0.700311372937 VAR3,0.67328997966,0.67328997966,0,1.0,1.0,0.665005542102,1.0,0.672603817442,1.0,1.0,1.0,1.0,0.819476976778,1.0,0.324126587758,1.0,1.0,0.665005542102,0.608407800233 The code works well and produces the expected output for the test file, however the real input file (exactly the same file structure but with 100s rows and 1000s of cols) is considerably larger and takes ~48 hours to run so I need to make it faster. I tried the following attempt to implement multithreading: import pandas as pd from scipy.stats import chi2_contingency from threading import Thread def build_adjacency_matrix(DATA_MATRIX, OUT, THREADS): # READS DATA: data must be a csv with a header and an index column my_data = pd.read_csv(DATA_MATRIX, index_col=0) # INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames) AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns) print(len(my_data.columns)) print(len(my_data.index)) # BUILD THREAD GROUPS thread_groups={} chunk=int(len(AM.columns)/THREADS) i=0; j=chunk for t in range(THREADS): thread_groups[t]=list(range(i,j)); i+=chunk; j+=chunk; # DELEGATE REMAINING COLS TO THE LAST THREAD if thread_groups[THREADS-1][-1] != len(AM.columns): thread_groups[THREADS-1] = thread_groups[THREADS-1] + \ list(range((thread_groups[THREADS-1][-1]),len(AM.columns))) print(thread_groups) def populate_DF(section): for c1 in AM.columns[section]: for c2 in AM.columns: if c1==c2: AM.loc[c1,c2]=0; continue sample_df = pd.DataFrame(my_data, columns=[c1,c2]) # KEEP ONLY ROWS WITH 1s and 0s sample_df = sample_df[sample_df[c1] != 0.5] sample_df = sample_df[sample_df[c2] != 0.5] sample_df = sample_df.dropna() # CALCULATE ChiX # Contingency table. contingency = pd.crosstab(sample_df[c1], sample_df[c2]) #Chi-square test of independence. try: # POPULATE AM WITH CHI-SQ p-value chi2, p, ddof, expected = chi2_contingency(contingency) AM.loc[c1,c2] = p except: # ASSIGN A p-value OF 1.0 IF THERE IS A PROBLEM ValueError; AM.loc[c1,c2] = 1 for tg in thread_groups: t = Thread(target=populate_DF, args=(thread_groups[tg],)) print(tg) print(thread_groups[tg]) t.start() AM.to_csv(OUT, sep=',') return data_matrix='input_test.csv' out='output_mt_test.csv' build_adjacency_matrix(data_matrix, out, 4) I'm not sure if I should be making the output dataframe a global variable? Or how to do it? The aim of the section on 'building thread groups' is to delegate groups of columns from the input file to be delegated to separate threads and each of the outputs added to the final dataframe. I have up to 16 cores available so thought a multithreading solution would help here. The code as it is produces an unexpected, partially complete output: ,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19 VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185, VAR2,,,,,,,,,,,,,,,,,,, VAR3,,,,,,,,,,,,,,,,,,, VAR4,,,,,,,,,,,,,,,,,,, VAR5,0.573642138098,0.714393037634,1.0,5.61531250139e-06,0,1.0,1.0,0.859350808026,0.819476976778,0.819476976778,1.0,1.0,0.805020272634,,,,,, VAR6,,,,,,,,,,,,,,,,,,, VAR7,,,,,,,,,,,,,,,,,,, VAR8,,,,,,,,,,,,,,,,,,, VAR9,1.0,1.0,1.0,0.819476976778,,,,,,,,,,,,,,, VAR10,,,,,,,,,,,,,,,,,,, VAR11,,,,,,,,,,,,,,,,,,, VAR12,,,,,,,,,,,,,,,,,,, VAR13,0.914326585297,,,,,,,,,,,,,,,,,, VAR14,,,,,,,,,,,,,,,,,,, VAR15,,,,,,,,,,,,,,,,,,, VAR16,,,,,,,,,,,,,,,,,,, VAR17,,,,,,,,,,,,,,,,,,, VAR18,,,,,,,,,,,,,,,,,,, VAR19,,,,,,,,,,,,,,,,,,, i'm not sure if this is to do with an issue with the multithreads trying to output to the same variable or if this is a problem with how I have spread the workload. I would really appreciate any help with how to fix this, or any other ways to optimize the code? Thanks in advance!
python/pandas/sklearn: getting closest matches from pairwise_distances
I have a dataframe and am trying to get the closest matches using mahalanobis distance across three categories, like: from io import StringIO from sklearn import metrics import pandas as pd stringdata = StringIO(u"""pid,ratio1,pct1,rsp 0,2.9,26.7,95.073615 1,11.6,29.6,96.963660 2,0.7,37.9,97.750412 3,2.7,27.9,102.750412 4,1.2,19.9,93.750412 5,0.2,22.1,96.750412 """) stats = ['ratio1','pct1','rsp'] df = pd.read_csv(stringdata) d = metrics.pairwise.pairwise_distances(df[stats].as_matrix(), metric='mahalanobis') print(df) print(d) Where that pid column is a unique identifier. What I need to do is take that ndarray returned by the pairwise_distances call and update the original dataframe so each row has some kind of list of its closest N matches (so pid 0 might have an ordered list by distance of like 2, 1, 5, 3, 4 (or whatever it actually is), but I'm totally stumped how this is done in python.
from io import StringIO from sklearn import metrics stringdata = StringIO(u"""pid,ratio1,pct1,rsp 0,2.9,26.7,95.073615 1,11.6,29.6,96.963660 2,0.7,37.9,97.750412 3,2.7,27.9,102.750412 4,1.2,19.9,93.750412 5,0.2,22.1,96.750412 """) stats = ['ratio1','pct1','rsp'] df = pd.read_csv(stringdata) dist = metrics.pairwise.pairwise_distances(df[stats].as_matrix(), metric='mahalanobis') dist = pd.DataFrame(dist) ranks = np.argsort(dist, axis=1) df["rankcol"] = ranks.apply(lambda row: ','.join(map(str, row)), axis=1) df