How to change the attribute values in h5ad file? - python

There is an attribute in h5ad file which is var_names. I converted the values of var_names to lowercase. Now I want to save/rewrite the new values to the var_names attribute in the h5ad file. How can I do that?
#ipynb file
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
file1 = sc.read_h5ad('/Users/nitish/Downloads/human_all.h5ad')
file2 = sc.read_h5ad('/Users/nitish/Downloads/mouse_all.h5ad')
file1.var_names.str.lower()
file2.var_names.str.lower()
file1.var_names = file1.var_names.str.lower()
file2.var_names = file2.var_names.str.lower()
How to save new file1.var_name to the existing h5ad file?

Related

Iterate through df and update based on prediction

I am not a Python programmer, so am struggling with the following;
def py_model(df):
import pickle
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
filename = 'C:/aaaTENNIS-DATA/votingC.pkl'
loaded_model = pickle.load(open(filename,'rb'))
for index, row in df.iterrows():
ab = row[['abc','def','ghi','jkl']]
input = np.array(ab)
df['Prediction'] =pd.DataFrame(loaded_model.predict([input]))
df['AccScore'] =??
return df
For each row of the dataframe, I wish to get a prediction and put it in df['Prediction'] and also get the model score and put it in another field.
You don't need to iterate
import pickle
filename = 'C:/aaaTENNIS-DATA/votingC.pkl'
loaded_model = pickle.load(open(filename,'rb'))
df['Prediction'] = loaded_model.predict(df[['abc','def','ghi','jkl']])
Tip #1: don't use input as a variable, it's a built-in function in python: https://docs.python.org/3/library/functions.html#input
Tip #2: don't put import statement in a function, put them all at the beginning of your file

Iterating through directory for all csv files and creating kml for each individual(csv) file and save using file name

import csv
import simplekml
import pandas as pd
import glob
frame = pd.DataFrame()
filelist=glob.glob('/Users/germanportes/Documents/Status_Report/Telework_training/Anomaly_6/files/*.csv')
kml = simplekml.Kml()
for file in filelist:
a6 =pd.read_csv(file)
for row in a6:
kml.newpoint(name=a6['idfa'], description = a6['device_os'],coords = [(a6['longitude'], a6['latitude'])])
kml.save('/Users/germanportes/Documents/Status_Report/Telework_training/Anomaly_6/files/kml/'+str(a6)+'.csv')
i like to save each individual csv as its own kml using the filename
Here you're iterating over the columns instead of the rows and then you pass pandas.Series as columns to kml.newpoint arguments instead of some values. Use DataFrame.apply() to iterate over the dataframe rows and add a point per each row to your kml object as follows:
from os.path import join
from glob import iglob
from pathlib import Path
import simplekml
import pandas as pd
csv_dir = 'path/to/csv/directory'
kml_dir = 'path/to/kml/directory'
for file in iglob(join(csv_dir, '*.csv')):
# read the csv file
df = pd.read_csv(file)
# make an empty kml object
kml = simplekml.Kml()
# iterate over the rows and and add new points to kml
df.apply(lambda x: kml.newpoint(name=x['idfa'], description = x['device_os'], coords=[(x['longitude'], x['latitude'])]), axis=1)
# save it as kml with the csv filename
kml.save(join(kml_dir, '{}.kml'.format(Path(file).stem)))

Appending dataframes from json files in a for loop

I am trying to iterate through json files in a folder and append them all into one pandas dataframe.
If I say
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
df_all = pd.DataFrame()
with open("building_data/rooms.json") as file:
data = json.load(file)
df = json_normalize(data['rooms'])
df_y.append(df, ignore_index=True)
I get a dataframe with the data from the one file. If I turn this thinking into a for loop, I have tried
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
df_all = pd.DataFrame()
for file in os.listdir(directory):
with open(directory_in_str+'/'+filename) as file:
data = json.load(file)
df = json_normalize(data['rooms'])
df_all.append(df, ignore_index=True)
print(df_all)
This returns an empty dataframe. Does anyone know why this is happening? If I print df before appending it, it prints the correct values, so I am not sure why it is not appending.
Thank you!
Instead of append next DataFrame I would try to join them like that:
if df_all.empty:
df_all = df
else:
df_all = df_all.join(df)
When joining DataFrames, you can specify on what they should be joined - on index or on specific (key) column, as well as how (default option is similar to appending - 'left').
Here's docs about pandas.DataFrame.join.
In these instances I load everything from json into a list by appending each file's returned dict onto that list. Then I pass the list to pandas.DataFrame.from_records (docs)
In this case the source would become something like...
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
json_data = []
for file in os.listdir(directory):
with open(directory_in_str+'/'+filename) as file:
data = json.load(file)
json_data.append( json_normalize(data['rooms']) )
df_all = pandas.DataFrame.from_records( json_data )
print(df_all)

How to add one more column to my file (modifiedFlights.csv) from another file (original.csv)

I want to add one names column in my file (modifiedFlights.csv) from another file (original.csv) which has a column names. The goal is to add names in modifiedFlights.csv after comparing column hashes which is present in both files. But I am not able to do so.
import os
import glob
from pathlib import Path
import pandas as pd
import pandas
import csv
import numpy as np
from pandas import DataFrame
import sys, argparse, csv
hashes=pd.read_csv(r'C:\Users\Sajid\Desktop\original.csv', usecols=[0]) #hashes in original.csv
names=pd.read_csv(r'C:\Users\Sajid\Desktop\original.csv', usecols=[1])
this=pd.read_csv(r'C:\Users\Sajid\Desktop\csv files\modifiedFlights.csv', usecols=[4])# hashes in modifiedFlights.csv
for i in hashes:
for y in this:
if i == y:
results_row=pd.read_csv(r'C:\Users\Sajid\Desktop\original.csv', usecols=[1], userows=[i])
with open(r'C:\Users\Sajid\Desktop\csv files\modifiedFlights.csv','r') as csvinput:
with open(r'C:\Users\Sajid\Desktop\out.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput)
for row in csv.reader(csvinput):
writer.writerow(row+[result_row])

how to name the output dataframe generated by python-ply

How can I name the data frame generated by the following code?
import re
import os
import csv
import codecs
import numpy as np
import matplotlib as p
import pdb
import pandas as pd
from pandas_ply import install_ply, X, sym_call
install_ply(pd)
(data_merged
.groupby('index')
.ply_select(
count = X.index.count(),
p_avg = X.item_price.mean()
))
Looking at your example, I assume that you mean to name the output variable of the dataframe, which would be data_out in the following:
data_out = (data_merged
.groupby('index')
.ply_select(
count = X.index.count(),
p_avg = X.item_price.mean()
)
)
Note that this is not actually giving the DataFrame a name, it's just naming the variable. You could create another variable that holds a reference to data_out (a pointer) that would have a different name. This is true because data_out is a mutable object.
Series are named and its name is stored in the name attribute. DataFrames are not named, but their columns are, since they are Series.

Categories

Resources