Is there a way to reduce this repetitive code? - python

def frame(dt_type, start_year, end_year, columns_req):
frame = pd.DataFrame()
for i in range (start_year, end_year):
file_name = f"{dt_type} {i}"
dataframe = pd.read_csv(BytesIO(uploaded["%s.csv"%file_name]))
if len(columns_req) == 1:
df = pd.DataFrame(data, columns= [columns_req[0])
if len(columns_req) == 2:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1]])
if len(columns_req) == 3:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2])
if len(columns_req) == 4:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2], columns_req[3]])
frame = frame.append(dataframe, ignore_index=True)
return (frame)
As you can see, the if loop is repetitive and feels odd. I am new to programming. Is there anyway to reduce that whole bunch of code?

you could do this
df = pd.DataFrame(data, columns = colums_req)
instead of all those if - conditions

Related

Dictionary creation inside a function

Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?
you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:

How to select column if string is in column name

so I have a dict of dataframes with many columns. I want to selected all the columns that have the string 'important' in them.
So some of the frames may have important_0 or important_9_0 as their column name. How can I select them and put them into their own new dictionary with all the values each columns contains.
import pandas as pd
df = pd.DataFrame(columns=['a', 'b', 'important_c'])
selected_cols = [c for c in df.columns if c.startswith('important_')]
print(selected_cols)
# ['important_c']
dict_df = { x: pd.DataFrame(columns=['a', 'b', 'important_c']) for x in range(3) }
new_dict = { x: dict_df[x][[c for c in dict_df[x].columns if c.startswith('important_')]] for x in dict_df }
important_columns = [x for x in df.columns if 'important' in x]
#changing your dataframe by remaining columns that you need
df = df[important_columns]

comparing cells iteration using pandas

I'm trying to compare cells within a data frame using pandas.
the data looks like that:
seqnames, start, end, width, strand, s1, s2, s3, sn
1, Ha412HOChr01, 1, 220000, 220000, CN2, CN10, CN2, CN2
2, Ha412HOChr01, 1, 220000, 220000, CN2, CN2, CN2, CN2
3, Ha412HOChr01, 1, 220000, 220000, CN2, CN4, CN2, CN2
n, Ha412HOChr01, 1, 220000, 220000, CN2, CN2, CN2, CN6
I was able to make individual comparisons with the following code
import pandas as pd
df = pd.read_csv("test.csv")
if df.iloc[0,5] != df.iloc[0,6]:
print("yay!")
else:
print("not intersting...")
I would like to iterate a comparison between s1 and all the other s columns, line by line in a loop or in any other more efficient methods.
when i've tried the following code:
df = pd.read_csv("test.csv")
df.columns
#make sure to change in future analysis
ref = df[' Sunflower_14_S8']
all_the_rest = df.drop(['seqnames', ' start', ' end', ' width', ' strand'], axis=1)
#all_the_rest.columns
OP = ref.eq(all_the_rest)
OP.to_csv("OP.csv")
i've got a wired output
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False444,False,False,False,False,False,False,False,False,False,False,False,False,False
it seems like it compare all the characters instead of the strings
I'm new to programming and I'm stuck, appreciate your help!
Does this help?
import pandas as pd
# define a list of columns you want to compare
cols = ['s1', 's2', 's3']
# some sample data
df = pd.DataFrame(columns=cols)
df['s1'] = ['CN2', 'CN10', 'CN2', 'CN2']
df['s2'] = ['CN2', 'CN2', 'CN2', 'CN2']
df['s3'] = ['CN2', 'CN2', 'CN2', 'CN6']
# remove 's1' from the list of columns
cols_except_s1 = [x for x in cols if x!='s1']
# create a blank dataframe to hold our comparisons
df_comparison = pd.DataFrame(columns=cols_except_s1)
# iterate through each other column, comparing it against 's1'
for x in cols_except_s1:
comparison_series = df['s1'] == df[x]
df_comparison[x] = comparison_series
# the result is a dataframe that has columns of Boolean values
print(df_comparison)
outputs
s2 s3
0 True True
1 False False
2 True True
3 True False
well 9 hour later i have found a way without using panadas...
df = pd.read_csv("test.csv")
#df.columns
#convertthe data frame to a list
list = df.values.tolist()
for line in list:
lineAVG = sum(line[5:]) / len(line[5:])
ref = (line[5])
if lineAVG - ref > 0.15:
output = line
print(output)

Joining dataframe in for loop

My code won't work... it gives me ValueError: columns overlap but no suffix specified
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = maindf.join(df)
print(maindf)
Use concat:
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = pd.concat([maindf, df], axis=1)
print(maindf)

searching in a pandas df that contains ranges

I have a pandas df that contains 2 columns 'start' and 'end' (both are integers). I would like an efficient method to search for rows such that the range that is represented by the row [start,end] contains a specific value.
Two additional notes:
It is possible to assume that ranges don't overlap
The solution should support a batch mode - that given a list of inputs, the output will be a mapping (dictionary or whatever) to the row indices that contain the matching range.
For example:
start end
0 7216 7342
1 7343 7343
2 7344 7471
3 7472 8239
4 8240 8495
and the query of
[7215,7217,7344]
will result in
{7217: 0, 7344: 2}
Thanks!
Brute force solution, could use lots of improvements though.
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]})
search = [7215, 7217, 7344]
res = {}
for i in search:
mask = (df.start <= i) & (df.end >= i)
idx = df[mask].index.values
if len(idx):
res[i] = idx[0]
print res
Yields
{7344: 2, 7217: 0}
Selected solution
This new solution could have better performances. But there is a limitation, it will only works if there is no gap between ranges like in the example provided.
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
query = [7215,7217,7344]
# Reshaping the original DataFrame
df = df.reset_index()
df = pd.concat([df['start'], df['end']]).reset_index()
df = df.set_index(0).sort_index()
# Creating a DataFrame with a continuous index
max_range = max(df.index) + 1
min_range = min(df.index)
s = pd.DataFrame(index=range(min_range,max_range))
# Joining them
s = s.join(df)
# Filling the gaps
s = s.fillna(method='backfill')
# Then a simple selection gives the result
s.loc[query,:].dropna().to_dict()['index']
# Result
{7217: 0.0, 7344: 2.0}
Previous proposal
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
# Constructing a DataFrame containing the query numbers
query = [7215,7217,7344]
result = pd.DataFrame(np.tile(query, (len(df), 1)), columns=query)
# Merging the data and the query
df = pd.concat([df, result], axis=1)
# Making the test
df = df.apply(lambda x: (x >= x['start']) & (x <= x['end']), axis=1).loc[:,query]
# Keeping only values found
df = df[df==True]
df = df.dropna(how='all', axis=(0,1))
# Extracting to the output format
result = df.to_dict('split')
result = dict(zip(result['columns'], result['index']))
# The result
{7217: 0, 7344: 2}

Categories

Resources