df1 = pd.read_excel(mxln) # Loads master xlsx for comparison
df2 = pd.read_excel(sfcn) # Loads student xlsx for comparison
difference = df2[df2 != df1] # Scans for differences
Wherever there is a difference, I want to store those cell locations in a list. It needs to be in the format 'A1' (not something like [1, 1]) so I can pass it through this:
redFill = PatternFill(start_color='FFEE1111', end_color='FFEE1111', fill_type='solid')
lsws['A1'].fill = redFill
lsfh.save(sfcn)
I've looked at solutions like this, but I couldn't get it to work/don't understand it. For example, the following doesn't work:
def highlight_cells():
df1 = pd.read_excel(mxln) # Loads master xlsx for comparison
df2 = pd.read_excel(sfcn) # Loads student xlsx for comparison
difference = df2[df2 != df1] # Scans for differences
return ['background-color: yellow']
df2.style.apply(highlight_cells)
To get the difference cells from two pandas.DataFrame as excel coordinates you can do:
Code:
def diff_cell_indices(dataframe1, dataframe2):
from openpyxl.utils import get_column_letter as column_letter
x_ofs = dataframe1.columns.nlevels + 1
y_ofs = dataframe1.index.nlevels + 1
return [column_letter(x + x_ofs) + str(y + y_ofs) for
y, x in zip(*np.where(dataframe1 != dataframe2))]
Test Code:
import pandas as pd
df1 = pd.read_excel('test.xlsx')
print(df1)
df2 = df.copy()
df2.C['R2'] = 1
print(df2)
print(diff_cell_indices(df1, df2))
Results:
B C
R2 2 3
R3 4 5
B C
R2 2 1
R3 4 5
['C2']
Related
I have a data frame
cat input.csv
dwelling,wall,weather,occ,height,temp
5,2,Ldn,Pen,154.7,23.4
5,4,Ldn,Pen,172.4,28.7
3,4,Ldn,Pen,183.5,21.2
3,4,Ldn,Pen,190.2,30.3
To which I'm trying to apply the following function:
input_df = pd.read_csv('input.csv')
def folder_column(row):
if row['dwelling'] == 5 and row['wall'] == 2:
return 'folder1'
elif row['dwelling'] == 3 and row['wall'] == 4:
return 'folder2'
else:
return 0
I want to run the function on the input dataset and store the output in a separate data frame using something like this:
temp_df = pd.DataFrame()
temp_df = input_df['archetype_folder'] = input_df.apply(folder_column, axis=1)
But when I do this I only get the newly created 'archetype_folder' in the temp_df, when I would like all the original columns from the input_df. Can anyone help? Note that I don't want to add the new column 'archetype_folder' to the original, input_df. I've also tried this:
temp_df = input_df
temp_df['archetype_folder'] = temp_df.apply(folder_column, axis=1)
But when I run the second command both input_df and temp_df end up with the new column?
Any help is appreciated!
Use Dataframe.copy :
temp_df = input_df.copy()
temp_df['archetype_folder'] = temp_df.apply(folder_column, axis=1)
You need to create copy of original DataFrame, then assign return values of your function to it, consider following simple example
import pandas as pd
def is_odd(row):
return row.value % 2 == 1
df1 = pd.DataFrame({"value":[1,2,3],"name":["uno","dos","tres"]})
df2 = df1.copy()
df2["odd"] = df1.apply(is_odd,axis=1)
print(df1)
print("=====")
print(df2)
gives output
value name
0 1 uno
1 2 dos
2 3 tres
=====
value name odd
0 1 uno True
1 2 dos False
2 3 tres True
You don't need apply. Use .loc to be more efficient.
temp_df = input_df.copy()
m1 = (input_df['dwelling'] == 5) & (input_df['wall'] == 2)
m2 = (input_df['dwelling'] == 3) & (input_df['wall'] == 4)
temp_df.loc[m1, 'archetype_folder'] = 'folder1'
temp_df.loc[m2, 'archetype_folder'] = 'folder2'
I have file names as below in a folder C/Downloads -
Mango001-003.csv
Mango004-006.csv
Mango007-100.csv
Applefruit.csv
Banana001-003.csv
Banana004-006.csv
How to import the fruits files separately and then join same fruit files together into a single file?
What's expected is one output for Mango, one for Apple & one for Banana
import os
import re
data_files = os.listdir(r'C:\Downloads')
def load_files(filenames):
# Pre-compile regex for code readability
regex = re.compile(r'Mango.*?.csv')
# Map filenames to match objects, filter out not matching names
matches = [m for m in map(regex.match, filenames) if m is not None]
li = []
for match in matches:
df = pd.read_csv(match, index_col=None, header=0, dtype=object)
li.append(df)
#Concatenating the data
frame = pd.concat(li, axis=0, ignore_index=True)
return (frame)
df = load_files(data_files)
print(df.shape)
df.head(2)
I am getting errors. In addition, it cannot be so complex, I must be doing something wrong.
I think the easiest way to do this is to use glob.glob to get a list of all files that start with a particular fruit name (here I used mango) and concatenate them all together using pd.concat.
data_files = r"path\to\folder\containing\csv"
df_mango= pd.DataFrame()
df_mango= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,'mango*.csv'))), ignore_index= True)
df_mango.to_csv('mango.csv')
Here is the example I tried:
mango0110.csv
A B C
0 1 2 3
mango01220.csv
A B C
0 4 5 6
To get:
A B C
0 1 2 3
1 4 5 6
Perhaps not the greatest way to do it but, for the file names given...
Try:
import pandas as pd
import glob
import re
path = r'./files' # use your path
all_files = glob.glob(path + "/*.csv")
fruits = []
# for all files in the folder get the fruit name
# this could be where things go wrong if the regex does not
# account for all filename types. Pattern may need tweaking
# example https://regex101.com/r/E69LWa/1
for file in all_files:
cleanFile = file.replace('fruit', '')
match = re.match(r'^.*/([A-Za-z]+)',cleanFile)
fruits.append(match.group(1))
# There will be one output for Mango, one for Apple & one for Banana hence three...
dfs_man = []
dfs_ban = []
dfs_app = []
# for all files create a df and append to the correct list holding other dfs of the same fruit
for i, file in enumerate(all_files):
df = pd.read_csv(file)
if fruits[i] == 'Mango':
dfs_man.append(df)
elif fruits[i] == 'Banana':
dfs_ban.append(df)
elif fruits[i] == 'Apple':
dfs_app.append(df)
# concatenate if more than one df in list, else just get the df out of list
if len(dfs_man) > 1:
df_mango = pd.concat(dfs_man, ignore_index=True)
elif len(dfs_man) == 1:
df_mango = dfs_man[0]
if len(dfs_ban) > 1:
df_banana = pd.concat(dfs_ban, ignore_index=True)
elif len(dfs_ban) == 1:
df_banana = dfs_ban[0]
if len(dfs_app) > 1:
df_apple = pd.concat(dfs_app, ignore_index=True)
elif len(dfs_app) == 1:
df_apple = dfs_app[0]
print(df_mango.shape, df_banana.shape, df_apple.shape)
Thank you #Vidya Ganesh
data_files = r'C:\Downloads'
list_file_names = ['Mango','Apple','Banana']
for i in list_file_names:
name = i
df = pd.DataFrame()
df= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,str(name)+'*.csv'))), ignore_index= True)
df = df.loc[:1000,:]
print (name)
print (df.shape)
df.to_csv(str(name)+".csv")
I would like to use a function that produces multiple outputs to create multiple new columns in an existing pandas dataframe.
For example, say I have this test function which outputs 2 things:
def testfunc (TranspoId, LogId):
thing1 = TranspoId + LogId
thing2 = LogId - TranspoId
return thing1, thing2
I can give those returned outputs to 2 different variables like so:
Thing1,Thing2 = testfunc(4,28)
print(Thing1)
print(Thing2)
I tried to do this with a dataframe in the following way:
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23]}
df = pd.DataFrame(data, columns = ['Name','TranspoId','LogId'])
print(df)
df['thing1','thing2'] = df.apply(lambda row: testfunc(row.TranspoId, row.LogId), axis=1)
print(df)
What I want is something that looks like this:
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23], 'Thing1':[13,16,26], 'Thing2':[11,12,20]}
df = pd.DataFrame(data, columns=['Name','TranspoId','LogId','Thing1','Thing2'])
print(df)
In the real world that function is doing a lot of heavy lifting, and I can't afford to run it twice, once for each new variable being added to the df.
I've been hitting myself in the head with this for a few hours. Any insights would be greatly appreciated.
I believe the best way is to change the order and make a function that works with Series.
import pandas as pd
# Create function that deals with series
def testfunc (Series1, Series2):
Thing1 = Series1 + Series2
Thing2 = Series1 - Series2
return Thing1, Thing2
# Create df
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23]}
df = pd.DataFrame(data, columns = ['Name','TranspoId','LogId'])
# Apply function
Thing1,Thing2 = testfunc(df['TranspoId'],df['LogId'])
print(Thing1)
print(Thing2)
# Assign new columns
df = df.assign(Thing1 = Thing1)
df = df.assign(Thing2 = Thing2)
# print df
print(df)
Your function should return a series that calculates the new columns in one pass. Then you can use pandas.apply() to add the new fields.
import pandas as pd
df = pd.DataFrame( {'TranspoId':[1,2,3], 'LogId':[4,5,6]})
def testfunc(row):
new_cols = pd.Series([
row['TranspoId'] + row['LogId'],
row['LogId'] - row['TranspoId']])
return new_cols
df[['thing1','thing2']] = df.apply(testfunc, axis = 1)
print(df)
Output:
TranspoId LogId thing1 thing2
0 1 4 5 3
1 2 5 7 3
2 3 6 9 3
I have a text file with data that repeoates every 3 rows. Lets say it is hash, directory, sub directory. The data looks like the following:
a3s2d1f32a1sdf321asdf
Dir_321321
Dir2_asdf
s21a3s21d3f21as32d1f
Dir_65465
Dir2_werq
asd21231asdfa3s21d
Dir_76541
Dir2_wbzxc
....
I have created a python script that takes the data and every 3 rows creates columns:
import pandas as pd
df1 = pd.read_csv('RogTest/RogTest.txt', delimiter = "\t", header=None)
df2 = df1[df1.index % 3 == 0]
df2 = df2.reset_index(drop=True)
df3 = df1[df1.index % 3 == 1]
df3 = df3.reset_index(drop=True)
df4 = df1[df1.index % 3 == 2]
df4 = df4.reset_index(drop=True)
df5 = pd.concat([df2, df3], axis=1)
df6 = pd.concat([df5, df4], axis=1)
#Rename columns
df6.columns = ['Hash', 'Dir_1', 'Dir_2']
#Write to csv
df6.to_csv('RogTest/RogTest.csv', index=False, header=True)
This works fine but I am curious if there is a more efficient way to do this aka less code?
You can use:
df_final = pd.DataFrame(np.reshape(df.values,(3, df.shape[0]/3)))
df_final.columns = ['Hash', 'Dir_1', 'Dir_2']
Output:
Hash Dir_1 Dir_2
0 a3s2d1f32a1sdf321asdf Dir_321321 Dir2_asdf
1 s21a3s21d3f21as32d1f Dir_65465 Dir2_werq
2 asd21231asdfa3s21d Dir_76541 Dir2_wbzxc
Morning,
I have 3 excels that i have imported via from excel. I am trying to create a DataFrame which has taken the name ('Ticker') column from each import, add the title of the excel ('Secto') and append it to eachother to create a new DataFrame. This new DataFrame will then be exported to excel.
AA = ['Aero&Def','REITs', 'Auto&Parts']
File = 'FTSEASX_'+AA[0]+'_Price.xlsx'
xlsx = pd.ExcelFile('C:/Users/Ben/'+File)
df = pd.read_excel(xlsx, 'Price_Data')
df = df[df.Identifier.notnull()]
df.fillna(0)
a = []
b = []
for i in df['Ticker']:
a.append(i)
b.append(AA[0])
raw_data = {'Ticker': a, 'Sector': b}
df2 = pd.DataFrame(raw_data, columns = ['Ticker', 'Sector'])
del AA[0]
for j in AA:
File = 'FTSEASX_'+j+'_Price.xlsx'
xlsx = pd.ExcelFile('C:/Users/Ben/'+File)
df3 = pd.read_excel(xlsx, 'Price_Data')
df3 = df3[df3.Identifier.notnull()]
df3.fillna(0)
a = []
b = []
for i in df3['Ticker']:
a.append(i)
b.append(j)
raw_data = {'Ticker': a, 'Sector': b}
df4 = pd.DataFrame(raw_data, columns = ['Ticker', 'Sector'])
df5 = df2.append(df4)
I am currently getting the below but obviously the 2nd import, titled 'REITs' is not getting captured.
Ticker Sector
0 AVON-GB Aero&Def
1 BA-GB Aero&Def
2 COB-GB Aero&Def
3 MGGT-GB Aero&Def
4 SNR-GB Aero&Def
5 ULE-GB Aero&Def
6 QQ-GB Aero&Def
7 RR-GB Aero&Def
8 CHG-GB Aero&Def
0 GKN-GB Auto&Parts
How would i go about achieving this? or is there a better more pythonic way of achieving this?
I would do it this way:
import pandas as pd
AA = ['Aero&Def','REITs', 'Auto&Parts']
# assuming that ['Ticker','Sector','Identifier'] columns are in 'B,D,E' Excel columns
xl_cols='B,D,E'
dfs = [ pd.read_excel('FTSEASX_{0}_Price.xlsx'.format(f),
'Price_Data',
parse_cols=xl_cols,
).query('Identifier == Identifier')
for f in AA]
df = pd.concat(dfs, ignore_index=True)
print(df[['Ticker', 'Sector']])
Explanation:
.query('Identifier == Identifier') - gives you only those rows where Identifier is NOT NULL (using the fact that value == NaN will always be False)
PS You don't want to loop through your data frames when working with Pandas...