Comparing two Data Frames and getting differences

Comparing two Data Frames and getting differences - python

I want to compare two Data Frames and print out my differences in a selective way. Here is what I want to accomplish in pictures:
Dataframe 1
Dataframe 2
Desired Output - Dataframe 3
What I have tried so far?
import pandas as pd
import numpy as np
df1 = pd.read_excel("01.xlsx")
df2 = pd.read_excel("02.xlsx")
def diff_pd(df1, df2):
"""Identify differences between two pandas DataFrames"""
assert (df1.columns == df2.columns).all(), \
"DataFrame column names are different"
if any(df1.dtypes != df2.dtypes):
"Data Types are different, trying to convert"
df2 = df2.astype(df1.dtypes)
if df1.equals(df2):
return None
else: # need to account for np.nan != np.nan returning True
diff_mask = (df1 != df2) & ~(df1.isnull() & df2.isnull())
ne_stacked = diff_mask.stack()
changed = ne_stacked[ne_stacked]
changed.index.names = ['id', 'Naziv usluge']
difference_locations = np.where(diff_mask)
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
return pd.DataFrame({'Service Previous': changed_from, 'Service Current': changed_to},
index=changed.index)
df3 = diff_pd(df1, df2)
df3 = df3.fillna(0)
df3 = df3.reset_index()
print(df3)
To be fair i found that code on another thread, but it does get job done, but I still have some issues.
My dataframes are not equal, what do I do?
I don't fully understand the code I provided.
Thank you!

How about something easier to start with ...
Try this
import pandas as pd
data1={'Name':['Tom','Bob','Mary'],'Age':[20,30,40],'Pay':[10,10,20]}
data2={'Name':['Tom','Bob','Mary'],'Age':[40,30,20]}
df1=pd.DataFrame.from_records(data1)
df2=pd.DataFrame.from_records(data2)
# Checking Columns
for col in df1.columns:
if col not in df2.columns:
print(f"DF2 Missing Col {col}")
# Check Col Values
for col in df1.columns:
if col in df2.columns:
# Ok we have the same column
if list(df1[col]) == list(df2[col]):
print(f"Columns {col} are the same")
else:
print(f"Columns {col} have differences")
It should output
DF2 Missing Col Pay
Columns Age have differences
Columns Name are the same
Python3.7 needed or change the f-string formatting.

Related

Pandas Conditional formatting by comparing the column values of dataframe

import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
html_table = df.tohtml()
Is there a way to compare the values of columns in dataframe and use it in conditional formatting ? I want compare if the 'prod','pre-prod' and 'stage' values are mismatching, if yes then then its bg-color should be red. I have tired the following methods present in pandas but none of them works.
df.style.apply()
df.style.apply_index()
df.style.applymap()
Current Output:
Desired output:

You can add style conditionally by applying style to a subset of your dataframe like:
import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
def add_color(row):
return ['background-color: red'] * len(row)
df = pd.read_csv(io.StringIO(csv_data), sep=",")
df.loc[(df["pre-prod"] == df["prod"]) & (df["prod"] == df["stage"])].style.apply(add_color, axis=1)

import io
import pandas as pd
csv_data = '''
App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
def match_checker(row):
if row['prod'] == row['pre-prod'] == row['stage']:
return [''] * len(row)
else:
return ['background-color: red'] * len(row)
df = df.style.apply(match_checker, axis=1)
html_table = df.to_html()
with open('testpandas.html','w+') as html_file:
html_file.write(html_table)
html_file.close()
Updated #PeterSmith answer.

It's also possible to style the entire DataFrame in one go by passing axis=None to apply.
We can identify rows which have differing values in the specified columns by comparing the first column (column 0) with the remaining columns (column 1-2) and identifying where there are unequal values using ne on axis=0.
df[['prod', 'stage']].ne(df['pre-prod'], axis=0)
# prod stage
# 0 False False
# 1 False True
Then we can check across rows for any rows which have any True values (meaning there is something that's not equal in the row).
df[['prod', 'stage']].ne(df['pre-prod'], axis=0).any(axis=1)
# 0 False
# 1 True
# dtype: bool
We can then simply apply the styles anywhere there's a True value in the resulting Series.
Altogether this could look something like:
def colour_rows_that_dont_match(df_: pd.DataFrame, comparison_cols: List[str]):
# Sanity check that comparison_cols is what we expect
assert isinstance(comparison_cols, list) and len(comparison_cols) > 1, \
'Must be a list and provide at least 2 column to compare'
# Create an Empty DataFrame to hold styles of the same shape as the original df
styles_df = pd.DataFrame('', index=df_.index, columns=df_.columns)
# Compare the first column's (col 0) values to the remaining columns.
# Find rows where any values are not equal (ne)
rows_that_dont_match = df[comparison_cols[1:]].ne(df[comparison_cols[0]], axis=0).any(axis=1)
# Apply styles to rows which meet the above criteria
styles_df.loc[rows_that_dont_match, :] = 'background-color: red'
return styles_df
df.style.apply(
colour_rows_that_dont_match,
# This gets passed to the function
comparison_cols=['pre-prod', 'prod', 'stage'],
# Apply to the entire DataFrame at once
axis=None
).to_html(buf='test_df.html')
Which produces the following:
Setup, version, and imports:
from typing import List
import pandas as pd # version 1.5.2
df = pd.DataFrame({
'App_name': ['matching-image', 'mismatching-image'],
'pre-prod': ['nginx', 'nginx'],
'prod': ['nginx', 'nginx'],
'stage': ['nginx', 'nginx:1.23.3-alpine']
})

if duplicata row update rows to 0 in Pyspark

I need to update values in column DF.EMAIL if have duplicates values in DF.EMAIL column to 0
generate DF
data = [('2345', 'leo#gmai.com'),
('2398', 'leo#hotmai.com'),
('2398', 'leo#hotmai.com'),
('2328', 'leo#yahoo.con'),
('3983', 'leo#yahoo.com.ar')]
serialize DF
df = sc.parallelize(data).toDF(['ID', 'EMAIL'])
# show DF
df.show()
Partial Solution
# create column with value 0 if don't have duplicates
# if have duplicates set value 1
df_join = df.join(
df.groupBy(df.columns).agg((count("*")>1).cast("int").alias("duplicate_indicator")),
on=df.columns,
how="inner"
)
# Update to 1 if have duplicates
df1 = df_join.withColumn(
"EMAIL",
when(df_join.duplicate_indicator == 1,"") \
.otherwise(df_join.EMAIL)
)

Syntax-wise, this looks more compact but yours might perform better.
df = (df.withColumn('count', count('*').over(Window.partitionBy('ID')))
.withColumn('EMAIL', when(col('count') > 1, '').otherwise(col('EMAIL'))))

TypeError: Float isn't subscriptable

I haven't found anything similar so.. I have 2 df's with the same Gene name but different p value's, example :
I am trying to run over combinedB values on "pvalues" column (numeric) and if they are >=0.05 to continue to combinedA values on "pvalues" column (numeric) which are <= 0.00005. I mustn't concat them
**EDITED
df = pd.read_csv("CombinedA.csv")
df = df['pvalue']
df1 = pd.read_csv("CombinedB.csv")
df1= df1['pvalue']
for i in df1:
if i >= 0.05:
while True:
for i in df:
if i <= 0.00005:
print(i)
Now it just running non stop. I think it prints only the "df" part

Here you are reading the table. You then overwrite df1 and get an array of the values.
df1 = pd.read_csv("CombinedB.csv")
df1= df1['pvalue']
Here you are iterating over the array of your values. These values are of type float.
for i in df1:
You are treating your float value as a dictionary. This is throwing the error.
if i['df1'] in df1 >= 0.05:
You probably meant to write:
if i >= 0.05
You are repeating the same mistake a couple more times.

df = pd.read_csv("Combined.csv", index_col = ["Gene"])
df = df['pvalue']
df.where(df <= 0.005, inplace = True)
df = df.replace(r'', np.NaN).dropna()
# Filter CombinedA
dfA = pd.read_csv("CombinedA.csv", index_col = ["Gene"])
dfA = dfA['pvalue']
dfA.where(dfA >= 0.05, inplace = True)
dfA = dfA.replace(r'', np.NaN).dropna()
df = df[df.index.isin(dfA.index)]
df.to_csv("CombinedRest.csv")
print(df)
This one is working.

Test if Pandas column is datetime type

I'm trying to fillna per column with a suitable variable. My goal is to try find the column type at the highest level of generality: basically, at the moment it is either numeric (int/float), string, or pandas Timestamp. I understand that I can detect numeric or string using numpy.issubdtype and the type hierarchy, but I haven't found a way to detect Timestamp. My solution uses iloc[0] and isinstance, but is there something better? Here is my code, roughly:
for col in df:
if np.issubdtype(dataframe[col].dtype, np.number):
df[col] = df[col].fillna(-1)
elif isinstance(dataframe[col].iloc[0], pd.datetime):
df[col] = df[col].fillna(pd.to_datetime('1900-01-01'))
else:
df[col] = df[col].fillna('NaN')
return (dataframe.fillna(na_var)
(Note that I can't use df.loc[0, col] because my index doesn't always contain 0.)

Form me, np.issubdtype(df[col].dtype, np.datetime64) does what you want.
So taking everything together, we have:
def df_fillna(df):
for col in df:
if np.issubdtype(df[col].dtype, np.number):
df[col] = df[col].fillna(-1)
elif np.issubdtype(df[col].dtype, np.datetime64):
df[col] = df[col].fillna(pd.to_datetime('1900-01-01'))
else:
df[col] = df[col].fillna('NaN')
return df
An example. Input:
df_test = pd.DataFrame()
df_test['dates'] = [pd.to_datetime("2009-7-23"), pd.to_datetime("2011-7-7"), pd.NaT]
df_test = df_fillna(df_test)
Output:
dates
0 2009-07-23
1 2011-07-07
2 1900-01-01

Rename unnamed multiindex columns in Pandas DataFrame

I created this dataframe:
import pandas as pd
columns = pd.MultiIndex.from_tuples([("x", "", ""), ("values", "a", "a.b"), ("values", "c", "")])
df0 = pd.DataFrame([(0,10,20),(1,100,200)], columns=columns)
df0
I unload df0 to excel:
df0.to_excel("test.xlsx")
and load it again:
df1 = pd.read_excel("test.xlsx", header=[0,1,2])
df1
And I have Unnamed :... column names.
To make df1 look like inital df0 I run:
def rename_unnamed(df, label=""):
for i, columns in enumerate(df.columns.levels):
columns = columns.tolist()
for j, row in enumerate(columns):
if "Unnamed: " in row:
columns[j] = ""
df.columns.set_levels(columns, level=i, inplace=True)
return df
rename_unnamed(df1)
Well done. But is there any pandas way from box to do this?

Since pandas 0.21.0 the code should be like this
def rename_unnamed(df):
"""Rename unamed columns name for Pandas DataFrame
See https://stackoverflow.com/questions/41221079/rename-multiindex-columns-in-pandas
Parameters
----------
df : pd.DataFrame object
Input dataframe
Returns
-------
pd.DataFrame
Output dataframe
"""
for i, columns in enumerate(df.columns.levels):
columns_new = columns.tolist()
for j, row in enumerate(columns_new):
if "Unnamed: " in row:
columns_new[j] = ""
if pd.__version__ < "0.21.0": # https://stackoverflow.com/a/48186976/716469
df.columns.set_levels(columns_new, level=i, inplace=True)
else:
df = df.rename(columns=dict(zip(columns.tolist(), columns_new)),
level=i)
return df

Mixing answers from #jezrael and #dinya, and limited for pandas above 0.21.0 (after 2017) an option to solve this will be:
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '-', columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)

You can use numpy.where with condition by contains:
for i, col in enumerate(df1.columns.levels):
columns = np.where(col.str.contains('Unnamed'), '', col)
df1.columns.set_levels(columns, level=i, inplace=True)
print (df1)
x values
a c
a.b
0 0 10 20
1 1 100 200

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Comparing two Data Frames and getting differences - python

Related

Pandas Conditional formatting by comparing the column values of dataframe

if duplicata row update rows to 0 in Pyspark

TypeError: Float isn't subscriptable

Test if Pandas column is datetime type

Rename unnamed multiindex columns in Pandas DataFrame

Categories

Resources