Using merge to detect differences in Pandas DataFrame

Using merge to detect differences in Pandas DataFrame - python

I am using merge to join two data frames together. The 2 data frames are data from a database table over two different dates. I need to work out what changed. The number of rows will be different, but I just to join the newer data set to the older data set with an inner join, and see what changed.
At the moment, I am taking advantage of the _x and _y naming of the data frames, and the .columns data to let me compare the fields for differences after the merge.
There must be an easier way to do this. I did try to use the new compare() method of 1.1.0, but it doesnt seem to like frames with a different shape (i.e. rows in my case), rendering it useless to me.
def get_changed_records(df_old, df_new, file_info):
join_keys = file_info["compare_col"].split(",");
old_file_name = file_info["old_file_name"]
new_file_name = file_info["new_file_name"]
print("Changed Records: JOIN DATA FRAMES ON COLUMNS: previous file ", old_file_name, " current_file_name ", " new file name ", new_file_name)
columns = df_new.columns
df_merged = df_new.merge(df_old, how='inner', on=join_keys, indicator=True)
changed_records = []
for idx, row in df_merged.iterrows():
changes = []
for col in columns:
if col not in join_keys:
after_col = col + '_x';
before_col = col + '_y';
else:
after_col = col
before_col = col
after_val = row[after_col];
before_val = row[before_col];
changed = False
if pd.isnull(before_val) or pd.isnull(after_val):
if pd.isnull(before_val) == False and pd.isnull(after_val) == True:
changed = True;
if pd.isnull(before_val) == True and pd.isnull(after_val) == False:
changed = True;
if pd.isnull(before_val) == True and pd.isnull(after_val) == True:
changed = False;
elif after_val != before_val:
print("COLUMN_CHANGE: ", col, " before ", before_val, " after ", after_val);
changed = True;
if changed == True:
print('-' * 50);
print('-Adding changes to result...')
changes.append(['COLUMN_CHANGE', col, before_val, after_val, row, join_keys]);
print(changes);
if len(changes) > 0:
changed_records.append(changes);
print("changed records ", len(changed_records));
print(changed_records);
return changed_records

Related

Python multiprocessing slower processing a list, even when using shared memory

How can a non parallelised version of this code run much faster than a parallel version, when I am using a Manager object to share a list across processes. I did this to avoid any serialisation, and I don't need to edit the list.
I return an 800,000 row data set from Oracle, convert it to a list and store it in shared memory using Manager.list().
I am iterating over each column in the query results, in parallel, to obtain some statistics (I know I could do it in SQL).
Main code:
import cx_Oracle
import csv
import os
import glob
import datetime
import multiprocessing as mp
import get_column_stats as gs;
import pandas as pd
import pandas.io.sql as psql
def get_data():
print("Starting Job: " + str(datetime.datetime.now()));
manager = mp.Manager()
# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())
print("CPU Count: " + str(mp.cpu_count()))
dsn_tns = cx_Oracle.makedsn('myserver.net', '1521', service_name='PARIELGX');
con = cx_Oracle.connect(user='fred', password='password123', dsn=dsn_tns);
stats_results = [["OWNER","TABLE","COLUMN_NAME","RECORD_COUNT","DISTINCT_VALUES","MIN_LENGTH","MAX_LENGTH","MIN_VAL","MAX_VAL"]];
sql = "SELECT * FROM ARIEL.DIM_REGISTRATION_SET"
cur = con.cursor();
print("Start Executing SQL: " + str(datetime.datetime.now()));
cur.execute(sql);
print("End SQL Execution: " + str(datetime.datetime.now()));
print("Start SQL Fetch: " + str(datetime.datetime.now()));
rs = cur.fetchall();
print("End SQL Fetch: " + str(datetime.datetime.now()));
print("Start Creation of Shared Memory List: " + str(datetime.datetime.now()));
lrs = manager.list(list(rs)) # shared memory list
print("End Creation of Shared Memory List: " + str(datetime.datetime.now()));
col_names = [];
for field in cur.description:
col_names.append(field[0]);
#print(col_names)
#print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-')
#print(rs)
#print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-')
#print(lrs)
col_index = 0;
print("Start In-Memory Iteration of Dataset: " + str(datetime.datetime.now()));
# we go through every field
for field in cur.description:
col_names.append(field[0]);
# start at column 0
col_index = 0;
# iterate through each column, to gather stats from each column using parallelisation
pool_results = pool.map_async(gs.get_column_stats_rs, [(lrs, col_name, col_names) for col_name in col_names]).get()
for i in pool_results:
stats_results.append(i)
# Step 3: Don't forget to close
pool.close()
print("End In-Memory Iteration of Dataset: " + str(datetime.datetime.now()));
# end filename for
cur.close();
outfile = open('C:\jupyter\Experiment\stats_dim_registration_set.csv','w');
writer=csv.writer(outfile,quoting=csv.QUOTE_ALL, lineterminator='\n');
writer.writerows(stats_results);
outfile.close()
print("Ending Job: " + str(datetime.datetime.now()));
get_data();
Code being called in parallel:
def get_column_stats_rs(args):
# rs is a list recordset of the results
rs, col_name, col_names = args
col_index = col_names.index(col_name)
sys.stdout = open("col_" + col_name + ".out", "a")
print("Starting Iteration of Column: " + col_name)
max_length = 0
min_length = 100000 # abitrarily large number!!
max_value = ""
min_value = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" # abitrarily large number!!
distinct_value_count = 0
has_values = False # does the column have any non-null values
has_null_values = False
row_count = 0
# create a dictionary into which we can add the individual items present in each row of data
# a dictionary will not let us add the same value more than once, so we can simply count the
# dictionary values at the end
distinct_values = {}
row_index = 0
# go through every row, for the current column being processed to gather the stats
for val in rs:
row_value = val[col_index]
row_count += 1
if row_value is None:
value_length = 0
else:
value_length = len(str(row_value))
if value_length > max_length:
max_length = value_length
if value_length < min_length:
if value_length > 0:
min_length = value_length
if row_value is not None:
if str(row_value) > max_value:
max_value = str(row_value)
if str(row_value) < min_value:
min_value = str(row_value)
# capture distinct values
if row_value is None:
row_value = "Null"
has_null_values = True
else:
has_values = True
distinct_values[row_value] = 1
row_index += 1
# end row for
distinct_value_count = len(distinct_values)
if has_values == False:
distinct_value_count = None
min_length = None
max_length = None
min_value = None
max_value = None
elif has_null_values == True and distinct_value_count > 0:
distinct_value_count -= 1
if min_length == 0 and max_length > 0 and has_values == True:
min_length = max_length
print("Ending Iteration of Column: " + col_name)
return ["ARIEL", "DIM_REGISTRATION_SET", col_name, row_count, distinct_value_count, min_length, max_length,
strip_crlf(str(min_value)), strip_crlf(str(max_value))]
Helper function:
def strip_crlf(value):
return value.replace('\n', ' ').replace('\r', '')
I am using a Manager.list() object to share state across the processes:
lrs = manager.list(list(rs)) # shared memory list
And am passing the list in the map_async() method:
pool_results = pool.map_async(gs.get_column_stats_rs, [(lrs, col_name, col_names) for col_name in col_names]).get()

Manager overhead is adding up to your runtime. Also you're not directly using shared memory here. You're only using multiprocessing manager, which is known to be slower than shared memory or single thread implementations. If you don't need synchronisation in your code, meaning that you're not modifying the shared data, just skip manager and use shared memory objects directly.
https://docs.python.org/3.7/library/multiprocessing.html
Server process managers are more flexible than using shared memory
objects because they can be made to support arbitrary object types.
Also, a single manager can be shared by processes on different
computers over a network. They are, however, slower than using shared
memory.

if record ouputting as NaT in pandas, skip record

I have a few records outputting in my pandas data frame as NaT.
i.e. Date_Refused_Final_Something_of_ICP=NaT,
And it seems to disrupt my script. I would simply like to skip these few records found as Nat, and continue with the rest of the records/script.
Below is the complete relevant code:
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
excel_data = pandas.read_excel(self.find_file())
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
num_valid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
if mrn in ("", " ", "N/A", None) or math.isnan(mrn):
print(f"Invalid record: {row}")
excel_data = excel_data.drop(excel_data.index[row.Index])
else:
num_valid_records += 1
print(f"Processing #{num_valid_records} records")
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])

I'm not sure where you're encountering this, but you can simply put continue when it finds this:
if mrn in ("", " ", "N/A", None) or math.isnan(mrn):
print(f"Invalid record: {row}")
continue # <--- if it encounters any of the conditions above, it skips and goes to the next row in your excel_data

OpenPyXL - How to query cell borders?

New to both python and openpyxl.
Writing a py script to glom through a ton of Excel workbooks/sheets, and need to find certain cells identified by their border formatting.
I see several examples online of how to set cell borders, but I need to read them.
Specifically, I wish to identify table boundaries, when the data within the table is inconsistent, but the table borders are always present. So, I need to find identify the cells with:
* top / left borders
* top / right borders
* bottom / left borders
* bottom / right borders
(thin borders). There is only one such table per worksheet.
Could some kind maven point me to a code sample? I would provide my code thus far, but honestly I have no idea how to begin. My code for looping through each worksheet is:
for row in range(1, ws.max_row, 1):
for col in range(1, sheet.max_column+1):
tmp = NumToAlpha(col)
ref = str(tmp) + str(row)
hasTopBorder = ws[ref].?????? <=== how do I get a boolean here?
hasLeftBorder = ws[ref].?????? <=== how do I get a boolean here?
hasRightBorder = ws[ref].?????? <=== how do I get a boolean here?
hasBottomBorder = ws[ref].?????? <=== how do I get a boolean here?
if hasTopBorder==True and hasLeftBorder==True and hasRightBorder==False and hasBottomBorder==False:
tableTopLeftCell = tmp + str(row)
elif hasTopBorder==True and hasLeftBorder==False and hasRightBorder==True and hasBottomBorder==False:
tableTopRightCell = tmp + str(row)
elif hasTopBorder==False and hasLeftBorder==True and hasRightBorder==False and hasBottomBorder==True:
tableBottomLeftCell = tmp + str(row)
elif hasTopBorder==False and hasLeftBorder==False and hasRightBorder==True and hasBottomBorder==True:
tableBottomRightCell = tmp + str(row)
if tableTopLeftCell != "" and tableTopRightCell != "" and tableBottomLeftCell != "" and tableBottomRightCell != "": break
if tableTopLeftCell != "" and tableTopRightCell != "" and tableBottomLeftCell != "" and tableBottomRightCell != "": break
Comments/suggestions for streamlining this novice code welcome and gratefully received.
Update:
By querying a cell like this:
tst = sheet['Q17'].border
I see that I get this type of result - but how do I use it? Or convert it into the desired boolean?

Here's one way.
I used is not none because the borders could be thin, double, etc.
for row in range(1, ws.max_row, 1):
for col in range(1, ws.max_column+1):
tmp = NumToAlpha(col)
cellRef = str(tmp) + str(row)
cellBorders = getCellBorders(ws, cellRef)
if ('T' in cellBorders) or ('L' in cellBorders) or ('R' in cellBorders) or ('B' in cellBorders):
if 'myTableTopLeftCell' not in refs:
if ('T' in cellBorders) and ('L' in cellBorders):
refs['myTableTopLeftCell'] = (cell.row, cell.col_idx)
nowInmyTable = True
if (nowInmyTable == True) and ('L' not in cellBorders):
if 'myTableBottomLeftCell' not in refs:
refs['myTableBottomLeftCell'] = (cell.row-1, cell.col_idx)
def getCellBorders(ws, cellRef):
tmp = ws[cellRef].border
brdrs = ''
if tmp.top.style is not None: brdrs += 'T'
if tmp.left.style is not None: brdrs += 'L'
if tmp.right.style is not None: brdrs += 'R'
if tmp.bottom.style is not None: brdrs += 'B'
return brdrs

To identify whether the "Q17" has a border:
from openpyxl.styles.borders import Border, Side
if sheet['Q17'].border.left.style == "thin":
print("Left side of Cell Q17 have left thin border")

I found a way around using border object by converting into JSON and getting the value of the border style
t = sheet.cell(1,1).border
f = json.dumps(t,default=lambda x: x.__dict__)
r = json.loads(f)
s = r['left']['style']
print(s) # which prints the value for left border style
if s == 'thin':
#do specific action

Can we define django model from csv file or pandas dataframe?

I have more than 200 columns in csv file and want to define Class in django model.
But I really can't find handful instructions to create django model from csv files (first row indicates columns)
Is there any way to define django model from csv file or pandas dataframe?

I generate texts to feed on model based on below codes
import numpy as np
for col in speech_data.columns:
arr = speech_data[col]
size = 0
value = ""
for ar in arr:
if len(str(ar)) > size:
size = len(str(ar))
value = ar
print col, size, type(speech_data.iloc[0][col]), ar
if type(speech_data.iloc[0][col]) == np.int64:
print col + " = " + "models.IntegerField()"
elif type(speech_data.iloc[0][col]) == str:
print col + " = " + "models.CharField(max_length="+ str(size+3) +")"
elif type(speech_data.iloc[0][col]) == np.float64:
print col + " = " + "models.FloatField(null=True, blank=True, default=None)"
elif type(speech_data.iloc[0][col]) == np.bool_:
print col + " = " + "models.BooleanField()"

How do I get the calculated column names from a panda pivot table

I am trying to create an excel spreadsheet from a pandas.pivot_table.
I don't want to use to_excel as it:
Renders at half the speed
Doesn't allow cell formatting
Doesn't compute cell widths
Doesn't allow me to create other cells with formulas
etc.
create a dataframe from a list of lists
convert to a pivot table with pivot_table
convert to records with to_records
Now I can create a worksheet, but I need the column headers for the indexes, which I can save as they were specified when creating the pivot table, but how do I get the inferred column_names deduced from the distinct values?
import pandas
import datetime
import logging
import math
import random
from dateutil.relativedelta import relativedelta
import xlsxwriter
logging.basicConfig(level=logging.INFO)
def get_sales_data(*, num_records, num_custs, num_products, date_from, number_of_months):
print("number of months %s" % number_of_months)
sales = {}
rows = []
for cust_nbr in range(0, num_custs):
cust_name = "cust " + "{0:0>3}".format(cust_nbr)
for month_delta in range(0, number_of_months):
ship_date = date_from + relativedelta(months=month_delta)
for product_nbr in (0, num_products):
product = "product " + str(product_nbr)
qty = random.randint(0, 20)
if (qty > 0):
key = (cust_name, product, ship_date)
shipment = (cust_name, product, ship_date, qty)
sales[key] = shipment
for shipment in sales.values():
rows.append(shipment)
return rows
def to_excel(workbook, sheetname, dataframe):
worksheet = workbook.add_worksheet(sheetname)
row_index = 1
max_widths = [None] * len(dataframe[0])
for row in dataframe:
#max_widths = [None] * len(row)
col_index = 0
for datum in row:
if datum is not None:
if isinstance(datum, float):
if not math.isnan(datum):
worksheet.write(row_index, col_index, datum)
else:
worksheet.write(row_index, col_index, datum)
# print ("len(max_widths) %s col_index %s " % (len(max_widths),col_index))
if max_widths[col_index] is None or len(str(datum)) > max_widths[col_index]:
max_widths[col_index] = len(str(datum))
# if row_index < 5:
# print("r: %s c: %s %s" % (row_index, col_index, datum))
col_index += 1
row_index += 1
col_index = 0
for width in max_widths:
worksheet.set_column(col_index, col_index, width + 1)
col_index += 1
# Get a List of Lists
from_date = datetime.date(2015, 1, 1)
to_date = datetime.date(2017, 7, 1)
matrix = get_sales_data(num_records=3000, num_products=3, date_from=from_date, number_of_months=30, num_custs=1000)
# Get a dataframe
labels = ["cust_name", "product", "ship_date", "qty"]
dataframe = pandas.DataFrame.from_records(matrix, columns=labels)
print(dataframe)
# get pivot
pivot_table = pandas.pivot_table(dataframe, columns='ship_date', values='qty', index=['cust_name', 'product'])
print(pivot_table)
# convert back to records
records = pivot_table.to_records()
# get excel
output = open("/tmp/crosstab.xslx", "wb")
workbook = xlsxwriter.Workbook(output)
to_excel(workbook, "Cust Product By Month", records)
workbook.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using merge to detect differences in Pandas DataFrame - python

Related

Python multiprocessing slower processing a list, even when using shared memory

if record ouputting as NaT in pandas, skip record

OpenPyXL - How to query cell borders?

Can we define django model from csv file or pandas dataframe?

How do I get the calculated column names from a panda pivot table

Categories

Resources