Python KeyError when Comparing two DataFrames

Python KeyError when Comparing two DataFrames - python

I have extracted data from a CSV file and want to use it update values in a spreadsheet. The dataframe has a column 'ticker' of values. I want to check the existing values in the spreadsheet, and if the CSV has a new value add the new value to the spreadsheet.
if df_xls.empty:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[0]]]}), ignore_index=True)
wtw = 1
print(reduced.columns.values)
print(df_xls.columns.values)
for csv_row in reduced:
for xls_row in df_xls:
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
wtw = 0
break
else:
next(xls_row)
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
next(csv_row)
I am getting a "KeyError: 'ticker'" in reference to the line "if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:" I don't understand the error given the column names are correct. The print outputs above display:
['ticker' '2021-02-01 shares' '2021-02-01 value']
['ticker']
Thanks in advance.
Edit --
I do not have the code available at a URL, but here is the entirety of the script:
import numpy as np
import pandas as pd
filename = "2021-02-01-FULLREPORT.csv"
##load new information from CSV into dataframe##
df_csv = pd.read_csv(filename)
prefix = filename[0:10]
ticker = df_csv.ticker
shares = df_csv.shares
value = df_csv["market value($)"]
reduced = pd.DataFrame({
"ticker": ticker,
prefix +" shares": shares,
prefix +" value": value
})
##end load new information from CSV into dataframe##
##load excel
from pandas import ExcelWriter
from pandas import ExcelFile
df_xls = pd.read_excel('file.xlsx')
##update ticker list with information saved in reduced##
if df_xls.empty:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[0]]]}), ignore_index=True)
wtw = 1
print(reduced.columns.values)
print(df_xls.columns.values)
for csv_row in reduced:
for xls_row in df_xls:
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
wtw = 0
break
else:
next(xls_row)
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
next(csv_row)
print (df_xls)
This resolved the KeyError:
##load excel
from pandas import ExcelWriter
from pandas import ExcelFile
df_xls = pd.read_excel('file.xlsx')
##update ticker list saved in reduced##
for csv_row in reduced.index:
wtw = 1
print ("testing " + reduced.ticker[csv_row])
for xls_row in df_xls.index:
print ("comparing "+ reduced.ticker[csv_row] + "with ")
print (df_xls.ticker[xls_row])
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
print ("match found")
wtw = 0
break
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
print (df_xls)

Related

Excel pandas very slow collecting Data from XLSX file - slow script - Python

Hey I wanted to get quick Output for slicing my Source XLSX file for collect Data from Cell on index: 11, but my script seems to working very slow.
Expected output is Collected items from column index(11) Cell when on column index(16) Cell value = None. Script check from begin every row on column index(16) if value == None, but on my file i have thousands positions before script start collecting Data,
Can I speed up this process or find faster way?
XLUtils.py:
import openpyxl
def getRowCount(file,sheetName):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
return(sheet.max_row)
def getColumnCount(file,sheetName):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
return(sheet.max_column)
def readData(file,sheetName,rownum,columnno):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
return sheet.cell(row=rownum, column=columnno).value
def writeData(file,sheetName,rownum,columno,data):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
sheet.cell(row=rownum, column=columno).value = data
workbook.save(file)
My Script:
import pandas as pd
import XLUtils
from openpyxl import Workbook
from datetime import datetime
def LISTING_GENERATOR():
#This function Create product list file for current day
i = 1
i_range = 50
r = 1
x = 1
rows = 50
LISTED_DATE = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,16)
if LISTED_DATE == None:
ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,11)
print(ASIN)
wb.save('Product_'+TODAY_DATE + '.xlsx')
print('File has been created: ',FILE)
for r in range(2,rows+1):
CHECK_ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r, 11)
if CHECK_ASIN == None:
print('No more ASIN avaiable')
break
FE = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r, 16)
if FE != None:
print('Product last added: ',FE)
if FE == None:
ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,11)
print(f'ASIN nr. {i}: {ASIN}')
LIST.append(ASIN)
XLUtils.writeData(FILE,'Sheet',x,1,ASIN)
XLUtils.writeData(PRODUCT_RESEARCH,'Sheet1',r,16,TODAY_DATE_XLSX)
x+=1
i+=1
if i >= i_range:
print(f'List of {i_range} items, has been Created.')
break
else:
print('Error: product on the list')
XLUtils.writeData(FILE,'Sheet',x,1,' ')
print('Created list:\n',LIST)
print('__________________________________')
ALL_ITEMS = (TODAY_DATE + '.xlsx')
print('CSV file has been named: ', ALL_ITEMS)
DATA_XLS = pd.read_excel(FILE, 'Sheet', dtype=str, index_col=None)
DATA_XLS.to_csv(PRODUCT_NAME+'.csv', encoding='utf-8', index=False)
if __name__ == '__main__':
#---Product_list_generator ---# Variable --
wb = Workbook()
LIST = []
TODAY_DATE = datetime.today().strftime('%d_%m_%Y')
TODAY_DATE_XLSX = datetime.today().strftime('%d/%m/%Y')
PRODUCT_RESEARCH = ('Product_Research_copy2.xlsx') #<--- xlsx File
FILE = ('Product_'+TODAY_DATE + '.xlsx')
PRODUCT_NAME = ('Product_'+TODAY_DATE)
LISTING_GENERATOR()

Pandas loop through sheets without sheet name

I have a couple of excel files with different sheet names
Was wondering if its possible to iterate through all of the sheet names without giving a variable "sheetname"
Right now it is through input selection.. I have tried to search before posting but haven't found or figured out how to work this out, help is appreciated.
import pandas as pd
# Look for sheet names
file = file_name
df = pd.ExcelFile(file).sheet_names
# Filter sheets
counter = 0
sheets = [""]
for sheet in df:
if sheet[0] == "Δ" or sheet == "Log Data":
pass
else:
counter += 1
sheets.append(sheet)
print(f"{sheets[counter]} - {counter}")
# Sheet selection
try:
x = int(input("Select Sheet Number: "))
assert x in range(1, counter + 1), "Select a value from list"
except ValueError as err:
logger.error(err)
raise
else:
df = pd.read_excel(f"{file}", f"{sheets[x]}")
finally:
print(f"{sheets[x]} Selected")

Unable to extract MCC details from PDF file

I am unable to extract MCC details from PDF. I am able to extract other data with my code.
import tabula.io as tb
from tabula.io import read_pdf
pdf_path = "IR21_SVNMT_Telekom Slovenije d.d._20210506142456.pdf"
for df in df_list:
if 'MSRN Number Range(s)' in df.columns:
df = df.drop(df.index[0])
df.columns = df.columns.str.replace('\r', '')
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace('Unnamed:0', 'CountryCode(CC)')
df.columns = df.columns.str.replace('Unnamed:1', 'NationalDestinationCode(NDC)')
df.columns = df.columns.str.replace('Unnamed:2', 'SNRangeStart')
df.columns = df.columns.str.replace('Unnamed:3', 'SNRangeStop')
break
msrn_table = (df[['CountryCode(CC)','NationalDestinationCode(NDC)','SNRangeStart','SNRangeStop']])
print (msrn_table)
The same logic I am trying to retrieve "Mobile Country Code (MCC)" details. But Pandas data frame is showing different data instead of what is there in PDF.
for df in df_list:
if 'Mobile Country Code (MCC)' in df.columns:
break
print (df)
Pandas output is given in this:
The actual content in pdf file is:

This code works
import pdfplumber
import re
pattern =re.compile(r'Mobile Network Code \(MNC\)[\r\n]+([^\r\n]+)')
#pattern =re.compile(r'Mobile\sNetwork\sCode\s\(MNC\)')
pdf = pdfplumber.open(pdf_path)
n = len(pdf.pages)
final = ""
for page in range(n):
data = pdf.pages[page].extract_text()
final = final + "\n" + data
mcc_mnc=" "
matches=pattern.findall(final)
mcc_mnc=mcc_mnc.join(matches)
mcc = mcc_mnc.split(" ")
actual_mcc =mcc[0]
actual_mnc=mcc[1]
print (actual_mcc)
print (actual_mnc)

Pandas not updating CSV

Dataset:
https://github.com/Bene939/newsheadlinedatasets
With my program I am labeling my dataset of news headlines. It worked fine until today.
For some reason it won't write the csv file anymore. As far as I can see the data frame gets updated though.
At around 4469 rows of my csv it started to not overwrite the csv file. And then it did. And then didnt do it again until it stopped overwriting completely at row 4474. It worked fine until now and if I create a new csv it will overwrite it.
I am using Jupyter Notebook. Is there some kind of limit to this? The labeled dataset is around 300KB.
!pip install pandas
!pip install pathlib
import pandas as pd
from pathlib import Path
#takes data frame and file name & appends it to given csv
def append_df(df, file_name):
my_file = Path(file_name)
if my_file.exists():
print("Appending to existing file named " + file_name)
orig_df = pd.read_csv(file_name)
print("Old Data Frame: ")
print(orig_df)
new_df = pd.concat([orig_df, df], ignore_index=True).drop_duplicates()
print("New Data Frame: ")
print(new_df)
new_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
else:
print("Creating new file named" + file_name)
news_sentiment_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#takes data frame and file name & overwrites given csv
def update_csv(df, file_name):
print("Overwriting " + file_name)
df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#shows sentence by sentence, labels it according to input and saves it in a new csv file
print("WARNING: EDITING CSV FILE WITH EXCEL MAY CORRUPT FILE\n")
file_name = "news_headlines.csv"
new_file = "news_headlines_sentiment.csv"
news_sentiment_df = pd.DataFrame(columns=["news", "sentiment"])
my_file = Path(file_name)
if my_file.exists():
df = pd.read_csv(file_name, encoding='utf-8-sig', error_bad_lines=False)
print("Loaded " + file_name)
for index, row in df.iterrows():
user_input = -1
range = [0, 1, 2]
while user_input not in range:
print("####################################################################")
print(row["news"])
try:
user_input = int(input("Negative: 0\nNeutral: 1\nPositive: 2\n"))
except ValueError as err:
print("\nPlease enter an Integer!\n")
pass
new_element = 0
#label sentiment according to input
if user_input == 0:
new_element = [row["news"], 0]
elif user_input == 1:
new_element = [row["news"], 1]
elif user_input == 2:
new_element = [row["news"], 2]
#save labeled sentence to new file
news_sentiment_df.loc[len(news_sentiment_df)] = new_element
append_df(news_sentiment_df, new_file)
#delete data point from original data frame
index_name = df[df["news"] == row["news"]].index
df.drop(index_name, inplace=True)
#update old csv file
update_csv(df, file_name)
else:
print("File not Found")

I was trying to add duplicates while using drop_duplicates function without noticing it

Print Unknown Length of Table to a CSV file using Python/Pandas

I have table I want to print out to a csv file using pandas. This table was extracted from a different excel file. The problem I have is that the table length is unknown. How should I print out this table to the csv file to show all of it and not on just on one line
for x in ABC:
print()
print(f"{x}:")
try:
df = pd.read_csv(x + "/File.csv")
df_filter= df[['A','B','C', "D", "E"]]
if df_filter['D'].str.contains('Fail').any():
noSC= df_filter[df_filter.DUTId != 'SC_INFO']
finalTable= noSC[noSC.D == 'Fail']
if finalTable.empty:
print("Did not complete")
sheet1['A16'] = finalTable
else:
filterTable= finalTable[['A','B','C', "E"]]
fullfinalTable = filterTable.to_string()
print(fullfinalTable)
else:
print("Run Successful")
except FileNotFoundError:
print("File does not exist")
I know that sheet1['A16'] = finalTable is wrong, but I unsure what I would do instead of that. It does output the table but only on A16, so it is a long line. Is there anyway to have the unknown table formatted into the new excel?

Try this instead.
from pathlib import Path
import pandas as pd
dir_path = r"yourFolderPath"
files_list = [str(p) for p in dir_path.glob("**/*.csv")]
if files_list:
source_dfs = [pd.read_csv(file_) for file_ in files_list]
df = pd.concat(source_dfs, ignore_index=True)
df = df[['A','B','C', "D", "E", "DUTId"]]
if df['D'].str.contains('Fail').any():
df = df_filter[df_filter.DUTId != 'SC_INFO']
finalTable = df[df.D == 'Fail']
if finalTable.empty:
print("Did not complete. Dataframe is empty.")
else:
print("Dataframe written to .csv")
finalTable = finalTable[['A','B','C','E']]
finalTable.to_csv(dir_path + r"/finaltable.csv")
else:
print(f"No .csv files in {dir_path}")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python KeyError when Comparing two DataFrames - python

Related

Excel pandas very slow collecting Data from XLSX file - slow script - Python

Pandas loop through sheets without sheet name

Unable to extract MCC details from PDF file

Pandas not updating CSV

Print Unknown Length of Table to a CSV file using Python/Pandas

Categories

Resources