Problem with set values on excel file with PyPDF2 and Openpyxl - python

I have the next code:
import PyPDF2
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.worksheet.cell_range import CellRange
#Definir Archivo Excel
file = 'C:\\Users\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx'
wb = openpyxl.load_workbook(file)#define workook
ws = wb['Hoja1']#define sheet1
#define max values for columns
max_row_for_a = max((a.row for a in ws['A'] if a.value is not None))
max_row_for_b = max((b.row for b in ws['B'] if b.value is not None))
#LOOP
for row in ws.iter_rows(min_row=2, max_col=1, max_row=max_row_for_a):
pdf= row[0].value
print (pdf)
for row2 in ws.iter_rows(min_row=2, max_col=2, max_row=max_row_for_b, min_col=2):
extracto = row2[0].value
pdfselect=open(pdf,"rb")
leer = PyPDF2.PdfFileReader(pdfselect)
pagina = leer.getPage(0)
extracto = pagina.extractText()
print(extracto)
wb.save("PRUEBA_PDF2.xlsx")
wb.close
The idea is read the excel's column A with the pdf's names and write in the column B the pdf extracts, but when i execute the code don't show anything, even dont show error. I try with a minimal case like this and i dont have problems:
#pdfselect=open("ejemplo.pdf","rb")
#leer = PyPDF2.PdfFileReader(pdfselect)
#pagina = leer.getPage(0)
#sheet = wb.active
#ws['B2'] = pagina.extractText()
#wb.save("PRUEBA_PDF2.xlsx")
#wb.close
what i'm doing wrong?
Greetings!

I found the problem. The correct code it must like this:
import PyPDF2
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.worksheet.cell_range import CellRange
#Definir Archivo Excel
file = 'C:\\Users\\lorrego\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx'
wb = openpyxl.load_workbook(file)#define workook
ws = wb['Hoja1']#define la hoja 1
max_row_for_a = max((a.row for a in ws['A'] if a.value is not None))
for row in ws.iter_rows(min_row=2, max_col=2, max_row=max_row_for_a):
pdf= row[0].value
pdfselected=open(pdf,"rb")
leer = PyPDF2.PdfFileReader(pdfselected)
pagina = leer.getPage(0)
ws.cell(row=row[0].row, column=2).value = pagina.extractText()
wb.save("C:\\Users\\Desktop\\PYTHON_PDF\\PRUEBA_PDF.xlsx")
wb.close

Related

openpyxl error "There is no item named '[Content_Types].xml' in the archive" [duplicate]

This question already has an answer here:
openpyxl problem Keyerror Content_Types.xml
(1 answer)
Closed last year.
I have a problem with openpyxl, when I want to start the script I get this error, until yesterday it worked and now no more, I tried to uninstall the module, but the problem persists, the excel files I deleted it and nowhere is it open. any ideas?
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.styles import Border, Side, PatternFill, Font, GradientFill, Alignment
from openpyxl.styles import colors
from openpyxl.cell import Cell
from termcolor import colored, cprint
from openpyxl.styles import numbers
from os import mkdir
myPath = '.\Erstellte Datein' # initialize the Chrome driver
def excel():
# Writing on a EXCEL FILE
filename = (f"{myPath}/Monatsplan openpytesst.xlsx")
dienstorinfo = 'texttest'
emptycell = ' '
x = len(dienstorinfo)
if x == 0:
dienstorinfo = tagesinfo2
try:
wb = load_workbook(filename)
ws = wb.worksheets[0] # select first worksheet
except FileNotFoundError:
headers_row = ['Datum','Dienst','Funktion','Von','Bis','Schichtdauer','Bezahlte Zeit','Überzeit','Sonnats Zulage','Nachtdienst']
wb = Workbook()
ws = wb.active
wb.save(filename)
ws.append(['1','2','2','4','5'])
wb.close()
for cols in ws.iter_cols( ):
if cols[-1].value:
cols[-1].border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
ws.column_dimensions['A'].width = 11
ws.row_dimensions['1'].height = 25
ws.column_dimensions['B'].width = 60
ws.column_dimensions['C'].width = 2
ws.column_dimensions['D'].width = 3
ws.column_dimensions['E'].width = 3
ws.column_dimensions['F'].width = 3
ws.column_dimensions['H'].width = 3
ws.column_dimensions['I'].width = 2
ws.column_dimensions['L'].width = 2
wb.save(filename)
wb.close()
excel()
It's either your .xlsx file is corrupt or you are referencing the wrong file.
Get a new .xlsx file or look for the file that is not corrupt and start working, this worked for me.

How to highlight an Excel row when there is a string value in a cell on a row using Openpyxl?

I'm trying to highlight a row in a spreadsheet (generated from my split_values dataframe) when there is a value in the cell under the "New Record ID" column The absolute reference for the row is 'J'.
Below is my most recent attempt at doing this:
# Group entries by client name and create team spreadsheet
split_values = submitted_and_revised['Client Name'].unique()
print(split_values)
for value in split_values:
teams = submitted_and_revised[submitted_and_revised['Client Name'] == value]
output_file_name = "Team_" + str(value) + ".xlsx"
from openpyxl import Workbook
from openpyxl.styles import numbers, PatternFill, colors
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment
from openpyxl.formatting.rule import Rule
from openpyxl.styles.differential import DifferentialStyle
workbook = Workbook()
sheet = workbook.active
############################################################
# This section works in this location, but I can't figure out how to make it conditional on the New Record ID in rule.formula . I'm also not sure what the sheet.conditional_formating.add("A1:0100", rule) means...
red_background = PatternFill(bgColor=colors.RED)
diff_style = DifferentialStyle(fill=red_background)
rule = Rule(type="expression", dxf=diff_style)
rule.formula = ["$J1==working_revised['New Record ID']"]
sheet.conditional_formatting.add("A1:O100", rule)
############################################################
#Format Column Widths
#sheet.column_dimensions['C'].auto_size = True
sheet.column_dimensions['B'].width = float(18)
sheet.column_dimensions['C'].width = float(5)
sheet.column_dimensions['C'].width = float(25.25)
sheet.column_dimensions['D'].width = float(20)
sheet.column_dimensions['E'].width = float(6)
sheet.column_dimensions['G'].width = float(65)
sheet.column_dimensions['H'].width = float(20)
sheet.column_dimensions['I'].width = float(14)
sheet.column_dimensions['J'].width = float(14)
sheet.column_dimensions['K'].width = float(50)
for row in dataframe_to_rows(teams, index=False, header=True):
sheet.append(row)
for rows in sheet.iter_rows(min_row=1, max_row=None, min_col=None, max_col=None):
for cell in rows:
cell.alignment = Alignment(vertical='center',wrapText=True)
workbook.save(output_file_name)
Thanks in advance for any help you can offer!
Thank you Dror Av. for all the support you offered on this question. I wasn't able to get your answer to work. I'm still extremely new to coding in general, and I'm sure I was missing something simple. I played with the code and finally came upon the below solution.
It doesn't work perfectly because in addition to the "New Record ID" rows that are highlighted, it also highlights the header row. Despite trying to fix that piece, I haven't been able to. For what it's worth, here is the solution I came up with:
# Import needed modules
from openpyxl import Workbook
from openpyxl.formatting.rule import Rule
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.styles import Font, PatternFill, colors, Alignment
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
#Group by unique clients
split_values = submitted_and_revised['Client Name'].unique()
print(split_values)
for value in split_values:
teams = submitted_and_revised[submitted_and_revised['Client Name'] == value]
output_file_name = "Team_" + str(value) + ".xlsx"
workbook = Workbook()
sheet = workbook.active
#Format Column Widths
#sheet.column_dimensions['C'].auto_size = True
sheet.column_dimensions['A'].width = float(3)
sheet.column_dimensions['B'].width = float(18)
sheet.column_dimensions['C'].width = float(25)
sheet.column_dimensions['D'].width = float(20)
sheet.column_dimensions['E'].width = float(10)
sheet.column_dimensions['G'].width = float(50)
sheet.column_dimensions['H'].width = float(15)
sheet.column_dimensions['I'].width = float(32)
sheet.column_dimensions['J'].width = float(25)
sheet.column_dimensions['K'].width = float(50)
red_background = PatternFill(bgColor=colors.RED)
diff_style = DifferentialStyle(fill=red_background)
rule = Rule(type="expression", dxf=diff_style)
rule.formula = ["NOT(ISBLANK($J1))"]
sheet.conditional_formatting.add("A1:Y100", rule)
for row in dataframe_to_rows(teams, index=False, header=True):
sheet.append(row)
for rows in sheet.iter_rows(min_row=1, max_row=None, min_col=None, max_col=None):
for cell in rows:
cell.alignment = Alignment(vertical='center', wrapText=True)
workbook.save(output_file_name)
First, don't import in a loop, or you will have a bad day. Imports go on the top of your code, see the PEP8 guidelines for more info on that and other guidelines on styling Python code.
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment, PatternFill, colors
from openpyxl.formatting.rule import FormulaRule
split_values = submitted_and_revised['Client Name'].unique()
print(split_values)
workbook = Workbook()
sheet = workbook.active
for value in split_values:
teams = submitted_and_revised[submitted_and_revised['Client Name'] == value]
output_file_name = "Team_" + str(value) + ".xlsx"
Second, you do not have to create the rule on your own as you are using a simple FormulaRule. Now you just need to apply the rule to all the cell in the row:
for row in dataframe_to_rows(teams, index=False, header=True):
sheet.append(row)
for rows in sheet.iter_rows(min_row=1, max_row=None, min_col=None, max_col=None):
for cell in rows:
cell.alignment = Alignment(vertical='center', wrapText=True)
sheet.conditional_formatting.add(f'{cell.coordinate}',
FormulaRule(formula=["=NOT(ISBLANK(J{cell.row}))"],
fill=red_background))
At the end it should look like this, and when The cell J{row} is equal to working_revised['New Record ID'] that specific row will have a red background. Do note that you are creating a new worbook with each teration of the loop, if that isn't your intention move:
workbook = Workbook()
sheet = workbook.active
out of the loop.
Final code:
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment, PatternFill, colors
from openpyxl.formatting.rule import FormulaRule
split_values = submitted_and_revised['Client Name'].unique()
print(split_values)
red_background = PatternFill(bgColor=colors.RED)
for value in split_values:
teams = submitted_and_revised[submitted_and_revised['Client Name'] == value]
output_file_name = "Team_" + str(value) + ".xlsx"
workbook = Workbook()
sheet = workbook.active
#Format Column Widths
#sheet.column_dimensions['C'].auto_size = True
sheet.column_dimensions['B'].width = float(18)
sheet.column_dimensions['C'].width = float(5)
sheet.column_dimensions['C'].width = float(25.25)
sheet.column_dimensions['D'].width = float(20)
sheet.column_dimensions['E'].width = float(6)
sheet.column_dimensions['G'].width = float(65)
sheet.column_dimensions['H'].width = float(20)
sheet.column_dimensions['I'].width = float(14)
sheet.column_dimensions['J'].width = float(14)
sheet.column_dimensions['K'].width = float(50)
for row in dataframe_to_rows(teams, index=False, header=True):
sheet.append(row)
for rows in sheet.iter_rows(min_row=2, max_row=None, min_col=None, max_col=None):
for cell in rows:
cell.alignment = Alignment(vertical='center', wrapText=True)
sheet.conditional_formatting.add(f'{cell.coordinate}',
FormulaRule(formula=["=NOT(ISBLANK(J{cell.row}))"],
fill=red_background))
workbook.save(output_file_name)
Output should look something like this (ignore the random values):

How to paste to a specific column with python excel

I am wanting to copy and paste data from a csv to an excel so I can later filter that table. I have done all these steps in VBA but I've noticed that VBA can be buggy so am wanting to migrate to Python.
I have converted the csv to an excel and I have successfully copied the converted xlsx file to the excel document.
My question is, how do I copy and paste to a specific starting column. As I have other data I need to copy at cell AN1.
I have tried the below.. I am able to write to one specific cell but I am wanting to post the data...
for row in ws1:
for cell in row:
ws2['K1'].value
#ws2[cell.coordinate].value = cell.value
wb2.save(path2)
Entirety...
## csv to xlsx
from openpyxl import Workbook
import csv
wb = Workbook()
ws = wb.active
with open('C:/B.csv', 'r') as f:
for row in csv.reader(f):
ws.append(row)
wb.save('C:/B.xlsx')
###### COPY FROM B to existing E workbook
import openpyxl as xl
path1 = 'C:/B.xlsx'
path2 = 'C:/E.xlsx'
wb1 = xl.load_workbook(filename=path1)
ws1 = wb1.worksheets[0]
wb2 = xl.load_workbook(filename=path2)
ws2 = wb2.worksheets[0]
#ws2 = wb2.create_sheet(ws1.title)
#cell.value = ['A2']
for row in ws1:
for cell in row:
ws2.cell(row=1, column=1).value = cell.value
wb2.save(path2)
Copying columns between two different workbooks using openpyxl could be done as follows:
import openpyxl
wb1 = openpyxl.load_workbook('B.xlsx')
ws1 = wb1.active
wb2 = openpyxl.load_workbook('E.xlsx')
ws2 = wb2.active
for src, dst in zip(ws1['B:B'], ws2['AN:AN']):
dst.value = src.value
wb2.save('E.xlsx')
For a range of columns, the following would work:
import openpyxl
wb1 = openpyxl.load_workbook('B.xlsx')
ws1 = wb1.active
wb2 = openpyxl.load_workbook('E.xlsx')
ws2 = wb2.active
for src, dst in zip(ws1['A:I'], ws2['AN:AV']):
for cell_src, cell_dst in zip(src, dst):
cell_dst.value = cell_src.value
wb2.save('E.xlsx')
for row in range(1, ws1.max_row + 1):
#for cell in row:
ws1.column_dimensions.group('A', 'D', hidden=True)
sheet.cell(row=i + 2, column=k + 1).value = val
wb2.save(path2)
Should do it
Unfortunately the solutions provide were very much unacceptable as they did not work. VBA is also off the table. I am using openpyxl and the above created an error. Ideally I would like to copy to a new column, but that is beyond my skill. Instead use the below and use excel formulas to get the data where you want. I will have to spend about 4 hours redesigning my excel but worth it I suppose as I am unable to find a workaround.
## csv to xlsx
from openpyxl import Workbook
import csv
wb = Workbook()
ws = wb.active
with open('C/B.csv', 'r') as f:
for row in csv.reader(f):
ws.append(row)
wb.save('C:/B.xlsx')
###### COPY FROM B to existing E workbook
import openpyxl as xl
path1 = 'C:/B.xlsx'
path2 = 'C:/E.xlsx'
wb1 = xl.load_workbook(filename=path1)
ws1 = wb1.worksheets[0]
wb2 = xl.load_workbook(filename=path2)
ws2 = wb2.worksheets[0]
#ws2 = wb2.create_sheet(ws1.title)
#cell.value = ['A2']
for row in ws1:
for cell in row:
ws2[cell.coordinate].value = cell.value
wb2.save(path2)

Data validation using openpyxl isnt writing to file - code enclosed

The code to actually write each file runs great. The problem I'm having is that the data validation piece doesn't appear to be doing anything. No drop downs are being created in the range I'm referencing.
Thanks in advance for any and all assistance!
%%time
import pandas as pd
import xlsxwriter as ew
import csv as csv
import os
import glob
import openpyxl
#remove existing files from directory
files = glob.glob(#filename)
for f in files:
os.remove(f)
pendpath = #filename
df = pd.read_sas(pendpath)
allusers = df.UserID_NB.unique()
listuserpath = #filename
listusers = pd.read_csv(listuserpath)
listusers = listusers['USER_ID'].apply(lambda x: str(x).strip())
for id in listusers:
x = df.loc[df['UserID_NB']==id]
path = #filename
x.to_excel(path, sheet_name = str(id), index = False)
from openpyxl import load_workbook
wb = openpyxl.load_workbook(filename = path)
sheet = wb.get_sheet_by_name(str(id))
maxrow = sheet.max_row
from openpyxl.worksheet.datavalidation import DataValidation
dv = DataValidation(type="list", formula1='"Yes,No"', allow_blank=False, showDropDown = True)
rangevar = 'R1:T'+ str(maxrow)
dv.ranges.append(rangevar)
wb.save(path)
print str(id), rangevar
Code for Basic Sheet
import openpyxl
wb = openpyxl.Workbook()
ws = wb.active
sheet.title = 'testsheet'
path = '#filepath'
from openpyxl.worksheet.datavalidation import DataValidation
dv = DataValidation(type="list", formula1='"Yes,No"', allow_blank=False, showDropDown = True)
dv.ranges.append('A1')
wb.save(path)
You are missing to add the dv to the worksheet.
>>> # Add the data-validation object to the worksheet
>>> ws.add_data_validation(dv)
Read the docs about validation

append rows in Excel using XLWT in Python

How to find total number of rows using XLWT or XLRD in Python? I have an excel file(accounts.xls) and would like to append rows in it.
I am getting an error here - AttributeError: 'Sheet' object has no attribute 'write'
from xlrd import open_workbook
from xlwt import Workbook
def saveWorkSpace(fields,r):
wb = open_workbook('accounts.xls')
ws = wb.sheet_by_index(0)
r = ws.nrows
r += 1
wb = Workbook()
ws.write(r,0,fields['name'])
ws.write(r,1,fields['phone'])
ws.write(r,2,fields['email'])
wb.save('accounts.xls')
print 'Wrote accounts.xls'
Here is the solution of the above question
import xlrd
import xlwt
from xlutils.copy import copy
def saveWorkSpace(fields):
rb = xlrd.open_workbook('accounts.xls',formatting_info=True)
r_sheet = rb.sheet_by_index(0)
r = r_sheet.nrows
wb = copy(rb)
sheet = wb.get_sheet(0)
sheet.write(r,0,fields['name'])
sheet.write(r,1,fields['phone'])
sheet.write(r,2,fields['email'])
wb.save('accounts.xls')
print 'Wrote accounts.xls'
Python Program to add Values to the last data row an Excel sheet.
from xlwt import Workbook
from xlrd import open_workbook
import openpyxl
# Function to get the last RowCount in the Excel sheet , change the index of the sheet accordingly to get desired sheet.
def getDataColumn():
#define the variables
rowCount=0
columnNumber=0
wb = open_workbook('C:\\Temp\\exp\\data.xlsx')
ws = wb.sheet_by_index(0)
rowCount = ws.nrows
rowCount+=1
columnNumber=1
print(rowCount)
writedata(rowCount,columnNumber)
#Data to specified cells.
def writedata(rowNumber,columnNumber):
book = openpyxl.load_workbook('C:\\Temp\\exp\\data.xlsx')
sheet = book.get_sheet_by_name('Sheet1')
sheet.cell(row=rowNumber, column=columnNumber).value = 'Appended Data'
book.save('C:\\Temp\\exp\\data.xlsx')
print('saved')
getDataColumn()
exit()

Categories

Resources