openpyxl - error importing in Python script - python

I have a python script at work (that I didn't write) which cycles through a folder of SQL scripts, condenses the SQL queries into one line each and adds them to an Excel document (along with other columns). This script always worked fine until this week when my work computer died. I got a new one, installed miniconda and then installed openpyxl by opening Anaconda prompt and:
pip install openpyxl
(pip3 install didn't work).
Most of my Python scripts work fine but this one does not. It is throwing an error:
ImportError: cannot import name 'range' from 'openpyxl.compat' (C:\Users\xxx\AppData\Local\Continuum\miniconda3\lib\site-packages\openpyxl\compat\__init__.py)
I tried to drill down into the libraries/site-packages to see the details, and I don't see 'range' listed anymore in the init file for that path. Maybe they got rid of it? Does anyone know what the suitable alternative is for that? To be honest I can't even tell where it is being used in the script, but when i take out that import statement it runs but it results in a blank Excel output file. So clearly it is used somewhere
import sqlparse
import glob
import sys
import regex as re
import os
import openpyxl
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.compat import range
def main():
path = "C:/my_path\sql_files*"
fileList = glob.glob(path)
TABLE_ID = 0
# TABLE_NM = ''
# STEP_ID = 0
STEP_TYPE = ''
# workFile = open('C:/Dev\DataWarehouseTesting\workfile.txt', 'w')
# wb = openpyxl.load_workbook('workfile.xlsx')
wb = Workbook()
ws1 = wb.active
ws1.title = "auto_tests"
# Adding the headers for the Excel sheet.
ws1['A1'] = 'TABLE_ID'
ws1['B1'] = 'TABLE_NM'
ws1['C1'] = 'TEST_TABLE_NM'
ws1['D1'] = 'STEP_ID'
ws1['E1'] = 'STEP_TYPE'
ws1['F1'] = 'SQL_SCRIPT'
def createSQL(pTABLE_ID, pTABLE_NM, pTEST_TABLE_NM, pSTEP_ID, pSTEP_TYPE, pSqlStr):
pSqlStr = pSqlStr.replace('\'','"')
sliceLeft = pSqlStr[:6]
if sliceLeft == 'SELECT':
pSTEP_TYPE = 'T'
elif sliceLeft != 'SELECT':
pSTEP_TYPE = 'P'
max_row = ws1.max_row
nextRow = str(max_row + 1)
ws1['A' + nextRow] = str(pTABLE_ID)
ws1['B' + nextRow] = pTABLE_NM
ws1['C' + nextRow] = pTEST_TABLE_NM
ws1['D' + nextRow] = str(pSTEP_ID)
ws1['E' + nextRow] = pSTEP_TYPE
ws1['F' + nextRow] = pSqlStr
def createTableTestSQL(mFile, TABLE_ID, TABLE_NM,STEP_ID, STEP_TYPE):
mText = mFile.read()
mSqls = sqlparse.split(mText)
for mSql in mSqls:
STEP_ID += 1
sqlStr = str(mSql.replace('\n',' '))
sqlStr = re.sub('--([^\s]+)',' ',sqlStr)
sqlStr = sqlparse.format(sqlStr, strip_comments=True)
if STEP_ID == 1:
TEST_TABLE_NM = sqlStr.replace("DROP TABLE ", "").replace(";", "")
createSQL(TABLE_ID, TABLE_NM, TEST_TABLE_NM, STEP_ID, STEP_TYPE, sqlStr)
for filename in fileList:
if filename.endswith('.sql'):
mFile = open(filename,'r')
TABLE_ID += 1
TABLE_NM = os.path.split(filename)[1].replace('.sql','')
#TEST_TABLE_NM = ''
STEP_ID = 0
createTableTestSQL(mFile, TABLE_ID, TABLE_NM, STEP_ID, STEP_TYPE)
mFile.close()
wb.save(filename='test_data_TR.xlsx')
if __name__ == "__main__":
main()

Related

How to remove sheets based on array with python openpyxl

I need to remove sheets that names are in array. Unfortunetely this: tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex]) , and this:
del tempWb[wsToRemoveNameArray[wsToRemoveIndex]] dont want to work with my code:
Anyone know how to deal with it?
def splitExcelFiles(InputPath, OutputPath, fileNameArray):
for file in range(0, len(fileNameArray)):
tempFile = InputPath + '\\' +fileNameArray[file]
tempWb = load_workbook(tempFile)
wsToRemoveNameArray = []
if(len(tempWb.sheetnames)==1):
#new wb
tempWb.save(str(OutputPath) + '\\' + str(tempWb.sheetnames) + '.xlsx')
else:
for ws in range (0,len(tempWb.sheetnames)):
newName = tempWb.sheetnames[ws]
wsToRemoveNameArray = []
#copyWs = tempWb.copy_worksheet[ws]
# #This section will save the names to remove other sheets from ws
for wsToRemoveName in range (0,len(tempWb.sheetnames)):
if newName != tempWb.sheetnames[wsToRemoveName]:
#print(tempWb.sheetnames[wsToRemoveName])
wsToRemoveNameArray.append(str(tempWb.sheetnames[wsToRemoveName]))
for wsToRemoveIndex in range (0, len(wsToRemoveNameArray)):
# tem
#tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex])
#del tempWb[wsToRemoveNameArray[wsToRemoveIndex]]
# tempWb.
# print(len(wsToRemoveNameArray))
tempWb.save(str(OutputPath) + '\\' + newName + '.xlsx')
First things first, some general tips:
Use the pathlib library whenever you deal with Paths. It makes things a lot easier. There is never a good reason to include the path delimiter in your code.
In python, it's common to write variables and functions with an underscore: save_path instead of savePath
Now that we have that out of the way, here is a tested example. Just change the directories to match yours.
from openpyxl import load_workbook, Workbook
from pathlib import Path
import shutil
def make_absolute_path(path, name, suffix=".xlsx"):
input_file_path = path / name
return input_file_path.with_suffix(suffix)
def remove_all_sheets_except_filename(input_path, output_path, filenames):
output_files_path = []
for i in range(0, len(filenames)):
input_file_path = make_absolute_path(input_path, filenames[i])
output_file_path = make_absolute_path(output_path, filenames[i])
if not Path.is_file(input_file_path):
print(f"Skipping {input_file_path}: Not valid file " f"path. ")
continue
shutil.copyfile(input_file_path, output_file_path)
wb_source: Workbook = load_workbook(filename=output_file_path)
sheets = wb_source.worksheets
if len(sheets) == 1:
save_path = make_absolute_path(output_path, str(wb_source.sheetnames[0]))
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
for sheet in wb_source.sheetnames:
if not sheet == input_file_path.stem:
wb_source.remove(wb_source[sheet])
if len(wb_source.worksheets) == 1:
save_path = make_absolute_path(
output_path, str(wb_source.sheetnames[0])
)
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
print(
f"Failed to process {input_file_path} with following "
f"sheets: {','.join(wb_source.worksheets)}."
)
raise ValueError("")
return output_files_path
def main():
# Adjust to where you have the xlsx files and where you want them
input_directory = Path("path/to/your/xlsx/files")
output_directory = Path("path/to/the/output/directory")
file_names = ["input", "foo", "bar"]
paths = remove_all_sheets_except_filename(
input_directory, output_directory, file_names
)
if __name__ == "__main__":
main()
``

openpyxl error "There is no item named '[Content_Types].xml' in the archive" [duplicate]

This question already has an answer here:
openpyxl problem Keyerror Content_Types.xml
(1 answer)
Closed last year.
I have a problem with openpyxl, when I want to start the script I get this error, until yesterday it worked and now no more, I tried to uninstall the module, but the problem persists, the excel files I deleted it and nowhere is it open. any ideas?
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.styles import Border, Side, PatternFill, Font, GradientFill, Alignment
from openpyxl.styles import colors
from openpyxl.cell import Cell
from termcolor import colored, cprint
from openpyxl.styles import numbers
from os import mkdir
myPath = '.\Erstellte Datein' # initialize the Chrome driver
def excel():
# Writing on a EXCEL FILE
filename = (f"{myPath}/Monatsplan openpytesst.xlsx")
dienstorinfo = 'texttest'
emptycell = ' '
x = len(dienstorinfo)
if x == 0:
dienstorinfo = tagesinfo2
try:
wb = load_workbook(filename)
ws = wb.worksheets[0] # select first worksheet
except FileNotFoundError:
headers_row = ['Datum','Dienst','Funktion','Von','Bis','Schichtdauer','Bezahlte Zeit','Überzeit','Sonnats Zulage','Nachtdienst']
wb = Workbook()
ws = wb.active
wb.save(filename)
ws.append(['1','2','2','4','5'])
wb.close()
for cols in ws.iter_cols( ):
if cols[-1].value:
cols[-1].border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
ws.column_dimensions['A'].width = 11
ws.row_dimensions['1'].height = 25
ws.column_dimensions['B'].width = 60
ws.column_dimensions['C'].width = 2
ws.column_dimensions['D'].width = 3
ws.column_dimensions['E'].width = 3
ws.column_dimensions['F'].width = 3
ws.column_dimensions['H'].width = 3
ws.column_dimensions['I'].width = 2
ws.column_dimensions['L'].width = 2
wb.save(filename)
wb.close()
excel()
It's either your .xlsx file is corrupt or you are referencing the wrong file.
Get a new .xlsx file or look for the file that is not corrupt and start working, this worked for me.

Using Textract, how do you extract tables from a pdf file and output it into a csv file via .py script?

I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file.
Any suggestions for writing the .py script is welcome.
This is my current script. I run into the error:
File "extract-table.py", line 63, in get_table_csv_results
bash: File: command not found
blocks=response['Blocks']
KeyError: 'Blocks'
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
#Response
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor
This will create a csv per table in your pdf document. e.g output_p0_t0.csv
from textractor import Textractor
def extract_tables(s3_file_path, output_directory, s3_output_path):
extractor = Textractor(profile_name="default")
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
I had to make slight changes to #Thomas answer by importing profile `
extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.
from textractor import Textractor
extractor = Textractor(profile_name="default")
def extract_tables(s3_file_path, output_directory, s3_output_path):
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
Hope it helps someone out there.

TypeError: 'DetectedFace' object is not subscriptable AZURE COGNITIVE FACE

ERROR:
File "identify.py", line 57, in <module>
faceIds.append(face['faceId'])
TypeError: 'DetectedFace' object is not subscriptable
CODE (Below is the code block)
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.face.models import TrainingStatusType, Person, SnapshotObjectType, OperationStatusType
import global_variables as global_var
import os, urllib
import sqlite3
from openpyxl import Workbook, load_workbook
from openpyxl.utils import get_column_letter, column_index_from_string
from openpyxl.cell import Cell
import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#get current date
currentDate = time.strftime("%d_%m_%y")
wb = load_workbook(filename = "reports.xlsx")
sheet = wb['Cse15']
def getDateColumn():
for i in range(1, len(list(sheet.rows)[0]) + 1):
col = get_column_letter(i)
if sheet['%s%s'% (col,'1')].value == currentDate:
return col
Key = global_var.key
ENDPOINT = 'https://centralindia.api.cognitive.microsoft.com'
face_client = FaceClient(ENDPOINT,CognitiveServicesCredentials(Key))
connect = sqlite3.connect("Face-DataBase")
attend = [0 for i in range(60)]
currentDir = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(currentDir, 'Cropped_faces')
for filename in os.listdir(directory):
if filename.endswith(".jpg"):
print(filename)
img_data = open(os.path.join(directory,filename), "rb")
res = face_client.face.detect_with_stream(img_data)
print("Res = {}".format(res))
if len(res) < 1:
print("No face detected.")
continue
faceIds = []
for face in res:
faceIds.append(face['faceId']) #error occuring line
res = face_client.face.identify(faceIds, global_var.personGroupId)
print(filename)
print("res = {}".format(res))
for face in res:
if not face['candidates']:
print("Unknown")
else:
personId = face['candidates'][0]['personId']
print("personid = {}".format(personId))
#cmd = + personId
cur = connect.execute("SELECT * FROM Students WHERE personID = (?)", (personId,))
#print("cur = {}".format(cur))
for row in cur:
print("aya")
print("row = {}".format(row))
attend[int(row[0])] += 1
print("---------- " + row[1] + " recognized ----------")
time.sleep(6)
for row in range(2, len(list(sheet.columns)[0]) + 1):
rn = sheet.cell(row = row, column =1).value
if rn is not None:
print("rn = {}".format(rn))
rn = rn[-2:]
if attend[int(rn)] != 0:
col = getDateColumn()
print("col = {}".format(col))
sheet['%s%s' % (col, str(row))] = 0
wb.save(filename = "reports.xlsx")
This is an automatic attendance system that captures multiple faces and updates it in excel sheet. Azure Cognitive face API is used for face detection.
I hope objects have changed for new API. Thid code contains commends for older version of azure but it has changed now.If anyone know how to solve the error do help.

XLSXWriter refuses to create a second Excel File

I'm working on a program to split excel files into sections of 1000. I can't seem to get it to create a second excel file, as xlsxwriter doesn't create the second file.
from os.path import join, dirname, abspath
from xlrd.sheet import ctype_text
import csv
import os
import sys
import xlrd
import xlsxwriter
import xlwt
file_paths = sys.argv[1:]
draganddrop = ''.join(file_paths)
beginGrab = 0
counting = 0
endGrab = 1000
thousands = 0
if draganddrop == "":
fileName = raw_input("\nInput the file with extension\n>")
else:
fileName = draganddrop
stopPoint = fileName.index('.')
prepRev = fileName[stopPoint:]
preName = fileName[:stopPoint]
if prepRev == ".csv":
excelFile = xlsxwriter.Workbook(preName + '.xlsx')
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
content = csv.reader(f)
for index_col, data_in_col in enumerate(content):
for index_row, data_in_cell in enumerate(data_in_col):
worksheet.write(index_col,index_row,data_in_cell)
excelFile.close()
fileName = (preName + '.xlsx')
delMe = 1
print("Temporary Convert to xlsx done.\n")
stopPoint = fileName.index('.')
prepRev = fileName[0:stopPoint]
fname = join(dirname(abspath(__file__)), fileName)
xl_workbook = xlrd.open_workbook(fname)
sheet_names = xl_workbook.sheet_names()
xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
book = xlwt.Workbook(encoding="utf-8")
worksheet = book.add_sheet("Results", cell_overwrite_ok=True)
workbook = xlrd.open_workbook(fileName)
for sheet in workbook.sheets():
for row in range(sheet.nrows):
row = int(row)
if(int(row)>1000):
subDivide = int(row) / 1000
while(thousands != subDivide + 1):
thousands = thousands + 1
counting = 0
totalName = preName + "_" + str(thousands) + ".xlsx"
print(totalName)
excelFile = xlsxwriter.Workbook(str(totalName))
worksheet = excelFile.add_worksheet()
with open(totalName,'rb') as f:
col = xl_sheet.col_slice(0,1,10101010)
for idx, cell_obj in enumerate(col, start=beginGrab):
counting = counting + 1
if(counting == 1000):
break
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
cell_obj_str = str(cell_obj)
telePhone = (cell_obj_str[7:19])
worksheet.write(idx+1, 0, "1" + telePhone)
worksheet.write(0,0, "Telephone Number")
beginGrab = thousands * 1000
endGrab = beginGrab + 1000
excelFile.close()
excelFile = None
else:
print("Mate, this is Tiny!")
print ("Ding! Job Done!")
I've been rubber ducking this and I can't find where I'm at fault.
EDIT:
SOLVED!!
By creating a sheet and then closing it, the program can then grasp it. I will probably make a git issue about this.
if prepRev == ".csv":
totalName = preName + '.xlsx'
excelFile = xlsxwriter.Workbook(totalName)
excelFile.close()
Closing it lets open see it while it still contains the same info.
excelFile = xlsxwriter.Workbook(totalName)
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
Doesn't the save/close line need to be within the while loop? Otherwise it looks like it will only save either the first/last item:
while(thousands != subDivide + 1):
# write file
excelFile.close()
that line is probably the reason why you cannot read back your file and your script crashes:
fname = join(dirname(abspath('__file__')), '%s' % fileName)
'__file__' shouldn't have quotes. I'd do:
fname = join(dirname(abspath(__file__)), fileName)

Categories

Resources