Python-docx delete table code not working as expected - python

I am trying to do the following tasks:
Open a DOCX file using python-docx library
Count the # of tables in the DOCX: table_count = len(document.tables)
The function read_docx_table() extracts the table, creates a dataframe.
My objective here is as following:
Extract ALL tables from the DOCX
Find the table that is empty
Delete the empty table
Save the DOCX
My code is as follows:
import pandas as pd
from docx import Document
import numpy as np
document = Document('tmp.docx')
table_count = len(document.tables)
table_num= table_count
print(f"Number of tables in the Document is: {table_count}")
nheader=1
i=0
def read_docx_table(document, table_num=1, nheader=1):
table = document.tables[table_num-1]
data = [[cell.text for cell in row.cells] for row in table.rows]
df = pd.DataFrame(data)
if nheader ==1:
df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
elif nheader == 2:
outside_col, inside_col = df.iloc[0], df.iloc[1]
h_index = pd.MultiIndex.from_tuples(list(zip(outside_col, inside_col)))
df = pd.DataFrame(data, columns=h_index).drop(df.index[[0,1]]).reset_index(drop=True)
elif nheader > 2:
print("More than two headers. Not Supported in Current version.")
df = pd.DataFrame()
return df
def Delete_table(table):
print(f" We are deleting table now. Table index is {table}")
print(f"Type of index before casting is {type(table)}")
index = int(table)
print(f"Type of index is {type(index)}")
try:
print("Deleting started...")
document.tables[index]._element.getparent().remove(document.tables[index]._element)
except Exception as e:
print(e)
while (i < table_count):
print(f"Dataframe table number is {i} ")
df = read_docx_table(document,i,nheader)
df = df.replace('', np.nan)
print(df)
if (df.dropna().empty):
print(f'Empty DataFrame. Table Index = {i}')
print('Deleting Empty table...')
#Delete_table(i)
try:
document.tables[i]._element.getparent().remove(document.tables[i]._element)
print("Table deleted...")
except Exception as e:
print(e)
else:
print("DF is not empty...")
print(df.size)
i+=1
document.save('OUT.docx')
My INPUT docx has 3 tables:
But, my CODE gives me the following Output:
It is keeping the empty table and deleting the non-empty table.
Is there something I am missing? I am doubting my logic to check the Table is empty using if (df.dropna().empty):

The df.dropna().empty logic drops any tables that have no non-header rows lacking a blank cell. Is that the intent? If so, then it seems okay to me.
Two points:
The docx library does not necessarily return the tables in the order they exist in the document.
When you delete a table with your code, you immediately skip the next one to be returned (which might not be the next one in the document due to the above) because you increment your counter after deleting. That results in not processing all the tables.
As I worked through the logic, I did some rearrangements to understand it. I think you might also have been getting some exceptions being emitted on indexing into the tables after deleting? I included my edits below - hopefully they help.
import pandas as pd
from docx import Document
import numpy as np
def read_docx_table(document, table_num, header_rows):
table = document.tables[table_num]
data = [[cell.text for cell in row.cells] for row in table.rows]
df = pd.DataFrame(data)
if header_rows == 1:
df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
elif header_rows == 2:
outside_col, inside_col = df.iloc[0], df.iloc[1]
h_index = pd.MultiIndex.from_tuples(list(zip(outside_col, inside_col)))
df = pd.DataFrame(data, columns=h_index).drop(df.index[[0,1]]).reset_index(drop=True)
else: # header_rows not 1 or 2
print("More than two headers. Not Supported in Current version.")
df = pd.DataFrame()
return df
def table_is_empty(document, table_num, header_rows):
df = read_docx_table(document, table_num, header_rows)
df = df.replace('', np.nan)
return df.dropna().empty
def delete_table(document, table_num):
document.tables[table_num]._element.getparent().remove(document.tables[table_num]._element)
document = Document('tmp.docx')
header_rows = 1
table_count = len(document.tables)
print(f"Number of tables in the input document is: {table_count}")
table_num = 0
while table_num < table_count:
if table_is_empty(document, table_num, header_rows):
delete_table(document, table_num)
table_count = len(document.tables)
else:
table_num += 1
print(f"Number of tables in the output document is: {table_count}")
document.save('OUT.docx')

Related

every time i run my code in python it removes the column heading and switch it with number also it adds columns on the left

This is my code below:
import pandas as pd
import numpy as np
df = pd.read_csv('DAQ1.csv', header=None, skiprows=1)
print(df)
splitter = np.where(~df[8].isnull())[0]
for i in range(len(splitter)):
try:
end = splitter[i+1]
except:
end = len(df)
val = df[8].iloc[splitter[i]]
df.iloc[splitter[i]:end, 8] = val
splitter = np.where(~df[7].isnull())[0]
for i in range(len(splitter)):
try:
end = splitter[i+1]
except:
end = len(df)
val = df[7].iloc[splitter[i]]
df.iloc[splitter[i]:end, 7] = val
splitter = np.where(~df[10].isnull())[0]
for i in range(len(splitter)):
try:
end = splitter[i+1]
except:
end = len(df)
val = df[10].iloc[splitter[i]]
df.iloc[splitter[i]:end, 10] = val
print(df)
df['furnace_power'] = df[8]*df[7]*0.52
df['heat_pump_power'] = df[10]*230*0.62
addy1 = df['furnace_power'].sum()
addy2 = df['heat_pump_power'].sum()
print(df)
print("Sum of furnace power: ", addy1)
print("Sum of heat pump power: ", addy2)
df.to_csv('DAQ1.csv') #(saving the file)
this is the original file
this is the file after I run the code
If you use header=None then it treats first row (with headers) as normal row and you have numbers instead of headers. And you have to skip first row because there are headers.
Simply skip arguments
df = pd.read_csv('DAQ1.csv')
If you want save without indexes then skip them.
df.to_csv('DAQ1.csv', index=False)
Doc: read_csv(), to_csv()
EDIT:
Minimal working example.
I use io.StringIO only to create example file at start.
import pandas as pd
# create example file in memory
import io
text = '''A,B,C
1,2,3
4,5,6
7,8,9
'''
file_object = io.StringIO(text)
# read with headers
df = pd.read_csv(file_object)
print(df)
# save without index
df = df.to_csv('DAQ1.csv', index=False)
# read again to check index
df = pd.read_csv('DAQ1.csv')
print(df)

table extraction: adding column with file name and path of origin file to df

i need to extract the same table out of multiple docx report documents.
In the list 'targets_in_dir' I have stored all the file names with paths in the format
'C:\directory\subdirectory\filename1.docx'
The code below perfectly grabs the table out of the document and correctly allocates the keys to the columns.
import pandas as pd
import docx
from docx.api import Document
document = Document(targets_in_dir[1])
table = document.tables[2]
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
df = pd.DataFrame(data)
df['report'] = targets_in_dir[1]
print (targets_in_dir[1])
My question: For tracking purpose I want to add a column to the final df where in each line the filename where the row was pulled is added. I tried to do it with the line
df['report'] = targets_in_dir[1]
but strangely it only adds the data from 'data_1' instead of the filename and path!
report
data_1
C:\directory\subdirectory\filename1.docx
Cumarin
C:\directory\subdirectory\filename1.docx
Piperacin
Meanwhile I found a solution myself with the following line of code. i just add str
df['report'] = str(targets_in_dir[1])

How to store data in new rows automatically using python's pandas and numpy library

I want to store data in the 2nd row of the CSV file. but it's overriding the 1st row each time. How to do it? I used a loop for storing data in the 2nd row. but it's not happening. Is there anything I am missing?
import pandas as pd
import numpy as np
a=1
def table_driven_agent(percept):
location=['A','B']
status=['clean','dirty']
percepts=[]
table=[location,status]
percepts.append(percept)
action = tableDrivenVaccum(percept,table)
if action == 'dirty':
action = dirty(percept[1])
return(action)
def tableDrivenVaccum(percept,table):
for i in table:
if i[0] == percept[0]:
return(i[1])
def dirty(percept):
if percept == 'A':
return('Go to B')
elif percept =='B':
return('Go to A')
while( a<10):
percept = list(map(str,input("Enter the status and position:").split(',')))
action = table_driven_agent(percept)
print(action)
dict = {'action': action, 'percept': percept}
df= pd.DataFrame(dict)
df.to_csv(r'index.csv', index =False, header=True)
a=1
Writing to csv creates a new one and replace current csv. That's why it writes into the first row: it actually delete previous and writes into blank file.
What you are looking for is to append df into current csv. That can be done by:
df.to_csv('index.csv', index=False, mode='a', header=True)
Notice I added mode='a'. It means mode=append. Default mode is 'w', means "write" (overwrite).
That way df is being added to the last row of current csv.
If your csv has more then 2 rows and you want to change only second row, You should load all your csv to a dataframe first. Then you can change only second row (df.loc[1,:]=...) and then save all of it to csv (df.to_csv(r'index.csv', index =False, header=True)
Ok, I solved it myself by reading old CSV file and merging and put it into the CSV file.you have to create CSV file index.csv like mine. then put column name as percent and action to work. without CSV file in your code location will give an error. it will automatically merge the latest input in the CSV file without deleting any row.
NOTE: index.csv, percent, and action is just an example. you can use yours as well.
import pandas as pd
import numpy as np
i = 1
def table_driven_agent(percept):
location=['A','B']
status=['clean','dirty']
percepts=[]
table=[location,status]
percepts.append(percept)
action = tableDrivenVaccum(percept,table)
if action == 'dirty':
action = dirty(percept[1])
return(action)
def tableDrivenVaccum(percept,table):
for i in table:
if i[0] == percept[0]:
return(i[1])
def dirty(percept):
if percept == 'A':
return('Go to B')
elif percept =='B':
return('Go to A')
while i < 6:
percept = list(map(str,input("Enter the status and position:").split(',')))
action = table_driven_agent(percept)
print(action)
df_old =pd.read_csv('index.csv')
newData=[[percept,action]]
colNames =df_old.columns
df_new = pd.DataFrame(data=newData, columns=colNames)
df_complete = pd.concat([df_old,df_new],axis=0)
df_complete.to_csv('index.csv',index=False)
#dict = {'percept': percept, 'action': action}
#df= pd.DataFrame(dict)
#df.to_csv(r'index.csv', index =False, header=True,mode='a')
i =1

Write tables from Word (.docx) to Excel (.xlsx) using xlsxwriter

I am trying to parse a word (.docx) for tables, then copy these tables over to excel using xlsxwriter.
This is my code:
from docx.api import Document
import xlsxwriter
document = Document('/Users/xxx/Documents/xxx/Clauses Sample - Copy v1 - for merge.docx')
tables = document.tables
wb = xlsxwriter.Workbook('C:/Users/xxx/Documents/xxx/test clause retrieval.xlsx')
Sheet1 = wb.add_worksheet("Compliance")
index_row = 0
print(len(tables))
for table in document.tables:
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
#print (data)
#big_data.append(data)
Sheet1.write(index_row,0, str(row_data))
index_row = index_row + 1
print(row_data)
wb.close()
This is my desired output:
However, here is my actual output:
I am aware that my current output produces a list of string instead.
Is there anyway that I can get my desired output using xlsxwriter? Any help is greatly appreciated
I would go using pandas package, instead of xlsxwriter, as follows:
from docx.api import Document
import pandas as pd
document = Document("D:/tmp/test.docx")
tables = document.tables
df = pd.DataFrame()
for table in document.tables:
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
df.columns = ["Column1", "Column2"]
df.to_excel("D:/tmp/test.xlsx")
print df
Which outputs the following that is inserted in the excel:
>>>
Column1 Column2
0 Hello TEST
1 Est Ting
2 Gg ff
This is the portion of my code update that allowed me to get the output I want:
for row in block.rows:
for x, cell in enumerate(row.cells):
print(cell.text)
Sheet1.write(index_row, x, cell.text)
index_row += 1
Output:

User Input pd.read_excel gives "ValueError: Invalid file path or buffer object type" - Pandas

I have list of excel files that are read into pandas dataframes. However, some files (dataframes) have different headers in different rows. Therefore, I would like to have a user input, which will help me to set dataframe headers for each DataFrame.
Lets say my first (Excel file) dataframe looks like this,
0 245 867
1 Reddit Facebook
2 ColumnNeeded ColumnNeeded
3 RedditInsight FacbookInsights
4 RedditText FacbookText
Now, I want to the user to look at this and then input row 2 (index 1) as the number, then my output dataframe will be like this,
Reddit Facebook
0 ColumnNeeded ColumnNeeded
1 RedditInsight FacbookInsights
2 RedditText FacbookText
This way, I can create headers for each dataframe.
This is how I have,
excel_file_dfs = []
for file in glob.glob(r'path\*.xlsx'):
df = pd.read_excel(file)
## Not sure how to show the DataFrame here so, user can select the row to be the header
ask_user = input("What raw do you want to make it header? ")
header_number = ask_user
df = pd.read_excel(file, header=[header_number])
excel_file_dfs.append(df)
I am getting this error:
ValueError: Invalid file path or buffer object type:
from line df = pd.read_excel(each_file, header=[ask_user]).
I know I am reading pd.read_excel() two times, which might be causing lot of memory and processing.
Anyhow, I want the user to see each DataFrame and then input the row number to select the header. How can I do it in pandas?
How many rows down can the header be? Let us assume it is within the first 5:
Would this approach make sense?
import pandas as pd
data = '''\
245 867
Reddit Facebook
ColumnNeeded ColumnNeeded
RedditInsight FacbookInsights
RedditText FacbookText
'''
fileobj = pd.compat.StringIO(data)
df = pd.read_csv(fileobj, sep='\s+', header=None)
print(df.head(5))
inp = input('Which row is header?')
n = int(inp)
df.columns = df.loc[n].values
df = df.loc[n+1:]
print(df)
Or define a function with a loop:
def change_header(df, i=5):
n = 0
while True:
print(df.loc[n:n+i])
inp = input('Which row is header? (number or (n)ext or (r)estart)')
if inp.isdigit():
n = int(inp)
if n < len(df):
break
else:
n = 0
print('error')
continue
elif inp.lower().startswith('r'):
n = 0
continue
elif inp.lower().startswith('n'):
if (n+i) < len(df):
n += i
continue
else:
print('Try something else')
df.columns = df.loc[n].values
df = df.loc[n+1:]
return df
df = change_header(df, 5)
You can use os library and call the files like this:
import os
import pandas as pd
excel_file_dfs = []
directory = 'C:/your_directory_here'
for filename in os.listdir(directory):
if filename.endswith('.xlsx'):
header_number = print('Enter row number you want to make header: ')
df = pd.read_excel(filename, header=int(header_number))
excel_file_dfs.append(df)
final_df = pd.concat(excel_file_dfs)
final_df
This way initially you can ask for headers and the take the os and call for the directory and take all the excel sheets.
Hope it cleared your question. :)

Categories

Resources