Converting docx to csv using python-docx: issue with merged cells - python

I have a .docx file (attached screenshot) with a table. I need to convert it into a .csv table. I am using python-docx for this with the code below.
My code is below. Everything works fine except the last column (G) which is a merged cell. My code ignores G1 and only reports column G2 (screenshot attached). How can I edit the code so that the .csv file has both G1 and G2 columns?
Thanks!
import glob
import os
import pandas as pd
from docx.api import Document
files = glob.glob('*.docx')
for name in glob.glob('*.docx'):
document = Document(name)
#document = Document('f.docx')
table = document.tables[0]
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
#data
#print (data)
df = pd.DataFrame(data)
print(os.path.splitext(name)[0])
df.to_csv(os.path.splitext(name)[0]+'.csv')

Related

Extract data from a docx and insert into an xlsx or csv file in python

I'm trying to extract data from a table that is in docx, and insert this data into a file in xslx or in csv. I chose to do it in Python, due to the ease of handling data.
Attached is the formatting of the data in the table
This is the script I'm using to generate the tables in xlsx. But the data from table 1 and table 2 are joined.
Name, id, year, subject, start and observations.
So the ideal would be to create a spreadsheet with this data instead of creating 2 tables.
I got other settings in the code but to no avail.
That's the question, I'm not able to join these 2 tables
import pandas as pd
from docx import Document
document = Document(r"test.docx")
tables = []
for index,table in enumerate(document.tables):
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
df[i][j] = cell.text
pd.DataFrame(df).to_excel("Tabela__ "+str(index)+".xlsx")
print("Done")
Expected configuration
So if anyone here has experience with this kind of data transformation, I would be very grateful.
Test file 71433068_data.docx can be downloaded from here
Possible solution is the following:
# pip install python-docx
# pip install pandas
import pandas as pd
from docx import Document
document = Document("71433068_data.docx")
# group tables by two items (two tables are related with in one subject)
tbl_list_grp = [document.tables[i:i + 2] for i in range(0, len(document.tables), 2)]
all_data = []
for subject in tbl_list_grp:
tbl_data = {}
# table 1 - NAME, ID (keys are the in first column)
for i, row in enumerate(subject[0].rows):
text = (cell.text for cell in row.cells)
kv = tuple(text)
tbl_data[kv[0]]= [kv[1]] * (len(subject[1].rows)-1)
# table 2 - YEAR, ..., OBSERVATION (keys are in the first row)
for i, column in enumerate(subject[1].columns):
col_cells = list((cell.text for cell in column.cells))
tbl_data[col_cells[0]] = [_ for _ in col_cells[1:]]
all_data.append(tbl_data)
# create pandas dataframe
df = pd.concat([pd.DataFrame(i) for i in all_data]).reset_index(drop=True)
df
Returns
# write dataframe data to excel
df.to_excel('71433068_data.xlsx', encoding='urf-8', index=False)

How to concat docx file in python?

Below is my code:
v_excel= []
for root, dirs, files in os.walk(paths):
for t in files:
if t.endswith('.xlsx'):
df = pd.read_excel(os.path.join(paths,t), header=None, index_col=False)
v_excel.append(df)
conc = pd.concat(v_excel, axis=1, ignore_index=True)
conc output:
#after appending two excel files i can successively concat the files and put it in
#seperate column
column1 column2
data1 data1
data2 data2
data3 data3
data3 data4
#column 1 is from excel file 1 and column2 from excel file 2
How to do this for docx as i did for excel ?
if t.endswith('.docx'):
#for c,z in enumerate(t):
v_doc.append(Document(t)) # <-----how to put this in df and concat according to
# docx file as i have done with excel ?
docx contains:
#docx contains dummy text's !!!
#docx1 contains:
data1
data2
data3
data4
#docx2 contains:
data5
data6
data7
data8
i want to save the content of docx files to columns of excel. docx 1 content to column 1 of excel and docx 2 to column 2 of same excel.
Hope i get some response. Thank you in advance.
Solution #1: Aggregating multiple .docx documents to single output docx document.
If want to copy the text and style from a collection of docx documents to a single output docx then can use python-docx module.
from docx import Document
import os
master = Document()
for f in os.listdir('.'):
if f.endswith('.docx'):
doc = Document(f)
for p in doc.paragraphs:
out_para = master.add_paragraph()
for run in p.runs:
output_run = out_para.add_run(run.text)
# copy style from old to new
output_run.bold = run.bold
output_run.italic = run.italic
output_run.underline = run.underline
output_run.font.color.rgb = run.font.color.rgb
output_run.style.name = run.style.name
master.save('out.docx')
Solution #2: Aggregating table content from multiple .docx documents to single output excel document.
In your comments, you want to create an excel sheet from a set of word documents with tables of text.
Here is Python code to copy cells in tables of Word documents to a target Excel document.
import pandas as pd
from docx import Document
import os
df = None
for f in os.listdir('data'):
if f.endswith('.docx'):
doc = Document(file)
for table in doc.tables:
for row in table.rows:
data = []
for cell in row.cells:
data.append(cell.text)
if df is None:
df = pd.DataFrame(columns=list(range(1, len(data)+1)))
df = df.append(pd.Series(data, index=df.columns),
ignore_index=True)
df.to_excel("output.xlsx")
Solution #3: Aggregating custom table content from multiple .docx documents to single output excel document with a 2-column table.
In your particular sample data, the table is structured with either 3 or 9 columns so need to concatenate the text of other columns to a single value if want to keep 2 columns in output.
df = None
for f in os.listdir('data'):
if f.endswith('.docx'):
doc = Document(file)
# iterate over all the tables
for table in doc.tables:
for row in table.rows:
cells = row.cells
if len(cells) > 1:
col1 = cells[0].text
# check if first column is not empty
if col1:
# concatenate text of cells to a single value
text = ''
for i in range(1, len(cells)):
if len(text) != 0:
text += ' '
text += cells[i].text
data = [cells[0].text, text]
if df is None:
df = pd.DataFrame(columns=['column1', 'column2'])
df = df.append(pd.Series(data, index=df.columns),
ignore_index=True)
# save output
df.to_excel("output.xlsx")
You can docxcompose to concat docx files in python. you can read more descriptions in docxcompose's pypi official page

table extraction: adding column with file name and path of origin file to df

i need to extract the same table out of multiple docx report documents.
In the list 'targets_in_dir' I have stored all the file names with paths in the format
'C:\directory\subdirectory\filename1.docx'
The code below perfectly grabs the table out of the document and correctly allocates the keys to the columns.
import pandas as pd
import docx
from docx.api import Document
document = Document(targets_in_dir[1])
table = document.tables[2]
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
df = pd.DataFrame(data)
df['report'] = targets_in_dir[1]
print (targets_in_dir[1])
My question: For tracking purpose I want to add a column to the final df where in each line the filename where the row was pulled is added. I tried to do it with the line
df['report'] = targets_in_dir[1]
but strangely it only adds the data from 'data_1' instead of the filename and path!
report
data_1
C:\directory\subdirectory\filename1.docx
Cumarin
C:\directory\subdirectory\filename1.docx
Piperacin
Meanwhile I found a solution myself with the following line of code. i just add str
df['report'] = str(targets_in_dir[1])

Extracting tables from a word doc

Is there any tool to extract all tables from a word documents and converting them to a csv file or any excel extension file using python or vba
note that the word file contains both text and tables.
You can use pandas with python-docx. Per this answer you can extract all tables from a document and put them in a list:
from docx import Document
import pandas as pd
document = Document('test.docx')
tables = []
for table in document.tables:
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
tables.append(pd.DataFrame(df))
You can then save the tables to csv files by looping through the list:
for nr, i in enumerate(tables):
i.to_csv("table_" + str(nr) + ".csv")

Write tables from Word (.docx) to Excel (.xlsx) using xlsxwriter

I am trying to parse a word (.docx) for tables, then copy these tables over to excel using xlsxwriter.
This is my code:
from docx.api import Document
import xlsxwriter
document = Document('/Users/xxx/Documents/xxx/Clauses Sample - Copy v1 - for merge.docx')
tables = document.tables
wb = xlsxwriter.Workbook('C:/Users/xxx/Documents/xxx/test clause retrieval.xlsx')
Sheet1 = wb.add_worksheet("Compliance")
index_row = 0
print(len(tables))
for table in document.tables:
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
#print (data)
#big_data.append(data)
Sheet1.write(index_row,0, str(row_data))
index_row = index_row + 1
print(row_data)
wb.close()
This is my desired output:
However, here is my actual output:
I am aware that my current output produces a list of string instead.
Is there anyway that I can get my desired output using xlsxwriter? Any help is greatly appreciated
I would go using pandas package, instead of xlsxwriter, as follows:
from docx.api import Document
import pandas as pd
document = Document("D:/tmp/test.docx")
tables = document.tables
df = pd.DataFrame()
for table in document.tables:
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
df.columns = ["Column1", "Column2"]
df.to_excel("D:/tmp/test.xlsx")
print df
Which outputs the following that is inserted in the excel:
>>>
Column1 Column2
0 Hello TEST
1 Est Ting
2 Gg ff
This is the portion of my code update that allowed me to get the output I want:
for row in block.rows:
for x, cell in enumerate(row.cells):
print(cell.text)
Sheet1.write(index_row, x, cell.text)
index_row += 1
Output:

Categories

Resources