As an example, I have a generic script that outputs the default table styles using python-docx (this code runs fine):
import docx
d=docx.Document()
type_of_table=docx.enum.style.WD_STYLE_TYPE.TABLE
list_table=[['header1','header2'],['cell1','cell2'],['cell3','cell4']]
numcols=max(map(len,list_table))
numrows=len(list_table)
styles=(s for s in d.styles if s.type==type_of_table)
for stylenum,style in enumerate(styles,start=1):
label=d.add_paragraph('{}) {}'.format(stylenum,style.name))
label.paragraph_format.keep_with_next=True
label.paragraph_format.space_before=docx.shared.Pt(18)
label.paragraph_format.space_after=docx.shared.Pt(0)
table=d.add_table(numrows,numcols)
table.style=style
for r,row in enumerate(list_table):
for c,cell in enumerate(row):
table.row_cells(r)[c].text=cell
d.save('tablestyles.docx')
Next, I opened the document, highlighted a split table and under paragraph format, selected "Keep with next," which successfully prevented the table from being split across a page:
Here is the XML code of the non-broken table:
You can see the highlighted line shows the paragraph property that should be keeping the table together. So I wrote this function and stuck it in the code above the d.save('tablestyles.docx') line:
def no_table_break(document):
tags=document.element.xpath('//w:p')
for tag in tags:
ppr=tag.get_or_add_pPr()
ppr.keepNext_val=True
no_table_break(d)
When I inspect the XML code the paragraph property tag is set properly and when I open the Word document, the "Keep with next" box is checked for all tables, yet the table is still split across pages. Am I missing an XML tag or something that's preventing this from working properly?
Ok, I also needed this. I think we were all making the incorrect assumption that the setting in Word's table properties (or the equivalent ways to achieve this in python-docx) was about keeping the table from being split across pages. It's not -- instead, it's simply about whether or not a table's rows can be split across pages.
Given that we know how successfully do this in python-docx, we can prevent tables from being split across pages by putting each table within the row of a larger master table. The code below successfully does this. I'm using Python 3.6 and Python-Docx 0.8.6
import docx
from docx.oxml.shared import OxmlElement
import os
import sys
def prevent_document_break(document):
"""https://github.com/python-openxml/python-docx/issues/245#event-621236139
Globally prevent table cells from splitting across pages.
"""
tags = document.element.xpath('//w:tr')
rows = len(tags)
for row in range(0, rows):
tag = tags[row] # Specify which <w:r> tag you want
child = OxmlElement('w:cantSplit') # Create arbitrary tag
tag.append(child) # Append in the new tag
d = docx.Document()
type_of_table = docx.enum.style.WD_STYLE_TYPE.TABLE
list_table = [['header1', 'header2'], ['cell1', 'cell2'], ['cell3', 'cell4']]
numcols = max(map(len, list_table))
numrows = len(list_table)
styles = (s for s in d.styles if s.type == type_of_table)
big_table = d.add_table(1, 1)
big_table.autofit = True
for stylenum, style in enumerate(styles, start=1):
cells = big_table.add_row().cells
label = cells[0].add_paragraph('{}) {}'.format(stylenum, style.name))
label.paragraph_format.keep_with_next = True
label.paragraph_format.space_before = docx.shared.Pt(18)
label.paragraph_format.space_after = docx.shared.Pt(0)
table = cells[0].add_table(numrows, numcols)
table.style = style
for r, row in enumerate(list_table):
for c, cell in enumerate(row):
table.row_cells(r)[c].text = cell
prevent_document_break(d)
d.save('tablestyles.docx')
# because I'm lazy...
openers = {'linux': 'libreoffice tablestyles.docx',
'linux2': 'libreoffice tablestyles.docx',
'darwin': 'open tablestyles.docx',
'win32': 'start tablestyles.docx'}
os.system(openers[sys.platform])
Have been straggling with the problem for some hours and finally found the solution worked fine for me. I just changed the XPath in the topic starter's code so now it looks like this:
def keep_table_on_one_page(doc):
tags = self.doc.element.xpath('//w:tr[position() < last()]/w:tc/w:p')
for tag in tags:
ppr = tag.get_or_add_pPr()
ppr.keepNext_val = True
The key moment is this selector
[position() < last()]
We want all but the last row in each table to keep with the next one
Would have left this is a comment under #DeadAd 's answer, but had low rep.
In case anyone is looking to stop a specific table from breaking, rather than all tables in a doc, change the xpath to the following:
tags = table._element.xpath('./w:tr[position() < last()]/w:tc/w:p')
where table refers to the instance of <class 'docx.table.Table'> which you want to keep together.
"//" will select all nodes that match the xpath (regardless of relative location), "./" will start selection from current node
Related
I have the following code to parse from an xml file to produce a pandas dataframe. The XML file looks like this:
<?xml version="1.0" encoding="UTF-8"?>
<Entries>
<EntrySynopsisDetail_1_0>
<EntryID>262148</EntryID>
<EntryTitle>Establishment of the Graduate Internship Program</EntryTitle>
<CategoryOfEntry>ENG</CategoryOfEntry>
</EntrySynopsisDetail_1_0>
<EntrySynopsisDetail_1_0>
<EntryID>2667654</EntryID>
<EntryTitle>Call for Mobility Program</EntryTitle>
<CategoryOfEntry>ENG</CategoryOfEntry>
<CategoryOfEntry>MAT</CategoryOfEntry>
</EntrySynopsisDetail_1_0>
</Entries>
And my code is below:
from bs4 import BeautifulSoup
import pandas as pd
fd = open("file_120123.xml",'r')
data = fd.read()
Bs_data = BeautifulSoup(data,'xml')
ID = Bs_data.find_all('EntryID')
Title = Bs_data.find_all('EntryTitle')
try:
Cat = Bs_data.find_all('CategoryOfEntry')
except IndexError:
Cat = ''
CatDict = {
"ENG":"English",
"MAT" :"Mathematics"
}
dataDf = []
for i in range(0,len(ID)):
if (Cat[i] == CatDict):
Cat[i] == CatDict.get(Cat[i])
rows = [ID[i].get_text(), Title[i].get_text(), Cat[i])
dataDf.append(rows)
df = pd.DataFrame(dataDf, columns =['ID', 'Title', 'Category'], dtype=float)
df.to_csv('120123.csv')
As you see, the code reads a xml file called 'file_120123.xml' using BeautifulSoup library, and calls each of the elements present in the file. Now one of the elements is a key and I have created a dictionary listing all possible keys. Not all parents have that element. I want to compare the extracted key with the ones in the dictionary and replace that with the value corresponding to that key.
With this code, I get the error IndexError: list index out of range on Cat[i] on if (Cat[i] == CatDict): line. Any insights on how to resolve this?
If you just want to avoid raising the error, add a conditional break
for i in range(0,len(ID)):
if not i < len(Cat): break ## <-- break loop if length of Cat is exceeded
if (Cat[i] == CatDict):
Cat[i] == CatDict.get(Cat[i])
rows = [ID[i].get_text(), Title[i].get_text(), Cat[i])
dataDf.append(rows)
First, as to why lxml is better than BeautifulSoup for xml, the answer is simple: the best way to query xml is with xpath. lxml supports xpath (though only version 1.0; for more complex xml and queries you will need xpath 2.0 to 3.1 and a library like elementpath). BS doesn't support xpath, though it does have excellent support for css selectors, which works better with html.
Having said all that - in your particular case, you probably don't need lxml either - only pandas and a one liner! Though you haven't shown your expected output, my guess is you expect the output below. Note that in your sample xml there is probability an error: the 2nd <EntrySynopsisDetail_1_0> has <CategoryOfEntry> twice, so I removed one:
entries = """<Entries>
<EntrySynopsisDetail_1_0>
<EntryID>262148</EntryID>
<EntryTitle>Establishment of the Graduate Internship Program</EntryTitle>
<CategoryOfEntry>ENG</CategoryOfEntry>
</EntrySynopsisDetail_1_0>
<EntrySynopsisDetail_1_0>
<EntryID>2667654</EntryID>
<EntryTitle>Call for Mobility Program</EntryTitle>
<CategoryOfEntry>MAT</CategoryOfEntry>
</EntrySynopsisDetail_1_0>
</Entries>"""
pd.read_xml(entries,xpath="//EntrySynopsisDetail_1_0")
Output:
EntryID EntryTitle CategoryOfEntry
0 262148 Establishment of the Graduate Internship Program ENG
1 2667654 Call for Mobility Program MAT
I am trying to display elements from an XML file in tkinter Entry and Text widgets.
Example XML:
<notes>
<note id="5/10/22/14:20">
<index>5/10/22, 14:20 > Mr Anderson</index>
<date>5/10/22</date>
<time>14:20</time>
<from>Agt Smith</from>
<to>Mr Anderson</to>
<subject>App Test 3</subject>
<body>Do you hear that, Mr. Anderson? That is the sound of inevitability.</body>
</note>
</notes>
I was going to try getchildren() but I am told that is deprecated. I want to use the note id to pull up that element and its children, and insert various children into the correlating Entry and Text widgets. However, I cannot seem to get the correct values into variables. It feels like it should be simple and easy, but my instincts are leading me astray. What I am getting from this code is all of the entries displaying, rather than one selected by the note id. This is my code:
def displayNote(self):
# Clear the boxes first
clearBoxes()
# get index value
for i in noteList.curselection():
noteIndex = noteList.get(i)
# process index value to get id
subStrip = ' >.*'
noteStrip = noteIndex.replace(", " , "/").replace(":" , "")
noteID = re.sub(subStrip , "" , noteStrip)
# parse xml for box display
fileForDisplay = ET.parse(notefile)
rootd = fileForDisplay.getroot()
# and pull the info for that note id into variables
for note in rootd.findall('note'):
dateInfo = note.find("date").text
toInfo = note.find("to").text
fromInfo = note.find("from").text
subjectInfo = note.find("subject").text
bodyInfo = note.find("body").text
timeInfo = note.find("time").text
bodyDisplay = timeInfo + "\n" + bodyInfo
I have a pdf that contains text and tables. I want to extract both of them but when I used the extract_text function it also extracts the content which is inside of the table. I just want to only extract the text which is outside the table and the table can be extracted with the extract_tables function.
I have tested with a pdf that only contains tables but still extract_text extracts also the table contents which I want to extract using extract_tables function.
You can try with the following code
import pdfplumber
# Import the PDF.
pdf = pdfplumber.open("file.pdf")
# Load the first page.
p = pdf.pages[0]
# Table settings.
ts = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
}
# Get the bounding boxes of the tables on the page.
bboxes = [table.bbox for table in p.find_tables(table_settings=ts)]
def not_within_bboxes(obj):
"""Check if the object is in any of the table's bbox."""
def obj_in_bbox(_bbox):
"""See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404"""
v_mid = (obj["top"] + obj["bottom"]) / 2
h_mid = (obj["x0"] + obj["x1"]) / 2
x0, top, x1, bottom = _bbox
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
print("Text outside the tables:")
print(p.filter(not_within_bboxes).extract_text())
I am using the .filter() method provided by pdfplumber to drop any objects that fall inside the bounding box of any of the tables and creating a filtered version of the page and then extracting the text from it.
Since you haven't shared the PDF, the table settings I have used may not work but you can change them to suit your needs.
Camelot is a fantastic Python library to extract the tables from a pdf file as a data frame. However, I'm looking for a solution that also returns the table description text written right above the table.
The code I'm using for extracting tables from pdf is this:
import camelot
tables = camelot.read_pdf('test.pdf', pages='all',lattice=True, suppress_stdout = True)
I'd like to extract the text written above the table i.e THE PARTICULARS, as shown in the image below.
What should be a best approach for me to do it? appreciate any help. thank you
You can create the Lattice parser directly
parser = Lattice(**kwargs)
for p in pages:
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs)
tables.extend(t)
Then you have access to parser.layout which contains all the components in the page. These components all have bbox (x0, y0, x1, y1) and the extracted tables also have a bbox object. You can find the closest component to the table on top of it and extract the text.
Here's my hilariously bad implementation just so that someone can laugh and get inspired to do a better one and contribute to the great camelot package :)
Caveats:
Will only work for non-rotated tables
It's a heuristic
The code is bad
# Helper methods for _bbox
def top_mid(bbox):
return ((bbox[0]+bbox[2])/2, bbox[3])
def bottom_mid(bbox):
return ((bbox[0]+bbox[2])/2, bbox[1])
def distance(p1, p2):
return math.sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)
def get_closest_text(table, htext_objs):
min_distance = 999 # Cause 9's are big :)
best_guess = None
table_mid = top_mid(table._bbox) # Middle of the TOP of the table
for obj in htext_objs:
text_mid = bottom_mid(obj.bbox) # Middle of the BOTTOM of the text
d = distance(text_mid, table_mid)
if d < min_distance:
best_guess = obj.get_text().strip()
min_distance = d
return best_guess
def get_tables_and_titles(pdf_filename):
"""Here's my hacky code for grabbing tables and guessing at their titles"""
my_handler = PDFHandler(pdf_filename) # from camelot.handlers import PDFHandler
tables = camelot.read_pdf(pdf_filename, pages='2,3,4')
print('Extracting {:d} tables...'.format(tables.n))
titles = []
with camelot.utils.TemporaryDirectory() as tempdir:
for table in tables:
my_handler._save_page(pdf_filename, table.page, tempdir)
tmp_file_path = os.path.join(tempdir, f'page-{table.page}.pdf')
layout, dim = camelot.utils.get_page_layout(tmp_file_path)
htext_objs = camelot.utils.get_text_objects(layout, ltype="horizontal_text")
titles.append(get_closest_text(table, htext_objs)) # Might be None
return titles, tables
See: https://github.com/atlanhq/camelot/issues/395
I have a Google Spreadsheet which I'm populating with values using a python script and the gdata library. If i run the script more than once, it appends new rows to the worksheet, I'd like the script to first clear all the data from the rows before populating it, that way I have a fresh set of data every time I run the script. I've tried using:
UpdateCell(row, col, value, spreadsheet_key, worksheet_id)
but short of running a two for loops like this, is there a cleaner way? Also this loop seems to be horrendously slow:
for x in range(2, 45):
for i in range(1, 5):
self.GetGDataClient().UpdateCell(x, i, '',
self.spreadsheet_key,
self.worksheet_id)
Not sure if you got this sorted out or not, but regarding speeding up the clearing out of current data, try using a batch request. For instance, to clear out every single cell in the sheet, you could do:
cells = client.GetCellsFeed(key, wks_id)
batch_request = gdata.spreadsheet.SpreadsheetsCellsFeed()
# Iterate through every cell in the CellsFeed, replacing each one with ''
# Note that this does not make any calls yet - it all happens locally
for i, entry in enumerate(cells.entry):
entry.cell.inputValue = ''
batch_request.AddUpdate(cells.entry[i])
# Now send the entire batchRequest as a single HTTP request
updated = client.ExecuteBatch(batch_request, cells.GetBatchLink().href)
If you want to do things like save the column headers (assuming they are in the first row), you can use a CellQuery:
# Set up a query that starts at row 2
query = gdata.spreadsheet.service.CellQuery()
query.min_row = '2'
# Pull just those cells
no_headers = client.GetCellsFeed(key, wks_id, query=query)
batch_request = gdata.spreadsheet.SpreadsheetsCellsFeed()
# Iterate through every cell in the CellsFeed, replacing each one with ''
# Note that this does not make any calls yet - it all happens locally
for i, entry in enumerate(no_headers.entry):
entry.cell.inputValue = ''
batch_request.AddUpdate(no_headers.entry[i])
# Now send the entire batchRequest as a single HTTP request
updated = client.ExecuteBatch(batch_request, no_headers.GetBatchLink().href)
Alternatively, you could use this to update your cells as well (perhaps more in line with that you want). The link to the documentation provides a basic way to do that, which is (copied from the docs in case the link ever changes):
import gdata.spreadsheet
import gdata.spreadsheet.service
client = gdata.spreadsheet.service.SpreadsheetsService()
# Authenticate ...
cells = client.GetCellsFeed('your_spreadsheet_key', wksht_id='your_worksheet_id')
batchRequest = gdata.spreadsheet.SpreadsheetsCellsFeed()
cells.entry[0].cell.inputValue = 'x'
batchRequest.AddUpdate(cells.entry[0])
cells.entry[1].cell.inputValue = 'y'
batchRequest.AddUpdate(cells.entry[1])
cells.entry[2].cell.inputValue = 'z'
batchRequest.AddUpdate(cells.entry[2])
cells.entry[3].cell.inputValue = '=sum(3,5)'
batchRequest.AddUpdate(cells.entry[3])
updated = client.ExecuteBatch(batchRequest, cells.GetBatchLink().href)