How to fetch Total No of pages count in Python-docx? - python

I am learning Python-docx and I am using this solution to add page number as given on stackoverflow by #Utkarsh Dalal
def create_element(name):
return OxmlElement(name)
def create_attribute(element, name, value):
element.set(nsqn(name), value)
def add_page_number(paragraph):
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
page_run = paragraph.add_run()
t1 = create_element('w:t')
create_attribute(t1, 'xml:space', 'preserve')
t1.text = 'Page '
page_run._r.append(t1)
page_num_run = paragraph.add_run()
fldChar1 = create_element('w:fldChar')
create_attribute(fldChar1, 'w:fldCharType', 'begin')
instrText = create_element('w:instrText')
create_attribute(instrText, 'xml:space', 'preserve')
instrText.text = "PAGE"
fldChar2 = create_element('w:fldChar')
create_attribute(fldChar2, 'w:fldCharType', 'end')
page_num_run._r.append(fldChar1)
page_num_run._r.append(instrText)
page_num_run._r.append(fldChar2)
of_run = paragraph.add_run()
t2 = create_element('w:t')
create_attribute(t2, 'xml:space', 'preserve')
t2.text = ' of '
of_run._r.append(t2)
fldChar3 = create_element('w:fldChar')
create_attribute(fldChar3, 'w:fldCharType', 'begin')
instrText2 = create_element('w:instrText')
create_attribute(instrText2, 'xml:space', 'preserve')
instrText2.text = "NUMPAGES"
fldChar4 = create_element('w:fldChar')
create_attribute(fldChar4, 'w:fldCharType', 'end')
num_pages_run = paragraph.add_run()
num_pages_run._r.append(fldChar3)
num_pages_run._r.append(instrText2)
num_pages_run._r.append(fldChar4)
for pg in num_pages_run.element:
print(pg.text)
doc = Document()
add_page_number(doc.sections[0].footer.paragraphs[0])
doc.save("your_doc.docx")
I receive the Page number in format of Page x of y.
But when I try to access the value of 'y' for total number of pages, I could not do it. I have tried to access it getting the text attribute of num_pages_run as pg.text. But all I get is NUMPAGES as output instead of number of pages.
I am looking for this feature because I would like to do some actions whenever a new page is added to the document.
Is there a way to get total pages from python-docx or any other alternative?

Related

use fitz merge span texts and coordinates into row

I'm try to merge every span to a row use pymupdf
import fitz
with fitz.open("0003001v1.pdf") as doc:
page = doc[0]
dict = page.get_text("dict")
if "blocks" in dict:
blocks = dict["blocks"]
fixed_blocks = dict["blocks"]
for block in blocks:
print("--------------------------".strip())
print("block: ", str(block["bbox"]).replace("(","[").replace(")","]"))
print("")
if "lines" in block.keys():
lines = block["lines"]
for line in lines:
if "spans" in line.keys():
spans = line["spans"]
for span in spans:
fixed_line_bbox = []
fixed_line_text = []
line_text = span["text"]
line_bbox = span["bbox"]
line_bbox_x_0 = line_bbox[0]
line_bbox_y_0 = line_bbox[1]
line_bbox_x_1 = line_bbox[2]
line_bbox_y_1 = line_bbox[3]
print("row:" + str(line_bbox).replace("(","[").replace(")","]") + "\t" + line_text)
the output will be:
block: [71.99899291992188, 630.993408203125, 502.38116455078125, 700.308837890625]
row:[71.99905395507812, 630.993408203125, 502.36865234375, 642.9486083984375] and look for the explicit form of the function Φ from the experimental data on the
row:[71.99905395507812, 645.2735595703125, 107.62599182128906, 657.228759765625] system
row:[107.62599182128906, 645.2735595703125, 119.32400512695312, 657.228759765625] S
row:[120.1189956665039, 645.2735595703125, 502.3509826660156, 657.228759765625] . However, the function Φ may depend on time, it means that there are
row:[71.99899291992188, 659.673583984375, 344.1631774902344, 671.6287841796875] some hidden parameters, which control the system
row:[344.1631774902344, 659.673583984375, 356.683837890625, 671.6287841796875] S
row:[356.683837890625, 659.673583984375, 502.38116455078125, 671.6287841796875] and its evolution is of the
row:[71.99899291992188, 673.95361328125, 96.2470474243164, 685.9088134765625] form
row:[257.9989929199219, 688.3536376953125, 261.3225402832031, 700.308837890625] ˙
row:[254.6388397216797, 688.1612548828125, 262.4575500488281, 700.116455078125] ϕ
row:[262.4575500488281, 688.3536376953125, 291.689697265625, 700.308837890625] = Φ(
row:[291.71893310546875, 688.1612548828125, 311.758056640625, 700.116455078125] ϕ, u
row:[311.75872802734375, 688.3536376953125, 316.4093017578125, 700.308837890625] )
row:[316.4388122558594, 688.1612548828125, 319.7623596191406, 700.116455078125] ,
how could I merge spans text and coordinates which in a single lines and get the fixed line coordinates and texts.

why python if while ends in a dead loop

order = 2
selected = 0
while selected < 21: # because I can only select 20 rows the most once.
current_tr = driver.find_element_by_xpath('/ html / body / table / tbody / tr / td / div / div[3] / table / tbody / tr[%d]' % order) # form line 1. below the table's header
if current_tr.get_attribute("bgcolor") is None: # no bgcolor means not yet reviewed
driver.find_element_by_xpath("//td[2]/div/a").click() # check the onclick content
div_content = driver.find_element_by_xpath("//td[2]/div/div").text # fetch onclick content
driver.find_element_by_xpath("//td[2]/div/div/a").click() # close the onclick content
print(div_content)
if "car" in div_content: #judge if certain string exists in onclick content
list_content = div_content.split("【car】")
car_close = list_content[1].strip() # fetch the content
list_car = car_close.split(" ")
car = list_doi[0]
print(car)
orderminus = order - 1
driver.find_element_by_xpath('// *[ # id = "%d"] / td[6] / a' % orderminus).click() # pick this row,
time.sleep(1)
selected = selected + 1
order = order + 0 #if this row is picked, the row will disappear, so the order won't change
else: ###problem is here, the else branch seems like never been executed ? otherwise the if always stands? no, not possible. there are ones exclude "car", the problem occurs at the first instance of div_content without "car"
order = order + 1 # if "car" is not in
time.sleep(1)
else: # if already reviewed, order + 1
order = order + 1
above is my code using selenium to navigate the webpage with a table.
First judgement: if the current row is reviewed,
not yet reviewed? ok, print the info;
already reviewed?skip it.
then plus judgement: if there certain string "car" in the info:
no? skip;
yes, click it, the row disappear;
But currently when I am running this, the actual status is :
when doing the plus judement, if the string "car" is not in the info,
it keeps printing the info, it seems it not doing the else branch, is doing the line 6_9 in this snippet, always, dead end loop.
Why? anybody give me a clue?
to make things clear, i have simplified my code as below:
list = []
list.append("ff122")
list.append("carff")
list.append("ff3232")
list.append("ffcar")
list.append("3232")
order = 0
selected = 0
while selected < 6:
current_tr = list[order]
print("round %d %s" % (order, current_tr))
if "ff" in current_tr:
print("ff is in current_tr")
if "car" in current_tr:
print("car")
selected = selected + 1
order = order + 0
else:
order = order + 1
print("order is %d" % order)
else: # if already reviewed, order + 1
order = order + 1
print("order is %d" % order)
everybody can run this, what I need to do is firstly filter the "ff", if "ff" exists, then filter "car". both two conditions TRUE, selected +1, until selected reach certain number. in real instance, don't doubt that the list is long enough.

How can I get the page number for editable fields using pdfminer six

I followed the example from this answer to get the editable field values from a PDF document:
How to extract PDF fields from a filled out form in Python?
For each field I get a data structure that looks like this below. But the list includes all the fields from all the pages. How can I determine what page each field was on? In the debugger I tried looking into the 'AP' and the 'P' items which are PDFObjRef's but that didn't lead me anywhere.
'AP' = {dict: 1} {'N': <PDFObjRef:1947>}
'DA' = {bytes: 23} b'0 0 0 rg /ArialMT 10 Tf'
'F' = {int} 4
'FT' = {PSLiteral} /'Tx'
'M' = {bytes: 23} b"D:20200129121854-06'00'"
'MK' = {dict: 0} {}
'P' = {PDFObjRef} <PDFObjRef:1887>
'Rect' = {list: 4} [36.3844, 28.5617, 254.605, 55.1097]
'StructParent' = {int} 213
'Subtype' = {PSLiteral} /'Widget'
'T' = {bytes: 12} b'CustomerName'
'TU' = {bytes: 13} b'Customer Name'
'Type' = {PSLiteral} /'Annot'
'V' = {bytes: 21} b'Ball-Mart Stores, Inc.'
TIA
Same Problem, took me 2 hours til I found the idea of page.annots by reviewing the PDF.
It works with PyPDF2. doc earlier is initialised by doc = open('sample.pdf')
idtopg = {}
pge = 0
for page in PDFPage.create_pages(doc):
if page.annots:
for annot in page.annots:
por = PDFObjRef.resolve(annot)
aid = por['T'].decode("utf-8")
idtopg[aid] = pge
pge += 1
Now look in your 'T's. The dict produced here gives you the page for each 'T'
myfieldid = thenameofyourfield['T'].decode('utf-8')
print("The field id {0} in on page {1}".format(myfieldid, idtopg[myfieldid])
I was able to get the page number for the fields by doing the following:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fp = open(PdfUtility.resource_path(filename), 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
kids = resolve1(doc.catalog['Pages'])['Kids']
page = 0
field_list = []
for kid in kids:
page += 1
kid_fields = resolve1(resolve1(kid)['Annots'])
for i in kid_fields:
field_dict = {}
field = resolve1(i)
name, position = field.get('T'), field.get('Rect')
if name:
field_dict['name'] = name.decode('utf-8')
field_dict['page'] = page
field_dict['position'] = position
print(field_dict)
field_list.append(field_dict)

scrapy for loop responds everytime only the first element

I've tried to crawl a page with many "article" elements, but the loop returns everytime only the first element (as many as their are article elements).
for offer in response.xpath("//article"):
product = Product()
product['kind'] = 'deal'
product['portal'] = 'mydealz'
product['link'] = offer.xpath("//a[#class='cept-dealBtn boxAlign-jc--all-c space--h-3 width--all-12 btn btn--mode-primary']/#href").get()
product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title']/a[#class='thread-title--list']/text()").get()
#product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/div[#class='overflow--fade']/a[#class='cept-merchant-link text--color-greyShade size--all-s']/span[#class='cept-merchant-link-term overflow--wrap-off']/span[#class='text--b text--color-brandPrimary cept-merchant-name']/text()").get()
product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
#product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
product['crawlingdate'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
yield product

Speed up parser: HTML into Database

I need insert all html tags and attributes into database
el.driver.get(url_page)
txthtml = el.driver.page_source
soup = BeautifulSoup(txthtml, "html.parser")
body = soup.find('html')
html_parse(body, el, url_page_id, 0, 0, 0,url_page)
def html_parse(html, el, url_page_id, level, i, parent_id, url_page):
txt = ""
if len(html.text) > 0:
txt = html.text.replace("\n","").replace("\t","").replace("\r","")
ta = tag_list()
ta.p_id = el.id
ta.page_id = url_page_id
ta.level = level
ta.number = i
ta.txt = txt
ta.name = html.name
ta.parent_id = parent_id
ta.html = str(html)
ta.save()
insert_attr(html, el.id, url_page_id, ta.id, url_page)
children = list(html.children)
j = 0
for child in children:
if child.name is None:
continue
j = j + 1
html_parse(child, el, url_page_id, level + 1, j, ta.id, url_page)
When I have recursive function html_parse
html - current html object
el - driver class
url_page_id - id of page
level - level in DOM
i - childe number
parent_id - id of parent
url_page - current URL
tag_list - insert current tag
insert_attr - insert into database attrs of tag
Every html_parse function run fast, but full html parsing run about 4-5 minutes per big html page.
How I can speed up the code?

Categories

Resources