I have a mircosoft docx file which has few words in red color.
Now I want to read that file through python code and extract those red words.
But I cannot find the apis that should be used for it.. I tried to iterate on para to access individual words .. but it says para is not iterable .
I'am also not sure how to check color of the word.
Can you please help on it.
import docx
def readtxt(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
print(para.text);
readtxt('C:\\Users\\X\\some.docx')
Regards
Try this, the function will return a list of all contiguous parts of the document which are in red.
import docx
from docx.shared import RGBColor
def readtxt(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
for run in para.runs:
if run.font.color.rgb == RGBColor(255, 000, 000):
fullText.append(run.text)
return fullText
fullText = readtxt('filepath.docx')
Also, please check that you're passing the filepath correctly.
I have populated some mail merge fields in a .docx file and now I want my script to convert the saved .docx file to a .dotx file. I am using Python 3.6.
from __future__ import print_function
from mailmerge import MailMerge
from datetime import date
from docx import Document
from docx.opc.constants import CONTENT_TYPE as CT
import csv
import sys
import os
import numpy as np
import pandas as pd
# . . .
for i in range(0, numTemplates):
theTemplateName = templateNameCol[i]
theTemplateFileLocation = templateFileLocationCol[i]
template = theTemplateFileLocation
document = MailMerge(template)
print(document.get_merge_fields())
theOffice = officeCol[i]
theAddress = addressCol[i]
theSuite = suiteCol[i]
theCity = cityCol[i]
theState = stateCol[i]
theZip = zipCol[i]
thePhoneNum = phoneNumCol[i]
theFaxNum = faxNumCol[i]
document.merge(
Address = theAddress
)
document.write(r'\Users\me\mailmergeproject\test-output' + str(i) + r'.docx')
#do conversion here
Here at the bottom is where I want to do the conversion. As you can see, I've written a file and it's just sitting in a folder right now
Here is the code snippet for converting the .docx file to dotx.
You have to change the content-type while saving the document
pip install python-docx
import docx
document = docx.Document('foo.dotx')
document_part = document.part
document_part._content_type = 'application/vnd.openxmlformats-
officedocument.wordprocessingml.template.main+xml'
document.save('bar.docx')
I have grabbed a pdf from the web using for example
import requests
pdf = requests.get("http://www.scala-lang.org/docu/files/ScalaByExample.pdf")
I would like to modify this code to display it
from gi.repository import Poppler, Gtk
def draw(widget, surface):
page.render(surface)
document = Poppler.Document.new_from_file("file:///home/me/some.pdf", None)
page = document.get_page(0)
window = Gtk.Window(title="Hello World")
window.connect("delete-event", Gtk.main_quit)
window.connect("draw", draw)
window.set_app_paintable(True)
window.show_all()
Gtk.main()
How do I modify the document = line to use the variable pdf that contains the pdf?
(I don't mind using popplerqt4 or anything else if that makes it easier.)
It all depends on the OS your using. These might usually help:
import os
os.system('my_pdf.pdf')
or
os.startfile('path_to_pdf.pdf')
or
import webbrowser
webbrowser.open(r'file:///my_pdf.pdf')
How about using a temporary file?
import tempfile
import urllib
import urlparse
import requests
from gi.repository import Poppler, Gtk
pdf = requests.get("http://www.scala-lang.org/docu/files/ScalaByExample.pdf")
with tempfile.NamedTemporaryFile() as pdf_contents:
pdf_contents.file.write(pdf)
file_url = urlparse.urljoin(
'file:', urllib.pathname2url(pdf_contents.name))
document = Poppler.Document.new_from_file(file_url, None)
Try this and tell me if it works:
document = Poppler.Document.new_from_data(str(pdf.content),len(repr(pdf.content)),None)
If you want to open pdf using acrobat reader then below code should work
import subprocess
process = subprocess.Popen(['<here path to acrobat.exe>', '/A', 'page=1', '<here path to pdf>'], shell=False, stdout=subprocess.PIPE)
process.wait()
Since there is a library named pyPdf, you should be able to load PDF file using that.
If you have any further questions, send me messege.
August 2015 : On a fresh intallation in Windows 7, the problem is still the same :
Poppler.Document.new_from_data(data, len(data), None)
returns : Type error: must be strings not bytes.
Poppler.Document.new_from_data(str(data), len(data), None)
returns : PDF document is damaged (4).
I have been unable to use this function.
I tried to use a NamedTemporayFile instead of a file on disk, but for un unknown reason, it returns an unknown error.
So I am using a temporary file. Not the prettiest way, but it works.
Here is the test code for Python 3.4, if anyone has an idea :
from gi.repository import Poppler
import tempfile, urllib
from urllib.parse import urlparse
from urllib.request import urljoin
testfile = "d:/Mes Documents/en cours/PdfBooklet3/tempfiles/preview.pdf"
document = Poppler.Document.new_from_file("file:///" + testfile, None) # Works fine
page = document.get_page(0)
print(page) # OK
f1 = open(testfile, "rb")
data1 = f1.read()
f1.close()
data2 = "".join(map(chr, data1)) # converts bytes to string
print(len(data1))
document = Poppler.Document.new_from_data(data2, len(data2), None)
page = document.get_page(0) # returns None
print(page)
pdftempfile = tempfile.NamedTemporaryFile()
pdftempfile.write(data1)
file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(pdftempfile.name))
print( file_url)
pdftempfile.seek(0)
document = Poppler.Document.new_from_file(file_url, None) # unknown error
I'm writing some script which capture data from web site and save them into DB. Some of datas are merged and I need to split them. I have sth like this
Endokrynologia (bez st.),Położnictwo i ginekologia (II st.)
So i need to get:
Endokrynologia (bez st.)
Położnictwo i ginekologia (II st.)
So i wrote some code in python:
#!/usr/bin/env python
# -*- encoding: utf-8
import MySQLdb as mdb
from lxml import html, etree
import urllib
import sys
import re
Nr = 17268
Link = "http://rpwdl.csioz.gov.pl/rpz/druk/wyswietlKsiegaServletPub?idKsiega="
sock = urllib.urlopen(Link+str(Nr))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
Spec = etree.XPath("string(//html/body/div/table[2]/tr[18]/td[2]/text())")
Specjalizacja = Spec(root)
if re.search(r'(,)\b', Specjalizacja):
text = Specjalizacja.split()
print text[0]
print text[1]
and i get:
Endokrynologia
(bez
what i'm doing wrong ?
you would try to replace
text = Specjalizacja.split()
with
text = Specjalizacja.split(',')
Don't know whether that would fix your problem.
How can I import data for example for the field A1?
When I use etree.parse() I get an error, because I dont have a xml file.
It's a zip file:
import zipfile
from lxml import etree
z = zipfile.ZipFile('mydocument.ods')
data = z.read('content.xml')
data = etree.XML(data)
etree.dump(data)