Store, manipulate and retrieve content of docx files, retaining formatting - python

So I need a way to retrieve the content of docx files (text, images, foarmatting), store them and then generate a new docx with the content of some of the files stitched together.
My current approach is, that I extract the <body> from the underlying document.xml, store that in a Pandas DF and modify the content of a template docx with data form that DF, before generating a new docx.
Storing the body of the files in a Pandas DF seems easy enough:
def get_word_xml(docx_filename):
with open(docx_filename, 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return zip, tmp_dir, xml_content
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
df = pd.DataFrame(columns=['Name', 'Text'])
for root, dirs, files in os.walk("./docs", topdown=False):
for name in files:
zip, tmp_dir, wordxml = get_word_xml(os.path.join(root, name).replace("\\","/"))
wordxml = get_xml_tree(wordxml)
wordxml = etree.tostring(wordxml, pretty_print=True)
body = re.search("(?<=<w:body>)(.*)(?=<\/w:body>)",str(wordxml)).group(1)
df = df.append({'Name':name.split('.')[0], 'Text':body}, ignore_index=True)
The actual problem I'm facing is, however, that generating a docx file leads to a corrupted file. I tried opening a file, extracting the contents(not even manipulating the data at this point) and generate a new file with the same content(basically a copy):
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = str(xml_content)
f.write(str(xmlstr))
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)
I'm not even sure if this is the right approach for this task, but I used this as reference.

There are several problems with your code:
etree.fromstring(xml_content)
This doesn't assign the XML Element created from xml_content to anything.
xmlstr = str(xml_content)
f.write(str(xmlstr))
First, you have an extra str conversion. Second, the correct way to convert the XML back to string is via the etree tostring() method.
Try the following code - on my (linux) system, the generated output.docx opens in LibreOffice Writer without problem. (BTW, next time please include complete code, including imports.)
#! /usr/bin/python3
import zipfile
import tempfile
import xml.etree.ElementTree as etree
import os.path
import shutil
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
xml = etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = etree.tostring(xml, encoding="unicode", xml_declaration=True)
f.write(xmlstr)
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)

Related

Getting "Xref table not zero-indexed. ID numbers for objects will be corrected" warning

I have the following code (comments explain what is occuring):
import os
from io import StringIO
from PyPDF2 import PdfFileReader
# Path to the directory containing the PDF files
pdf_dir = '/path/to/pdf/files'
# Iterate over the files in the directory
for filename in os.listdir(pdf_dir):
# Check if the file is a PDF file
if filename.endswith('.pdf'):
# Construct the full path to the file
filepath = os.path.join(pdf_dir, filename)
# Open the PDF file and read its contents
with open(filepath, 'rb') as f:
pdf = PdfFileReader(f)
# Extract the text from the PDF file
text = ''
for page in pdf.pages:
text += page.extractText()
# Construct the name of the output text file
txt_filename = filename[:-4] + '.txt'
# Write the text to the output file
with open(txt_filename, 'w') as f:
f.write(text)
When I run the code, it produces a Xref table not zero-indexed. ID numbers for objects will be corrected warning. It is not a hard error, but it makes me wonder if there's a different way I should be doing this.
Thanks for any suggestions.

I need an epub to text solution in Python

I need to get text from an epub
from epub_conversion.utils import open_book, convert_epub_to_lines
f = open("demofile.txt", "a")
book = open_book("razvansividra.epub")
lines = convert_epub_to_lines(book)
I use this but if I use print(lines) it does print only one line. And the library is 6 years old. Do you guys know a good way ?
What about https://github.com/aerkalov/ebooklib
EbookLib is a Python library for managing EPUB2/EPUB3 and Kindle
files. It's capable of reading and writing EPUB files programmatically
(Kindle support is under development).
The API is designed to be as simple as possible, while at the same
time making complex things possible too. It has support for covers,
table of contents, spine, guide, metadata and etc.
import ebooklib
from ebooklib import epub
book = epub.read_epub('test.epub')
for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
print doc
convert_epub_to_lines returns an iterator to lines, which you need to iterate one by one to get.
Instead, you can get all lines with "convert", see in the documentation of the library:
https://pypi.org/project/epub-conversion/
Epublib has the problem of modifying your epub metadata, so if you want the original file with maybe only a few things changed you can simply unpack the epub into a directory and parse it with Beautifulsoup:
from os import path, listdir
with ZipFile(FILE_NAME, "r") as zip_ref:
zip_ref.extractall(extract_dir)
for filename in listdir(extract_dir):
if filename.endswith(".xhtml"):
print(filename)
with open(path.join(extract_dir, filename), "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "lxml")
for text_object in soup.find_all(text=True):
Here is a sloppy script that extracts the text from an .epub in the right order. Improvements could be made
Quick explanation:
Takes input(epub) and output(txt) file paths as first and second arguments
Extracts epub content in temporary directory
Parses 'content.opf' file for xhtml content and order
Extracts text from each xhtml
Dependency: lxml
#!/usr/bin/python3
import shutil, os, sys, zipfile, tempfile
from lxml import etree
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <input.epub> <output.txt>")
exit(1)
inputFilePath=sys.argv[1]
outputFilePath=sys.argv[2]
print(f"Input: {inputFilePath}")
print(f"Output: {outputFilePath}")
with tempfile.TemporaryDirectory() as tmpDir:
print(f"Extracting input to temp directory '{tmpDir}'.")
with zipfile.ZipFile(inputFilePath, 'r') as zip_ref:
zip_ref.extractall(tmpDir)
with open(outputFilePath, "w") as outFile:
print(f"Parsing 'container.xml' file.")
containerFilePath=f"{tmpDir}/META-INF/container.xml"
tree = etree.parse(containerFilePath)
for rootFilePath in tree.xpath( "//*[local-name()='container']"
"/*[local-name()='rootfiles']"
"/*[local-name()='rootfile']"
"/#full-path"):
print(f"Parsing '{rootFilePath}' file.")
contentFilePath = f"{tmpDir}/{rootFilePath}"
contentFileDirPath = os.path.dirname(contentFilePath)
tree = etree.parse(contentFilePath)
for idref in tree.xpath("//*[local-name()='package']"
"/*[local-name()='spine']"
"/*[local-name()='itemref']"
"/#idref"):
for href in tree.xpath( f"//*[local-name()='package']"
f"/*[local-name()='manifest']"
f"/*[local-name()='item'][#id='{idref}']"
f"/#href"):
outFile.write("\n")
xhtmlFilePath = f"{contentFileDirPath}/{href}"
subtree = etree.parse(xhtmlFilePath, etree.HTMLParser())
for ptag in subtree.xpath("//html/body/*"):
for text in ptag.itertext():
outFile.write(f"{text}")
outFile.write("\n")
print(f"Text written to '{outputFilePath}'.")

Converting all XML files to JSON files in a folder in python?

I have a folder with 1000's of XML files. Now I would like to read each xml file and create a json file for that respective xml file without deleting the xml file.
For example: if I have a file named abc.xml I want that xml file to be converted into json and stored as abc.json so the folder should have abc.xml, abc.json and it should be the same for all the files.
Currently, I am using the below code chunk to convert the xml to json but the issue is it is not creating a new json file.
for filename in os.listdir(path):
if not filename.endswith('.xml'): continue
fullname = os.path.join(path, filename)
with open(fullname, 'r') as f:
xmlString = f.read()
jsonString = json.dumps(xmltodict.parse(xmlString), indent=4)
with open(fullname, 'w') as x:
x.write(jsonString)
Any kind of relevant suggestions would be really appreciated.
fullname is the name of your input xml file. You want to change it to "same thing but json" else you'll create a file with xml extension but json data (destroying your original xml file, not that I care about xml much but...)
with open(fullname, 'r') as f:
xmlString = f.read()
json_output = os.path.splitext(fullname)[0]+".json"
now write your file directly (no need to create a string first)
with open(json_output, 'w') as f:
json.dump(xmltodict.parse(xmlString), f, indent=4)
note that you could use glob.glob directly like this so you get the full, xml filtered path:
for fullname in glob.glob(os.path.join(path,"*.xml"):
This should work.
import xmltodict
for filename in os.listdir(path):
if not filename.endswith('.xml'):
continue
fullname = os.path.join(path, filename)
with open(fullname, 'r') as f:
xmlString = f.read()
jsonString = json.dumps(xmltodict.parse(xmlString), indent=4)
with open(fullname[:-4] + ".json", 'w') as f:
f.write(jsonString)

Extract zip to memory, parse contents

I want to read the contents of a zip file into memory rather than extracting them to disc, find a particular file in the archive, open the file and extract a line from it.
Can a StringIO instance be opened and parsed? Suggestions? Thanks in advance.
zfile = ZipFile('name.zip', 'r')
for name in zfile.namelist():
if fnmatch.fnmatch(name, '*_readme.xml'):
name = StringIO.StringIO()
print name # prints StringIO instances
open(name, 'r') # IO Error: No such file or directory...
I found a few similar posts, but none that seem to address this issue: Extracting a zipfile to memory?
IMO just using read is enough:
zfile = ZipFile('name.zip', 'r')
files = []
for name in zfile.namelist():
if fnmatch.fnmatch(name, '*_readme.xml'):
files.append(zfile.read(name))
This will make a list with contents of files that match the pattern.
Test:
You can then parse contents afterwards by iterating through the list:
for file in files:
print(file[0:min(35,len(file))].decode()) # "parsing"
Or better use a functor:
import zipfile as zip
import os
import fnmatch
zip_name = os.sys.argv[1]
zfile = zip.ZipFile(zip_name, 'r')
def parse(contents, member_name = ""):
if len(member_name) > 0:
print( "Parsed `{}`:".format(member_name) )
print(contents[0:min(35, len(contents))].decode()) # "parsing"
for name in zfile.namelist():
if fnmatch.fnmatch(name, '*.cpp'):
parse(zfile.read(name), name)
This way there is no data kept in memory for no reason and memory foot print is smaller. It might be important if the files are big.
Don't overthink it. It Just Works:
import zipfile
# 1) I want to read the contents of a zip file ...
with zipfile.ZipFile('A-Zip-File.zip') as zipper:
# 2) ... find a particular file in the archive, open the file ...
with zipper.open('A-Particular-File.txt') as fp:
# 3) ... and extract a line from it.
first_line = fp.readline()
print first_line
The question you link shows you that you need to read the file. Depending on your use case that may already be enough. In your code you replace the loop variable holding a filename with an empty string buffer. Try something like this:
zfile = ZipFile('name.zip', 'r')
for name in zfile.namelist():
if fnmatch.fnmatch(name, '*_readme.xml'):
ex_file = zfile.open(name) # this is a file like object
content = ex_file.read() # now file-contents are a single string
If you really want a buffer that you can manipulate, then simply instantiate it with the contents:
buf = StringIO(zfile.open(name).read())
You may also want to look at BytesIO and note that there are differences between Python 2 and 3.
Thank you to everyone that contributed solutions. This is what ended up working for me:
zfile = ZipFile('name.zip', 'r')
for name in zfile.namelist():
if fnmatch.fnmatch(name, '*_readme.xml'):
zopen = zfile.open(name)
for line in zopen:
if re.match('(.*)<foo>(.*)</foo>(.*)', line):
print line

EOF marker not found - How to fix in PyPDF and PyPDF2?

I'm attempting to combine a few PDF files into a single PDF file using Python. I've tried both PyPDF and PyPDF2 - on some files, they both throw this same error:
PdfReadError: EOF marker not found
Here's my code (page_files) is a list of PDF file paths to combine:
from PyPDF2 import PdfReader, PdfWriter
writer = PdfWriter()
for path in ["example1.pdf", "example2.pdf"]:
reader = PdfReader(path)
for page in reader.pages:
writer.add_page(page)
with open("out.pdf", "wb") as fp:
writer.write(fp)
I've read a few StackOverflow threads on the topic, but none contain a solution that works. If you've successfully combined PDF files using Python, I'd love to hear how.
You were running in an issue of PyPDF2 which was solved with PR #321. The fix was released in PyPDF2==1.27.8 (released on 2022-04-21).
Is there is still someone looking for merging a "list" of pdfs:
Note:
Using glob to get the correct filelist. <- this will really safe your day ^^
Check this out: glob module reference
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
import os
import glob
class MergeAllPDF:
def __init__(self):
self.mergelist = []
def create(self, filepath, outpath, outfilename):
self.outfilname = outfilename
self.filepath = filepath
self.outpath = outpath
self.pdfs = glob.glob(self.filepath)
self.myrange = len(self.pdfs)
for _ in range(self.myrange):
if self.pdfs:
self.mergelist.append(self.pdfs.pop(0))
self.merge()
def merge(self):
if self.mergelist:
self.merger = PdfFileMerger()
for pdf in self.mergelist:
self.merger.append(open(pdf, 'rb'))
self.merger.write(self.outpath + "%s.pdf" % (self.outfilname))
self.merger.close()
self.mergelist = []
else:
print("mergelist is empty please check your input path")
# example how to use
#update your path here:
inpath = r"C:\Users\Fabian\Desktop\mergeallpdfs\scan\*.pdf" #here are your single page pdfs stored
outpath = r"C:\Users\Fabian\Desktop\mergeallpdfs\output\\" #here your merged pdf will be stored
b = MergeAllPDF()
b.create(inpath, outpath, "mergedpdf")

Categories

Resources