I need an epub to text solution in Python - python

I need to get text from an epub
from epub_conversion.utils import open_book, convert_epub_to_lines
f = open("demofile.txt", "a")
book = open_book("razvansividra.epub")
lines = convert_epub_to_lines(book)
I use this but if I use print(lines) it does print only one line. And the library is 6 years old. Do you guys know a good way ?

What about https://github.com/aerkalov/ebooklib
EbookLib is a Python library for managing EPUB2/EPUB3 and Kindle
files. It's capable of reading and writing EPUB files programmatically
(Kindle support is under development).
The API is designed to be as simple as possible, while at the same
time making complex things possible too. It has support for covers,
table of contents, spine, guide, metadata and etc.
import ebooklib
from ebooklib import epub
book = epub.read_epub('test.epub')
for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
print doc

convert_epub_to_lines returns an iterator to lines, which you need to iterate one by one to get.
Instead, you can get all lines with "convert", see in the documentation of the library:
https://pypi.org/project/epub-conversion/

Epublib has the problem of modifying your epub metadata, so if you want the original file with maybe only a few things changed you can simply unpack the epub into a directory and parse it with Beautifulsoup:
from os import path, listdir
with ZipFile(FILE_NAME, "r") as zip_ref:
zip_ref.extractall(extract_dir)
for filename in listdir(extract_dir):
if filename.endswith(".xhtml"):
print(filename)
with open(path.join(extract_dir, filename), "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "lxml")
for text_object in soup.find_all(text=True):

Here is a sloppy script that extracts the text from an .epub in the right order. Improvements could be made
Quick explanation:
Takes input(epub) and output(txt) file paths as first and second arguments
Extracts epub content in temporary directory
Parses 'content.opf' file for xhtml content and order
Extracts text from each xhtml
Dependency: lxml
#!/usr/bin/python3
import shutil, os, sys, zipfile, tempfile
from lxml import etree
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <input.epub> <output.txt>")
exit(1)
inputFilePath=sys.argv[1]
outputFilePath=sys.argv[2]
print(f"Input: {inputFilePath}")
print(f"Output: {outputFilePath}")
with tempfile.TemporaryDirectory() as tmpDir:
print(f"Extracting input to temp directory '{tmpDir}'.")
with zipfile.ZipFile(inputFilePath, 'r') as zip_ref:
zip_ref.extractall(tmpDir)
with open(outputFilePath, "w") as outFile:
print(f"Parsing 'container.xml' file.")
containerFilePath=f"{tmpDir}/META-INF/container.xml"
tree = etree.parse(containerFilePath)
for rootFilePath in tree.xpath( "//*[local-name()='container']"
"/*[local-name()='rootfiles']"
"/*[local-name()='rootfile']"
"/#full-path"):
print(f"Parsing '{rootFilePath}' file.")
contentFilePath = f"{tmpDir}/{rootFilePath}"
contentFileDirPath = os.path.dirname(contentFilePath)
tree = etree.parse(contentFilePath)
for idref in tree.xpath("//*[local-name()='package']"
"/*[local-name()='spine']"
"/*[local-name()='itemref']"
"/#idref"):
for href in tree.xpath( f"//*[local-name()='package']"
f"/*[local-name()='manifest']"
f"/*[local-name()='item'][#id='{idref}']"
f"/#href"):
outFile.write("\n")
xhtmlFilePath = f"{contentFileDirPath}/{href}"
subtree = etree.parse(xhtmlFilePath, etree.HTMLParser())
for ptag in subtree.xpath("//html/body/*"):
for text in ptag.itertext():
outFile.write(f"{text}")
outFile.write("\n")
print(f"Text written to '{outputFilePath}'.")

Related

Store, manipulate and retrieve content of docx files, retaining formatting

So I need a way to retrieve the content of docx files (text, images, foarmatting), store them and then generate a new docx with the content of some of the files stitched together.
My current approach is, that I extract the <body> from the underlying document.xml, store that in a Pandas DF and modify the content of a template docx with data form that DF, before generating a new docx.
Storing the body of the files in a Pandas DF seems easy enough:
def get_word_xml(docx_filename):
with open(docx_filename, 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return zip, tmp_dir, xml_content
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
df = pd.DataFrame(columns=['Name', 'Text'])
for root, dirs, files in os.walk("./docs", topdown=False):
for name in files:
zip, tmp_dir, wordxml = get_word_xml(os.path.join(root, name).replace("\\","/"))
wordxml = get_xml_tree(wordxml)
wordxml = etree.tostring(wordxml, pretty_print=True)
body = re.search("(?<=<w:body>)(.*)(?=<\/w:body>)",str(wordxml)).group(1)
df = df.append({'Name':name.split('.')[0], 'Text':body}, ignore_index=True)
The actual problem I'm facing is, however, that generating a docx file leads to a corrupted file. I tried opening a file, extracting the contents(not even manipulating the data at this point) and generate a new file with the same content(basically a copy):
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = str(xml_content)
f.write(str(xmlstr))
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)
I'm not even sure if this is the right approach for this task, but I used this as reference.
There are several problems with your code:
etree.fromstring(xml_content)
This doesn't assign the XML Element created from xml_content to anything.
xmlstr = str(xml_content)
f.write(str(xmlstr))
First, you have an extra str conversion. Second, the correct way to convert the XML back to string is via the etree tostring() method.
Try the following code - on my (linux) system, the generated output.docx opens in LibreOffice Writer without problem. (BTW, next time please include complete code, including imports.)
#! /usr/bin/python3
import zipfile
import tempfile
import xml.etree.ElementTree as etree
import os.path
import shutil
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
xml = etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = etree.tostring(xml, encoding="unicode", xml_declaration=True)
f.write(xmlstr)
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)

Convert PDF to .docx with Python

I'm trying very hard to find the way to convert a PDF file to a .docx file with Python.
I have seen other posts related with this, but none of them seem to work correctly in my case.
I'm using specifically
import os
import subprocess
for top, dirs, files in os.walk('/my/pdf/folder'):
for filename in files:
if filename.endswith('.pdf'):
abspath = os.path.join(top, filename)
subprocess.call('lowriter --invisible --convert-to doc "{}"'
.format(abspath), shell=True)
This gives me Output[1], but then, I can't find any .docx document in my folder.
I have LibreOffice 5.3 installed.
Any clues about it?
Thank you in advance!
I am not aware of a way to convert a pdf file into a Word file using libreoffice.
However, you can convert from a pdf to a html and then convert the html to a docx.
Firstly, get the commands running on the command line. (The following is on Linux. So you may have to fill in path names to the soffice binary and use a full path for the input file on your OS)
soffice --convert-to html ./my_pdf_file.pdf
then
soffice --convert-to docx:'MS Word 2007 XML' ./my_pdf_file.html
You should end up with:
my_pdf_file.pdf
my_pdf_file.html
my_pdf_file.docx
Now wrap the commands in your subprocess code
I use this for multiple files
####
from pdf2docx import Converter
import os
# # # dir_path for input reading and output files & a for loop # # #
path_input = '/pdftodocx/input/'
path_output = '/pdftodocx/output/'
for file in os.listdir(path_input):
cv = Converter(path_input+file)
cv.convert(path_output+file+'.docx', start=0, end=None)
cv.close()
print(file)
Below code worked for me.
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.visible = 1
pdfdoc = 'NewDoc.pdf'
todocx = 'NewDoc.docx'
wb1 = word.Documents.Open(pdfdoc)
wb1.SaveAs(todocx, FileFormat=16) # file format for docx
wb1.Close()
word.Quit()
My approach does not follow the same methodology of using subsystems. However this one does the job of reading through all the pages of a PDF document and moving them to a docx file. Note: It only works with text; images and other objects are usually ignored.
#Description: This python script will allow you to fetch text information from a pdf file
#import libraries
import PyPDF2
import os
import docx
mydoc = docx.Document() # document type
pdfFileObj = open('pdf/filename.pdf', 'rb') # pdffile loction
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # define pdf reader object
# Loop through all the pages
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfContent = pageObj.extractText() #extracts the content from the page.
print(pdfContent) # print statement to test output in the terminal. codeline optional.
mydoc.add_paragraph(pdfContent) # this adds the content to the word document
mydoc.save("pdf/filename.docx") # Give a name to your output file.
I have successfully done this with pdf2docx :
from pdf2docx import parse
pdf_file = "test.pdf"
word_file = "test.docx"
parse(pdf_file, word_file, start=0, end=None)

Parsing the file name from list of url links

Ok so I am using a script that is downloading a files from urls listed in a urls.txt.
import urllib.request
with open("urls.txt", "r") as file:
linkList = file.readlines()
for link in linkList:
urllib.request.urlretrieve(link)
Unfortunately they are saved as temporary files due to lack of second argument in my urllib.request.urlretrieve function. As there are thousand of links in my text file naming them separately is not an option. The thing is that the name of the file is contained in those links, i.e. /DocumentXML2XLSDownload.vm?firsttime=true&repengback=true&d‌​ocumentId=XXXXXX&xsl‌​FileName=rher2xml.xs‌​l&outputFileName=XXX‌​X_2017_06_25_4.xls where the name of the file comes after outputFileName=
Is there an easy way to parse the file names and then use them in urllib.request.urlretrieve function as secondary argument? I was thinking of extracting those names in excel and placing them in another text file that would be read in similar fashion as urls.txt but I'm not sure how to implement it in Python. Or is there a way to make it exclusively in python without using excel?
You could parse the link on the go.
Example using a regular expression:
import re
with open("urls.txt", "r") as file:
linkList = file.readlines()
for link in linkList:
regexp = '((?<=\?outputFileName=)|(?<=\&outputFileName=))[^&]+'
match = re.search(regexp, link.rstrip())
if match is None:
# Make the user aware that something went wrong, e.g. raise exception
# and/or just print something
print("WARNING: Couldn't find file name in link [" + link + "]. Skipping...")
else:
file_name = match.group(0)
urllib.request.urlretrieve(link, file_name)
You can use urlparse and parse_qs to get the query string
from urlparse import urlparse,parse_qs
parse = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html?name=Python&version=2')
print(parse_qs(parse.query)['name'][0]) # prints Python

Issue locating files due to foreign characters

It's my first time writing here, so I hope I'm doing everything all right.
I'm using python 3.5 on Win10, and I'm trying to "sync" music from Itunes to my Android device. Basically, I'm reading the Itunes Library XML file and getting all the files location ( so I can copy/paste them into my phone ) but I have problems with songs containing foreign characters.
import getpass
import re
import os
from urllib.parse import unquote
user = getpass.getuser()
ITUNES_LIB_PATH = "C:\\Users\\%s\\Music\\Itunes\\iTunes Music Library.xml" % user
ITUNES_SONGS_FILE = "ya.txt"
def write(file, what, newline=True):
with open(file, 'a', encoding="utf8") as f:
if not os.path.isfile(what):
print("Issue locating file %s\n" % what)
if newline:
what+"\n"
f.write(what)
def get_songs(file=ITUNES_LIB_PATH):
with open(file, 'r', encoding="utf8") as f:
f = f.read()
songs_location = re.findall("<key>Location</key><string>file://localhost/(.*?)</string>", f)
for song in songs_location:
song = unquote(song.replace("/", '\\'))
write(ITUNES_SONGS_FILE, song)
get_songs()
Output:
Issue locating file C:\Users\Dymy\Desktop\Media\Norin &amp; Rad - Bird Is The Word.mp3
How should I handle that "&" in the file name?
There are a couple of related issues in your code e.g., unescaped xml character references, hardcoded character encodings cause by using regular expressions to parse xml. To fix them, use xml parser such as xml.etree.ElementTree or use a more specific pyitunes library (I haven't tried it).

Parse each file in a directory with BeautifulSoup/Python, save out as new file

New to Python & BeautifulSoup. I have a Python program that opens a file called "example.html", runs a BeautifulSoup action on it, then runs a Bleach action on it, then saves the result as file "example-cleaned.html". So far it is working for all contents of "example.html".
I need to modify it so that it opens each file in folder "/posts/", runs the program on it, then saves it out as "/posts-cleaned/X-cleaned.html" where X is the original filename.
Here's my code, minimised:
from bs4 import BeautifulSoup
import bleach
import re
text = BeautifulSoup(open("posts/example.html"))
text.encode("utf-8")
tag_black_list = ['iframe', 'script']
tag_white_list = ['p','div']
attr_white_list = {'*': ['title']}
# Step one, with BeautifulSoup: Remove tags in tag_black_list, destroy contents.
[s.decompose() for s in text(tag_black_list)]
pretty = (text.prettify())
# Step two, with Bleach: Remove tags and attributes not in whitelists, leave tag contents.
cleaned = bleach.clean(pretty, strip="TRUE", attributes=attr_white_list, tags=tag_white_list)
fout = open("posts/example-cleaned.html", "w")
fout.write(cleaned.encode("utf-8"))
fout.close()
print "Done"
Assistance & pointers to existing solutions gladly received!
You can use os.listdir() to get a list of all files in a directory. If you want to recurse all the way down the directory tree, you'll need os.walk().
I would move all this code to handle a single file to function, and then write a second function to handle parsing the whole directory. Something like this:
def clean_dir(directory):
os.chdir(directory)
for filename in os.listdir(directory):
clean_file(filename)
def clean_file(filename):
tag_black_list = ['iframe', 'script']
tag_white_list = ['p','div']
attr_white_list = {'*': ['title']}
with open(filename, 'r') as fhandle:
text = BeautifulSoup(fhandle)
text.encode("utf-8")
# Step one, with BeautifulSoup: Remove tags in tag_black_list, destroy contents.
[s.decompose() for s in text(tag_black_list)]
pretty = (text.prettify())
# Step two, with Bleach: Remove tags and attributes not in whitelists, leave tag contents.
cleaned = bleach.clean(pretty, strip="TRUE", attributes=attr_white_list, tags=tag_white_list)
# this appends -cleaned to the file;
# relies on the file having a '.'
dot_pos = filename.rfind('.')
cleaned_filename = '{0}-cleaned{1}'.format(filename[:dot_pos], filename[dot_pos:])
with open(cleaned_filename, 'w') as fout:
fout.write(cleaned.encode("utf-8"))
print "Done"
Then you just call clean_dir('/posts') or what not.
I'm appending "-cleaned" to the files, but I think I like your idea of using a whole new directory better. That way you won't have to handle conflicts if -cleaned already exists for some file, etc.
I'm also using the with statement to open files here as it closes them and handles exceptions automatically.
Answer to my own question, for others who might find the Python docs for os.listdir a bit unhelpful:
from bs4 import BeautifulSoup
import bleach
import re
import os, os.path
tag_black_list = ['iframe', 'script']
tag_white_list = ['p','div']
attr_white_list = {'*': ['title']}
postlist = os.listdir("posts/")
for post in postlist:
# HERE: you need to specify the directory again, the value of "post" is just the filename:
text = BeautifulSoup(open("posts/"+post))
text.encode("utf-8")
# Step one, with BeautifulSoup: Remove tags in tag_black_list, destroy contents.
[s.decompose() for s in text(tag_black_list)]
pretty = (text.prettify())
# Step two, with Bleach: Remove tags and attributes not in whitelists, leave tag contents.
cleaned = bleach.clean(pretty, strip="TRUE", attributes=attr_white_list, tags=tag_white_list)
fout = open("posts-cleaned/"+post, "w")
fout.write(cleaned.encode("utf-8"))
fout.close()
I cheated and made a separate folder called "posts-cleaned/" because savings files to there was easier than splitting the filename, adding "cleaned", and re-joining it, although if anyone wants to show me a good way to do that, that would be even better.

Categories

Resources