Converting Multiple html file into pdf using pdfkit in Python - python

I try converting multiple html file into pdf using pdfkik. This is my code:
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/jaypratappandey/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
data=[]
f=open('htmlfile.html', 'w')
top=open('tophtmlfile.html', 'w')
for name in soup.select('.pv-top-card-section__body'):
top.write("%s" % name)
for item in soup.select('.pv-oc.ember-view'):
f.write("%s" % item)
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'jayprofile.pdf')
driver.quit()
This code give the following error:
Traceback (most recent call last):
File "lkdndata.py", line 23, in <module>
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'ankurprofile.pdf')
File "/usr/local/lib/python3.5/dist-packages/pdfkit/api.py", line 49, in from_file
return r.to_pdf(output_path)
File "/usr/local/lib/python3.5/dist-packages/pdfkit/pdfkit.py", line 156, in to_pdf
raise IOError('wkhtmltopdf reported an error:\n' + stderr)
OSError: wkhtmltopdf reported an error:
Error: This version of wkhtmltopdf is build against an unpatched version of QT, and does not support more then one input document.
Exit with code 1, due to unknown error.

The solution i found was to first merge the html files into one and then go on to convert it using pdfkit. so in your case would be to save the tophtml and html files together in same dir and replace the path to that dir.
import pdfkit
import os
# path to folder containing html files
path = "/home/ec2-user/data-science-processes/src/results/"
def multiple_html_to_pdf(path):
""" converts multiple html files to a single pdf
args: path to directory containing html files
"""
empty_html = '<html><head></head><body></body></html>'
for file in os.listdir(path):
if file.endswith(".html"):
print(file)
# append html files
with open(path + file, 'r') as f:
html = f.read()
empty_html = empty_html.replace('</body></html>', html + '</body></html>')
# save merged html
with open('merged.html', 'w') as f:
f.write(empty_html)
pdfkit.from_file('/home/ec2-user/data-science-processes/report/merged.html','Report.pdf')
multiple_html_to_pdf(path)

I had the same error. The error you are probably getting is due to the inconsistency of your qt installation and non availability of compatible qt version.
Try running
wkhtmltopdf
on your terminal and see whether you can find "Reduced Functionality".
If yes then my assumption is correct and then your safest bet would be to compile it from source.

Related

python : wget module downloading file without any extension

I am writing small python code to download a file from follow link and retrieve original filename
and its extension.But I have come across one such follow link for which python downloads the file but it is without any extension whereas file has .txt extension when downloads using browser.
Below is the code I am trying :
from urllib.request import urlopen
from urllib.parse import unquote
import wget
filePath = 'D:\\folder_path'
followLink = 'http://example.com/Reports/Download/c4feb46c-8758-4266-bec6-12358'
response = urlopen(followLink)
if response.code == 200:
print('Follow Link(response url) :' + response.url)
print('\n')
unquote_url = unquote(response.url)
file_name = wget.detect_filename(response.url).replace('|', '_')
print('file_name - '+file_name)
wget.download(response.url,filePa
th)
file_name variable in above code is just giving 'c4feb46c-8758-4266-bec6-12358' as filename.
Where I want to download it as c4feb46c-8758-4266-bec6-12358.txt.
I have also tried to read file name from header i.e. response.info(). But not getting proper file name.
Anyone can please help me with this.I am stucked in my work.Thanks in advance.
Wget gets the filename from the URL itself. For example, if your URL was https://someurl.com/filename.pdf, it is saved as filename.pdf. If it was https://someurl.com/filename, it is saved as filename. Since wget.download returns the filename of the downloaded file, you can rename it to any extension you want with os.rename(filename, filename+'.<extension>').

Writing to HTML-File - Opened file looks weird [duplicate]

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

ElementTree errors, html files will not parse using Python/Sublime

I am trying to parse a few thousand html files and dump the variables into a csv file (excel spreadsheet). I've come up against several roadblocks, but the first one is this: I can not get it to properly parse the file. Below is a brief explanation, the python code and the traceback info.
Using Python & Sublime to parse html files, I am getting several errors. What IS working: it runs fine up until if '.html' in file:. It does not execute that loop. It will iterate through print allFiles just fine. It also creates the csv file and creates the headers (though not in separate columns, but I can ask about that later).
It seems that the problem is in the if tree = ET.parse(HTML_PATH+"/"+file) piece. I've written this several different ways (without "/" and/or "file", for example)--so far I have yet to resolve this problem.
If I can provide more information or if anyone can direct me to other documenation, it would be greatly appreciated. So far I have yet to find anything that addresses this issue.
Many thanks for your thoughts.
//C
# Parses out data from crawled html files under "html files"
# and places the output in output.csv.
import xml.etree.ElementTree as ET
import csv, codecs, os
from cStringIO import StringIO
# Note: you need to download and install this..
import unicodecsv
# TODO: make into command line params (instead of constant)
CSV_FILE='output.csv'
HTML_PATH='/Users/C/data/Folder_NS'
f = open(CSV_FILE, 'wb')
w = unicodecsv.writer(f, encoding='utf-8', delimiter=';')
w.writerow(['file', 'category', 'about', 'title', 'subtitle', 'date', 'bodyarticle'])
# redundant declarations:
category=''
about=''
title=''
subtitle=''
date=''
bodyarticle=''
print "headers created"
allFiles = os.listdir(HTML_PATH)
#with open(CSV_FILE, 'wb') as csvfile:
print "all defined"
for file in allFiles:
#print allFiles
if '.html' in file:
print "in html loop"
tree = ET.parse(HTML_PATH+"/"+file)
print '===================='
print 'Parsing file: '+file
print '===================='
for node in tree.iter():
print "tbody"
# The tbody attribute spells it all (or does it):
name = node.attrib.get('/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[1]/font')
# Check common header stuff
if name=='/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[1]/font':
#print ' ------------------'
#print ' Category:'
category=node.text
print "category"
f.close()
Traceback:
File "/Users/C/data/Folder_NS/data_parse.py", line 34, in
tree = ET.parse(HTML_PATH+"/"+file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 1182, in parse
tree.parse(source, parser)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 656, in parse
parser.feed(data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 1642, in feed
self._raiseerror(v)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 1506, in _raiseerror
raise err
xml.etree.ElementTree.ParseError: mismatched tag: line 63, column 2
You are trying to parse HTML with an XML parser, and valid HTML is not always valid XML. You would be better off using the HTML parsing library in the lxml package.
import xml.etree.ElementTree as ET
# ...
tree = ET.parse(HTML_PATH + '/' + file)
would be changed to
import lxml.html
# ...
tree = lxml.html.parse(HTML_PATH + '/' + file)

"File Does Not Exist" when dynamically creating files for PDF download with Requests in Python 2.7

I'm trying to dynamically download pdf's from a web site. I am sure I'm listing them correctly but I am not sure I'm doing the actual file I/O correctly. I get the following error
File "download.py", line 22, in <module>
with open("'"+url+"'", "wb") as pdf:
IOError: [Errno 2] No such file or directory: "'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf'"
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
origin = requests.get("http://freehaven.net/anonbib")
soup=BeautifulSoup(origin.text)
results = soup.find_all(href=re.compile("(http).*(pdf)"))
for link in results:
url = (link.get('href'))
r = requests.get(url)
with open("'"+url+"'", "wb") as pdf:
try:
pdf.write(r.content)
finally:
pdf.close
If url is set to 'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf', your code fails because it is trying to open a file with that name on your filesystem.
Instead, try something like this:
fileForUrl = '/tmp/' + url.split('/')[-1]
with open(fileForUrl, 'wb') as pdf:
# Rest of the code as before

How to save "complete webpage" not just basic html using Python

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

Categories

Resources