How can I print a webpage as a PDF with headers and footers so it looks like it has been printed from google chrome.
This is what I have tried so far using PhantomJS but it doesn't have the headers and footers.
from selenium import webdriver
import selenium
def execute(script, args):
driver.execute('executePhantomScript', {'script': script, 'args' : args })
driver = webdriver.PhantomJS('phantomjs')
# hack while the python interface lags
driver.command_executor._commands['executePhantomScript'] = ('POST', '/session/$sessionId/phantom/execute')
driver.get('http://stackoverflow.com')
# set page format
# inside the execution script, webpage is "this"
pageFormat = '''this.paperSize = {format: "A4", orientation: "portrait" };'''
execute(pageFormat, [])
# render current page
render = '''this.render("test.pdf")'''
execute(render, [])
This can be done using Selenium v4.1.3 by following pyppeteer examples.
You will need to be able to send commands to chromium and call Page.printToPDF Devtools API as shown in the following snip:
# ...
result = send_cmd(driver, "Page.printToPDF", params={
'landscape': True
,'margin':{'top':'1cm', 'right':'1cm', 'bottom':'1cm', 'left':'1cm'}
,'format': 'A4'
,'displayHeaderFooter': True
,'headerTemplate': '<span style="font-size:8px; margin-left: 5px">Page 1 of 1</span>'
,'footerTemplate': f'<span style="font-size:8px; margin-left: 5px">Generated on {datetime.now().strftime("%-m/%-d/%Y at %H:%M")}</span>'
,'scale': 1.65
,'pageRanges': '1'
})
with open(out_path_full, 'wb') as file:
file.write(base64.b64decode(result['data']))
# ...
def send_cmd(driver, cmd, params={}):
response = driver.command_executor._request(
'POST'
,f"{driver.command_executor._url}/session/{driver.session_id}/chromium/send_command_and_get_result"
,json.dumps({'cmd': cmd, 'params': params}))
if response.get('status'): raise Exception(response.get('value'))
return response.get('value')
I've included a full example in my GitHub Repo.
pdfkit may help you out. It will render a PDF from an html page.
Related
I am using playwright for test automation.
Every test run creates a new instance of chromium.
When I pass --auto-open-devtools-for-tabs it opens devtools as expected.
But, I need to go one step further and have checkbox Perserve Log enabled.
Tests are fast and I need to see requests before redirect.
Based on this answer, one trick would be loading the browser with a persistent_context, closing the browser, and then edit the preference file setting the Preserve log value.
user_data_dir = './prefs'
pref_file_path = user_data_dir + '/Default/Preferences'
browser = playwright.chromium.launch_persistent_context(user_data_dir, headless=False, args= ['--auto-open-devtools-for-tabs'])
browser.close()
with open(pref_file_path, 'r') as pref_file:
data = json.load(pref_file)
data['devtools'] = {
'preferences': {
'network_log.preserve-log': '"true"'
}
}
with open(pref_file_path, 'w') as pref_file:
json.dump(data, pref_file)
browser = playwright.chromium.launch_persistent_context(user_data_dir, headless=False, args= ['--auto-open-devtools-for-tabs'])
page = browser.new_page()
page.goto('https://stackoverflow.com/questions/63661366/puppeteer-launch-chromium-with-preserve-log-enabled')
I'm new to scrapy and splash, and I need to scrape data from single page and regular web apps.
A caveat, though, is I'm mostly scraping data from internal tools and applications, so some require authentication and all of them require at least a couple of seconds loading time before the page fully loads.
I naively tried a Python time.sleep(seconds) and it didn't work. It seems like SplashRequest and scrapy.Request both run and yield results, basically. I then learned about LUA scripts as arguments to these requests, and attempted a LUA script with various forms of wait(), but it looks like the requests never actually run the LUA scripts. It finishes right away and my HTMl selectors don't find anything I'm looking for.
I'm following directions from here https://github.com/scrapy-plugins/scrapy-splash, and have their docker instance running on localhost:8050 and created a settings.py.
Anyone with experience here know what I might be missing?
Thanks!
spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy_splash import SplashRequest
import logging
import base64
import time
# from selenium import webdriver
# lua_script="""
# function main(splash)
# splash:set_user_agent(splash.args.ua)
# assert(splash:go(splash.args.url))
# splash:wait(5)
# -- requires Splash 2.3
# -- while not splash:select('#user-form') do
# -- splash:wait(5)
# -- end
# repeat
# splash:wait(5))
# until( splash:select('#user-form') ~= nil )
# return {html=splash:html()}
# end
# """
load_page_script="""
function main(splash)
splash:set_user_agent(splash.args.ua)
assert(splash:go(splash.args.url))
splash:wait(5)
function wait_for(splash, condition)
while not condition() do
splash:wait(0.5)
end
end
local result, error = splash:wait_for_resume([[
function main(splash) {
setTimeout(function () {
splash.resume();
}, 5000);
}
]])
wait_for(splash, function()
return splash:evaljs("document.querySelector('#user-form') != null")
end)
-- repeat
-- splash:wait(5))
-- until( splash:select('#user-form') ~= nil )
return {html=splash:html()}
end
"""
class HelpSpider(scrapy.Spider):
name = "help"
allowed_domains = ["secet_internal_url.com"]
start_urls = ['https://secet_internal_url.com']
# http_user = 'splash-user'
# http_pass = 'splash-password'
def start_requests(self):
logger = logging.getLogger()
login_page = 'https://secet_internal_url.com/#/auth'
splash_args = {
'html': 1,
'png': 1,
'width': 600,
'render_all': 1,
'lua_source': load_page_script
}
#splash_args = {
# 'html': 1,
# 'png': 1,
# 'width': 600,
# 'render_all': 1,
# 'lua_source': lua_script
#}
yield SplashRequest(login_page, self.parse, endpoint='execute', magic_response=True, args=splash_args)
def parse(self, response):
# time.sleep(10)
logger = logging.getLogger()
html = response._body.decode("utf-8")
# Looking for a form with the ID 'user-form'
form = response.css('#user-form')
logger.info("####################")
logger.info(form)
logger.info("####################")
I figured it out!
Short Answer
My Spider class was configured incorrectly for using splash with scrapy.
Long Answer
Part of running splash with scrape is, in my case, running a local Docker instance that it uses to load my requests into for it to run the Lua scripts. An important caveat to note is the settings for splash as described in the github page must be a property of the spider class itself, so I added this code to my Spider:
custom_settings = {
'SPLASH_URL': 'http://localhost:8050',
# if installed Docker Toolbox:
# 'SPLASH_URL': 'http://192.168.99.100:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
}
Then I noticed my Lua code running, and the Docker container logs indicating the interactions. After fixing errors with the splash:select() my login script worked, as did my waits:
splash:wait( seconds_to_wait )
Lastly, I created a Lua script to handle logging in, redirecting, and gathering links and text from pages. My application is an AngularJS app, so I can't gather links or visit them except clicking. This script let me run through every link, click it, and gather content.
I suppose an alternative solution would have been to use end-to-end testing tools such as Selenium/WebDriver or Cypress, but I prefer to use scrapy to scrape and testing tools to test. To each their own (Python or NodeJS tools), I suppose.
Neat Trick
Another thing to mention that's really helpful for debugging, is when you're running the Docker instance for Scrapy-Splash, you can visit that URL in your browser and there's an interactive "request tester" that lets you test out Lua scripts and see rendered HTML results (for example, verifying login or page visits). For me, this url was http://0.0.0.0:8050, and this URL is set in your settings and should be configured to match with your Docker container.
Cheers!
I've compiled this code to perform an iteration of downloads from a webpage which has multiple download links. Once the download link is clicked, the webpage produces a webform which has to be filled and submitted for the download to start. I've tried running the code and face issue in 'try'& 'except' block code (Error: Too broad exception clause) and towards the end there is an error associated with the 'submit' (Error: method submit maybe static) both of these subsequently result in 'SyntaxError: invalid syntax '. Any suggestions / help will be much appreciated. Thank you.
import os
from selenium import webdriver
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.manager.showWhenStarting",False)
fp.set_preference("browser.download.dir", os.getcwd())
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-msdos-program")
driver = webdriver.Firefox(firefox_profile=fp)
driver.get('http://def.com/catalog/attribute')
#This is to find the download links in the webpage one by one
i=0
while i<1:
try:
driver.find_element_by_xpath('//*[#title="xml (Open in a new window)"]').click()
except:
i=1
#Once the download link is clicked this has to fill the form for submission which fill download the file
class FormPage(object):
def fill_form(self, data):
driver.find_element_by_xpath('//input[#type = "radio" and #value = "Non-commercial"]').click()
driver.find_element_by_xpath('//input[#type = "checkbox" and #value = "R&D"]').click()
driver.find_element_by_xpath('//input[#name = "name_d"]').send_keys(data['name_d'])
driver.find_element_by_xpath('//input[#name = "mail_d"]').send_keys(data['mail_d'])
return self
def submit(self):
driver.find_element_by_xpath('//input[#value = "Submit"]').click()
data = {
'name_d': 'abc',
'mail_d': 'xyz#gmail.com',
}
FormPage().fill_form(data).submit()
driver.quit()
Actually you have two warnings and a error:
1 - "Too broad exception" this is a warning telling you that you should except espefic errors, not all of them. In your "except" line should be something like except [TheExceptionYouAreTreating]: an example would be except ValueError:. However this should not stop your code from running
2 - "Error: method submit maybe static" this is warning telling you that the method submit is a static method (bassically is a method that don't use the self attribute) to supress this warning you can use the decorator #staticmethod like this
#staticmethod
def submit():
...
3 - "SyntaxError: invalid syntax" this is what is stopping your code from running. This is a error telling you that something is written wrong in your code. I think that may be the indentation on your class. Try this:
i=0
while i<1:
try:
driver.find_element_by_xpath('//*[#title="xml (Open in a new window)"]').click()
except:
i=1
#Once the download link is clicked this has to fill the form for submission which fill download the file
class FormPage(object):
def fill_form(self, data):
driver.find_element_by_xpath('//input[#type = "radio" and #value = "Non-commercial"]').click()
driver.find_element_by_xpath('//input[#type = "checkbox" and #value = "R&D"]').click()
driver.find_element_by_xpath('//input[#name = "name_d"]').send_keys(data['name_d'])
driver.find_element_by_xpath('//input[#name = "mail_d"]').send_keys(data['mail_d'])
return self
def submit(self):
driver.find_element_by_xpath('//input[#value = "Submit"]').click()
data = {
'name_d': 'abc',
'mail_d': 'xyz#gmail.com',
}
FormPage().fill_form(data).submit()
driver.quit()
One more thing. Those are really simple errors and warnings, you should be able to fix them by yourself by carefully reading what the error has to say. I also reccomend you reading about Exceptions
I am Writing some parsing script and need to access to many web pages like this one.
Whenever I try to get this page with urlopen and then read(), I get redirected to this page.
When I launch the same links from google chrome redirect happens but really rarely, most of times when I try to launch url not by clicking it from site menus.
Is there way to dodge that redirect or simulate jump to url from web-site menus with python3?
Example code:
def getItemsFromPage(url):
with urlopen(url) as page:
html_doc = str(page.read())
return re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', html_doc)
url = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha<r=1'
items_urls = getItemsFromPage(url)
with urlopen(item_urls[0]) as item_page:
print(item_page.read().decode('utf-8')) # Here i get search.advanced instead of item page
In fact, it's a weird problem with ampersands from raw html data. When you visit the webpage and click on link ampersands (&) are read by web navigator as "&" and it work. However, python reads data as it is, that is raw data. So:
import urllib.request as net
from html.parser import HTMLParser
import re
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0",
}
def unescape(items):
html = HTMLParser()
unescaped = []
for i in items:
unescaped.append(html.unescape(i))
return unescaped
def getItemsFromPage(url):
request = net.Request(url, headers=headers)
response = str(net.urlopen(request).read())
# --------------------------
# FIX AMPERSANDS - unescape
# --------------------------
links = re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', response)
unescaped_links = unescape(links)
return unescaped_links
url = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha<r=1'
item_urls = getItemsFromPage(url)
request = net.Request(item_urls[0], headers=headers)
print(item_urls)
response = net.urlopen(request)
# DEBUG RESPONSE
print(response.url)
print(80 * '-')
print("<title>Charity Navigator Rating - 10,000 Degrees</title>" in (response.read().decode('utf-8')))
Your problem is not replacing & with & in the url string. I rewrite your code using urllib3 as below and got the expected webpages.
import re
import urllib3
def getItemsFromPage(url):
# create connection pool object (urllib3-specific)
localpool = urllib3.PoolManager()
with localpool.request('get', url) as page:
html_doc = page.data.decode('utf-8')
return re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', html_doc)
# the master webpage
url_master = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha<r=1'
# name and store the downloaded contents for testing purpose.
file_folder = "R:"
file_mainname = "test"
# parse the master webpage
items_urls = getItemsFromPage(url_master)
# create pool
mypool = urllib3.PoolManager()
i=0;
for url in items_urls:
# file name to be saved
file_name = file_folder + "\\" + file_mainname + str(i) + ".htm"
# replace '&' with r'&'
url_OK = re.sub(r'&', r'&', url)
# print revised url
print(url_OK)
### the urllib3-pythonic way of web page retrieval ###
with mypool.request('get', url_OK) as page, open(file_name, 'w') as f:
f.write(page.data.decode('utf-8'))
i+=1
(verified on python 3.4 eclipse PyDev win7 x64)
I'm written a bit of code in an attempt to pull photos from a website. I want it to find photos, then download them for use to tweet:
import urllib2
from lxml.html import fromstring
import sys
import time
url = "http://www.phillyhistory.org/PhotoArchive/Search.aspx"
response = urllib2.urlopen(url)
html = response.read()
dom = fromstring(html)
sels = dom.xpath('//*[(#id = "large_media")]')
for pic in sels[:1]:
output = open("file01.jpg","w")
output.write(pic.read())
output.close()
#twapi = tweepy.API(auth)
#twapi.update_with_media(imagefilename, status=xxx)
I'm new at this sort of thing, so I'm not really sure why this isn't working. No file is created, and no 'sels' are being created.
Your problem is that the image search (Search.aspx) doesn't just return a HTML page with all the content in it, but instead delivers a JavaScript application that then makes several subsequent requests (see AJAX) to fetch raw information about assets, and then builds a HTML page dynamically that contains all those search results.
You can observe this behavior by looking at the HTTP requests your browser makes when you load the page. Use the Firebug extension for Firefox or the builtin Chrome developer tools and open the Network tab. Look for requests that happen after the initial page load, particularly POST requests.
In this case the interesting requests are the ones to Thumbnails.ashx, Details.ashx and finally MediaStream.ashx. Once you identify those requests, look at what headers and form data your browser sends, and emulate that behavior with plain HTTP requests from Python.
The response from Thumbnails.ashx is actually JSON, so it's much easier to parse than HTML.
In this example I use the requests module because it's much, much better and easier to use than urllib(2). If you don't have it, install it with pip install requests.
Try this:
import requests
import urllib
BASE_URL = 'http://www.phillyhistory.org/PhotoArchive/'
QUERY_URL = BASE_URL + 'Thumbnails.ashx'
DETAILS_URL = BASE_URL + 'Details.ashx'
def get_media_url(asset_id):
response = requests.post(DETAILS_URL, data={'assetId': asset_id})
image_details = response.json()
media_id = image_details['assets'][0]['medialist'][0]['mediaId']
return '{}/MediaStream.ashx?mediaId={}'.format(BASE_URL, media_id)
def save_image(asset_id):
filename = '{}.jpg'.format(asset_id)
url = get_media_url(asset_id)
with open(filename, 'wb') as f:
response = requests.get(url)
f.write(response.content)
return filename
urlqs = {
'maxx': '-8321310.550067',
'maxy': '4912533.794965',
'minx': '-8413034.983992',
'miny': '4805521.955385',
'onlyWithoutLoc': 'false',
'sortOrderM': 'DISTANCE',
'sortOrderP': 'DISTANCE',
'type': 'area',
'updateDays': '0',
'withoutLoc': 'false',
'withoutMedia': 'false'
}
data = {
'start': 0,
'limit': 12,
'noStore': 'false',
'request': 'Images',
'urlqs': urllib.urlencode(urlqs)
}
response = requests.post(QUERY_URL, data=data)
result = response.json()
print '{} images found'.format(result['totalImages'])
for image in result['images']:
asset_id = image['assetId']
print 'Name: {}'.format(image['name'])
print 'Asset ID: {}'.format(asset_id)
filename = save_image(asset_id)
print "Saved image to '{}'.\n".format(filename)
Note: I didn't check what http://www.phillyhistory.org/'s Terms of Service have to say about automated crawling. You need to check yourself and make sure you're not in violation of their ToS with whatever you're doing.