I have to implement a web scraping tool and I choose to do with electron js with react and python.
I could able to integrate python and react using python shell in electron as follow,
React Code
import React from 'react';
var path = require("path")
const {PythonShell} = require("python-shell");
const city = 'XYZ';
var options = {
scriptPath : path.join(__dirname, '../../python/'),
args : [city]
}
class App extends React.Component {
constructor(props) {
super(props);
}
componentDidMount() {
var shell = new PythonShell('main.py', options); //executes python script on python3
shell.on('message', function(message) {
console.log('message', message)
})
}
render (){
return (
<div className="header">
<h1>Hello, World, {this.state.test}</h1>
</div>
)
}
}
export default App;
Python Code
import sys
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
city = sys.argv[1]
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=5)
self.scraped_pages = set([])
self.to_crawl = Queue()
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("http://websosite.com")
s.run_scraper()
I could get all scraping URLs after executing the python shell in React but I want all URLs in real time in React front end.
Following React code execute the python code and give the final result
var shell = new PythonShell('main.py', options); //executes python script on python3
This React code is used to receives a message from the python script with a simple 'print' statement.
pyshell.on('message', function (message) {
console.log(message);
});
Is there any way to get the result in real time while executing the python code?
Use sys.stdout.flush() after print statement in python.
Ref:usage of sys.stdout.flush()
What's a good way for me to output (AMP compliant) HTML using a template and JSON for the data? I have a nice little python script:
import requests
import json
from bs4 import BeautifulSoup
url = requests.get('https://www.perfectimprints.com/custom-promos/20492/Beach-Balls.html')
source = BeautifulSoup(url.text, 'html.parser')
products = source.find_all('div', class_="product_wrapper")
infos = source.find_all('div', class_="categories_wrapper")
def get_category_information(category):
category_name = category.find('h1', class_="category_head_name").text
return {
"category_name": category_name.strip()
}
category_information = [get_category_information(info) for info in infos]
with open("category_info.json", "w") as write_file:
json.dump(category_information, write_file)
def get_product_details(product):
product_name = product.find('div', class_="product_name").a.text
sku = product.find('div', class_="product_sku").text
product_link = product.find('div', class_="product_image_wrapper").find("a")["href"]
src = product.find('div', class_="product_image_wrapper").find('a').find("img")["src"]
return {
"title": product_name,
"link": product_link.strip(),
"sku": sku.strip(),
"src": src.strip()
}
all_products = [get_product_details(product) for product in products]
with open("products.json", "w") as write_file:
json.dump({'items': all_products}, write_file)
print("Success")
Which generates the JSON files I need. However, I now need to use those JSON files and input it into my template (gist) everywhere it says {{ JSON DATA HERE }}.
I'm not even sure where to start. I'm most comfortable with JavaScript so I'd like to use that if possible. I figure something involving Node.js.
Here's how you can render HTML with a template engine by itself and use it to return pure HTML:
from jinja2 import Template
me = Template('<h1>Hello {{x}}<h1>')
html = me.render({'x': 'world'})
return html
html is your rendered HTML string. Alternatively, you can render from a file:
from jinja2 import Template
with open('your_template.html') as file_:
template = Template(file_.read())
html = template.render(your_dict)
return html
Since you're going to generate HTML one way or another, using a template engine will save you much time. You can also do {% for item in list %} and such thing will greatly simplify your task.
I'm facing trouble trying to match my string W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl with my RegEX ^([A-Za-z0-9]{32})$.
According to various online RegEx tools it should match, but not according to my Python script:
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
return line
else:
return None
I've attempted using strip() to check if there are any unseen whitespaces, but can't find anything.
Here is the entire script:
import requests, binascii, base64, re
from requests.auth import HTTPBasicAuth
def pattern_lookup(line):
"""
Will iterate through lines
to find a matching string that
is 32 characters long and only
holds alphanumerical characters.
-----
:param lines: The lines to be iterated.
:return: The line holding the matched string,
or None if not found
"""
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
return line
else:
return None
def get_secret(host, credentials):
"""
Grabs the hint(flag) from the
host by splitting the response on
semicolon (;) then performing
pattern matching using regex.
----
:param host: The host we are sending
requests to.
:param credentials: The credentials required
to sign into the host.
:return: The hex encoded secret.
"""
try:
response = requests.get(host, auth=HTTPBasicAuth(*credentials))
response_lines = response.content.decode('ascii').replace('"', '').split(';')
return next((line
for line in response_lines
if pattern_lookup(line)),
None)
except requests.RequestException as e:
print(e)
def prepare_payload(secret):
decoded_secret = base64.b64decode(binascii.unhexlify(secret)[::-1])
payload = {'secret': decoded_secret, 'submit': 'placeholder'}
return payload
def get_password(host, credentials, secret):
"""
Uses a post-request injected with the
reverse engineered secret to get access
to the password to natas9.
:param host: The host that holds the
password.
:param credentials:
:param decoded_hint:
:return: The password to Natas9
"""
payload = prepare_payload(secret)
try:
response = requests.post(host, auth=HTTPBasicAuth(*credentials), data=payload)
response_lines = response.content.decode('utf-8').split(' ')
return next((line
for line in response_lines
if pattern_lookup(line.strip())),
None)
except requests.RequestException as e:
print(e)
def main():
host = 'http://natas8.natas.labs.overthewire.org/index-source.html'
credentials = ['natas8', 'DBfUBfqQG69KvJvJ1iAbMoIpwSNQ9bWe']
secret = get_secret(host, credentials)
print(get_password(host.split('index')[0], credentials, secret))
if __name__ == '__main__':
main()
EDIT:
I should mention that the initial test in get_secret works absolutely flawlessly and all my previous modules that use this work fine...
EDIT2:
Output:
<link
rel="stylesheet"
type="text/css"
href="http://natas.labs.overthewire.org/css/level.css">
<link
rel="stylesheet"
href="http://natas.labs.overthewire.org/css/jquery-ui.css"
/>
<link
rel="stylesheet"
href="http://natas.labs.overthewire.org/css/wechall.css"
/>
<script
src="http://natas.labs.overthewire.org/js/jquery-1.9.1.js"></script>
<script
src="http://natas.labs.overthewire.org/js/jquery-ui.js"></script>
<script
src=http://natas.labs.overthewire.org/js/wechall-data.js></script><script
src="http://natas.labs.overthewire.org/js/wechall.js"></script>
<script>var
wechallinfo
=
{
"level":
"natas8",
"pass":
"DBfUBfqQG69KvJvJ1iAbMoIpwSNQ9bWe"
};</script></head>
<body>
<h1>natas8</h1>
<div
id="content">
Access
granted.
The
password
for
natas9
is
W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl <-- here it is
<form
method=post>
Input
secret:
<input
name=secret><br>
<input
type=submit
name=submit>
</form>
<div
id="viewsource"><a
href="index-source.html">View
sourcecode</a></div>
</div>
</body>
</html>
None
I made a demo code based on your regex, it works fine.
import re
line = 'W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl'
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
print ("matched")
else:
print ("No")
Demo
That means the line which you are reading from response_lines is not of the same format which regex is expecting. Try to print the line and see what's missing.
Edit: After your edit, I can you see have multiline data. Use the below:
pattern = re.compile("^([A-Za-z0-9]{32})$", re.MULTILINE)
if pattern.finditer(line):
print ("matched")
else:
print ("No")
Full Demo
Your text is multiline. Have you tried with:
re.compile("^([A-Za-z0-9]{32})$", re.MULTILINE)
.jsp
<%# page import = "java.io.File"%>
<%# page import = "java.io.*"%>
<%
String dir = "/var/lib/tomcat7/webapps/Venu"
Process p = Runtime.getRuntime().exec("ImageConversion.py download.jpg", null, dir);
p.waitFor();
BufferedReader reader=new BufferedReader(new InputStreamReader(p.getInp$
String line=reader.readLine();
while(line!=null)
{
System.out.println(line);
line=reader.readLine();
}
%>
ImageConversion.py
<%# page import = "java.io.File"%>
<%# page import = "java.io.*"%>
<%
String dir = "/var/lib/tomcat7/webapps/Venu"
Process p = Runtime.getRuntime().exec("download.jpg", null, dir);
p.waitFor();
BufferedReader reader=new BufferedReader(new InputStreamReader(p.getInp$
String line=reader.readLine();
while(line!=null)
{
System.out.println(line);
line=reader.readLine();
}
%>
I am trying to invoke python script using JSP but the jsp code is showing error and how can we call system command from jsp.
Thanks in advance.
I've been searching the web for examples of people using the pdfkit (python wrapper) in implementing headers and footers and could not find any examples.
Would anyone be able to show some examples of how to implement the options in wkhtmltopdf using the pdfkit python wrapper?
I'm using it only with headers but I think that it will work the same with footers.
You need to have separate html file for the header.
header.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
Code of your header goes here.
</body>
</html>
Then you can use it like that in Python
import pdfkit
pdfkit.from_file('path/to/your/file.html', 'out.pdf', {
'--header-html': 'path/to/header.html'
})
The tricky part if you use some backend like Django and want to use templates is that you can't pass the header html as rendered string. You need to have a file.
This is what I do to render PDFs with Django.
import os
import tempfile
import pdfkit
from django.template.loader import render_to_string
def render_pdf(template, context, output, header_template=None):
"""
Simple function for easy printing of pdfs from django templates
Header template can also be set
"""
html = render_to_string(template, context)
options = {
'--load-error-handling': 'skip',
}
try:
if header_template:
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as header_html:
options['header-html'] = header_html.name
header_html.write(render_to_string(header_template, context).encode('utf-8'))
return pdfkit.from_string(html, output, options=options)
finally:
# Ensure temporary file is deleted after finishing work
if header_template:
os.remove(options['header-html'])
In my example I create temporary file where I put rendered content. Important part is that temporary file need to end with .html and to be deleted manually.
Improving #V Stoykov answer as it helped me using Flask, the render function with custom header in Flask will be as follows:
import os
import tempfile
import pdfkit
from flask import render_template, make_response
#app.route('/generate_pdf')
def render_pdf_custom_header(foo, bar):
main_content = render_template('main_pdf.html', foo=foo)
options = {
'--encoding': "utf-8"
}
add_pdf_header(options, bar)
add_pdf_footer(options)
try:
pdf = pdfkit.from_string(main_content, False, options=options)
finally:
os.remove(options['--header-html'])
os.remove(options['--footer-html'])
response = build_response(pdf)
return response
def add_pdf_header(options, bar):
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as header:
options['--header-html'] = header.name
header.write(
render_template('header.html', bar=bar).encode('utf-8')
)
return
def add_pdf_footer(options):
# same behaviour as add_pdf_header but without passing any variable
return
def build_response(pdf):
response = make_response(pdf)
response.headers['Content-Type'] = 'application/pdf'
filename = 'pdf-from-html.pdf'
response.headers['Content-Disposition'] = ('attachment; filename=' + filename)
return response
Notice that I used the '--header-html' and '--footer-html' notation as it matches the wkhtmltopdf options format.
options = {
'page-size': 'Letter',
'margin-top': '0.9in',
'margin-right': '0.9in',
'margin-bottom': '0.9in',
'margin-left': '0.9in',
'encoding': "UTF-8",
'header-center': 'YOUR HEADER',
'custom-header' : [
('Accept-Encoding', 'gzip')
],
'no-outline':None
}
you can add the header, which you need in the value for header-center
This question and its answers are quite old and not working for me.
wkhtmltopdf version: $ wkhtmltopdf --version
wkhtmltopdf 0.12.6 (with patched qt)
python 3.8
For wkhtmltopdf,
header-html and footer-html can only be URI, e.g. html url or file path, CANNOT be string. so the idea is to keep each html file on cloud or create a temp file in local as footer and header for reference.
Request:
content as url or html
header as html url or html string
footer as html url or html string
Example:
import logging
import os
import tempfile
import pdfkit
from flask import Flask, Response, make_response
from flask_restx import Resource, Api, fields
Request = api.model('Request', {
'url': fields.String(
required=False,
description='url',
example='https://www.w3schools.com/html/html5_svg.asp',
),
'html': fields.String(
required=False,
description='content html string',
example=example_content_html
),
'header_html': fields.String(
required=False,
description='pdf header html string',
example=example_header_html
),
'footer_html': fields.String(
required=False,
description='pdf footer html string',
example=example_footer_html
),
})
#api.route("/convert_html_to_pdf", endpoint = 'html2pdf')
#api.representation('application/octet-stream')
class PdfConverter(Resource):
#api.doc(body=Request)
def post(self):
logging.info(request.json)
url = request.json.get('url')
html = request.json.get('html')
header_html = request.json.get('header_html')
footer_html = request.json.get('footer_html')
header_uri = 'https://xxxxx/header.html' # default header
footer_uri = 'https://xxxxx/footer.html' # default footer
if header_html:
fph = tempfile.NamedTemporaryFile(suffix='.html')
fph.write(header_html.encode('utf-8'))
fph.flush()
header_uri = fph.name
if footer_html:
fpf = tempfile.NamedTemporaryFile(suffix='.html')
fpf.write(footer_html.encode('utf-8'))
fpf.flush()
footer_uri = fpf.name
options = {
'page-size': 'A4',
'margin-top': '32mm',
'header-spacing': 6,
'footer-spacing': 6,
'header-html': header_uri,
'footer-html': footer_uri,
'margin-right': '0',
'margin-bottom': '16mm',
'margin-left': '0',
'encoding': "UTF-8",
'cookie': [
('cookie-empty-value', '""'),
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'no-outline': None
}
logging.info(options)
if url:
# api.payload['requestedBlobUrl'] = url
# return api.payload
pdf = pdfkit.from_url(url, options=options)
else:
pdf = pdfkit.from_string(html, options=options)
if header_html:
fph.close()
if footer_html:
fpf.close() # close will delete the temp file
response = make_response(pdf)
response.headers['Content-Type'] = 'application/pdf'
response.headers['Content-Disposition'] = 'attachment;filename=report.pdf'
return response
Python pdfkit wrapper only supports html files as header and footer. To be able to support string, all you need to do is generate a temporary file and delete it on close. Here is my code sample.
Using tempfile with delete=True and suffix='.html' arguments will generate a deletable file on temp.close()
import tempfile
temp = tempfile.NamedTemporaryFile(delete=True,suffix='.html')
with open(temp.name, 'w') as f:
f.write("""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
Code of your header goes here.
</body>
</html>
""")
options = {
'page-size': 'A4',
'margin-top': '1in',
'margin-bottom': '0.75in',
'margin-right': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'header-html': temp.name,
'footer-center': "Page [page] of [topage]",
'footer-font-size': "9",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'enable-local-file-access': False,
'no-outline': None
}
pdf = pdfkit.from_string(html_string, options=options)
temp.close()