I have configured xampp on windows to work with python 2.7 and Pygments. My php code is highlighted properly in Pygments on the website. The code has colors, span elements, classes.
That is how it looks:
But I cannot get line numbers.
As I have read tutorials it depends on the linenos value in python script. The value should be either table or inline or 1 or True.
But it does not work for me. I still gives the same final code
<!doctype html>
<html lang="pl">
<head>
<meta charset="UTF-8">
<title>Document</title>
<link rel="stylesheet" href="gh.css">
</head>
<body>
<div class="highlight highlight-php"><pre><code><span class="nv">$name</span> <span class="o">=</span> <span class="s2">"Jaś"</span><span class="p">;</span>
<span class="k">echo</span> <span class="s2">"Zażółć gęślą jaźń, "</span> <span class="o">.</span> <span class="nv">$name</span> <span class="o">.</span> <span class="s1">'.'</span><span class="p">;</span>
<span class="k">echo</span> <span class="s2">"hehehe#jo.io"</span><span class="p">;</span>
</code></pre></div>
</html>
How to add line numbers? I put two files of the website below:
index.py
import sys
from pygments import highlight
from pygments.formatters import HtmlFormatter
# If there isn't only 2 args something weird is going on
expecting = 2;
if ( len(sys.argv) != expecting + 1 ):
exit(128)
# Get the code
language = (sys.argv[1]).lower()
filename = sys.argv[2]
f = open(filename, 'rb')
code = f.read()
f.close()
# PHP
if language == 'php':
from pygments.lexers import PhpLexer
lexer = PhpLexer(startinline=True)
# GUESS
elif language == 'guess':
from pygments.lexers import guess_lexer
lexer = guess_lexer( code )
# GET BY NAME
else:
from pygments.lexers import get_lexer_by_name
lexer = get_lexer_by_name( language )
# OUTPUT
formatter = HtmlFormatter(linenos='table', encoding='utf-8', nowrap=True)
highlighted = highlight(code, lexer, formatter)
print highlighted
index.php
<?php
define('MB_WPP_BASE', dirname(__FILE__));
function mb_pygments_convert_code($matches)
{
$pygments_build = MB_WPP_BASE . '/index.py';
$source_code = isset($matches[3]) ? $matches[3] : '';
$class_name = isset($matches[2]) ? $matches[2] : '';
// Creates a temporary filename
$temp_file = tempnam(sys_get_temp_dir(), 'MB_Pygments_');
// Populate temporary file
$filehandle = fopen($temp_file, "w");
fwrite($filehandle, html_entity_decode($source_code, ENT_COMPAT, 'UTF-8'));
fclose($filehandle);
// Creates pygments command
$language = $class_name ? $class_name : 'guess';
$command = sprintf('C:\Python27/python %s %s %s', $pygments_build, $language, $temp_file);
// Executes the command
$retVal = -1;
exec($command, $output, $retVal);
unlink($temp_file);
// Returns Source Code
$format = '<div class="highlight highlight-%s"><pre><code>%s</code></pre></div>';
if ($retVal == 0)
$source_code = implode("\n", $output);
$highlighted_code = sprintf($format, $language, $source_code);
return $highlighted_code;
}
// This prevent throwing error
libxml_use_internal_errors(true);
// Get all pre from post content
$dom = new DOMDocument();
$dom->loadHTML(mb_convert_encoding('
<pre class="php">
<code>
$name = "Jaś";
echo "Zażółć gęślą jaźń, " . $name . \'.\';
echo "<address>hehehe#jo.io</address>";
</code>
</pre>', 'HTML-ENTITIES', "UTF-8"), LIBXML_HTML_NODEFDTD);
$pres = $dom->getElementsByTagName('pre');
foreach ($pres as $pre) {
$class = $pre->attributes->getNamedItem('class')->nodeValue;
$code = $pre->nodeValue;
$args = array(
2 => $class, // Element at position [2] is the class
3 => $code // And element at position [2] is the code
);
// convert the code
$new_code = mb_pygments_convert_code($args);
// Replace the actual pre with the new one.
$new_pre = $dom->createDocumentFragment();
$new_pre->appendXML($new_code);
$pre->parentNode->replaceChild($new_pre, $pre);
}
// Save the HTML of the new code.
$newHtml = "";
foreach ($dom->getElementsByTagName('body')->item(0)->childNodes as $child) {
$newHtml .= $dom->saveHTML($child);
}
?>
<!doctype html>
<html lang="pl">
<head>
<meta charset="UTF-8">
<title>Document</title>
<link rel="stylesheet" href="gh.css">
</head>
<body>
<?= $newHtml ?>
</body>
</html>
Thank you
While reading the file try readlines:
f = open(filename, 'rb')
code = f.readlines()
f.close()
This way you do the following it will get multiple lines :
formatter = HtmlFormatter(linenos='table', encoding='utf-8', nowrap=True)
Suggestion:
More pythonic way of opening files is :
with open(filename, 'rb') as f:
code = f.readlines()
That's it python context manager closes this file for you.
Solved!
nowrap
If set to True, don’t wrap the tokens at all, not even inside a tag. This disables most other options (default: False).
http://pygments.org/docs/formatters/#HtmlFormatter
Related
I'm trying to parse the testObj in the html into JSON, but it includes so much formatting.
I already tried to remove the non-ascii characters in the object, but json.loads() and yaml still can't parse the string into an object.
How can I parse the string into an object?
html
<!DOCTYPE html>
<html lang="en">
<head>
<title>Sample Document</title>
</head>
<body></body>
<script>
const testObj = {
a: 1,
b: 2,
c: 3,
};
</script>
</html>
Python Script
import lxml.html
import urllib.request
import os
import json
import yaml
def removeNonAscii(str):
return ''.join(i for i in str if ord(i)>31 and ord(i)<126)
with urllib.request.urlopen('file:///'+os.path.abspath('./test.html')) as url:
page = url.read()
tree = lxml.html.fromstring(page)
x = tree.xpath("//script")[0].text_content()
json_str = x.strip().split('testObj = ')[1][:-1]
str = removeNonAscii(json_str)
print(str)
# >>> {a: 1,b: 2,c: 3,}
# Attempt 1 - This doesn't work as object doesn't originally have double quotes
# data = json.loads(str)
# >>> json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes
# Attempt 2 - Not sure how to detect or get rid of formatting
# data = yaml.load(str, yaml.SafeLoader)
# >>> ScannerError: While scanning for the next token found character '\t' that cannot start any token
print(data.a)
# >>> Should return 1
Edit: In my actual use case, the JSON object is very large and I cannot recreate the string. I need to remove the formatting and/or add double quotes to make it proper JSON so it can parse, but not sure how to do it. I'm close getting it to {a: 1,b: 2,c: 3,} but it still doesn't want to parse.
If it is as shown (not minified) then you can use the following regex to extract the string then hjson to add the quoted keys
import hjson, re
html = '''
<!DOCTYPE html>
<html lang="en">
<head>
<title>Sample Document</title>
</head>
<body></body>
<script>
const testObj = {
a: 1,
b: 2,
c: 3,
};
</script>
</html>'''
s = re.search(r'const testObj = ([\s\S]+?);', html).group(1)
res = hjson.loads(s)
print(res)
Regex:
I have this simple code:
html_string = '''<html lang="en-US">
'<head>
<title>My Python articles</title>
</head>
<body>'''
for i in range(2):
html_string += '''
<p>
<span style="white-space: pre-line">$''' + str(i) + '''</span>
</p>'''
html_string += '''</body>
</html>'''
html_template = Template(html_string)
output_dir = "./html/"
output_path = os.path.join(output_dir, 'my_page.html')
with io.open(output_path, 'w+', encoding='UTF-8', errors='replace') as html_output:
for i in range(2):
html_output.write(html_template.safe_substitute(i="Hallo"))
html_output.truncate()
It looks like the i in the html_output.write(html_template.safe_substitute(i="Hello")) doesn't correspond to the i in the for loop and all I get is:
$0
$1
$0
$1
$0 and $1 need to exist only once and each of them have to be replaced with the word Hello. Later I'll be replacing $0 and $1 each with a different input.
The docs for template strings have this to say about substitution identifiers:
By default, "identifier" is restricted to any case-insensitive ASCII alphanumeric string (including underscores) that starts with an underscore or ASCII letter.
Identifiers like "$0" and "$1" don't satisfy this condition, because they start with an ASCII digit.
Inserting a letter between the "$" and the digit like this ought to work:
html_string = '''<html lang="en-US">
'<head>
<title>My Python articles</title>
</head>
<body>'''
# Make substitution identifiers like "$Ti"
for i in range(2):
html_string += '''
<p>
<span style="white-space: pre-line">$T''' + str(i) + '''</span>
</p>'''
html_string += '''</body>
</html>'''
html_template = Template(html_string)
# Map identifiers to values
mapping = {'T' + str(i): 'Hello' for i in range(2)}
output_dir = "./html/"
output_path = os.path.join(output_dir, 'my_page.html')
with open(output_path, 'w+', encoding='UTF-8', errors='replace') as html_output:
html_output.write(html_template.safe_substitute(mapping))
html_output.truncate()
I know this is asked before also, but I am a newbie in scraping and python. Please help me and it would be very much helpful in my learning path.
I am scraping a news site using python with packages such as Beautiful Soup and etc.
I am facing difficulty while getting the value of java script variable which is declared in script tag and also it is getting updated there.
Here is the part of HTML page which I am scraping:(containing only script part)
<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>
From the above part, I want to get the value of min_news_id in python.
I should also get the value of same variable if updated from line 2.
Here is how I am doing it:
self.pattern = re.compile('var min_news_id = (.+?);') // or self.pattern = re.compile('min_news_id = (.+?);')
page = bs(htmlPage, "html.parser")
//find all the scripts tag
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
scriptString.replace('"', '\\"')
print(scriptString)
if(self.pattern.match(str(scriptString))):
print("matched")
data = self.pattern.match(scriptString)
jsVariable = json.loads(data.groups()[0])
InShortsScraper.newsOffset = jsVariable
print(InShortsScraper.newsOffset)
But I am never getting the value of the variable. Is it problem with my regular expression or any other? Please help me.
Thank You in advance.
html = '''<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>'''
finder = re.findall(r'min_news_id = .*;', html)
print(finder)
Output:
['min_news_id = "d7zlgjdu-1";', 'min_news_id = data.min_news_id||min_news_id;']
#2 OR YOU CAN USE
print(finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip())
Output:
d7zlgjdu-1
#3 OR YOU CAN USE
finder = re.findall(r'[a-z0-9]{8}-[0-9]', html)
print(finder)
Output:
['d7zlgjdu-1']
you can't monitor javascript variable change using BeautifulSoup, here how to get next page news using while loop, re and json
from bs4 import BeautifulSoup
import requests, re
page_url = 'https://inshorts.com/en/read/politics'
ajax_url = 'https://inshorts.com/en/ajax/more_news'
htmlPage = requests.get(page_url).text
# BeautifulSoup extract article summary
# page = BeautifulSoup(htmlPage, "html.parser")
# ...
# get current min_news_id
min_news_id = re.search('min_news_id\s+=\s+"([^"]+)', htmlPage).group(1) # result: d7zlgjdu-1
customHead = {'X-Requested-With': 'XMLHttpRequest', 'Referer': page_url}
while min_news_id:
# change "politics" if in different category
reqBody = {'category' : 'politics', 'news_offset' : min_news_id }
# get Ajax next page
ajax_response = requests.post(ajax_url, headers=customHead, data=reqBody).json() # parse string to json
# again, do extract article summary
page = BeautifulSoup(ajax_response["html"], "html.parser")
# ....
# ....
# new min_news_id
min_news_id = ajax_response["min_news_id"]
# remove this to loop all page (thousand?)
break
thank you for the response, Finally I solved using requests package after reading its documentation,
here is my code :
if InShortsScraper.firstLoad == True:
self.pattern = re.compile('var min_news_id = (.+?);')
else:
self.pattern = re.compile('min_news_id = (.+?);')
page = None
# print("Pattern: " + str(self.pattern))
if news_offset == None:
htmlPage = urlopen(url)
page = bs(htmlPage, "html.parser")
else:
self.loadMore['news_offset'] = InShortsScraper.newsOffset
# print("payload : " + str(self.loadMore))
try:
r = myRequest.post(
url = url,
data = self.loadMore
)
except TypeError:
print("Error in loading")
InShortsScraper.newsOffset = r.json()["min_news_id"]
page = bs(r.json()["html"], "html.parser")
#print(page)
if InShortsScraper.newsOffset == None:
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
finder = re.findall(self.pattern, scriptString)
InShortsScraper.newsOffset = finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip()
I currently have an HTML file and a python file. The python file uses YELP's API and returns JSON data. How do I display that data onto my webpage through HTML? Is there a function like document.getElementById("id").innerHTML = JSONDATA from JavaScript?
Please let me know if you need any more details; this is my first time posting and first time using an API/making a website. I understand the JSON data is not going to look nice but I will put it into a dictionary and sort it later, basically right now I am just wondering how to display data from a Python file into a HTML file. Also, feel free to link any helpful tutorials.
Found the following Node.js code as it was suggested to use Javascript instead, where in this would I put my tokens/secrets? And then how would I call it in my html file... Thank you.
/* require the modules needed */
var oauthSignature = require('oauth-signature');
var n = require('nonce')();
var request = require('request');
var qs = require('querystring');
var _ = require('lodash');
/* Function for yelp call
* ------------------------
* set_parameters: object with params to search
* callback: callback(error, response, body)
*/
var request_yelp = function(set_parameters, callback) {
/* The type of request */
var httpMethod = 'GET';
/* The url we are using for the request */
var url = 'http://api.yelp.com/v2/search';
/* We can setup default parameters here */
var default_parameters = {
location: 'San+Francisco',
sort: '2'
};
/* We set the require parameters here */
var required_parameters = {
oauth_consumer_key : process.env.oauth_consumer_key,
oauth_token : process.env.oauth_token,
oauth_nonce : n(),
oauth_timestamp : n().toString().substr(0,10),
oauth_signature_method : 'HMAC-SHA1',
oauth_version : '1.0'
};
/* We combine all the parameters in order of importance */
var parameters = _.assign(default_parameters, set_parameters, required_parameters);
/* We set our secrets here */
var consumerSecret = process.env.consumerSecret;
var tokenSecret = process.env.tokenSecret;
/* Then we call Yelp's Oauth 1.0a server, and it returns a signature */
/* Note: This signature is only good for 300 seconds after the oauth_timestamp */
var signature = oauthSignature.generate(httpMethod, url, parameters, consumerSecret, tokenSecret, { encodeSignature: false});
/* We add the signature to the list of paramters */
parameters.oauth_signature = signature;
/* Then we turn the paramters object, to a query string */
var paramURL = qs.stringify(parameters);
/* Add the query string to the url */
var apiURL = url+'?'+paramURL;
/* Then we use request to send make the API Request */
request(apiURL, function(error, response, body){
return callback(error, response, body);
});
};
I had a similar situation. I had to show the IAM users of AWS account in a HTML page. I used AWS boto3 Python client to grab all IAM users and write a JSON file. Then from HTML file I read that JSON file and showed all users in a table.
Here is the Python code IAM.PY:
import boto3
import os
import subprocess
import json
iam_client = boto3.client('iam')
def list_user_cli():
list_cmd = "aws iam list-users"
output = subprocess.check_output(list_cmd, shell = True)
output = str(output.decode('ascii'))
return output
def write_json_file(filename, data):
try:
with open(filename, "w") as f:
f.writelines(data)
print(filename + " has been created.")
except Exception as e:
print(str(e))
if __name__ == "__main__":
filename = "iam.json"
data = list_user_cli()
write_json_file(filename, data)
Here is the HTML file IAM.HTML:
<!DOCTYPE html>
<html>
<head>
<!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<title>IAM User List</title>
<style type="text/css">
body{
margin: 20px;
}
</style>
</head>
<body>
<div class="container">
<table class="table table-responsive table-hover table-bordered">
<thead>
<tr>
<th>User ID</th>
<th>User Name</th>
<th>Path</th>
<th>Create Date</th>
<th>Arn</th>
</tr>
</thead>
<tbody id="iam_tbody">
</tbody>
</table>
</div>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.2.0/jquery.min.js"></script>
<script type="text/javascript">
$(document).ready(function(){
$.ajax({
method: "GET",
url: "http://localhost/iam/iam.json",
}).done(function(response){
user_list = response.Users;
for(i = 0; i<user_list.length; i++){
tr = "<tr>";
tr += "<td>";
tr += user_list[i]["UserId"];
tr += "</td>";
tr += "<td>";
tr += user_list[i]["UserName"];
tr += "</td>";
tr += "<td>";
tr += user_list[i]["Path"];
tr += "</td>";
tr += "<td>";
tr += user_list[i]["CreateDate"];
tr += "</td>";
tr += "<td>";
tr += user_list[i]["Arn"];
tr += "</td>";
tr += "<tr>";
$("#iam_tbody").append(tr);
}
});
});
</script>
</body>
</html>
Output
You can use Jquery Ajax to call your API, include Jquery in your html file.
$.ajax({
method: "GET",
url: "api_url",
}).done(function( response ) {
$('#divId').append(response);
});
In Your Html File
<div id="divId"></div>
Jquery Ajax Documentation
I've built a simple web server that gets a request and send a response. So when the server gets an invalid request, like "localhost/not-a-page", the server will send a response with the content of the HTML file "404.html" the webpage should display an image. So far, so good.
But when the 404 page loads up, the page can't find the image. The HTML part is correct and works offline. I've tried to move the image to serval locations, relative to the Python script, relative to the HTML. But it just can't find it. Hi I'm trying to make the server as low-level as I can, I want to learn how servers work. So I'm not using any server-related libraries. I'm using only the socket library of Python.
I'll appreciate any help to resolve this problem without using other libraries,
EDIT
Here is the relevant Python part :
import socket
import threading
import os
default_error_page = """\
<!DOCTYPE HTML>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<center>
<h1>Response</h1>
<p>Error code: %(code)d</p>
<p>Message: %(status)s.</p>
</center>
</body>
</html>
"""
default_header_status = "HTTP/1.1 %(code)d %(status)s\r\n"
default_header_content_type = "Content-Type: text/html; charset=utf-8\r\n\r\n"
buffer_size = 1024
def get_page(code):
page = default_error_page
if code == 200:
pass
else:
file = open(os.path.dirname(__file__) + "/www/not-found.html", 'r')
page = file.read()
return page
class BaseServer:
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_name = ""
host_name = ""
host_port = 8000 # default port
is_shutdown = False
def __init__(self, name):
self.server_name = name
def start_server(self):
thread = threading.Thread(target=self.run_server(), name=self.server_name)
thread.start()
def run_server(self):
self.server_socket.bind((self.host_name, self.host_port)) # bind to host
self.server_socket.listen()
while not self.is_shutdown:
conn, addr = self.server_socket.accept()
self.handle_request(conn, addr)
def handle_request(self, connection, address):
print(str(address[0]) + " Connected! (port " + str(address[1]) + ")")
result = self.parse_request(connection.recv(buffer_size))
if result == 0:
page = self.parse_response(200)
else:
page = self.parse_response(404)
connection.sendall(bytes(page))
def parse_request(self, data):
if len(data) == 0:
return
strings = str(bytes(data).decode('utf-8')).split('\r\n')
command, path, version = strings[0].split()
print("command - " + command)
print("path - " + path)
print("version - " + version)
status = 1
if path == "/":
status = 0
return status
def parse_response(self, code):
status = "ERROR"
if code == 200:
status = "OK"
elif code == 404:
status = "NOT FOUND"
base_header = (default_header_status % {'code': code, 'status': status})
base_content_type = default_header_content_type
# page = (default_error_page % {'code': code, 'status': status})
page = str(get_page(code))
return_string = str(base_header + base_content_type + page).encode('utf-8')
print(return_string)
return return_string
def main():
server = BaseServer("Home Server")
server.start_server()
if __name__ == "__main__":
main()
And this is the HTML:
<html>
<head>
<link rel="stylesheet" type="text/css" href="/style/main.css"/>
<style>
*{
padding:0;
margin:0;
}
body{
background-color:#ffe6b3;
}
h1{
margin-top:30px;
background-color:#ffcc66;
font-size:3em;
display:inline-block;
color:#3a0000;
}
p{
margin-top:80px;
font-size:2em;
color:#3a0000;
}
#img404{
background-image:url(../images/404.gif);
width:100%;
height:50%;
background-repeat:no-repeat;
background-position:center 20%;
}
</style>
</head>
<body>
<center>
<div class=top>
<h1>ERROR 404</h1>
</div>
<p>
Sorry, we could not find the page :(
</p>
<div id="img404">
</div>
</center>
</body>
</html>
Sorry if it's not very readable, but I'm on the phone.
Dima.
Don't use relative paths for image like ../images/img.gif. Rather use full url or url relative to the root.
http://localhost/images/img.gif - full url
/images/img.gif - path relative to root url
Figured out my problem.
After I saw the logs, I realized that the browser sends another request, for the image.
And silly me, my code is:
if path ="/":
status = 0
else:
status = 1
So for every request which is not root("/") the server will return 404.
Oops