How to Insert dictionary values into a html template file using python? - python

I have a html template file as shown below and I want to replace the title and body with dictionary values in my python script.
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>#Want to insert dictionary values here in python></title>
<LINK href="styles.css" rel="stylesheet" type="text/css">
</head>
<body>
<img src="forkit.gif" id="octocat" alt="" />
<!-- Feel free to change this text here -->
<p>
#Want to insert dictionary values here in python>
</p>
<p>
#Want to insert dictionary values here in python>
</p>
</body>
</html>
I'm parsing json file and stored values in a dictionary and now want to insert those values in the html file created.
import json
#from lxml import etree
with open('ninjs_basic.json','r') as file:
resp_str = file.read()
#print(resp_str)
resp_dict = json.loads(resp_str)
with open('result.html','w') as output:
output.write('uri: ' + resp_dict['uri']+ '\n')
output.write(resp_dict['headline'] + '\n')
output.write(resp_dict['body_text'])
I tried with following code and had no luck. What would be the right approach here ?

Give you an example of using SimplifiedDoc.
from simplified_scrapy import SimplifiedDoc,req,utils
import json
html ='''
<body>
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>#Want to insert dictionary values here in python></title>
<LINK href="styles.css" rel="stylesheet" type="text/css">
</head>
<body>
<img src="forkit.gif" id="octocat" alt="" />
<!-- Feel free to change this text here -->
<p>
#Want to insert dictionary values here in python>
<placeholder1 />
</p>
<p>
#Want to insert dictionary values here in python>
<placeholder2 />
</p>
</body>
</html>'''
doc = SimplifiedDoc(html)
# with open('ninjs_basic.json','r') as file:
# resp_str = file.read()
# resp_dict = json.loads(resp_str)
with open('result.html','w') as output:
doc.title.setContent("The title you took from the JSON file")
doc.placeholder1.repleaceSelf("Want to insert dictionary values here in python")
doc.placeholder2.repleaceSelf("Want to insert dictionary values here in python")
output.write(doc.html)

Related

Find duplicate id attributes

Before uploading on my server I want to check if I accidentally defined an id two or more times in one of my html files:
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>The HTML5 Herald</title>
<meta name="description" content="The HTML5 Herald">
<meta name="author" content="SitePoint">
<link rel="stylesheet" href="css/styles.css?v=1.0">
</head>
<body>
<div id="test"></div>
<div id="test"></div>
</body>
</html>
The idea is to print an error message if there are duplicates:
"ERROR: The id="test" is not unique."
You can do this by using find_all to gather all elements with an id attribute, and then collections.Counter to collect the ids that contain duplicates
import bs4
import collections
soup = bs4.BeautifulSoup(html)
ids = [a.attrs['id'] for a in soup.find_all(attrs={'id': True})]
ids = collections.Counter(ids)
dups = [key for key, value in ids.items() if value > 1]
for d in dups:
print('ERROR: The id="{}" is not unique.'.format(d))
>>> ERROR: The id="test" is not unique.
You could use a regex to find all ids in the HTML and then search for duplicates.
For example:
import re
html_page = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>$The HTML5 Herald</title>
<div id="test1"></div>
<meta name="description" content="The HTML5 Herald">
<meta name="author" content="SitePoint">
<link $rel="stylesheet" href="css/styles.css?v=1.0">
</head>
<body>
<div id="test2"></div>
<div id="test2"></div>
</body>
<div id="test3"></div>
</html>
"""
ids_match = re.findall(r'(?<=\s)id=\"\w+\"',html_page)
print(ids_match) #-> ['id="test1"', 'id="test2"', 'id="test2"', 'id="test3"']
print(len(ids_match)) #-> 4
print(len(set(ids_match))) #->3
# the following returns True if there are dupicates in ids_match
print(len(ids_match) != len(set(ids_match))) #->True

Switch all "href = (link)" with "onclick = (PythonScript(link)) "

I am working on a webscraper that scrapes a website, does some stuff to the body of the website, and outputs that into a new html file. One of the features would be to take any hyperlinks in the html file and instead run a script where the link would be an input for the script.
I want to go from this..
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Scraper</title>
</head>
<body>
<a href="/wiki/Mercury_poisoning" title="Mercury poisoning">
mercury poisoning
</a>
</body>
</html>
To this....
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Scraper</title>
</head>
<body>
<a onclick ='pythonScript(/wiki/Mercury_poisoning)' href="#" title="Mercury poisoning">
mercury poisoning
</a>
</body>
</html>
I did a lot of googling and I read about jQuery and ajax but do not know these tools and would prefer to do this in python. Is it possible to do this using File IO in python?
You can do something like this using BeautifulSoup:
PS: You need to install Beautifulsoup: pip install bs4
from bs4 import BeautifulSoup as bs
html = '''<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Scraper</title>
</head>
<body>
<a href="/wiki/Mercury_poisoning" title="Mercury poisoning">
mercury poisoning
</a>
</body>
</html>
'''
soup = bs(html, 'html.parser')
links = soup.find_all('a')
for link in links:
actual_link = link['href']
link['href'] = '#'
link['onclick'] = 'pythonScript({})'.format(actual_link)
print(soup)
Output:
<html>
<head>
<meta charset="utf-8"/>
<title>Scraper</title>
</head>
<body>
<a href="#" onclick="pythonScript(/wiki/Mercury_poisoning)" title="Mercury poisoning">
mercury poisoning
</a>
</body>
</html>
Bonus:
You can also create a new HTML file like this:
with open('new_html_file.html', 'w') as out:
out.write(str(soup))

Scraping webpage

I am trying to write a Python script to scrape data from this webpage. I am trying to scrape the data from the second table ('class': 'char-pico-table') and am using this script to do so:
def getPICO(url):
r = requests.get(url)
print (r.content)
However, this prints this:
b'<!DOCTYPE html>\n<html class="view">\n <head>\n <title>RobotReviewer: Automating evidence synthesis</title>\n <meta charset="utf-8">\n <meta name="viewport" content="width=device-width, initial-scale=1.0">\n <meta name="google" content="notranslate">\n\n <link rel="stylesheet" type="text/css" href="//maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css">\n <link rel="stylesheet" type="text/css" href="/css/main.css">\n <link rel="stylesheet alternative prefetch" type=text/css href="/css/report.css">\n\n <!-- Preload examples -->\n <link rel="prefetch" href="/report_view/Tvg0-pHV2QBsYpJxE2KW-/html">\n <link rel="prefetch" href="/report_view/_fzGUEvWAeRsqYSmNQbBq/html">\n <link rel="prefetch" href="/report_view/HBkzX1I3Uz_kZEQYeqXJf/html">\n\n <!-- / Preload examples -->\n\n\n <script src="/scripts/modernizr.js"></script>\n <script src="/scripts/spa/scripts/vendor/pdfjs/pdf.js"></script>\n <script src="/scripts/spa/scripts/vendor/compatibility.js"></script>\n <script data-main="/scripts/main" src="/scripts/require.js"></script>\n\n <script>\n PDFJS.disableWebGL = false;\n CSRF_TOKEN = "1508009356##6a03b1bf519972b27a0d871ae4823eb3a3366c0c";\n </script>\n </head>\n\n <body>\n <nav id="top-bar" class="top-bar" data-topbar role="navigation">\n <div>\n <ul class="title-area">\n <li class="name">\n <h1><img src="/img/logo.svg" width="190px"></h1>\n </li>\n </ul>\n\n <section class="top-bar-section">\n <ul class="right">\n <li>About</li>\n </ul>\n </section>\n </div>\n </nav>\n\n <div id="breadcrumbs"></div>\n\n <main id="main"></main>\n\n\n </body>\n</html>'
which is not the output that I see when I view the page in my browser - it contains none of the data that I wish to scrape. Why is this not the case?
When viewing the page in a web browser it looks like this:
Expected Output
Based on the comment from #Shahin, I wrote the following code, which gave me the data in a JSON format from which I was easily able to extract the data.
result = json.loads(requests.get('https://robot-reviewer.vortext.systems/report_view/'+id+'/json').content)

python lxml.html add parameter

I have a html-template where i want to add some content. The Template looks like the following:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" >
<head>
<title>Data Base</title>
<link rel="stylesheet" href="stylesheet.css" />
</head>
<body>
<h1>Data Base</h1>
<div class="file_explorer">
</div>
<div class="info_screen">
</div>
</body>
</html>
I want to search for the <div class="file_explorer"></div> and add some parameters to it. Afterwards it should look like this:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" >
<head>
<title>Data Base</title>
<link rel="stylesheet" href="stylesheet.css" />
</head>
<body>
<h1>Data Base</h1>
<div class="file_explorer">
<p class="folder">Folder_1</p>
<p class="folder">Folder_2</p>
</div>
<div class="info_screen">
</div>
</body>
</html>
Therefore I tried to parse the html-template and wanted to search for the file_explorer tag to add the paragraphs. How do I search for them and add the paragraphs afterwards. I tried html.cssselector but it did not work. Pls help me. Thats my code:
from lxml import html
from os import path
class HtmlGenerator:
#staticmethod
def modify_html(html_path, list_folders):
html_path = path.abspath(html_path)
parser = html.HTMLParser(remove_blank_text=True)
if path.isfile(html_path) and html_path.endswith(".html"):
tree = html.parse(html_path, parser)
# search for <div class="file_explorer"> [MISSING]
for folder in list_folders:
# add folder as paragraph to html [MISSING]
tree.write(html_path, pretty_print=True)
Thanks in advance.
You can use XPath to find the target div in your template, and then use E-factory to build the new elements :
from lxml.html import builder as E
....
tree = html.parse(html_path, parser)
root = tree.getroot()
# search for <div class="file_explorer">
div = root.find('.//div[#class="file_explorer"]')
for folder in list_folders:
# add folder as paragraph to html
# I assume `folder` as a string like 'Folder_1', 'Folder_2', ...
d.append(E.P(E.CLASS('folder'), folder))
tree.write(html_path, pretty_print=True)

Append before closing body tag in python

ok guys so I have a template.html file like so:
<h1>Hello wolrd</h1>
<div>This is me</div>
And I want to append that to my index file before the closing body tag. Just like so:
<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<script type="text/ng-template" id="templates/template.html">
<h1>Hello wolrd</h1>
<div>This is me</div>
</script>
</body>
</html>
I've so far gotten to read the file and append to the end of it but I have yet to add the script tags to the file that I am reading and append to the correct spot of my file. This is what I currently have:
#!/usr/bin/env python
import fileinput
to_readfile=open('index.html', "r")
try:
reading_file=to_readfile.read()
writefile=open('index2.html','a')
try:
writefile.write("\n")
writefile.write(reading_file)
finally:
writefile.close()
finally:
to_readfile.close()
Any help would be much appreciated. Thank you!
The simplest approach would be to add a placeholder in the layout template and then when processing the layout search for the placeholder and replace it with the contents of the other template.
<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<script type="text/ng-template" id="templates/template.html">
{{content}}
</script>
</body>
</html>
...
..
.
layout = open('layout.html', "r")
layout_contents = layout.read()
partial=open('partial_file.html','r')
result = layout_contents.replace("{{content}}", partial)
writefile = open("file_to_write.html", "w")
writefile.write("\n")
writefile.write(result)
.
..
....
You can also work on a much more extensive solution such as the ones used by jinja http://jinja.pocoo.org/docs/templates/#template-inheritance.

Categories

Resources