#!/usr/bin/python
# Import modules for CGI handling
import cgi, cgitb
# Create instance of FieldStorage
form = cgi.FieldStorage()
name = form.getvalue('name')
age = int(form.getvalue('age')) + 1
print "Content-type: text/html"
print
print "<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">"
print "<html>"
print "<head><title></title></head>"
print "<body>"
print "<p> Hello, %s</p>" % (name)
print "<p> Next year, you will be %s years old.</p>" % age
print "</body>"
print "</html>"
Whenever I write the DOCTYPE down, I get an Invalid Syntax error. Don't know what the problem is. Help would be appreciated since I'm new to python. Thank you!
Your quotes are conflicting (notice how the syntax highlighting breaks after that line).
Either use single quotes:
print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
Or triple quote it:
print """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">"""
Use different quotes:
print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
Print statement sees the quotes in the middle as ending quotes. You need to escape out of quotes by using /" or using different quotes.
print '<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">'
You have double-quoted a string that already contains a double-quote. Python thinks your string ends after PUBLIC, and the next thing appears to be a minus sign followed by a division sign, which is an error. On top of that, you have broken the string into two lines without any continuation characters, which won't work. Use triple-quotes to allow a string to continue from one line to the next (this will also resolve your problem with the embedded " characters).
print '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'''
For those kind of "long-multiline-text" you might prefer using the triple quotes (""").
Coupled with the format string method available on any decently recent version of Python, you get the poor's man template engine:
tmpl = """Content-type: text/html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head><title></title></head>
<body>
<p> Hello, {name}</p>
<p> Next year, you will be {age} years old.</p>
</body>
</html>
"""
print tmpl.format(name='Sylvain', age=40)
Related
I have a large number of HTML documents that must be converted to XML. Not all may look exactly the same. For example, the sample below ends with an HTML comment tag, not with the HTML tag.
Note this question is related to this one.
Here is my code:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<comment>this is an HTML comment</comment>
<comment>this is another HTML comment</comment>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
...
<comment>here is a comment inside the head tag</comment>
</head>
<body>
...
<comment>Comment inside body tag</comment>
<comment>Another comment inside body tag</comment>
<comment>There could be many comments in each file and scattered, not just 1 in the head and three in the body. This is just a sample.</comment>
</body>
</html>
<comment>This comment is the last line of the file</comment>
I wish to wrap the entire document with a custom tag called <root>. So far, the best I can do is wrap <root> around <html>.
root_tag = bs4.Tag(name="root")
soup.html.wrap(root_tag)
How can I position the <root> element such that it wraps the entire document?
A little crude, as this is just wrapping any given file in <root> </root>
See if it works for your use case:
def root_wrap(file):
fin = open(file, 'r+')
fin.write('<root>')
for line in fin:
fin.write(line)
fin.write('</root>')
fin.close()
Having switched from Fedora 17 to 18, I get different parsing behaviour for the same lxml code, apparently due to different versions of the underlying libraries (libxml2 and libxslt versions changed).
Here's an example of lxml code with different results for the two versions:
from io import BytesIO
from lxml import etree
myHtmlString = \
'<!doctype html public "-//w3c//dtd html 4.0 transitional//en">\r\n'+\
'<html>\r\n'+\
'<head>\r\n'+\
' <title>Title</title>\r\n'+\
'</head>\r\n'+\
'<body/>\r\n'+\
'</html>\r\n'
myFile = BytesIO(myHtmlString)
myTree = etree.parse(myFile, etree.HTMLParser())
myTextElements = myTree.xpath("//text()")
myFullText = ''.join([myEl for myEl in myTextElements])
assert myFullText == 'Title', repr(myFullText)
The f17 version passes the assert, i.e. xpath("//text()") only returns text 'Title', whereas the f18 version fails with output
Traceback (most recent call last):
File "TestLxml.py", line 17, in <module>
assert myFullText == 'Title', repr(myFullText)
AssertionError: '\r\n\r\n Title\r\n\r\n\r\n'
Apparently, the f18 version handles newlines and whitespace differently from the f17 version.
Is there a way to have control over this behaviour? (An optional argument somewhere?)
Or even better, is there a way in which I can get the old behaviour back using the new libraries?
in XML, the text() returns the text inside the tags as is (unstripped), so if you have any whitespace characters, tabs, new lines they will be included.
It might be that the way you construct the multiline string with + and \n\r accidentally testing two different strings.
If you change your string to a triple quote string like the example below and test it.
from io import BytesIO
from lxml import etree
html = '''
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<title>Title</title>
</head>
<body/>
</html>
'''
tree = etree.parse(BytesIO(html), etree.HTMLParser())
text_elements = tree.xpath("//text()")
full_text = ''.join(text_elements)
assert full_text == 'Title', repr(full_text)
You can also see that surrounding the text with spaces or new lines make them part of the text() function return. See title below.
html = '''
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<title> Title </title>
</head>
<body/>
</html>
'''
tree = etree.parse(BytesIO(html), etree.HTMLParser())
text_elements = tree.xpath("//text()")
full_text = ''.join(text_elements)
assert full_text == ' Title ', repr(full_text)
If you don't need the spaces you can always call strip() on the string yourself. If you're sure you're getting spaces even though your tags do not contain them, then you should report that as a bug on the lxml mailing list.
I'm trying to translate text out of a template file in a Pyramid project. More or less as in this example: http://docs.pylonsproject.org/projects/pyramid_cookbook/en/latest/chameleon_i18n.html
Now how do I get rid of the <dynamic element> in the comment of my .pot file? I'd like to see the rest of the code along with its tags.
My chameleon template (.pt):
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" xmlns:tal="http://xml.zope.org/namespaces/tal"
xmlns:i18n="http://xml.zope.org/namespaces/i18n"
i18n:domain="MyDomain">
<head>
...
</head>
<body>
<div i18n:translate="MyID">
This will appear in the comments.
<span>This will NOT.</span>
While this will again appear.
</div>
</body>
</html>
I use Babel and Lingua to extract the messages with the following options in my setup.py:
message_extractors = { '.': [
('**.py', 'lingua_python', None ),
('**.pt', 'lingua_xml', None ),
]}
And the relevant output in my .pot file looks like this:
#. Default: This will appear in the comments. <dynamic element> While this will
#. again appear.
#: myproject/templates/base.pt:10
msgid "MyID"
msgstr ""
This is explicitly not supported: a translation should only contain the text - it should never contain markup. Otherwise you would have two problems:
translators could insert markup, which may break your site or create a security problem
a template toolkit would have no way to determine if any characters in a translation
need to be escaped or should be output as-is.
It is common to need to translate items with dynamic components or markup inside them: for those you use the i18n:name attribute. For example you can do this:
<p i18n:translate="">This is <strong i18n:name="very" i18n:translate="">very</strong> important.
That would give you two strings to translate: This is ${very} string and very.
I have this:
response = urllib2.urlopen(url)
html = response.read()
begin = html.find('<title>')
end = html.find('</title>',begin)
title = html[begin+len('<title>'):end].strip()
if the url = http://www.google.com then the title have no problem as "Google",
but if the url = "http://www.britishcouncil.org/learning-english-gateway" then the title become
"<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
<base href="http://www.britishcouncil.org/" />
<META http-equiv="Content-Type" Content="text/html;charset=utf-8">
<meta name="WT.sp" content="Learning;Home Page Smart View" />
<meta name="WT.cg_n" content="Learn English Gateway" />
<META NAME="DCS.dcsuri" CONTENT="/learning-english-gateway.htm">..."
What is actually happening, why I couldn't return the "title"?
That URL returns a document with <TITLE>...</TITLE> and find is case-sensitive. I strongly suggest you use an HTML parser like Beautiful Soup.
Let's analyze why we got that answer. If you open the website and view the source, we note that it doesn't have <title>...</title>. Instead we have <TITLE>...</TITLE>. So what happened to the 2 find calls? Both will be -1!
begin = html.find('<title>') # Result: -1
end = html.find('</title>') # Result: -1
Then begin+len('<title>') will be -1 + 7 = 6. So your final line would be extracting html[6:-1]. It turns out that negative indices actually mean something legitimate in Python (for good reasons). It means to count from the back. Hence -1 here refers to the last character in html. So what you are getting is a substring from the 6th character (inclusive) to the last character (exclusive).
What can we do then? Well, for one, you can use regular expression matcher that ignore case or use a proper HTML parser. If this is a one-off thing and space/performance isn't much of a concern, the quickest approach might be to create a copy of html and lower-cased the entire string:
def get_title(html):
html_lowered = html.lower();
begin = html_lowered.find('<title>')
end = html_lowered.find('</title>')
if begin == -1 or end == -1:
return None
else:
# Find in the original html
return html[begin+len('<title>'):end].strip()
Working solution with lxml and urllib using Python 3
import lxml.etree, urllib.request
def documenttitle(url):
conn = urllib.request.urlopen(url)
parser = lxml.etree.HTMLParser(encoding = "utf-8")
tree = lxml.etree.parse(conn, parser = parser)
return tree.find('.//title')
I tried:
document.doctype = xml.dom.minidom.DocumentType('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "DTD/xhtml1-strict.dtd"')
There is no doctype in the output. How to fix without inserting it by hand?
You shouldn't instantiate classes from minidom directly. It's not a supported part of the API, the ownerDocuments won't tie up and you can get some strange misbehaviours. Instead use the proper DOM Level 2 Core methods:
>>> imp= minidom.getDOMImplementation('')
>>> dt= imp.createDocumentType('html', '-//W3C//DTD XHTML 1.0 Strict//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd')
(‘DTD/xhtml1-strict.dtd’ is a commonly-used but wrong SystemId. That relative URL would only be valid inside the xhtml1 folder at w3.org.)
Now you've got a DocumentType node, you can add it to a document. According to the standard, the only guaranteed way of doing this is at document creation time:
>>> doc= imp.createDocument('http://www.w3.org/1999/xhtml', 'html', dt)
>>> print doc.toxml()
<?xml version="1.0" ?><!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'><html/>
If you want to change the doctype of an existing document, that's more trouble. The DOM standard doesn't require that DocumentType nodes with no ownerDocument be insertable into a document. However some DOMs allow it, eg. pxdom. minidom kind of allows it:
>>> doc= minidom.parseString('<html xmlns="http://www.w3.org/1999/xhtml"><head/><body/></html>')
>>> dt= minidom.getDOMImplementation('').createDocumentType('html', '-//W3C//DTD XHTML 1.0 Strict//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd')
>>> doc.insertBefore(dt, doc.documentElement)
<xml.dom.minidom.DocumentType instance>
>>> print doc.toxml()
<?xml version="1.0" ?><!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'><html xmlns="http://www.w3.org/1999/xhtml"><head/><body/></html>
but with bugs:
>>> doc.doctype
# None
>>> dt.ownerDocument
# None
which may or may not matter to you.
Technically, the only reliable way per the standard to set a doctype on an existing document is to create a new document and import the whole of the old document into it!
def setDoctype(document, doctype):
imp= document.implementation
newdocument= imp.createDocument(doctype.namespaceURI, doctype.name, doctype)
newdocument.xmlVersion= document.xmlVersion
refel= newdocument.documentElement
for child in document.childNodes:
if child.nodeType==child.ELEMENT_NODE:
newdocument.replaceChild(
newdocument.importNode(child, True), newdocument.documentElement
)
refel= None
elif child.nodeType!=child.DOCUMENT_TYPE_NODE:
newdocument.insertBefore(newdocument.importNode(child, True), refel)
return newdocument