Show URLs inside list with lxml.builder - python

I have a need to generate HTML with lxml package. Here is the sample main function that shows how I do it:
def main():
from lxml.builder import E
p_persons = []
person = ['1'] #counter
person.append('ID')
person.append('0. https://www.youtube.com/watch?v=qLsn5aNaVkI 1. https://www.youtube.com/watch?v=MPbO6P3Vtx8 2. https://www.youtube.com/watch?v=jVKWPaFuNng 3. https://www.youtube.com/watch?v=9HFyB4gCOqY 4. https://www.youtube.com/watch?v=muQGef4Df_8')
person.append('birthplace')
p_persons.append(person)
page = (
E.html(
E.body(
E.table(
*[E.tr(
*[
E.td(split(col)) if ind == 1 and col is not None else
E.td(str(col)) for ind, col in enumerate(row)
]
) for row in p_persons ]
, border="2"
)
)
)
)
with open('result.html', 'w') as f:
f.write(etree.tostring(page, pretty_print=True).decode('utf-8'))
def split(col):
from lxml.builder import E
import re
muts = re.split('\d\.',col)
links = []
for idx, mut in enumerate(muts):
print(mut)
links.append(str(idx + 1))
links.append(E.a(mut, href=mut))
links.append('\n')
return links
All is fine with simple structures like above, but sometimes I need to analyze the data and output it to E.td depending on the content.
I build person element which is a list of fields, and than put it to p_persons list, intended for output. Second field (a string which contains URLs separated by counter) demonstrates us the structure to be output. It is necessary to split this string and show the URLs in a form of a numeric list inside single cell E.td.
But E.td doesn't recognize it, if I put E.td(split(col))
Traceback (most recent call last):
File "<stdin>", line 11, in <module>
File "/home/user/functions.py", line 298, in rows_to_html
) for row in rows ]
File "/home/user/functions.py", line 298, in <listcomp>
) for row in rows ]
File "/home/user/functions.py", line 296, in <listcomp>
E.td(str(col)) for ind, col in enumerate(row)
File "src/lxml/builder.py", line 222, in lxml.builder.ElementMaker.__call__
TypeError: bad argument type: list(['1', <Element a at 0x7f1900117c48>, '\n'])
Here is the HTML sample I want to receive:
<!DOCTYPE html>
<html>
<body>
<table border="2">
<tr>
<td>ID</td>
<td><ol>
<li>https://www.youtube.com/watch?v=qLsn5aNaVkI</li>
<li>https://www.youtube.com/watch?v=MPbO6P3Vtx8</li>
<li>https://www.youtube.com/watch?v=jVKWPaFuNng</li>
<li>https://www.youtube.com/watch?v=9HFyB4gCOqY</li>
<li>https://www.youtube.com/watch?v=muQGef4Df_8</li>
</ol>
</td>
<td>birthplace</td>
</tr>
</table>
</body>
</html>
What is the proper way of doing this? Should I wrap the URLs into DIV or smth else? I didn't find similar examples in the web.

Related

How could I use PyQuery traversal correctly?

There is a file called "name.txt"
Content is below
<td>
<input class="name" value="Michael">
<input class="age" value="22">
<input class="location" value="hebei">
</td>
<td>
<input class="name" value="Jack">
<input class="age" value="23">
<input class="location" value="NewYo">
</td>
Now I want to use pyquery to get all input tags, then traversal input tags
Use '.filter' to get all name class and age class
At last, get the value of name and age and write all results into a file called'name_file.txt'
My code is below
# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
doc = pq(filename='name.txt')
input = doc('input')
for result in input.items():
name_result = result.filter('.name')
age_result = result.filter('.age')
name = name_result.attr('value')
age = age_result.attr('value')
print "%s:%s" %(name,age)
c = "%s:%s" %(name,age)
f = file('name_file.txt','w')
f.write(c)
f.close()
But now, I met 2 issues
1. The results I got are not "Michael:22", they are "Michael:None" and "None:22"
2. The content of 'name_file' I wrote into is just 'None:None', not all results I got.
The first problem stems from the fact that you're looping through all your <input ... > elements (collected by doc('input')) so you only either get the name, or the age, but not the both. What you can do is loop through individual <td> ... </td> blocks and extract the matching children - a bit wasteful but to keep in line with your idea:
from pyquery import PyQuery as pq
doc = pq(filename='name.txt') # open our document from `name.txt` file
for result in doc('td').items(): # loop through all <td> ... </td> items
name_result = result.find('.name') # grab a tag with class="name"
age_result = result.find('.age') # grab a tag with class="age"
name = name_result.attr('value') # get the name's `value` attribute value
age = age_result.attr('value') # get the age's `value` attribute value
print("{}:{}".format(name, age)) # print it to the STDOUT as name:age
As for the second part - you're opening your name_file.txt file in write mode, writing a line and then closing it on each loop - when you open a file in write mode it will truncate everything in it so you keep writing the first line for each loop. Try doing this instead:
from pyquery import PyQuery as pq
doc = pq(filename='name.txt') # open our document from `name.txt` file
with open("name_file.txt", "w") as f: # open name_file.txt for writing
for result in doc('td').items(): # loop through all <td> ... </td> items
name_result = result.find('.name') # grab a tag with class="name"
age_result = result.find('.age') # grab a tag with class="age"
name = name_result.attr('value') # get the name's `value` attribute value
age = age_result.attr('value') # get the age's `value` attribute value
print("{}:{}".format(name, age)) # print values to the STDOUT as name:age
f.write("{}:{}\n".format(name, age)) # write to the file as name:age + a new line
from pyquery import PyQuery as pq
doc = pq(filename = 'text.txt')
input=doc.children('body')
f = file('name_file.txt', 'w')
for x in [result.html() for result in input.items('td')]:
x=pq(x)
name = x('input').eq(0).attr('value')
age = x('input').eq(1).attr('value')
print "%s:%s" % (name, age)
c = "%s:%s" % (name, age)
f.write(c)
f.close()
You cannot have the file opening statement inside the loop else you'd just have the file being overwritten with just one record on every loop iteration.
Similarly, you close it after the loop and not after inserting every record.

Modifying a BeautifulSoup .string with line breaks

I am trying to change the content of an html file with BeautifulSoup. This content will be coming from python-based text so it will have \n newlines...
newContent = """This is my content \n with a line break."""
newContent = newContent.replace("\n", "<br>")
htmlFile.find_all("div", "product").p.string = newContent
when I do this, the html file <p> text is changed to this:
This is my content <br> with a line break.
How do I change a string within a BeautifulSoup object and keep <br> breaks? if the string just contains \n then it'll create an actual line break.
You need to create separate elements; there isn't one piece of text contained in the <p> tag, but a series of text and <br/> elements.
Rather than replace \n newlines with the text <br/> (which will be escaped), split the text on newlines and insert extra elements in between:
parent = htmlFile.find_all("div", "product")[0].p
lines = newContent.splitlines()
parent.append(htmlFile.new_string(lines[0]))
for line in lines[1:]:
parent.append(htmlFile.new_tag('br'))
parent.append(htmlFile.new_string(line))
This uses the Element.append() method to add new elements to the tree, and using BeautifulSoup.new_string() and BeautifulSoup.new_tag() to create those extra elements.
Demo:
>>> from bs4 import BeautifulSoup
>>> htmlFile = BeautifulSoup('<p></p>')
>>> newContent = """This is my content \n with a line break."""
>>> parent = htmlFile.p
>>> lines = newContent.splitlines()
>>> parent.append(htmlFile.new_string(lines[0]))
>>> for line in lines[1:]:
... parent.append(htmlFile.new_tag('br'))
... parent.append(htmlFile.new_string(line))
...
>>> print htmlFile.prettify()
<html>
<head>
</head>
<body>
<p>
This is my content
<br/>
with a line break.
</p>
</body>
</html>

Nested for-loop iteration stops

I have two input files: an html one, and a css for it. I want to produce some operation on the html file based on the contents of the css file.
my html is like this:
<html>
<head>
<title></title>
</head>
<body>
<p class = "cl1" id = "id1"> <span id = "span1"> blabla</span> </p>
<p class = "cl2" id = "id2"> <span id = "span2"> blablabla</span> <span id = "span3"> qwqwqw </span> </p>
</body>
</html>
Styles for span ids are defined in the css file (individually for each span id!)
Before doing real stuff (deletion of spans based on their style) I was trying just to print out ids from the html and the style descritption from the css corresponding to each id.
Code:
from lxml import etree
tree = etree.parse("file.html")
filein = "file.css"
def f1():
with open(filein, 'rU') as f:
for span in tree.iterfind('//span'):
for line in f:
if span and span.attrib.has_key('id'):
x = span.get('id')
if "af" not in x and x in line:
print x, line
def main():
f1()
So, there are two for-loops, which iterate perfectly if separated, but when put together in this function the iteration stops after the first loop:
>> span1 span`#span1 { font-weight: bold; font-size: 11.0pt; font-style: normal; letter-spacing: 0em }
How can I fix this?
If as I think, tree is completely loaded in memory, you could try to reverse the loops. That way, you only browse the file filein once :
def f1():
with open(filein, 'rU') as f:
for line in f:
for span in tree.iterfind('//span'):
if span and span.attrib.has_key('id'):
x = span.get('id')
if "af" not in x and x in line:
print x, line
It happens because you have read all filein lines till second outer loop begin.
To make it work, you need add f.seek(0) before starting inner loop over filein:
with open(filein, 'rU') as f:
for span in tree.iterfind('//span'):
f.seek(0)
for line in f:
if span and span.attrib.has_key('id'):
x = span.get('id')
if "af" not in x and x in line:
print x, line

converting text file to html file with python

I have a text file that contains :
JavaScript 0
/AA 0
OpenAction 1
AcroForm 0
JBIG2Decode 0
RichMedia 0
Launch 0
Colors>2^24 0
uri 0
I wrote this code to convert the text file to html :
contents = open("C:\\Users\\Suleiman JK\\Desktop\\Static_hash\\test","r")
with open("suleiman.html", "w") as e:
for lines in contents.readlines():
e.write(lines + "<br>\n")
but the problem that I had in html file that in each line there is no space between the two columns:
JavaScript 0
/AA 0
OpenAction 1
AcroForm 0
JBIG2Decode 0
RichMedia 0
Launch 0
Colors>2^24 0
uri 0
what should I do to have the same content and the two columns like in text file
Just change your code to include <pre> and </pre> tags to ensure that your text stays formatted the way you have formatted it in your original text file.
contents = open"C:\\Users\\Suleiman JK\\Desktop\\Static_hash\\test","r")
with open("suleiman.html", "w") as e:
for lines in contents.readlines():
e.write("<pre>" + lines + "</pre> <br>\n")
This is HTML -- use BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup()
body = soup.new_tag('body')
soup.insert(0, body)
table = soup.new_tag('table')
body.insert(0, table)
with open('path/to/input/file.txt') as infile:
for line in infile:
row = soup.new_tag('tr')
col1, col2 = line.split()
for coltext in (col2, col1): # important that you reverse order
col = soup.new_tag('td')
col.string = coltext
row.insert(0, col)
table.insert(len(table.contents), row)
with open('path/to/output/file.html', 'w') as outfile:
outfile.write(soup.prettify())
That is because HTML parsers collapse all whitespace. There are two ways you could do it (well probably many more).
One would be to flag it as "preformatted text" by putting it in <pre>...</pre> tags.
The other would be a table (and this is what a table is made for):
<table>
<tr><td>Javascript</td><td>0</td></tr>
...
</table>
Fairly tedious to type out by hand, but easy to generate from your script. Something like this should work:
contents = open("C:\\Users\\Suleiman JK\\Desktop\\Static_hash\\test","r")
with open("suleiman.html", "w") as e:
e.write("<table>\n")
for lines in contents.readlines():
e.write("<tr><td>%s</td><td>%s</td></tr>\n"%lines.split())
e.write("</table>\n")
You can use a standalone template library like mako or jinja. Here is an example with jinja:
from jinja2 import Template
c = '''<!doctype html>
<html>
<head>
<title>My Title</title>
</head>
<body>
<table>
<thead>
<tr><th>Col 1</th><th>Col 2</th></tr>
</thead>
<tbody>
{% for col1, col2 in lines %}
<tr><td>{{ col 1}}</td><td>{{ col2 }}</td></tr>
{% endfor %}
</tbody>
</table>
</body>
</html>'''
t = Template(c)
lines = []
with open('yourfile.txt', 'r') as f:
for line in f:
lines.append(line.split())
with open('results.html', 'w') as f:
f.write(t.render(lines=lines))
If you can't install jinja, then here is an alternative:
header = '<!doctyle html><html><head><title>My Title</title></head><body>'
body = '<table><thead><tr><th>Col 1</th><th>Col 2</th></tr>'
footer = '</table></body></html>'
with open('input.txt', 'r') as input, open('output.html', 'w') as output:
output.writeln(header)
output.writeln(body)
for line in input:
col1, col2 = line.rstrip().split()
output.write('<tr><td>{}</td><td>{}</td></tr>\n'.format(col1, col2))
output.write(footer)
I have added title, looping here line by line and appending each line on < tr > and < td > tags, it is should work as single table without column. No need to use these tags(< tr >< /tr > and < td >< /td >[gave a spaces for readability]) for col1 and col2.
log: snippet:
MUTHU PAGE
2019/08/19 19:59:25 MUTHUKUMAR_TIME_DATE,line: 118 INFO | Logger
object created for: MUTHUKUMAR_APP_USER_SIGNUP_LOG 2019/08/19 19:59:25
MUTHUKUMAR_DB_USER_SIGN_UP,line: 48 INFO | ***** User SIGNUP page
start ***** 2019/08/19 19:59:25 MUTHUKUMAR_DB_USER_SIGN_UP,line: 49
INFO | Enter first name: [Alphabet character only allowed, minimum 3
character to maximum 20 chracter]
html source page:
'''
<?xml version="1.0" encoding="utf-8"?>
<body>
<table>
<p>
MUTHU PAGE
</p>
<tr>
<td>
2019/08/19 19:59:25 MUTHUKUMAR_TIME_DATE,line: 118 INFO | Logger object created for: MUTHUKUMAR_APP_USER_SIGNUP_LOG
</td>
</tr>
<tr>
<td>
2019/08/19 19:59:25 MUTHUKUMAR_DB_USER_SIGN_UP,line: 48 INFO | ***** User SIGNUP page start *****
</td>
</tr>
<tr>
<td>
2019/08/19 19:59:25 MUTHUKUMAR_DB_USER_SIGN_UP,line: 49 INFO | Enter first name: [Alphabet character only allowed, minimum 3 character to maximum 20 chracter]
'''
CODE:
from bs4 import BeautifulSoup
soup = BeautifulSoup(features='xml')
body = soup.new_tag('body')
soup.insert(0, body)
table = soup.new_tag('table')
body.insert(0, table)
with open('C:\\Users\xxxxx\\Documents\\Latest_24_may_2019\\New_27_jun_2019\\DB\\log\\input.txt') as infile:
title_s = soup.new_tag('p')
title_s.string = " MUTHU PAGE "
table.insert(0, title_s)
for line in infile:
row = soup.new_tag('tr')
col1 = list(line.split('\n'))
col1 = [ each for each in col1 if each != '']
for coltext in col1:
col = soup.new_tag('td')
col.string = coltext
row.insert(0, col)
table.insert(len(table.contents), row)
with open('C:\\Users\xxxx\\Documents\\Latest_24_may_2019\\New_27_jun_2019\\DB\\log\\output.html', 'w') as outfile:
outfile.write(soup.prettify())

Beautiful Soup line matching

Im trying to build a html table that only contains the table header and the row that is relevant to me. The site I'm using is http://wolk.vlan77.be/~gerben.
I'm trying to get the the table header and my the table entry so I do not have to look each time for my own name.
What I want to do :
get the html page
Parse it to get the header of the table
Parse it to get the line with table tags relevant to me (so the table row containing lucas)
Build a html page that shows the header and table entry relevant to me
What I am doing now :
get the header with beautifulsoup first
get my entry
add both to an array
pass this array to a method that generates a string that can be printed as html page
def downloadURL(self):
global input
filehandle = self.urllib.urlopen('http://wolk.vlan77.be/~gerben')
input = ''
for line in filehandle.readlines():
input += line
filehandle.close()
def soupParserToTable(self,input):
global header
soup = self.BeautifulSoup(input)
header = soup.first('tr')
tableInput='0'
table = soup.findAll('tr')
for line in table:
print line
print '\n \n'
if '''lucas''' in line:
print 'true'
else:
print 'false'
print '\n \n **************** \n \n'
I want to get the line from the html file that contains lucas, however when I run it like this I get this in my output :
****************
<tr><td>lucas.vlan77.be</td> <td><span style="color:green;font-weight:bold">V</span></td> <td><span style="color:green;font-weight:bold">V</span></td> <td><span style="color:green;font-weight:bold">V</span></td> </tr>
false
Now I don't get why it doesn't match, the string lucas is clearly in there :/ ?
It looks like you're over-complicating this.
Here's a simpler version...
>>> import BeautifulSoup
>>> import urllib2
>>> html = urllib2.urlopen('http://wolk.vlan77.be/~gerben')
>>> soup = BeautifulSoup.BeautifulSoup(html)
>>> print soup.find('td', text=lambda data: data.string and 'lucas' in data.string)
lucas.vlan77.be
It's because line is not a string, but BeautifulSoup.Tag instance. Try to get td value instead:
if '''lucas''' in line.td.string:

Categories

Resources