Using Beautiful soup to analyze table in python - python

So I've got a table:
<table border="1" style="width: 100%">
<caption></caption>
<col>
<col>
<tbody>
<tr>
<td>Pig</td>
<td>House Type</td>
</tr>
<tr>
<td>Pig A</td>
<td>Straw</td>
</tr>
<tr>
<td>Pig B</td>
<td>Stick</td>
</tr>
<tr>
<td>Pig C</td>
<td>Brick</td>
</tr>
And I was simply trying to return a JSON string of the table pairs like so:
[["Pig A", "Straw"], ["Pig B", "Stick"], ["Pig C", "Brick"]]
However, with my code I can't seem to get rid of the HTML tags:
stable = soup.find('table')
cells = [ ]
rows = stable.findAll('tr')
for tr in rows[1:4]:
# Process the body of the table
row = []
td = tr.findAll('td')
#td = [el.text for el in soup.tr.finall('td')]
row.append( td[0])
row.append( td[1])
cells.append( row )
return cells
#eventually, I'd like to do this:
#h = json.dumps(cells)
#return h
My output is this:
[[<td>Pig A</td>, <td>Straw</td>], [<td>Pig B</td>, <td>Stick</td>], [<td>Pig C</td>, <td>Brick</td>]]

Use the text property to get only the inner text of the element:
row.append(td[0].text)
row.append(td[1].text)

You can try using lxml library.
from lxml.html import fromstring
import lxml.html as PARSER
#data = open('example.html').read() # You can read it from a html file.
#OR
data = """
<table border="1" style="width: 100%">
<caption></caption>
<col>
<col>
<tbody>
<tr>
<td>Pig</td>
<td>House Type</td>
</tr>
<tr>
<td>Pig A</td>
<td>Straw</td>
</tr>
<tr>
<td>Pig B</td>
<td>Stick</td>
</tr>
<tr>
<td>Pig C</td>
<td>Brick</td>
</tr>
"""
root = PARSER.fromstring(data)
main_list = []
for ele in root.getiterator():
if ele.tag == "tr":
text = ele.text_content().strip().split('\n')
main_list.append(text)
print main_list
Output:
[['Pig', ' House Type'], ['Pig A', ' Straw'], ['Pig B', ' Stick'], ['Pig C', ' Brick']]

Related

Trying to append a new row to the first row in a the table body with BeautifulSoup

Having trouble appending a new row to the first row (the header row) in the table body ().
my code:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('page_content.xml'), 'html.parser')
# append a row to the first row in the table body
row = soup.find('tbody').find('tr')
row.append(soup.new_tag('tr', text='New Cell'))
print(row)
the output:
<tr>
<th>Version</th>
<th>Jira</th>
<th colspan="1">Date/Time</th>
<tr text="New Cell"></tr></tr>
what the output should be:
<tr>
<th>Version</th>
<th>Jira</th>
<th colspan="1">Date/Time</th>
</tr>
<tr text="New Cell"></tr>
the full xml file is:
<h1>Rental Agreement/Editor</h1>
<table class="wrapped">
<colgroup>
<col/>
<col/>
<col/>
</colgroup>
<tbody>
<tr>
<th>Version</th>
<th>Jira</th>
<th colspan="1">Date/Time</th>
<tr text="New Cell"></tr></tr>
<tr>
<td>1.0.1-0</td>
<td>ABC-1234</td>
<td colspan="1">
<br/>
</td>
</tr>
</tbody>
</table>
<p class="auto-cursor-target">
<br/>
</p>
You can use .insert_after:
from bs4 import BeautifulSoup
html_doc = """
<table>
<tr>
<th>Version</th>
<th>Jira</th>
<th colspan="1">Date/Time</th>
</tr>
<tr>
<td> something else </td>
</tr>
</table>
"""
soup = BeautifulSoup(html_doc, "html.parser")
row = soup.select_one("tr:has(th)")
row.insert_after(soup.new_tag("tr", text="New Cell"))
print(soup.prettify())
Prints:
<table>
<tr>
<th>
Version
</th>
<th>
Jira
</th>
<th colspan="1">
Date/Time
</th>
</tr>
<tr text="New Cell">
</tr>
<tr>
<td>
something else
</td>
</tr>
</table>
EDIT: If you want to insert arbitrary HTML code, you can try:
what_to_insert = BeautifulSoup(
'<tr param="xxx">This is new <b>text</b></tr>', "html.parser"
)
row.insert_after(what_to_insert)

Retrieving table values from HTML with the same tag names using Beautiful Soup in Python

I am trying to retrieve all the td text for the below table using Beautiful Soup, unfortunately the tag names are the same and I am either only able to retrieve the first element or some elements are repeatedly printing. Hence not really sure of how to go about it.
Below is HTML table snippet:
<div>Table</div>
<table class="Auto" width="100%">
<tr>
<td class="Auto_head">Address</td>
<td class="Auto_head">Name</td>
<td class="Auto_head">Type</td>
<td class="Auto_head">Value IN</td>
<td class="Auto_head">AUTO Statement</td>
<td class="Auto_head">Value OUT</td>
<td class="Auto_head">RESULT</td>
<td class="Auto_head"></td>
</tr>
<tr>
<td class="Auto_body">1</td>
<td class="Auto_body">abc</td>
<td class="Auto_body">yes</td>
<td class="Auto_body">abc123</td>
<td class="Auto_body">jar</td>
<td class="Auto_body">123abc</td>
<td class="Auto_body">PASS</td>
<td class="Auto_body">na</td>
</tr>
What I want is all the text content inside these tags for example the first auto_head corresponds to first auto_body i.e. Address = 1 similarly all the values should be retrieved.
I have used find,findall,findNext and next_sibling but no luck. Here is my current code in python:
self.table = self.soup_file.findAll(class_="Table")
self.headers = [tab.find(class_="Auto_head").findNext('td',class_="Auto_head").contents[0] for tab in self.table]
self.data = [data.find(class_="Auto_body").findNext('td').contents[0] for data in self.table]
Get the headers first, then use zip(...) to combine
from bs4 import BeautifulSoup
data = '''\
<table class="Auto" width="100%">
<tr>
<td class="Auto_head">Address</td>
<td class="Auto_head">Name</td>
<td class="Auto_head">Type</td>
</tr>
<tr>
<td class="Auto_body">1</td>
<td class="Auto_body">abc</td>
<td class="Auto_body">yes</td>
</tr>
<tr>
<td class="Auto_body">2</td>
<td class="Auto_body">def</td>
<td class="Auto_body">no</td>
</tr>
<tr>
<td class="Auto_body">3</td>
<td class="Auto_body">ghi</td>
<td class="Auto_body">maybe</td>
</tr>
</table>
'''
soup = BeautifulSoup(data, 'html.parser')
for table in soup.select('table.Auto'):
# get rows
rows = table.select('tr')
# get headers
headers = [td.text for td in rows[0].select('td.Auto_head')]
# get details
for row in rows[1:]:
values = [td.text for td in row.select('td.Auto_body')]
print(dict(zip(headers, values)))
My output:
{'Address': '1', 'Name': 'abc', 'Type': 'yes'}
{'Address': '2', 'Name': 'def', 'Type': 'no'}
{'Address': '3', 'Name': 'ghi', 'Type': 'maybe'}
Get each category first then iterate using zip
s = '''<div>Table</div>
<table class="Auto" width="100%">
<tr>
<td class="Auto_head">Address</td>
<td class="Auto_head">Name</td>
<td class="Auto_head">Type</td>
<td class="Auto_head">Value IN</td>
<td class="Auto_head">AUTO Statement</td>
<td class="Auto_head">Value OUT</td>
<td class="Auto_head">RESULT</td>
<td class="Auto_head"></td>
</tr>
<tr>
<td class="Auto_body">1</td>
<td class="Auto_body">abc</td>
<td class="Auto_body">yes</td>
<td class="Auto_body">abc123</td>
<td class="Auto_body">jar</td>
<td class="Auto_body">123abc</td>
<td class="Auto_body">PASS</td>
<td class="Auto_body">na</td>
</tr></table>'''
soup = BeautifulSoup(s,features='html')
head = soup.find_all(name='td',class_='Auto_head')
body = soup.find_all(name='td',class_='Auto_body')
for one,two in zip(head,body):
print(f'{one.text}={two.text}')
Address=1
Name=abc
Type=yes
Value IN=abc123
AUTO Statement=jar
Value OUT=123abc
RESULT=PASS
=na
Searching by CSS class
The easiest solution is to add the find_all method at the end of the find
so your code will be
source = requests.get('YOUR URL')
soup=BeautifulSoup(source.text,'html.parser')
data = soup.find('tr').find_all('td')[0]
data = soup.find('tr').find_all('td')[1]
and so on just change the last list number 0,1,2... or else use for loop for the same

Beautifulsoup iterate to get either <td>sometext</td> or url

I Want to create a list that contains a key-value pair. With the <thead> items as the key. For the values I want to get the text for all <th>items except the <th> items where there is a <a href='url'>, then I want to get the url instead.
Currently I am only able to get the text from all items. But how do I do to get '/someurl' instead of Makulerad and Detaljer?
<table class="table table-bordered table-hover table-striped zero-margin-top">
<thead>
<tr>
<th>Volymsenhet</th>
<th>Pris</th>
<th>Valuta</th>
<th>Handelsplats</th>
<th>url1</th>
<th>url2</th>
</tr>
</thead>
<tbody>
<tr class="iprinactive">
<td>Antal</td>
<td>5,40</td>
<td>SEK</td>
<td>NASDAQ STOCKHOLM AB</td>
<td>Makulerad</td>
<td>
Detaljer
</td>
</tr>
</tbody>
</table>
My code:
raw_html = simple_get('https://example.com/')
soup = BeautifulSoup(raw_html, 'html.parser')
table = soup.find("table", attrs={"class":"table"})
head = [th.get_text() for th in table.find("tr").find_all("th")]
datasets = []
for row in table.find_all("tr")[1:]:
dataset = dict(zip(head,(td.get_text() for td in row.find_all("td"))))
datasets.append(dataset)
Try this:
simply get the text data of <td> if it doesn't have an <a>. Otherwise get the href value.
from bs4 import BeautifulSoup
raw_html = '''<table class="table table-bordered table-hover table-striped zero-margin-top">
<thead>
<tr>
<th>Volymsenhet</th>
<th>Pris</th>
<th>Valuta</th>
<th>Handelsplats</th>
<th>url1</th>
<th>url2</th>
</tr>
</thead>
<tbody>
<tr class="iprinactive">
<td>Antal</td>
<td>5,40</td>
<td>SEK</td>
<td>NASDAQ STOCKHOLM AB</td>
<td>Makulerad</td>
<td>
Detaljer
</td>
</tr>
</tbody>
</table>'''
soup = BeautifulSoup(raw_html, 'html.parser')
table = soup.find("table", attrs={"class":"table"})
head = [th.get_text() for th in table.find("tr").find_all("th")]
datasets = []
for row in table.find_all("tr")[1:]:
dataset = dict(zip(head, [td.get_text() if not td.a else td.a['href'] for td in row.find_all("td")]))
datasets.append(dataset)
print(datasets)
OUTPUT:
[{'Volymsenhet': 'Antal', 'Pris': '5,40', 'Valuta': 'SEK', 'Handelsplats': 'NASDAQ STOCKHOLM AB', 'url1': '/someurl', 'url2': '/someurl'}]

Parse "<tbody> / <tr> / <td>" with python's BeautifulSoup

I have the following HTML code:
<tbody>
<tr>
<td>1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa</td>
<td>62e907b15cbf27d5425399ebf6f0fb50ebb88f18</td>
<td class="num">66.6771<small class="b-blockExplorer__small">1246</small> BTC</td>
<td class="num">66.6771<small class="b-blockExplorer__small">1246</small> BTC</td>
<td class="num">1089</td>
</tr>
<tr>
<td>12c6DSiU4Rq3P4ZxziKxzrL5LmMBrzjrJX</td>
<td>119b098e2e980a229e139a9ed01a469e518e6f26</td>
<td class="num">50.0572<small class="b-blockExplorer__small">3154</small> BTC</td>
<td class="num">50.0572<small class="b-blockExplorer__small">3154</small> BTC</td>
<td class="num">55</td>
</tr>
<!--- SNIP --->
</tbody>
I want to parse it to get something like:
1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa,62e907b15cbf27d5425399ebf6f0fb50ebb88f18,66.6771,66.6771
12c6DSiU4Rq3P4ZxziKxzrL5LmMBrzjrJX,119b098e2e980a229e139a9ed01a469e518e6f26,50.0572,50.0572
Tried with BeautifulSoup:
soup.select('tbody > tr > td')[rowcount].get_text(strip=True)
I get only the fist <td>*</td>
What am I doing wrong?
Try this
for row in soup.select('tbody tr'):
row_text = [x.text for x in row.find_all('td')]
print(', '.join(row_text)) # You can save or print this string however you want.
Output:
1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa, 62e907b15cbf27d5425399ebf6f0fb50ebb88f18, 66.67711246 BTC, 66.67711246 BTC, 1089
12c6DSiU4Rq3P4ZxziKxzrL5LmMBrzjrJX, 119b098e2e980a229e139a9ed01a469e518e6f26, 50.05723154 BTC, 50.05723154 BTC, 55
I was able to find what you want to scrape by doing the following:
from bs4 import BeautifulSoup
html = """<tbody>
<tr>
<td>1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa</td>
<td>62e907b15cbf27d5425399ebf6f0fb50ebb88f18</td>
<td class="num">66.6771<small class="b-blockExplorer__small">1246</small> BTC</td>
<td class="num">66.6771<small class="b-blockExplorer__small">1246</small> BTC</td>
<td class="num">1089</td>
</tr>
<tr>
<td>12c6DSiU4Rq3P4ZxziKxzrL5LmMBrzjrJX</td>
<td>119b098e2e980a229e139a9ed01a469e518e6f26</td>
<td class="num">50.0572<small class="b-blockExplorer__small">3154</small> BTC</td>
<td class="num">50.0572<small class="b-blockExplorer__small">3154</small> BTC</td>
<td class="num">55</td>
</tr>
<!--- SNIP --->
</tbody>"""
b = BeautifulSoup(html, 'lxml')
for tr in b.find_all('tr'):
data = tr.find_all('td')
val1 = data[0].find('a').text
val2 = data[1].find('a').text
num1 = data[2].text.split()[0]
num2 = data[3].text.split()[0]
print(val1, val2, num1, num2)
This results in:
1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa 62e907b15cbf27d5425399ebf6f0fb50ebb88f18 66.67711246 66.67711246
12c6DSiU4Rq3P4ZxziKxzrL5LmMBrzjrJX 119b098e2e980a229e139a9ed01a469e518e6f26 50.05723154 50.05723154

Iterating through a table of rows with beautiful soup in python

I'm trying to parse through a table of rows using beautiful soup and save values of each row in a dict.
One hiccup is the structure of the table has some rows as the section headers. So for any row with the class 'header' I want to define a variable called "section". Here's what I have, but it's not working because it's saying ['class'] TypeError: string indices must be integers
Here's what I have:
for i in credits.contents:
if i['class'] == 'header':
section = i.contents
DATA_SET[section] = {}
else:
DATA_SET[section]['data_point_1'] = i.find('td', {'class' : 'data_point_1'}).find('p').contents
DATA_SET[section]['data_point_2'] = i.find('td', {'class' : 'data_point_2'}).find('p').contents
DATA_SET[section]['data_point_3'] = i.find('td', {'class' : 'data_point_3'}).find('p').contents
Example of data:
<table class="credits">
<tr class="header">
<th colspan="3"><h1>HEADER NAME</h1></th>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
<tr class="header">
<th colspan="3"><h1>HEADER NAME</h1></th>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA</p></td>
<td class="data_point_2"><p>DATA</p></td>
<td class="data_point_3"><p>DATA</p></td>
</tr>
</table>
Here is one solution, with a slight adaptation of your example data so that the result is clearer:
from BeautifulSoup import BeautifulSoup
from pprint import pprint
html = '''<body><table class="credits">
<tr class="header">
<th colspan="3"><h1>HEADER 1</h1></th>
</tr>
<tr>
<td class="data_point_1"><p>DATA11</p></td>
<td class="data_point_2"><p>DATA12</p></td>
<td class="data_point_3"><p>DATA12</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA21</p></td>
<td class="data_point_2"><p>DATA22</p></td>
<td class="data_point_3"><p>DATA23</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA31</p></td>
<td class="data_point_2"><p>DATA32</p></td>
<td class="data_point_3"><p>DATA33</p></td>
</tr>
<tr class="header">
<th colspan="3"><h1>HEADER 2</h1></th>
</tr>
<tr>
<td class="data_point_1"><p>DATA11</p></td>
<td class="data_point_2"><p>DATA12</p></td>
<td class="data_point_3"><p>DATA13</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA21</p></td>
<td class="data_point_2"><p>DATA22</p></td>
<td class="data_point_3"><p>DATA23</p></td>
</tr>
<tr>
<td class="data_point_1"><p>DATA31</p></td>
<td class="data_point_2"><p>DATA32</p></td>
<td class="data_point_3"><p>DATA33</p></td>
</tr>
</table></body>'''
soup = BeautifulSoup(html)
rows = soup.findAll('tr')
section = ''
dataset = {}
for row in rows:
if row.attrs:
section = row.text
dataset[section] = {}
else:
cells = row.findAll('td')
for cell in cells:
if cell['class'] in dataset[section]:
dataset[section][ cell['class'] ].append( cell.text )
else:
dataset[section][ cell['class'] ] = [ cell.text ]
pprint(dataset)
Produces:
{u'HEADER 1': {u'data_point_1': [u'DATA11', u'DATA21', u'DATA31'],
u'data_point_2': [u'DATA12', u'DATA22', u'DATA32'],
u'data_point_3': [u'DATA12', u'DATA23', u'DATA33']},
u'HEADER 2': {u'data_point_1': [u'DATA11', u'DATA21', u'DATA31'],
u'data_point_2': [u'DATA12', u'DATA22', u'DATA32'],
u'data_point_3': [u'DATA13', u'DATA23', u'DATA33']}}
EDIT ADAPTATION OF YOUR SOLUTION
Your code is neat and has only a couple of issues. You use contents in places where you shoul duse text or findAll -- I repaired that below:
soup = BeautifulSoup(html)
credits = soup.find('table')
section = ''
DATA_SET = {}
for i in credits.findAll('tr'):
if i.get('class', '') == 'header':
section = i.text
DATA_SET[section] = {}
else:
DATA_SET[section]['data_point_1'] = i.find('td', {'class' : 'data_point_1'}).find('p').contents
DATA_SET[section]['data_point_2'] = i.find('td', {'class' : 'data_point_2'}).find('p').contents
DATA_SET[section]['data_point_3'] = i.find('td', {'class' : 'data_point_3'}).find('p').contents
print DATA_SET
Please note that if successive cells have the same data_point class, then successive rows will replace earlier ones. I suspect this is not an issue in your real dataset, but that is why your code would return this, abbreviated, result:
{u'HEADER 2': {'data_point_2': [u'DATA32'],
'data_point_3': [u'DATA33'],
'data_point_1': [u'DATA31']},
u'HEADER 1': {'data_point_2': [u'DATA32'],
'data_point_3': [u'DATA33'],
'data_point_1': [u'DATA31']}}

Categories

Resources