Extract Text and the image from a webpage using BeautifulSoup

Extract Text and the image from a webpage using BeautifulSoup - python

I seem to have hit a wall and I am looking for some help/guidance.
I am trying to extract data from a html page - I can extract the text or the image file alone but not together:
Within the HTML file there is multiple occurrences off a heading and the associated text:
Example:
<h2>Builder ind=BOB</h2>
<table border=0 cellpadding=0 cellspacing=0>
<tr>
<td align=left valign=top>
</td>
<td align=left valign=top><br>
<h3>TEST -- TXF 1234 -- 04/01/2020 6:21:42 PM</h3>
<img src="gfx/image117.png" width=997 height=601>
<h2>Builder ind=ROB</h2>
<table border=0 cellpadding=0 cellspacing=0>
<tr>
<td align=left valign=top>
</td>
<td align=left valign=top><br>
<h3>TEST -- EXF 1234 -- 04/01/2020 6:21:42 PM</h3>
<img src="gfx/image118.png" width=997 height=601>
In the example above I am trying to extract the text contained within the h2 tags and the associated img src tag and export them to a csv file
Extracting the image text code that i have:
{
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
fname = '\\\\C:\\TEMP\\\PAGE.htm'
html= open(fname)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.png')})
for image in images:
print(image['src']+'\n')
How would i go about looping through the file and extracting both the texts and the and port to a file?
In the final output I am trying to achieve the following in a csv file:
ind=BOB,image117.png
ind=ROB,image118.png
The output that I get currently is:
gfx/image117.png
gfx/image118.png

Try this approach:
from bs4 import BeautifulSoup
import re
fname = '\\\\C:\\TEMP\\\PAGE.htm'
html= open(fname)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.png')})
headings = bs.find_all('h2')
for i in range(len(images)):
print(headings[i].text.split(" ")[1]+", "+images[i]['src'])
Output:
ind=BOB, gfx/image117.png
ind=ROB, gfx/image118.png
Or If you want to store your output in a csv file so you should try this approach:
from bs4 import BeautifulSoup
import re
import csv
fname = 'PAGE.htm'
html= open(fname)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.png')})
headings = bs.find_all('h2')
with open('data.csv', 'w') as file:
writer = csv.writer(file)
for i in range(len(images)):
#headingPlusImage = list(headings[i].text.split(" ")[1]+", "+images[i]['src'])
heading = headings[i].text.split(" ")[1]
image = images[i]['src']
print(heading,"," ,image)
writer.writerow([heading, image])

from bs4 import BeautifulSoup
html = """
<h2>Builder ind=BOB</h2>
<table border=0 cellpadding=0 cellspacing=0>
<tr>
<td align=left valign=top>
</td>
<td align=left valign=top><br>
<h3>TEST -- TXF 1234 -- 04/01/2020 6:21:42 PM</h3>
<img src="gfx/image117.png" width=997 height=601>
<h2>Builder ind=ROB</h2>
<table border=0 cellpadding=0 cellspacing=0>
<tr>
<td align=left valign=top>
</td>
<td align=left valign=top><br>
<h3>TEST -- EXF 1234 -- 04/01/2020 6:21:42 PM</h3>
<img src="gfx/image118.png" width=997 height=601>
"""
soup = BeautifulSoup(html, 'html.parser')
for item in soup.findAll("h2"):
print("Text: {}, Image: {}".format(
item.text, item.find_next("img").get("src")))
Output:
Text: Builder ind=BOB, Image: gfx/image117.png
Text: Builder ind=ROB, Image: gfx/image118.png

Related

How to parse html table in python

I'm newbie in parsing tables and regular expressions, can you help to parse this in python:
<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>
I need the "3text" and "6text"

You can use CSS selector select() and select_one() to get "3text" and "6text" like below:
import requests
from bs4 import BeautifulSoup
html_doc='''
<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>
'''
soup = BeautifulSoup(html_doc, 'lxml')
soup1 = soup.select('tr')
for i in soup1:
print(i.select_one('td:nth-child(2)').text)
You can also use find_all method:
trs = soup.find('table').find_all('tr')
for i in trs:
tds = i.find_all('td')
print(tds[1].text)
Result:
3text
6text

best way is to use beautifulsoup
from bs4 import BeautifulSoup
html_doc='''
<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>
'''
soup = BeautifulSoup(html_doc, "html.parser")
# finds all tr tags
for i in soup.find_all("tr"):
# finds all td tags in tr tags
for k in i.find_all("td"):
# prints all td tags with a text format
print(k.text)
in this case it prints
1text 2text
3text 
4text 5text
6text 
but you can grab the texts you want with indexing. In this case you could just go with
# finds all tr tags
for i in soup.find_all("tr"):
# finds all td tags in tr tags
print(i.find_all("td")[1].text)

you could use pythons html.parser: https://docs.python.org/3/library/html.parser.html
the custom parser class tracking a bit the state of the current parsing.
since you want the second cell of each row, when starting a row, each row resets the cell counter (index). each cell increments the counter.
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_cell = False
self.cell_index = -1
def handle_starttag(self, tag, attrs):
if tag == 'tr':
self.cell_index = -1
if tag == 'td':
self.in_cell = True
self.cell_index += 1
# print("Encountered a start tag:", tag)
def handle_endtag(self, tag):
if tag == 'td':
self.in_cell = False
# print("Encountered an end tag :", tag)
def handle_data(self, data):
if self.in_cell and self.cell_index == 1:
print(data.strip())
parser = MyHTMLParser()
parser.feed('''<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>''')
outputs:
> python -u "html_parser_test.py"
3text
6text

Since your question has the beautifulsoup tag attached I am going to assume that you are happy using this module to tackle the problem you are having. My solution also makes use of the builtin unicodedata module to parse any escaped characters present within the HTML (e.g. ).
To parse the table so that you have access to the second field from each row within the table (as per your question), please see the below code/comments.
from bs4 import BeautifulSoup
import unicodedata
table = '''<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>'''
soup = BeautifulSoup(table, 'html.parser') # Parse HTML table
tableData = soup.find_all('td') # Get list of all <td> tags from table
# Store normalized content (basically parse unicode characters, affecting spaces in this case) from every 2nd <td> tag from table to list
output = [ unicodedata.normalize('NFKC', d.text) for i, d in enumerate(tableData) if i % 2 != 0 ]

Try this:
from bs4 import BeautifulSoup
html="""
<table callspacing="0" cellpadding="0">
<tbody><tr>
<td>1text 2text</td>
<td>3text </td>
</tr>
<tr>
<td>4text 5text</td>
<td>6text </td>
</tr>
</tbody></table>"""
soup = BeautifulSoup(html, 'html.parser')
for tr_soup in soup.find_all('tr'):
td_soup = tr_soup.find_all('td')
print(td_soup[1].text.strip())

using pandas
In [8]: import pandas as pd
In [9]: df = pd.read_html(html_table)[0]
In [10]: df[1]
Out[10]:
0 3text
1 6text
Name: 1, dtype: object

Unable to acces element while having a SRE match using BeautifulSoup

I scrape the page like this:
s1 =bs4DerivativePage.find_all('table',class_='not-clickable zebra’)
With output:
[<table class="not-clickable zebra" data-price-format="{price}" data-quote-detail="0" data-stream-id="723288" data-stream-quote-option="Standard">
<tbody><tr>
<td><strong>Stop loss-niveau</strong></td>
<td>141,80447</td>
<td class="align-left"><strong>Type</strong></td>
<td>Turbo's</td>
</tr>
<tr>
<td><strong>Financieringsniveau</strong></td>
<td>135,05188</td>
I need to retrieve the value from Financieringsniveau.
The following gives a match:
finNiveau=re.search('Financieringsniveau’,LineIns1)
However I need the numerical value 135,05188. How does one does this?

You can use .findNext()
Ex:
from bs4 import BeautifulSoup
s = """<table class="not-clickable zebra" data-price-format="{price}" data-quote-detail="0" data-stream-id="723288" data-stream-quote-option="Standard">
<tbody><tr>
<td><strong>Stop loss-niveau</strong></td>
<td>141,80447</td>
<td class="align-left"><strong>Type</strong></td>
<td>Turbo's</td>
</tr>
<tr>
<td><strong>Financieringsniveau</strong></td>
<td>135,05188</td></tr></tbody></table>"""
soup = BeautifulSoup(s, "html.parser")
print(soup.find(text="Financieringsniveau").findNext("td").text) #Search using text and the use findNext
Output:
135,05188

Assuming that data-stream-id attribute value is unique (in combination with table tag) you can use CSS selectors and avoid re. This is a fast retrieval method.
from bs4 import BeautifulSoup
html = '''
<table class="not-clickable zebra" data-price-format="{price}" data-quote-detail="0" data-stream-id="723288" data-stream-quote-option="Standard">
<tbody><tr>
<td><strong>Stop loss-niveau</strong></td>
<td>141,80447</td>
<td class="align-left"><strong>Type</strong></td>
<td>Turbo's</td>
</tr>
<tr>
<td><strong>Financieringsniveau</strong></td>
<td>135,05188</td>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select_one('table[data-stream-id="723288"] td:nth-of-type(6)').text)

How to extract specific <td> from table

I'm working on a web scraping program using Python & BeautifulSoup. I encountered a problem when scraping a table.
My problem is, I need to extract selected <td> tags only and not the entire table.
I only need the numbers for 52 Week High, 52 Week Low, Earnings Per Share and Price to book value.
Is there anyway I can do that?
Sample Table
<table id="TABLE_1">
<tbody id="TBODY_2">
<tr id="TR_3">
<td id="TD_4">
<strong id="STRONG_5">52-Week High:</strong>
</td>
<td id="TD_6">
1,116.00
</td>
<td id="TD_7">
<strong id="STRONG_8">Earnings Per Share TTM (EPS):</strong>
</td>
<td id="TD_9">
47.87 (15.57%)
</td>
<td id="TD_10">
<strong id="STRONG_11">Price to Book Value (P/BV):</strong>
</td>
<td id="TD_12">
2.5481125565
</td>
</tr>
<tr id="TR_13">
<td id="TD_14">
<strong id="STRONG_15">52-Week Low:</strong>
</td>
<td id="TD_16">
867.50
</td>
<td id="TD_17">
<strong id="STRONG_18">Price-Earnings Ratio TTM (P/E):</strong>
</td>
<td id="TD_19">
20.8272404429
</td>
<td id="TD_20">
<strong id="STRONG_21">Return on Equity (ROE):</strong>
</td>
<td id="TD_22">
12.42%
</td>
</tr>
<tr id="TR_23">
<td id="TD_24">
<strong id="STRONG_25">Fair Value:</strong>
</td>
<td id="TD_26">
-
</td>
<td id="TD_27">
<strong id="STRONG_28">Dividends Per Share (DPS):</strong>
</td>
<td id="TD_29">
-
</td>
<td id="TD_30">
<strong id="STRONG_31">Recommendation:</strong>
</td>
<td id="TD_32">
None<span id="SPAN_33"></span>
</td>
</tr>
<tr id="TR_34">
<td id="TD_35">
<strong id="STRONG_36">Last Price:</strong>
</td>
<td id="TD_37">
<span id="SPAN_38"></span> <span id="SPAN_39">984.5</span>
</td>
</tr>
</tbody>
</table>
I also showed my codes for your reference.
Any help would be very much appreciated! Thank you!
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
import pandas as pd
myurl = "https://www.investagrams.com/Stock/ac"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(myurl,headers=hdr)
# Open connection to website
uClient = urlopen(req)
# Offloads the content to variable
page_html = uClient.read()
#just closing it
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
table = page_soup.find("div", {"id":"FundamentalAnalysisPanel"}).find("table")
print(table.text)

You can do it with findNextSibling method.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.investagrams.com/Stock/ac')
soup = BeautifulSoup(r.text)
# specify table parameters for which you want to find values
parameters = ['52-Week High:', '52-Week Low:', 'Earnings Per Share TTM (EPS):', 'Price-Earnings Ratio TTM (P/E):', 'Price to Book Value (P/BV):']
# iterate all <td> tags and print text of the next sibling (with value),
# if this <td> contains specified parameter.
for td in soup.findAll('td'):
for p in parameters:
if td.find('strong', text=p) is not None:
print(td.findNextSibling().text.strip())
Result:
1,116.00
47.87 (15.57%)
2.5481125565
867.50
20.8272404429

This might be what you want
page_soup = soup(req.data.decode('utf-8'))
#tables = page_soup.find_all('table')
tables = page_soup.find_all('td')
df = pd.read_html(str(tables[i]))
where i is the table you want

How to use BeauifulSoup for parsing data in following example?

I am a beginner in Python and BeautifulSoup and I am trying to make a web scraper. However, I am facing some issues and can't figure out a way out. Here is my issue:
This is part of the HTML from where I want to scrap:
<tr>
<td class="num cell-icon-string" data-sort-value="6">
<td class="cell-icon-string"><a class="ent-name" href="/pokedex/charizard" title="View pokedex for #006 Charizard">Charizard</a></td>
</tr>
<tr>
<td class="num cell-icon-string" data-sort-value="6">
<td class="cell-icon-string"><a class="ent-name" href="/pokedex/charizard" title="View pokedex for #006 Charizard">Charizard</a><br>
<small class="aside">Mega Charizard X</small></td>
</tr>
Now, I want to extract "Charizard" from 1st table row and "Mega Charizard X" from the second row. Right now, I am able to extract "Charizard" from both rows.
Here is my code:
#!/usr/bin/env python3
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("data.html"), "lxml")
poke_boxes = soup.findAll('a', attrs = {'class': 'ent-name'})
for poke_box in poke_boxes:
poke_name = poke_box.text.strip()
print(poke_name)

import bs4
html = '''<tr>
<td class="num cell-icon-string" data-sort-value="6">
<td class="cell-icon-string"><a class="ent-name" href="/pokedex/charizard" title="View pokedex for #006 Charizard">Charizard</a></td>
</tr>
<tr>
<td class="num cell-icon-string" data-sort-value="6">
<td class="cell-icon-string"><a class="ent-name" href="/pokedex/charizard" title="View pokedex for #006 Charizard">Charizard</a><br>
<small class="aside">Mega Charizard X</small></td>
</tr>'''
soup = bs4.BeautifulSoup(html, 'lxml')
in:
[tr.get_text(strip=True) for tr in soup('tr')]
out:
['Charizard', 'CharizardMega Charizard X']
you can use get_text() to concatenate all the text in the tag, strip=Ture will strip all the space in the string

You'll need to change your logic to go through the rows and check to see if the small element exists, if it does print out that text, otherwise print out the anchor text as you are now.
soup = BeautifulSoup(html, 'lxml')
trs = soup.findAll('tr')
for tr in trs:
smalls = tr.findAll('small')
if smalls:
print(smalls[0].text)
else:
poke_box = tr.findAll('a')
print(poke_box[0].text)

How to only print certain text using BeautifulSoup

I am trying to pull some financial data for city governments using BeautifulSoup (had to convert the files from pdf). I just want to get the data as a csv file and then I'll analyze it in Excel or SAS. My problem is that I do not want to print the "& nbsp;" that is in the original HTML, just the numbers and the row heading. Any suggestions on how I can do this without using regex?
Below is a sample of the html I am looking at. Next is my code (currently just in proof of concept mode, need to prove I can get clean data before moving on). New to Python and programming so any help is appreciated.
<TD class="td1629">Investments (Note 2)</TD>
<TD class="td1605"> </TD>
<TD class="td479"> </TD>
<TD class="td1639">-</TD>
<TD class="td386"> </TD>
<TD class="td116"> </TD>
<TD class="td1634">2,207,592</TD>
<TD class="td479"> </TD>
<TD class="td1605"> </TD>
<TD class="td1580">2,207,592</TD>
<TD class="td301"> </TD>
<TD class="td388"> </TD>
<TD class="td1637">2,882,018</TD>
CODE
import htmllib
import urllib
import urllib2
import re
from BeautifulSoup import BeautifulSoup
CAFR = open("C:/Users/snown/Documents/CAFR2004 BFS Statement of Net Assets.html", "r")
soup = BeautifulSoup(CAFR)
assets_table = soup.find(True, id="page_27").find(True, id="id_1").find('table')
rows = assets_table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
for td in cols:
text = ''.join(td.find(text=True))
print text+"|",
print

soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
It converts and other html entities to appropriate characters.
To write it to a csv file:
>>> import csv
>>> import sys
>>> csv_file = sys.stdout
>>> writer = csv.writer(csv_file, delimiter="|")
>>> soup = BeautifulSoup("<tr><td>1<td> <td>3",
... convertEntities=BeautifulSoup.HTML_ENTITIES)
>>> writer.writerows([''.join(t.encode('utf-8') for t in td(text=True))
... for td in tr('td')] for tr in soup('tr'))
1| |3
I've used t.encode('utf-8') due to is translated to non-ascii U+00A0 (no-break space) character.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract Text and the image from a webpage using BeautifulSoup - python

Related

How to parse html table in python

Unable to acces element while having a SRE match using BeautifulSoup

How to extract specific <td> from table

How to use BeauifulSoup for parsing data in following example?

How to only print certain text using BeautifulSoup

Categories

Resources