Python: extract all the information(src, href, title) inside the class - python

I found that I can extract all the information I want from this HTML. I need to extract title, href abd src from this.
HTML:
<div class="col-md-2 col-sm-2 col-xs-2 home-hot-thumb">
<a itemprop="url" href="/slim?p=3090" class="main">
<img src="/FileUploads/Post/3090.jpg?w=70&h=70&mode=crop" alt="apple" title="apple" />
</a>
</div>
<div class="col-md-2 col-sm-2 col-xs-2 home-hot-thumb">
<a itemprop="url" href="/slim?p=3091" class="main">
<img src="/FileUploads/Post/3091.jpg?w=70&h=70&mode=crop" alt="banana" title="banana" />
</a>
</div>
Code:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://www.cad.com/')
soup = BeautifulSoup(res.text,"lxml")
for a in soup.findAll('div', {"id":"home"}):
for b in a.select(".main"):
print ("http://www.cad.com"+b.get('href'))
print(b.get('title'))
I can successfully get href from this, but since title and src are in another line, I don't know how to extract them. After this, I want to save them in excel, so maybe I need to finish one first then do the second one.
Expected output:
/slim?p=3090
apple
/FileUploads/Post/3091.jpg?w=70&h=70&mode=crop" alt="banana" title="banana
/slim?p=3091
banana
/FileUploads/Post/3091.jpg?w=70&h=70&mode=crop" alt="banana" title="banana

My own solution:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://www.cad.com/')
soup = BeautifulSoup(res.text,"lxml")
for a in soup.findAll('div', {"id":"home"}):
div = a.findAll('div', {"class": "home-hot-thumb"})
for div in div:
title=(div.img.get('title'))
print(title)
href=('http://www.cad.com/'+div.a.get('href'))
print(href)
src=('http://www.cad.com/'+div.img.get('src'))
print(src.replace('?w=70&h=70&mode=crop', ''))

Related

Python: Deleting all divs without class

I want to delete all divs without classes (but not the content that is in the div).
My input
<h1>Test</h1>
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
The output I want
<h1>Test</h1>
<div class="test">
<p>abc</p>
</div>
My try 1
Based on "Deleting a div with a particular class":
from bs4 import BeautifulSoup
soup = BeautifulSoup('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>', 'html.parser')
for div in soup.find_all("div", {'class':''}):
div.decompose()
print(soup)
# <h1>Test</h1>
My try 2
from htmllaundry import sanitize
myinput = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
myoutput = sanitize(myinput)
print myoutput
# <p>Test</p><p>abc</p> instead of <h1>Test</h1><div class="test"><p>abc</p></div>
My try 3
Based on "Clean up HTML in python"
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(remove_tags=('font', 'div'))
return cleaner.clean_html(dirty_html)
myhtml = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
print(sanitize(myhtml))
# <div><h1>Test</h1><p>abc</p></div>
My try 4
from html_sanitizer import Sanitizer
sanitizer = Sanitizer() # default configuration
output = sanitizer.sanitize('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>')
print(output)
# <h1>Test</h1><p>abc</p>
Problem: A div element is used to wrap the HTML fragment for the parser, therefore div tags are not allowed. (Source: Manual)
If you want to exclude div without class, preserving its content:
from bs4 import BeautifulSoup
markup = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
soup = BeautifulSoup(markup,"html.parser")
for tag in soup.find_all():
empty = tag.name == 'div' and not(tag.has_attr('class'))
if not(empty):
print(tag)
Output:
<h1>Test</h1>
<div class="test"><p>abc</p></div>
<p>abc</p>
Please checkout this.
from bs4 import BeautifulSoup
data="""
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
"""
soup = BeautifulSoup(data, features="html5lib")
for div in soup.find_all("div", class_=True):
print(div)

I want to get the value of multiple ids inside an a tag that resides in a div

Here's the HTML code:
<div class="sizeBlock">
<div class="size">
<a class="selectSize" id="44526" data-size-original="36.5">36.5</a>
</div>
<div class="size inactive active">
<a class="selectSize" id="44524" data-size-original="40">40</a>
</div>
<div class="size ">
<a class="selectSize" id="44525" data-size-original="40.5">40.5</a>
</div>
</div>
I want to get the values of the id tag and the data-size-original.
Here's my code:
for sizeBlock in soup.find_all('a', class_="selectSize"):
aid = sizeBlock.get('id')
size = sizeBlock.get('data-size-us')
The problem is that it gets the values of other ids that have the same class "selectSize".
I think this is what you want. You won't have ids and size from data in div class='size inactive active'
for sizeBlock in soup.select('div.size a.selectSize'):
aid = sizeBlock.get('id')
size = sizeBlock.get('data-size-us')
Already answered here How to Beautiful Soup (bs4) match just one, and only one, css class
Use soup.select. Here's a simple test:
from bs4 import BeautifulSoup
html_doc = """<div class="size">
<a class="selectSize otherclass" id="44526" data-ean="0193394075362" " data-tprice="" data-sku="1171177-36.5" data-size-original="36.5">5</a>
</div>"""
soup = BeautifulSoup(html_doc, 'html.parser')
#for sizeBlock in soup.find_all('a', class_= "selectSize"): # this would include the anchor
for sizeBlock in soup.select("a[class='selectSize']"):
aid = sizeBlock.get('id')
size = sizeBlock.get('data-size-original')
print aid, size

Find tag with certain child tag using bs4 python

I have an html in the below format.
<div class="consider">
<div class="row">
<p>Text1</p>
</div>
</div>
<div class="consider">
<h2>Hello</h2>
</div>
<div class="Consider">
<div class="row">
<p>Text2
</div>
</div>
I want to get the tag div only where its child tag(div) has class as "row"
this is how you can access it :
from bs4 import BeautifulSoup
content = '<div class="consider"><div class="row"><p>Text1</p></div></div><div class="consider"><h2>Hello</h2></div><div class="Consider"><div class="row"><p>Text2</p></div></div>'
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', class_='row'):
if div.parent.name == "div":
#do whatever you want with div.parent which is the element you want.
With select('div > div.row') we select all div tags with class row as direct children of div tag and then through list comprehension we select all parents of these tags:
data = '<div class="consider"><div class="row"><p>Text1</p></div></div><div class="consider"><h2>Hello</h2></div><div class="Consider"><div class="row"><p>Text2</p></div></div>'
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
divs = [div.parent for div in soup.select('div > div.row')]
print(divs)
Outputs:
[<div class="consider"><div class="row"><p>Text1</p></div></div>, <div class="Consider"><div class="row"><p>Text2</p></div></div>]

i want to scrape data using python script

I have written python script to scrape data from http://www.cricbuzz.com/cricket-stats/icc-rankings/batsmen-rankings
It is a list of 100 players and I successfully scraped this data. The problem is, when i run script instead of scraping data just one time it scraped the same data 3 times.
<div class="cb-col cb-col-100 cb-font-14 cb-lst-itm text-center">
<div class="cb-col cb-col-16 cb-rank-tbl cb-font-16">1</div>
<div class="cb-col cb-col-50 cb-lst-itm-sm text-left">
<div class="cb-col cb-col-33">
<div class="cb-col cb-col-50">
<span class=" cb-ico" style="position:absolute;"></span> –
</div>
<div class="cb-col cb-col-50">
<img src="http://i.cricketcb.com/i/stats/fw/50x50/img/faceImages/2250.jpg" class="img-responsive cb-rank-plyr-img">
</div>
</div>
<div class="cb-col cb-col-67 cb-rank-plyr">
<a class="text-hvr-underline text-bold cb-font-16" href="/profiles/2250/steven-smith" title="Steven Smith's Profile">Steven Smith</a>
<div class="cb-font-12 text-gray">AUSTRALIA</div>
</div>
</div>
<div class="cb-col cb-col-17 cb-rank-tbl">906</div>
<div class="cb-col cb-col-17 cb-rank-tbl">1</div>
</div>
And here is python script which i write scrap each player data.
import sys,requests,csv,io
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "http://www.cricbuzz.com/cricket-stats/icc-rankings/batsmen-rankings"
r = requests.get(url)
r.content
soup = BeautifulSoup(r.content, "html.parser")
maindiv = soup.find_all("div", {"class": "text-center"})
for div in maindiv:
print(div.text)
but instead of scraping the data once, it scrapes the same data 3 times.
Where can I make changes to get data just one time?
Select the table and look for the divs in that:
maindiv = soup.select("#batsmen-tests div.text-center")
for div in maindiv:
print(div.text)
Your original output and that above gets all the text from the divs as one line which is not really useful, if you just want the player names:
anchors = soup.select("#batsmen-tests div.cb-rank-plyr a")
for a in anchors:
print(a.text)
A quick and easy way to get the data in a nice csv format is to just get text from each child:
maindiv = soup.select("#batsmen-tests div.text-center")
for d in maindiv[1:]:
row_data = u",".join(s.strip() for s in filter(None, (t.find(text=True, recursive=False) for t in d.find_all())))
if row_data:
print(row_data)
Now you get output like:
# rank, up/down, name, country, rating, best rank
1,–,Steven Smith,AUSTRALIA,906,1
2,–,Joe Root,ENGLAND,878,1
3,–,Kane Williamson,NEW ZEALAND,876,1
4,–,Hashim Amla,SOUTH AFRICA,847,1
5,–,Younis Khan,PAKISTAN,845,1
6,–,Adam Voges,AUSTRALIA,802,5
7,–,AB de Villiers,SOUTH AFRICA,802,1
8,–,Ajinkya Rahane,INDIA,785,8
9,2,David Warner,AUSTRALIA,772,3
10,–,Alastair Cook,ENGLAND,770,2
11,1,Misbah-ul-Haq,PAKISTAN,764,6
As opposed to:
PositionPlayerRatingBest Rank
Player
1    –Steven SmithAUSTRALIA9061
2    –Joe RootENGLAND8781
3    –Kane WilliamsonNEW ZEALAND8761
4    –Hashim AmlaSOUTH AFRICA8471
5    –Younis KhanPAKISTAN8451
6    –Adam VogesAUSTRALIA8025
The reason you get output three times is because the website has three categories you have to select it and then accordingly you can use it.
Simplest way of doing it with your code would be to add just one line
import sys,requests,csv,io
from bs4 import BeautifulSoup
url = "http://www.cricbuzz.com/cricket-stats/icc-rankings/batsmen- rankings"
r = requests.get(url)
r.content
soup = BeautifulSoup(r.content, "html.parser")
specific_div = soup.find_all("div", {"id": "batsmen-tests"})
maindiv = specific_div[0].find_all("div", {"class": "text-center"})
for div in maindiv:
print(div.text)
This will give similar reuslts with just test batsmen, for other output just change the "id" in specific_div line.

xbmc/kodi python scrape data using BeautifulSoup

I want to edit a Kodi addon that use re.compile to scrape data, and make it use BeautifulSoup4 instead.
The original code is like this:
import urllib, urllib2, re, sys, xbmcplugin, xbmcgui
link = read_url(url)
match = re.compile('<a class="frame[^"]*"'
' href="(http://somelink.com/section/[^"]+)" '
'title="([^"]+)">.*?<img src="([^"]+)".+?Length:([^<]+)',
re.DOTALL).findall(link)
for url, name, thumbnail, length in match:
addDownLink(name + length, url, 2, thumbnail)
The HTML it is scraping is like this:
<div id="content">
<span class="someclass">
<span class="sec">
<a class="frame" href="http://somlink.com/section/name-here" title="name here">
<img src="http://www.somlink.com/thumb/imgsection/thumbnail.jpg" >
</a>
</span>
<h3 class="title">
name here
</h3>
<span class="details"><span class="length">Length: 99:99</span>
</span>
.
.
.
</div>
How do I get all of url (href), name, length and thumbnail using BeautifulSoup4, and add them in addDownLink(name + length, url, 2, thumbnail)?
from bs4 import BeautifulSoup
html = """<div id="content">
<span class="someclass">
<span class="sec">
<a class="frame" href="http://somlink.com/section/name-here" title="name here">
<img src="http://www.somlink.com/thumb/imgsection/thumbnail.jpg" >
</a>
</span>
<h3 class="title">
name here
</h3>
<span class="details"><span class="length">Length: 99:99</span>
</span>
</div>
"""
soup = BeautifulSoup(html, "lxml")
sec = soup.find("span", {"class": "someclass"})
# get a tag with frame class
fr = sec.find("a", {"class": "frame"})
# pull img src and href from the a/frame
url, img = fr["href"], fr.find("img")["src"]
# get h3 with title class and extract the text from the anchor
name = sec.select("h3.title a")[0].text
# "size" is in the span with the details class
size = sec.select("span.details")[0].text.split(None,1)[-1]
print(url, img, name.strip(), size.split(None,1)[1].strip())
Which gives you:
('http://somlink.com/section/name-here', 'http://www.somlink.com/thumb/imgsection/thumbnail.jpg', u'name here', u'99:99')
If you have multiple sections, we just need find_all and to apply the logic to each section:
def secs():
soup = BeautifulSoup(html, "lxml")
sections = soup.find_all("span", {"class": "someclass"})
for sec in sections:
fr = sec.find("a", {"class": "frame"})
url, img = fr["href"], fr.find("img")["src"]
name, size = sec.select("h3.title a")[0].text, sec.select("span.details")[0].text.split(None,1)[-1]
yield url, name, img,size
If you don't know all the class but you know for instance there is one img tag you can call find on the section:
sec.find("img")["src"]
And the same logic applies to the rest.

Categories

Resources