How to scrape content from nested div html - python

I tried to get price ($50.56) info from this link: https://www.google.com/search?biw=1600&bih=758&output=search&tbm=shop&q=McKleinUSA+17564+N+Series+ARIA+%28Khaki%29&oq=McKleinUSA+17564+N+Series+ARIA+%28Khaki%29&gs_l=products-cc.12...26051.26051.0.27263.1.1.0.0.0.0.71.71.1.1.0....0...1ac.2.64.products-cc..0.0.0....0.AxAVt6XRExI#spd=15005512733707849930
but I keep get nothing.
So I get link info from excel file using openpyxl and use requests and bs4 to scrape the info.
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests
wb = load_workbook(filename = "test.xlsx")
ws = wb.get_active_sheet()
html = ws['A2'].hyperlink.target
source = requests.get(html).text
soup = BeautifulSoup(source,'lxml')
test = soup.find('span', attrs = {'class': "O8U6h"})

Classes are dynamic but there is a constant id and you can use the relationship of the b tag to this
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.google.com/search?biw=1600&bih=758&output=search&tbm=shop&q=McKleinUSA+17564+N+Series+ARIA+%28Khaki%29&oq=McKleinUSA+17564+N+Series+ARIA+%28Khaki%29&gs_l=products-cc.12...26051.26051.0.27263.1.1.0.0.0.0.71.71.1.1.0....0...1ac.2.64.products-cc..0.0.0....0.AxAVt6XRExI#spd=15005512733707849930')
soup = bs(r.content, 'lxml')
print(soup.select_one('#ires b').text)

Looks like the class names change. I opened the link myself and couldn't find an element with class O8U6h.

Related

BS4 returns [] instead of the wanted HTML tag

I want to parse the given website and scrape the table. To me the code looks right. New to python and web parsing
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'lxml-xml')
cases = doc.find_all('div', {"class": "cell"})
print(cases)
doing this returns
[]
Change your parser and the class and there you have it.
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://delhifightscorona.in/').text, 'html.parser').find('div', {"class": "grid-x grid-padding-x small-up-2"})
print(soup.find("h3").getText())
Output:
423,831
You can choose to print only the cases or the total stats with the date.
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'html.parser')
stats = doc.find('div', {"class": "cell medium-5"})
print(stats.text) #Print the whole block with dates and the figures
cases = stats.find('h3')
print(cases.text) #Print the cases only

python BeautifulSoup4 - why does result print 5 times?

I'm trying to do some web scraping in python using BeautifulSoup 4.
I am trying to scrape the salary of an public employee. I am doing that successfully but the result is returned 5 times and I cannot figure out why.
Here is the website I am scraping: https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett
Here is my code example:
import requests
from bs4 import BeautifulSoup
source = requests.get(f'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett')
soup = BeautifulSoup(source.text, 'html.parser')
main_box = soup.find_all('div')
for i in main_box:
try:
x = i.find('div', class_='col-12 col-lg-4 pay')
z = x.find('h2').text
print(z)
except Exception:
pass
And my results are:
$525,000
$525,000
$525,000
$525,000
$525,000
This is the correct salary, but as I said the results print 5 times.
If I go to the page, right click, and 'inspect' I find the class I am looking for, which is 'col-12 col-lg-4 pay' and then within that the 'h2' tag. There is only one 'h2' tag. And print the text of that.
So it seems I am missing something, but what?
I would just get rid of the for loop and use a more specific find query
import requests
from bs4 import BeautifulSoup
source = requests.get(f'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett')
soup = BeautifulSoup(source.text, 'html.parser')
main_box = soup.find("div", {"class": "pay"})
print(main_box.find('h2').text)
You can also extract this is by using CSS
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
url = 'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett'
res = requests.get(url).text
soup = BeautifulSoup(res , 'html.parser')
Value = soup.select('#paytotal')
print(Value[0].text)

How to receive website link in Python using BeautifulSoup

I want to collect the link : /hmarchhak/102217 from a site (https://www.vanglaini.org/) and print it as https://www.vanglaini.org/hmarchhak/102217. Please help
Img
import requests
import pandas as pd
from bs4 import BeautifulSoup
source = requests.get('https://www.vanglaini.org/').text
soup = BeautifulSoup(source, 'lxml')
for article in soup.find_all('article'):
headline = article.a.text
summary=article.p.text
link = article.a.href
print(headline)
print(summary)
print(link)
print()
This is my code.
Unless I am missing something headline and summary appear to be the same text. You can use :has with bs4 4.7.1+ to ensure your article has a child href; and this seems to strip out article tag elements that are not part of main body which I suspect is actually your aim
from bs4 import BeautifulSoup as bs
import requests
base = 'https://www.vanglaini.org'
r = requests.get(base)
soup = bs(r.content, 'lxml')
for article in soup.select('article:has([href])'):
headline = article.h5.text.strip()
summary = re.sub(r'\n+|\r+',' ',article.p.text.strip())
link = f"{base}{article.a['href']})"
print(headline)
print(summary)
print(link)

BeautifulSoup find_all by table and id class returning no results?

I am trying to scrape box-score data from ProFootball reference. After running into issues with javascript, I turned to selenium to get the initial soup object. I'm trying to find a specific table on a website and subsequently iterate through its rows.
The code words if I simply find_all('table')[#] however the # changes depending on which box score I am looking at so it isn't reliable. I therefore want to use the id='player_offense' tag to identify the same table across games but when I use it it returns nothing. What am I missing here?
from selenium import webdriver
import os
from bs4 import BeautifulSoup
#path to chromedriver
chrome_path=os.path.expanduser('~/Documents/chromedriver.exe')
driver = webdriver.Chrome(path)
driver.get('https://www.pro-football-
reference.com/boxscores/201709070nwe.htm')
soup = BeautifulSoup(driver.page_source,'lxml')
driver.quit()
#doesn't work
soup.find('table',id='player_offense')
#works
table = soup.find_all('table')[3]
Data is in comments. Find the appropriate comment and then extract table
import requests
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import pandas as pd
r= requests.get('https://www.pro-football-reference.com/boxscores/201709070nwe.htm#')
soup = bs(r.content, "lxml")
comments = soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
if 'id="player_offense"' in comment:
print(pd.read_html(comment)[0])
break
This also works.
from requests_html import HTMLSession, HTML
import pandas as pd
with HTMLSession() as s:
r = s.get('https://www.pro-football-reference.com/boxscores/201709070nwe.htm')
r = HTML(html=r.text)
r.render()
table = r.find('table#player_offense', first=True)
df = pd.read_html(table.html)
print(df)

How to scrape a table with a blind caption?

I'm scraping a table from a page.
But the table's caption is 'blind'.
Is there no way to extract the table from the site?
Using BeautifulSoup like:
from urllib.request import urllib
from bs4 import BeautifulSoup
Take a look at this:
import bs4 as bs
import urllib.request
link = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cn=&cmp_cd=005930&menuType=block'
source = urllib.request.urlopen(link)
soup = bs.BeautifulSoup(source, 'html.parser')
table = soup.find('table', attrs={'id' : 'cTB24'})
for tr in table.find_all('tr'):
for td in tr.find_all('td'):
print(td.text)

Categories

Resources