How to scrape price data on page using BeautifulSoup - python

I am new to web scraping and is having trouble figuring out how to scrape all the prices in the webpage below. What I tried returns blank, any pointers would be great!
import bs4
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from datetime import datetime
from pytz import timezone
import urllib.request
url = 'https://www.remax.ca/find-real-estate'
page = urlopen(url)
soup = bs4.BeautifulSoup(page,'html.parser')
price = soup.findAll('h3', {'class' : 'price'})

First thing, if you use from bs4 import BeautifulSoup, don't use import bs4 too.
Second, write soup = BeautifulSoup(page,'html.parser)
Then use price = soup.find_all('h3',{'class':'price})
After this, you should have in "price" all the prices, but you still need to refine, as in that form you will copy all that code from the h3s.
EDIT
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from datetime import datetime
import urllib.request
url = 'https://www.remax.ca/find-real-estate'
page = urlopen(url)
soup = BeautifulSoup(page,'html.parser')
price = soup.find_all('h3', {'class' : 'price'})
for p in price:
print(p.text)
This should do the job. I eliminated pandas because i have not it installed.

Related

How to use find_all method in bs4 on an object without class

import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/').text
soup=BeautifulSoup (result, 'lxml')
stories=soup.find_all('tr')
print (stories)
The find method works but find_all doesn't I'm not sure why maybe it is because it doesn't have a class?
correct code is
import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/')
soup = BeautifulSoup(result.content, 'html5lib')
stories=soup.find_all('tr')
you can access each 'tr' by
stories[0]
0 can be replaced with any number in list
You can also use Pandas
eg
import pandas
import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/')
soup = BeautifulSoup(result.content, 'html5lib')
df=pandas.read_html(soup.prettify())
print(df)

How to breakup webpage text

I assume I have to use the /br to break up this text and extract the fixtures only, but cannot figure it out at all!
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import pprint
import json
from urllib.request import urlopen
# sample web page
sample_web_page = 'https://bleacherreport.com/articles/10005879-epl-schedule-2021-22-official-list-of-fixtures-for-new-premier-league-season'
# call get method to request that page
page = requests.get(sample_web_page)
# with the help of beautifulSoup and html parser create soup
soup = BeautifulSoup(page.content, "html.parser")
z = soup.findAll('p', {'class':''})
print(z)

Getting a spesific value from a website in python

I get a product link that is on Aliexpress then I want to get the product price value and print it. I like this:
import requests
import urllib2
import sys
import urllib.parse
import urllib.request
import re
from bs4 import BeautifulSoup as BS
url ="https://www.amazon.com/Optimum-Nutrition-Standard-Naturally-Flavored/dp/B00QQA0H3S?pd_rd_wg=Hiuuc&pd_rd_r=2542737c-992b-4b5c-b7de-05acce3929d5&pd_rd_w=be1cP&ref_=pd_gw_simh&pf_rd_r=WFBB3JV61PKGSDJRW0C1&pf_rd_p=b841581f-e864-5164-afa6-4c18a8348879"
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
soup = BS(data)
price = soup.findAll("span", {"id": "volume"})[0]
print price
In this Amazon link, I want to find the 55.01 value.(Amazon is a just example.The website does not matter.It may be AliExpress etc.)

Having trouble in getting page source with beautifulsoup

I am trying to get the HTML source of a web page using beautifulsoup.
import bs4 as bs
import requests
import urllib.request
sourceUrl='https://www.pakwheels.com/forums/t/planing-a-trip-from-karachi-to-lahore-by-road-in-feb-2017/414115/2.html'
source=urllib.request.urlopen(sourceUrl).read()
soup=bs.BeautifulSoup(source,'html.parser')
print(soup)
I want the HTML source of the page. This is what I am getting now:
'ps.store("siteSettings", {"title":"PakWheels Forums","contact_email":"sami.ullah#pakeventures.com","contact_url":"https://www.pakwheels.com/main/contact_us","logo_url":"https://www.pakwheels.com/assets/logo.png","logo_small_url":"/images/d-logo-sketch-small.png","mobile_logo_url":"data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4NCjwhLS0gR2VuZXJhdG9yOiBBZG9iZSBJbGx1c3RyYXRvciAxNi4wLjAsIFNWRyBFeHBvcnQgUGx1Zy1JbiAuIFNWRyBWZXJzaW9uOiA2LjAwIEJ1aWxkIDApICAtLT4NCjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+DQo8c3ZnIHZlcnNpb249IjEuMSIgaWQ9IkxheWVyXzEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4Ig0KCSB3aWR0aD0iMjQwcHgiIGhlaWdodD0iNjBweCIgdmlld0JveD0iMCAwIDI0MCA2MCIgZW5hYmxlLWJhY2tncm91bmQ9Im5ldyAwIDAgMjQwIDYwIiB4bWw6c3BhY2U9InByZXNlcnZlIj4NCjxwYXRoIGZpbGw9IiNGRkZGRkYiIGQ9Ik02LjkwMiwyMy4yODZDMzQuNzc3LDIwLjI2Miw1Ny4yNC'
Have a look at this code:
from urllib import request
from bs4 import BeautifulSoup
url_1 = "http://www.google.com"
page = request.urlopen(url_1)
soup = BeautifulSoup(page)
print(soup.prettify())
Import everything you need correctly. Read this.

list out of range error when using soup.select('placeholder')[0].get_text() in Beautiful soup

New to scraping and I'm trying to use Beautiful soup to get the Wheelbase value ( eventually other things) from a wikipedia page ( I'll deal with robots.txt later) This is the guide I've been using
Two questions
1.) How do I resolve the error below?
2.) How do I scrape the value in the cell that contains wheelbase is it just "td#Wheelbase td" ?
The error I get is
File "evscraper.py", line 25, in <module>
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3') [0].get_text()
IndexError: list index out of range
Thanks for any help!
__author__ = 'KirkLazarus'
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3')[0].get_text()
print wheelbase_data
Well your first problem is with your selector. There's no div with the ID of "Wheelbase" on that page, so it's returning an empty list.
What follows is by no means perfect, but will get you what you want, only because you know the structure of the page already:
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
wheelbase_data = {}
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
for link in soup.find_all('a'):
if link.get('href') == "/wiki/Wheelbase":
wheelbase = link
break
wheelbase_data['Wheelbase'] = wheelbase.parent.parent.td.text
It looks like you're looking for the incorrect path. I've had to do something similar in the past.. I'm not sure if this is the best approach but certainly worked for me.
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
car_data = pd.DataFrame()
models = ['Tesla_Model_S','Tesla_Model_X']
for model in models:
wiki = "https://en.wikipedia.org/wiki/{0}".format(model)
header = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find("table", { "class" : "infobox hproduct" })
for row in table.findAll("tr")[2:]:
try:
field = row.findAll("th")[0].text.strip()
val = row.findAll("td")[0].text.strip()
car_data.set_value(model,field,val)
except:
pass
car_data

Categories

Resources