Getting a spesific value from a website in python - python

I get a product link that is on Aliexpress then I want to get the product price value and print it. I like this:
import requests
import urllib2
import sys
import urllib.parse
import urllib.request
import re
from bs4 import BeautifulSoup as BS
url ="https://www.amazon.com/Optimum-Nutrition-Standard-Naturally-Flavored/dp/B00QQA0H3S?pd_rd_wg=Hiuuc&pd_rd_r=2542737c-992b-4b5c-b7de-05acce3929d5&pd_rd_w=be1cP&ref_=pd_gw_simh&pf_rd_r=WFBB3JV61PKGSDJRW0C1&pf_rd_p=b841581f-e864-5164-afa6-4c18a8348879"
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
soup = BS(data)
price = soup.findAll("span", {"id": "volume"})[0]
print price
In this Amazon link, I want to find the 55.01 value.(Amazon is a just example.The website does not matter.It may be AliExpress etc.)

Related

How to breakup webpage text

I assume I have to use the /br to break up this text and extract the fixtures only, but cannot figure it out at all!
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import pprint
import json
from urllib.request import urlopen
# sample web page
sample_web_page = 'https://bleacherreport.com/articles/10005879-epl-schedule-2021-22-official-list-of-fixtures-for-new-premier-league-season'
# call get method to request that page
page = requests.get(sample_web_page)
# with the help of beautifulSoup and html parser create soup
soup = BeautifulSoup(page.content, "html.parser")
z = soup.findAll('p', {'class':''})
print(z)

How to scrape price data on page using BeautifulSoup

I am new to web scraping and is having trouble figuring out how to scrape all the prices in the webpage below. What I tried returns blank, any pointers would be great!
import bs4
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from datetime import datetime
from pytz import timezone
import urllib.request
url = 'https://www.remax.ca/find-real-estate'
page = urlopen(url)
soup = bs4.BeautifulSoup(page,'html.parser')
price = soup.findAll('h3', {'class' : 'price'})
First thing, if you use from bs4 import BeautifulSoup, don't use import bs4 too.
Second, write soup = BeautifulSoup(page,'html.parser)
Then use price = soup.find_all('h3',{'class':'price})
After this, you should have in "price" all the prices, but you still need to refine, as in that form you will copy all that code from the h3s.
EDIT
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from datetime import datetime
import urllib.request
url = 'https://www.remax.ca/find-real-estate'
page = urlopen(url)
soup = BeautifulSoup(page,'html.parser')
price = soup.find_all('h3', {'class' : 'price'})
for p in price:
print(p.text)
This should do the job. I eliminated pandas because i have not it installed.

Crawl site with infinite scrolling

I'm trying to crawl flipkart, but flipkart does not load its page at once. So I'm not able to crawl it. Please help.
from bs4 import BeautifulSoup
import requests
import re
import MySQLdb
import urllib2
import urllib
url = "https://www.flipkart.com/offers-list/weekend-specials?screen=dynamic&pk=contentTheme%3DLS-Nov-Weekend_widgetType%3DdealCard&wid=4.dealCard.OMU&otracker=hp_omu_Weekend+Specials_1"
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
name=soup.find_all("div",{"class":"iUmrbN"})
for i in name:
print i.text
This is not giving any output.

list out of range error when using soup.select('placeholder')[0].get_text() in Beautiful soup

New to scraping and I'm trying to use Beautiful soup to get the Wheelbase value ( eventually other things) from a wikipedia page ( I'll deal with robots.txt later) This is the guide I've been using
Two questions
1.) How do I resolve the error below?
2.) How do I scrape the value in the cell that contains wheelbase is it just "td#Wheelbase td" ?
The error I get is
File "evscraper.py", line 25, in <module>
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3') [0].get_text()
IndexError: list index out of range
Thanks for any help!
__author__ = 'KirkLazarus'
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3')[0].get_text()
print wheelbase_data
Well your first problem is with your selector. There's no div with the ID of "Wheelbase" on that page, so it's returning an empty list.
What follows is by no means perfect, but will get you what you want, only because you know the structure of the page already:
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
wheelbase_data = {}
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
for link in soup.find_all('a'):
if link.get('href') == "/wiki/Wheelbase":
wheelbase = link
break
wheelbase_data['Wheelbase'] = wheelbase.parent.parent.td.text
It looks like you're looking for the incorrect path. I've had to do something similar in the past.. I'm not sure if this is the best approach but certainly worked for me.
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
car_data = pd.DataFrame()
models = ['Tesla_Model_S','Tesla_Model_X']
for model in models:
wiki = "https://en.wikipedia.org/wiki/{0}".format(model)
header = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find("table", { "class" : "infobox hproduct" })
for row in table.findAll("tr")[2:]:
try:
field = row.findAll("th")[0].text.strip()
val = row.findAll("td")[0].text.strip()
car_data.set_value(model,field,val)
except:
pass
car_data

Python BeautifulSoup cannot find the data in the page source

in this website I can see the data I want and when I inspect the element I find it ,however it is not there in the page source so I cannot scrape it :
import requests
from bs4 import BeautifulSoup, Tag
from lxml import html
import requests
import MySQLdb
import urllib2
import itertools
import re
import sys
from datetime import date, timedelta as td
urls =("http://euw.lolesports.com/tourney/match/1833")
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(urls,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
tournament=soup.findAll('div',{'class':['field-item even']})
print tournament
print soup.title.text
match_time=soup.findAll('span',{'class':['local-time']})
for tag in match_time:
time=tag.get('datetime',None)
if time !=None:
print time
vid = soup.findAll('iframe',{'class':['media-youtube-player']})
for tag in vid:
vidlink=tag.get('href',None)
if vidlink !=None:
print vidlink
teams=soup.findAll('h5',{'class':['team-name']})
for tag in teams:
tag.replaceWith('')
print (tag.string)
print soup.findAll('span',{'class':['winner-holder']})
I was able to retrieve the title and teams but with everything else no luck, when I click on each element to inspect it I can see the data there, however when I view the page source they are empty tags which is why I think I am not getting any results.
Is there a way to overcome this ?

Categories

Resources