Webscraping Amazon with BeautifulSoup - python

I am trying to webscrape Amazon's reviews: https://www.amazon.com/Python-Crash-Course-Hands-Project-Based/dp/1593276036/ref=sr_1_3?ie=UTF8&qid=1541450645&sr=8-3&keywords=python
Here is my code:
import requests as req
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Kevin\'s_request'}
r = req.get('https://www.amazon.com/Python-Crash-Course-Hands-Project-Based/dp/1593276036/ref=sr_1_3?ie=UTF8&qid=1541450645&sr=8-3&keywords=python', headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
soup.find(class_="a-expander-content a-expander-partial-collapse-content")
I only end up with an empty list. I am using Python 3.6.4 in Jupyter Notebooks and BS 4

Try this approach. Turn out that your selector could not find anything. However, I've fixed it to serve the purpose:
import requests
from bs4 import BeautifulSoup
def get_reviews(s,url):
s.headers['User-Agent'] = 'Mozilla/5.0'
response = s.get(url)
soup = BeautifulSoup(response.text,"lxml")
return soup.find_all("div",{"data-hook":"review-collapsed"})
if __name__ == '__main__':
link = 'https://www.amazon.com/Python-Crash-Course-Hands-Project-Based/dp/1593276036/ref=sr_1_3?ie=UTF8&qid=1541450645&sr=8-3&keywords=python'
with requests.Session() as s:
for review in get_reviews(s,link):
print(f'{review.text}\n')

Not sure what's happening on your side, but this code works fine.
Here it goes (python 3.6, BSP 4.6.3):
import requests
from bs4 import BeautifulSoup
def s_comments(url):
headers = {'User-Agent': 'Bob\'s_request'}
response = requests.get(url, headers=headers )
if response.status_code != 200:
raise ConnectionError
soup = BeautifulSoup(response.content)
return soup.find_all(class_="a-expander-content a-expander-partial- collapse-content")
url = 'https://www.amazon.com/dp/1593276036'
reviews = s_comments(url)
for i, review in enumerate(reviews):
print('---- {} ----'.format(i))
print(review.text)

Related

Extracting json when web scraping

I was following a python guide on web scraping and there's one line of code that won't work for me. I'd appreciate it if anybody could help me figure out what the issue is, thanks.
from bs4 import BeautifulSoup
import json
import re
import requests
url = 'https://finance.yahoo.com/quote/AAPL/analysis?p=AAPL'
page = requests.get(url)
soup = BeautifulSoup(page.content, "lxml")
script = soup.find('script', text=re.compile('root\.App\.main'))
json_text = re.search(r'^\s*root\.App\.main\s*=\s*({.*?})\s*;\s*$',script.string, flags=re.MULTILINE).group(1)
Error Message:
json_text = re.search(r'^\s*root\.App\.main\s*=\s*({.*?})\s*;\s*$',script.string, flags=re.MULTILINE).group(1)
AttributeError: 'NoneType' object has no attribute 'string'
Link to the guide I was looking at: https://www.mattbutton.com/how-to-scrape-stock-upgrades-and-downgrades-from-yahoo-finance/
Main issue in my opinion is that you should add an user-agent to your request, so that you get expected HTML:
headers = {'user-agent':'Mozilla/5.0'}
page = requests.get(url, headers=headers)
Note: Almost and first at all - Take a deeper look into your soup, to check if expected information is available.
Example
import re
import json
from bs4 import BeautifulSoup
url = 'https://finance.yahoo.com/quote/AAPL/analysis?p=AAPL'
headers = {'user-agent':'Mozilla/5.0'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content)
script = soup.find('script', text=re.compile('root\.App\.main'))
json_text = json.loads(re.search(r'^\s*root\.App\.main\s*=\s*({.*?})\s*;\s*$',script.string, flags=re.MULTILINE).group(1))
json_text

When using 'requests.get()', the html is less than the actual html

The first image is the html of the site brought in python, and the second image is the actual html viewed by pressing F12 on the site. I don't know why the two results are different. Other sites display html normally, but I wonder why only that site is not displayed normally.
Here is the Code:
import requests
from bs4 import BeautifulSoup
result = requests.get('https://www.overbuff.com/heroes')
soup = BeautifulSoup(result.text, "html.parser")
print(soup)
Your maybe blocked by the page and should try it with some headers like:
headers = {"user-agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
Example
import requests
from bs4 import BeautifulSoup
url = "https://www.overbuff.com/heroes"
headers = {"user-agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
print(soup)

How to obtain all image links from a Reddit page's posts using requests in Python

Here is the code I am working with using python requests and the variable page passed in is "http://www.reddit.com/r/okbuddyretard/top/?t=day". Not sure what I am doing wrong where I am not able to read the HTML and pull all of the images from posts.
def return_image_links(page):
page = requests.get(page, headers = headers)
if page.status_code == 200:
page.html.render(sleep =5,timeout = 8) # Not sure if I am doing this right either
urls = page.html.xpath("a[ends-with(#src, '.jpg')]") #This is the part I can't figure out
print(urls)
image_urls = []
for url in urls:
if is_image(url):
image_urls.append(url)
else:
image_urls.extend(get_imgur_gallery_links(url))
return image_urls
else:
print('Bad response from reddit server.')
sys.exit()
The line urls = page.html.xpath("a[ends-with(#src, '.jpg')]") is simply wrong, you seem to be trying to use methods which are present under the lxml nodule, while using names (html.xpath) that do not live under the class requests.Response.
The following is an example which uses requests and BeautifulSoup to parse the html and access to the images :
from bs4 import BeautifulSoup as bs
import requests
import sys
def return_image_links(url, headers):
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = bs(page.text, "html.parser")
img_tags = soup.select('img[alt="Post image"]')
image_urls = []
for img_tag in img_tags:
image_urls.append(img_tag['src'])
return image_urls
else:
print('Bad response from reddit server.')
sys.exit()
url = "http://www.reddit.com/r/okbuddyretard/top/?t=day"
headers = {"User-Agent": 'Mozilla/5.0'}
imgs = return_image_links(url, headers)
print(*imgs, sep='\n')

Web scraping with beautifulsoup not finding anything

I'm trying to scrape coinmarketcap.com just to get an update of a certain currency price, also just to learn how to web scrape. I'm still a beginner and can't figure out where I'm going wrong, because whenever I try to run it, it just tells me there are none. Although I know that line does exist. Any help is appreciated!
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/currencies/electroneum/'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
price = soup.find('data-currency-price data-usd=')
print (price)
If you are going to be doing alot of this consider doing a single call using the official API and get all the prices. Then extract what you want. The following is from the site with an amendment by me to show the desired value for electroneum. The API guidance shows how to retrieve one at a time as well, though that requires a higher plan than the basic.
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'
parameters = {
'start': '1',
'limit': '5000',
'convert': 'USD',
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': 'yourKey',
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
# print(response.text)
data = json.loads(response.text)
print(data['data'][64]['quote']['USD']['price'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
You can always deploy a loop and check against a desired list e.g.
interested = ['Electroneum','Ethereum']
for item in data['data']:
if item['name'] in interested:
print(item)
For your current example:
You can use an attribute selector for data-currency-value
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/currencies/electroneum/'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
soup.select_one('[data-currency-value]').text
You can use the class attribute to get the value.
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/currencies/electroneum/'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
price = soup.find('span' ,attrs={"class" : "h2 text-semi-bold details-panel-item--price__value"})
print (price.text)
Output :
0.006778
You can get the value like this:
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/currencies/electroneum/'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
price = soup.find("span", id="quote_price").get('data-usd')
print (price)
You should try to be more specific in how you want to FIND the item.
you currently are using soup.find('') I am not sure what you have put inside this as you wrote data-currency-price data-usd=
Is that an ID a class name?
why not try finding the item using an ID.
soup.find(id="link3")
or find by tag
soup.find("relevant tag name like div or a")
or something like this
find_this = soup.find("a", id="ID HERE")
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/currencies/electroneum/'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
x=soup(id="quote_price").text
print (x)
Look for ID better,or search through soup.find_all(text="data-currency-price data-usd")[1].text

Make a list of all pages of a site in Python

I am trying to list all pages of a site in Python for scraping with BeautifulSoup. What I currently have is this:
team_urls = ['http://www.lyricsfreak.com/e/ed+sheeran/thinking+out+loud_21083784.html',
'http://www.lyricsfreak.com/e/ed+sheeran/photograph_21058341.html',
'http://www.lyricsfreak.com/e/ed+sheeran/a+team_20983411.html',
'http://www.lyricsfreak.com/e/ed+sheeran/i+see+fire_21071421.html',
'http://www.lyricsfreak.com/e/ed+sheeran/perfect_21113253.html',
'http://www.lyricsfreak.com/e/ed+sheeran/castle+on+the+hill_21112527.html',
'http://www.lyricsfreak.com/e/ed+sheeran/supermarket+flowers_21113249.html',
'http://www.lyricsfreak.com/e/ed+sheeran/lego+house_20983415.html',
'http://www.lyricsfreak.com/e/ed+sheeran/even+my+dad+does+sometimes_21085123.html',
'http://www.lyricsfreak.com/e/ed+sheeran/kiss+me_20983414.html',
'http://www.lyricsfreak.com/e/ed+sheeran/shape+of+you_21113143.html',
'http://www.lyricsfreak.com/e/ed+sheeran/i+see+fire_21071421.html'
]
I would like to call a function to pull all sites starting with http://www.lyricsfreak.com/e/ed+sheeran/, as I know the current list is sloppy and there are approximately 30 more available that I'd rather not just add manually.
In Python 2.x, you could create your list of subdomains as follows:
from bs4 import BeautifulSoup
import urllib2
base_url = 'http://www.lyricsfreak.com'
request = urllib2.Request(base_url + '/e/ed+sheeran/', headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'})
response = urllib2.urlopen(request)
soup = BeautifulSoup(response.read(), 'html.parser')
urls = []
for tr in soup.select('tbody tr'):
urls.append(base_url + tr.td.a['href'])
print urls
This would create a urls list starting:
['http://www.lyricsfreak.com/e/ed+sheeran/a+team_20983411.html', 'http://www.lyricsfreak.com/e/ed+sheeran/afire+love_21084845.html', ...
In Python 3.x, this can be modified as follows:
from bs4 import BeautifulSoup
import urllib
base_url = 'http://www.lyricsfreak.com'
resp = urllib.request.urlopen(base_url + '/e/ed+sheeran/')
soup = BeautifulSoup(resp, 'html.parser')
urls = []
for tr in soup.select('tbody tr'):
urls.append(base_url + tr.td.a['href'])
print(urls)
Or use the requests library as follows:
from bs4 import BeautifulSoup
import requests
base_url = 'http://www.lyricsfreak.com'
response = requests.get(base_url + '/e/ed+sheeran/')
soup = BeautifulSoup(response.text, 'html.parser')
urls = []
for tr in soup.select('tbody tr'):
urls.append(base_url + tr.td.a['href'])
Install using:
pip install requests

Categories

Resources