Beautifullsoup Amazon Product Detail - python

I can't scrape the "Product Details" section (scrolling down the webpage you'll find it) html by using requests or requests_html.
Find_all returns a 0 size object... Any Help?
from requests import session
from requests_html import HTMLSession
s = HTMLSession()
#s = session()
r = s.get("https://www.amazon.com/dp/B094HWN66Y")
soup = BeautifulSoup(r.text, 'html.parser')
len(soup.find_all("div", {"id":"detailBulletsWrapper_feature_div"}))

Product details with different information:
Code:
from bs4 import BeautifulSoup
import requests
cookies = {'session': '131-1062572-6801905'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
r = requests.get("https://www.amazon.com/dp/B094HWN66Y",headers=headers,cookies=cookies)
print(r)
soup = BeautifulSoup(r.text, 'lxml')
key = [x.get_text(strip=True).replace('\u200f\n','').replace('\u200e','').replace(':\n','').replace('\n', '').strip() for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span.a-text-bold')][:13]
#print(key)
value = [x.get_text(strip=True) for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span:nth-child(2)')]
#print(value)
product_details = {k:v for k, v, in zip(key, value)}
print(product_details)
Output:
{'ASIN': 'B094HWN66Y', 'Publisher': 'Boldwood Books (September 7, 2021)', 'Publication date':
'September 7, 2021', 'Language': 'English', 'File size': '1883 KB', 'Text-to-Speech': 'Enabled', 'Screen Reader': 'Supported', 'Enhanced typesetting': 'Enabled', 'X-Ray': 'Enabled', 'Word
Wise': 'Enabled', 'Print length': '332 pages', 'Page numbers source ISBN': '1800487622', 'Lending': 'Not Enabled'}

This is an example of how to scrape the title of the product using bs4 and requests, easily expandable to getting other info from the product.
The reason yours doesn't work is your request has no headers so Amazon realises your a bot and doesn't want you scraping their site. This is shown by your request being returned as <Response [503]> and explained in r.text.
I believe Amazon have an API for this (that they'd probably like you to use) but it'll be fine to scrape like this for small-scale stuff.
import requests
import bs4
# Amazon don't like you scrapeing them however these headers should stop them from noticing a small number of requests
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})
def main():
url = "https://www.amazon.com/dp/B094HWN66Y"
title = get_title(url)
print("The title of %s is: %s" % (url, title))
def get_title(url: str) -> str:
"""Returns the title of the amazon product."""
# The request
r = requests.get(url, headers=HEADERS)
# Parse the content
soup = bs4.BeautifulSoup(r.content, 'html.parser')
title = soup.find("span", attrs={"id": 'productTitle'}).string
return title
if __name__ == "__main__":
main()
Output:
The title of https://www.amazon.com/dp/B094HWN66Y is: Will They, Won't They?

Related

Amazon scraper no data display

import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
headers = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
search_query = 'home office'.replace(' ', '+')
base_url = 'https://www.amazon.com/s?k={0}'.format(search_query)
items = []
for i in range(1, 11):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'})
I don't know why each time to run the code. it only appends the strings together and gives me the link to the pages. It doesn't scrape any data from the page at all. I attached a screen shot of my screen as wellscreenshot
Main issue is you never append / return / print your ResultSet
...
for i in range(1, 11):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
items.extend(soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'}))
print(items)
Example
This will iterate the ResultSetand store each item as dict with specific information in your list:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
headers = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
search_query = 'home office'.replace(' ', '+')
base_url = 'https://www.amazon.com/s?k={0}'.format(search_query)
items = []
for i in range(1, 2):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for item in soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'}):
items.append({
'title':item.h2.text,
'url': item.a.get('href')
})
items
Output
[{'title': 'Raven Pro Document Scanner - Huge Touchscreen, High Speed Color Duplex Feeder (ADF), Wireless Scan to Cloud, WiFi, Ethernet, USB, Home or Office Desktop ',
'url': '/sspa/click?ie=UTF8&spc=MTo4NzYzMDkwMjIzMjg0MTI3OjE2NjAzOTk5ODA6c3BfYXRmOjIwMDAyMzU4Mzg0OTg2MTo6MDo6&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&url=%2FRaven-Pro-Document-Scanner-Touchscreen%2Fdp%2FB07MFRJWY6%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dhome%2Boffice%26qid%3D1660399980%26sr%3D8-1-spons%26psc%3D1'},
{'title': 'Home Office Desk Chair, Ergonomic Mesh Executive Office Chair with 3 Position Tilt Function, Comfortable High Back Black Computer Chair with 3D Adjustable Armrest & Lumbar Support, FANMEN ',
'url': '/Ergonomic-Executive-Comfortable-Adjustable-FANMEN/dp/B09KRKX9FT/ref=sr_1_2?keywords=home+office&qid=1660399980&sr=8-2'},
{'title': 'bonsaii Paper Shredder for Home Use,6-Sheet Crosscut Paper and Credit Card Shredder for Home Office,Home Shredder with Handle for Document,Mail,Staple,Clip-3.4 Gal Wastebasket(C237-B) ',
'url': '/bonsaii-Paper-Shredder-6-Sheet-Crosscut-Paper-Design-Home-Shredder-Clip-3-4-Gal-Wastebasket-C237-B/dp/B0834J2SVR/ref=sr_1_3?keywords=home+office&qid=1660399980&sr=8-3'},...]

Web scraping review section on agoda website with post request

need help with retrieving review section from agoda website.
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
filename = "hotel.csv"
f = open(filename, "w", encoding="utf-8")
headers = "title, rating, review\n"
f.write(headers)
api_url = "https://www.agoda.com/api/cronos/property/review/ReviewComments"
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# for loop for multiple page scrap
for x in tqdm(range(1,10)):
post_data = {"hotelId":"2252947",
"providerId":"332",
"demographicId":"0",
"page":str(x),
"pageSize":"20",
"sorting":"7",
"providerIds":[332],
"isReviewPage":"false",
"isCrawlablePage":"true",
"filters":{"language":[],"room":[]},
"searchKeyword":"",
"searchFilters":[]}
html = requests.post(api_url, data=post_data)
values = html.text
soup = BeautifulSoup(values, "html.parser")
hotels = soup.find_all("div", {"class": "review-comment"})
for hotel in hotels:
try:
rating = hotel.find("div", {"class":"Review-comment-leftScore"}).text
title = hotel.find("p", {"class":"Review-comment-bodyTitle"}).text
review = hotel.find("p", {"class":"Review-comment-bodyText"}).text
f.write(title + ", "+ rating + ", " + review + "\n")
except TypeError:
continue
f.close()
post data i get from firefox network monitor when i change the page on the review section.
the hotel: Hotel Page
tried the json method but i dont understand
I think your api endpoint or data is wrong. 'cause if you try just to print, you get <Response [415]>.
Should be 200.
html.json()
{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.13', 'title': 'Unsupported Media Type', 'status': 415, 'traceId': '00-68f23e7f0431e7bffae420112667ed1b-6306a38dd716894d-00'}

Indeed Job Scraper only for Postings with External Link

Currently using the below Python scraper to pull Job title, Company, Salary, and Description. Looking for a way to take it one step further by filtering only results where application link is URL to company website, as opposed to the 'Easily Apply' postings that send application through Indeed. Is there a way to do this?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
url = f'https://www.indeed.com/jobs?q=Software%20Engineer&l=Austin%2C%20TX&ts=1630951951455&rq=1&rsIdx=1&fromage=last&newcount=6&vjk=c8f4815c6ecfa793'
r = requests.get(url, headers) # 200 is OK, 404 is page not found
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# <span title="API Developer"> API Developer </span>
def transform(soup):
divs = soup.find_all('div', class_ = 'slider_container')
for item in divs:
if item.find(class_ = 'label'):
continue # need to fix, if finds a job that has a 'new' span before the title span, skips job completely
title = item.find('span').text.strip()
company = item.find('span', class_ = "companyName").text.strip()
description = item.find('div', class_ = "job-snippet").text.strip().replace('\n', '')
try:
salary = item.find('span', class_ = "salary-snippet").text.strip()
except:
salary = ""
job = {
'title': title,
'company': company,
'salary': salary,
'description': description
}
jobList.append(job)
# print("Seeking a: "+title+" to join: "+company+" paying: "+salary+". Job description: "+description)
return
jobList = []
# go through multiple pages
for i in range(0,100, 10): #0-40 stepping in 10's
print(f'Getting page, {i}')
c = extract(0)
transform(c)
print(len(jobList))
df = pd.DataFrame(jobList)
print(df.head())
df.to_csv('jobs.csv')
My approach is as follows-
Find the href from the <a> tag for each job card on the initial page, and then send a request to each of those links, and grab the external job link (If "Apply on Company Site" button is available) from there.
Code snippet-
#function which gets external job links
def get_external_link(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
#if Apply On Company Site button is available, fetch the link
if(soup.find('a',attrs={"referrerpolicy" : "origin"})) is not None:
external_job_link=soup.find('a',attrs={"referrerpolicy" : "origin"})
print(external_job_link['href'])
#add this piece of code to transform function
def transform(soup):
cards=soup.find('div',class_='mosaic-provider-jobcards')
links=cards.find_all("a", class_=lambda value: value and value.startswith("tapItem"))
#for each job link in the page call get_external_links
for link in links:
get_external_link('https://www.indeed.com'+(link['href']))
Note- You can also use the page source of the new requests which are being called to fetch the data like title, company, salary, description which you previously used to scrape from the main page.

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

Python Attribute error in web scraping for a popup

I am trying to web scrape https://steamcommunity.com/market/listings/730/AWP%20%7C%20Atheris%20%28Field-Tested%29 and extract the data regarding the price of items sorted with there Inspect in Game link.
You can find the Inspect in Game button by clicking on the arrow popup which appears when you move the cursor on the image of any listed item below.
I managed to scrape the data of prices but, from the same method the Inspect in Game link isn't scraping.
Here's my code
from bs4 import BeautifulSoup
import requests
import re
print("Fetching Data")
url = 'https://steamcommunity.com/market/listings/730/AWP%20%7C%20Atheris%20%28Field-Tested%29'
response = requests.get(url)
# getting the source code of page response.text
data = response.text
soup = BeautifulSoup(data,'html.parser')
jobs = soup.find_all('div',{'class':re.compile('market_listing_row market_recent_listing_row listing_''\d')})
for job in jobs:
price_tag = job.find('span',{'class':'market_listing_price market_listing_price_with_fee'})
price = price_tag.text[8:] if price_tag else "Sold!"
link = job.find('a',{'class':'popup_menu_item'}).get('href')
print('PRICE : ',price,link)
I got error AttributeError: 'NoneType' object has no attribute 'get' which is probably because this class appears in html code when someone click on the arrow button on the picture, and in the code i am unable to do so. Can anyone suggest me a way, so that i can get the link of Inspect in Game button
Go through the API. I believe it already doesn't include sold items:
from bs4 import BeautifulSoup
import requests
import re
import math
import time
print("Fetching Data")
with requests.Session() as s:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
s.get('https://steamcommunity.com/')
cookiesDict = s.cookies.get_dict()
headers.update({'Cookie':'sessionid=' + cookiesDict['sessionid']})
url = 'https://steamcommunity.com/market/listings/730/AWP%20%7C%20Atheris%20%28Field-Tested%29/render/'
payload = {
'query':'',
'start': '0',
'count': '100',
'country': 'US',
'language': 'english',
'currency': '1'}
results = {}
jsonData = s.get(url, params=payload, headers=headers).json()
total_count = jsonData['total_count']
total_pages = math.ceil(total_count/100)
for page in range(0,total_pages):
#page=1
time.sleep(5)
payload = {
'query':'',
'start': '%s' %(page*100),
'count': '100',
'country': 'US',
'language': 'english',
'currency': '1'}
jsonData = s.get(url, params=payload, headers=headers).json()
results.update(jsonData['listinginfo'])
print ('Aquired page %s of %s...' %(page, total_pages))
for k, v in results.items():
price = (v['converted_price'] + v['converted_fee']) / 100
link = v['asset']['market_actions'][0]['link']
print('PRICE : $%.02f %s' %(price, link))
Output:
....
PRICE : $4.64 steam://rungame/730/76561202255233023/+csgo_econ_action_preview%20M%listingid%A%assetid%D2353320232219212327
PRICE : $4.64 steam://rungame/730/76561202255233023/+csgo_econ_action_preview%20M%listingid%A%assetid%D5237312699887394313
PRICE : $4.64 steam://rungame/730/76561202255233023/+csgo_econ_action_preview%20M%listingid%A%assetid%D17060803922484197923
PRICE : $4.64 steam://rungame/730/76561202255233023/+csgo_econ_action_preview%20M%listingid%A%assetid%D3054184006200365805
PRICE : $4.64 steam://rungame/730/76561202255233023/+csgo_econ_action_preview%20M%listingid%A%assetid%D5237312699887394313
....

Categories

Resources