Amazon scraper no data display - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
headers = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
search_query = 'home office'.replace(' ', '+')
base_url = 'https://www.amazon.com/s?k={0}'.format(search_query)
items = []
for i in range(1, 11):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'})
I don't know why each time to run the code. it only appends the strings together and gives me the link to the pages. It doesn't scrape any data from the page at all. I attached a screen shot of my screen as wellscreenshot

Main issue is you never append / return / print your ResultSet
...
for i in range(1, 11):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
items.extend(soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'}))
print(items)
Example
This will iterate the ResultSetand store each item as dict with specific information in your list:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
headers = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
search_query = 'home office'.replace(' ', '+')
base_url = 'https://www.amazon.com/s?k={0}'.format(search_query)
items = []
for i in range(1, 2):
print('Processing {0}...'.format(base_url + '&page={0}'.format(i)))
response = requests.get(base_url + '&page={0}'.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for item in soup.find_all('div', {'class': 's-result-item', 'data-component-type': 's-search-result'}):
items.append({
'title':item.h2.text,
'url': item.a.get('href')
})
items
Output
[{'title': 'Raven Pro Document Scanner - Huge Touchscreen, High Speed Color Duplex Feeder (ADF), Wireless Scan to Cloud, WiFi, Ethernet, USB, Home or Office Desktop ',
'url': '/sspa/click?ie=UTF8&spc=MTo4NzYzMDkwMjIzMjg0MTI3OjE2NjAzOTk5ODA6c3BfYXRmOjIwMDAyMzU4Mzg0OTg2MTo6MDo6&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&url=%2FRaven-Pro-Document-Scanner-Touchscreen%2Fdp%2FB07MFRJWY6%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dhome%2Boffice%26qid%3D1660399980%26sr%3D8-1-spons%26psc%3D1'},
{'title': 'Home Office Desk Chair, Ergonomic Mesh Executive Office Chair with 3 Position Tilt Function, Comfortable High Back Black Computer Chair with 3D Adjustable Armrest & Lumbar Support, FANMEN ',
'url': '/Ergonomic-Executive-Comfortable-Adjustable-FANMEN/dp/B09KRKX9FT/ref=sr_1_2?keywords=home+office&qid=1660399980&sr=8-2'},
{'title': 'bonsaii Paper Shredder for Home Use,6-Sheet Crosscut Paper and Credit Card Shredder for Home Office,Home Shredder with Handle for Document,Mail,Staple,Clip-3.4 Gal Wastebasket(C237-B) ',
'url': '/bonsaii-Paper-Shredder-6-Sheet-Crosscut-Paper-Design-Home-Shredder-Clip-3-4-Gal-Wastebasket-C237-B/dp/B0834J2SVR/ref=sr_1_3?keywords=home+office&qid=1660399980&sr=8-3'},...]

Related

Web scraping review section on agoda website with post request

need help with retrieving review section from agoda website.
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
filename = "hotel.csv"
f = open(filename, "w", encoding="utf-8")
headers = "title, rating, review\n"
f.write(headers)
api_url = "https://www.agoda.com/api/cronos/property/review/ReviewComments"
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# for loop for multiple page scrap
for x in tqdm(range(1,10)):
post_data = {"hotelId":"2252947",
"providerId":"332",
"demographicId":"0",
"page":str(x),
"pageSize":"20",
"sorting":"7",
"providerIds":[332],
"isReviewPage":"false",
"isCrawlablePage":"true",
"filters":{"language":[],"room":[]},
"searchKeyword":"",
"searchFilters":[]}
html = requests.post(api_url, data=post_data)
values = html.text
soup = BeautifulSoup(values, "html.parser")
hotels = soup.find_all("div", {"class": "review-comment"})
for hotel in hotels:
try:
rating = hotel.find("div", {"class":"Review-comment-leftScore"}).text
title = hotel.find("p", {"class":"Review-comment-bodyTitle"}).text
review = hotel.find("p", {"class":"Review-comment-bodyText"}).text
f.write(title + ", "+ rating + ", " + review + "\n")
except TypeError:
continue
f.close()
post data i get from firefox network monitor when i change the page on the review section.
the hotel: Hotel Page
tried the json method but i dont understand
I think your api endpoint or data is wrong. 'cause if you try just to print, you get <Response [415]>.
Should be 200.
html.json()
{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.13', 'title': 'Unsupported Media Type', 'status': 415, 'traceId': '00-68f23e7f0431e7bffae420112667ed1b-6306a38dd716894d-00'}

How to select first element in multi-valued html tags?

I'm developing a web scraping to collect some information from AllMusic. However, I am having difficulties to correctly return information when there is more than one option inside the tag (e.g. href).
Question: I need to return the first music genre for each artist. In the case of one value per artist, my code works. However, in situations with more than one music genre, I'm not able to select just the first one.
Here is the code created:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
performer = []
links = []
genre = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
div = soup.select("div.name")[0]
link = div.find_all('a')[0]['href']
links.append(link)
for l in links:
soup = BeautifulSoup(requests.get(l, headers=headers).content, "html.parser")
divGenre= soup.select("div.genre")[0]
genres = divGenre.find('a')
performer.append(artist)
genre.append(genres.text)
df = pd.DataFrame(zip(performer, genre, links), columns=["artist", "genre", "link"])
df
Hopfully understand your question right - Main issue is that you iterate the links inside your for-loop and that causes the repetition.
May change your strategy, try to get all information in one iteration and store them in a more structured way.
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
data = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
link = soup.select_one("div.name a").get('href')
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
data.append({
'artist':artist,
'genre':soup.select_one("div.genre a").text,
'link':link
})
print(pd.DataFrame(data).to_markdown(index=False))
Output
artist
genre
link
Alexander 23
Pop/Rock
https://www.allmusic.com/artist/alexander-23-mn0003823464
Alex & Sierra
Folk
https://www.allmusic.com/artist/alex-sierra-mn0003280540
Tion Wayne
Rap
https://www.allmusic.com/artist/tion-wayne-mn0003666177
Tom Cochrane
Pop/Rock
https://www.allmusic.com/artist/tom-cochrane-mn0000931015
The Waked
Electronic
https://www.allmusic.com/artist/the-waked-mn0004025091

Find() ==> how extract attribute="value"

I want to extract the attribute Value "705-419-1151"
<a href="javascript:void(0)" class="mlr__item__cta jsMlrMenu" title="Get the Phone Number" data-phone="705-419-1151">
from bs4 import BeautifulSoup
url='https://www.yellowpages.ca/search/si/2/hvac+services/Ontario+ON'
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('div', class_ ='listing__content__wrapper')
for item in articles:
tel = item.find('li' , {'data-phone' : 'attr(data-phone)'}).get()
print(tel)
How can I do this?
Try to focus while processing the data, select your elements more specific and always check if the element is available before call methods:
e.get('data-phone') if(e := item.select_one('[data-phone]')) else None
Example
This example stores results in list of dicts, so you could easy create an DataFrame and save to specific format.
import requests
import pandas as pd
from bs4 import BeautifulSoup
url='https://www.yellowpages.ca/search/si/2/hvac+services/Ontario+ON'
headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' , 'Accept-Language': 'en-US, en;q=0.5'}
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('div', class_ ='listing__content__wrapper')
data = []
for item in articles:
com = e.get_text(strip=True, separator='\n') if(e := item.select_one('[itemprop="name"]')) else None
add = e.text.strip() if(e := item.select_one('[itemprop="address"]')) else None
tel = e.get('data-phone') if(e := item.select_one('[data-phone]')) else None
data.append({
'com':com,
'add':add,
'tel':tel
})
#create a csv file with results
pd.DataFrame(data).to_csv('filename.csv', index=False)
Output of data
[{'com': '1\nCity Experts',
'add': '17 Raffia Ave, Richmond Hill, ON L4E 4M9',
'tel': '416-858-3051'},
{'com': '2\nAssociateair Mechanical Systems Ltd',
'add': '40-81 Auriga Dr, Nepean, ON K2E 7Y5',
'tel': '343-700-1174'},
{'com': '3\nAffordable Comfort Heating & Cooling',
'add': '54 Cedar Pointe Dr, Unit 1207 Suite 022, Barrie, ON L4N 5R7',
'tel': '705-300-9536'},
{'com': '4\nHenderson Metal Fabricating Co Ltd',
'add': '76 Industrial Park Cres, Sault Ste Marie, ON P6B 5P2',
'tel': '705-910-5895'},...]

Beautifullsoup Amazon Product Detail

I can't scrape the "Product Details" section (scrolling down the webpage you'll find it) html by using requests or requests_html.
Find_all returns a 0 size object... Any Help?
from requests import session
from requests_html import HTMLSession
s = HTMLSession()
#s = session()
r = s.get("https://www.amazon.com/dp/B094HWN66Y")
soup = BeautifulSoup(r.text, 'html.parser')
len(soup.find_all("div", {"id":"detailBulletsWrapper_feature_div"}))
Product details with different information:
Code:
from bs4 import BeautifulSoup
import requests
cookies = {'session': '131-1062572-6801905'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
r = requests.get("https://www.amazon.com/dp/B094HWN66Y",headers=headers,cookies=cookies)
print(r)
soup = BeautifulSoup(r.text, 'lxml')
key = [x.get_text(strip=True).replace('\u200f\n','').replace('\u200e','').replace(':\n','').replace('\n', '').strip() for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span.a-text-bold')][:13]
#print(key)
value = [x.get_text(strip=True) for x in soup.select('ul.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list > li > span > span:nth-child(2)')]
#print(value)
product_details = {k:v for k, v, in zip(key, value)}
print(product_details)
Output:
{'ASIN': 'B094HWN66Y', 'Publisher': 'Boldwood Books (September 7, 2021)', 'Publication date':
'September 7, 2021', 'Language': 'English', 'File size': '1883 KB', 'Text-to-Speech': 'Enabled', 'Screen Reader': 'Supported', 'Enhanced typesetting': 'Enabled', 'X-Ray': 'Enabled', 'Word
Wise': 'Enabled', 'Print length': '332 pages', 'Page numbers source ISBN': '1800487622', 'Lending': 'Not Enabled'}
This is an example of how to scrape the title of the product using bs4 and requests, easily expandable to getting other info from the product.
The reason yours doesn't work is your request has no headers so Amazon realises your a bot and doesn't want you scraping their site. This is shown by your request being returned as <Response [503]> and explained in r.text.
I believe Amazon have an API for this (that they'd probably like you to use) but it'll be fine to scrape like this for small-scale stuff.
import requests
import bs4
# Amazon don't like you scrapeing them however these headers should stop them from noticing a small number of requests
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})
def main():
url = "https://www.amazon.com/dp/B094HWN66Y"
title = get_title(url)
print("The title of %s is: %s" % (url, title))
def get_title(url: str) -> str:
"""Returns the title of the amazon product."""
# The request
r = requests.get(url, headers=HEADERS)
# Parse the content
soup = bs4.BeautifulSoup(r.content, 'html.parser')
title = soup.find("span", attrs={"id": 'productTitle'}).string
return title
if __name__ == "__main__":
main()
Output:
The title of https://www.amazon.com/dp/B094HWN66Y is: Will They, Won't They?

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

Categories

Resources