How do I Scrape the link of the website from these page - python

I am trying to scrape the link from amazon website but they will provide me 2 or 3 links
the link of website is https://www.amazon.com/s?rh=n%3A1069242&fs=true&ref=lp_1069242_sar
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://www.amazon.com/s?rh=n%3A1069242&fs=true&ref=lp_1069242_sar')
soup=BeautifulSoup(r.content, 'html.parser')
for link in soup.find_all('a',href=True):
print(link['href'])

Here is the working solution:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
base_url='https://www.amazon.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36','session':'141-2320098-4829807'}
r = requests.get('https://www.amazon.com/s?rh=n%3A1069242&fs=true&ref=lp_1069242_sar', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
for link in soup.find_all('a',class_="a-link-normal s-underline-text s-underline-link-text a-text-normal",href=True):
p=link['href']
l=urljoin(base_url,p)
print(l)
Output:
https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_browse_office-products_sr_pg1_1?ie=UTF8&adId=A05861132UJ9W79S82Z3&url=%2FFiskars-Inch-Student-Scissors-Pack%2Fdp%2FB08CL355MN%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26qid%3D1633717907%26s%3Doffice-products%26sr%3D1-1-spons%26psc%3D1&qualifier=1633717907&id=1565389383398743&widgetName=sp_atf_browse
https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_browse_office-products_sr_pg1_1?ie=UTF8&adId=A0918144191FAIKGYK3YC&url=%2FFiskars-Inch-Blunt-Kids-Scissors%2Fdp%2FB00TJSS9ZW%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26qid%3D1633717907%26s%3Doffice-products%26sr%3D1-2-spons%26psc%3D1&qualifier=1633717907&id=1565389383398743&widgetName=sp_atf_browse
https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_browse_office-products_sr_pg1_1?ie=UTF8&adId=A09889161KB2CNO5NB8QC&url=%2FLind-Kitchen-Dispenser-Decorative-Stationery%2Fdp%2FB07VRLW5C6%2Fref%3Dsr_1_3_sspa%3Fdchild%3D1%26qid%3D1633717907%26s%3Doffice-products%26sr%3D1-3-spons%26psc%3D1&qualifier=1633717907&id=1565389383398743&widgetName=sp_atf_browse
https://www.amazon.com/Zebra-Pen-Retractable-Ballpoint-18-Count/dp/B00M382RJO/ref=sr_1_4?dchild=1&qid=1633717907&s=office-products&sr=1-4
... so on

Related

Python Request returning different result than original page (browser)

I am trying to do a simple WebScrapper to monitor Nike's site here in Brazil.
Basically i want to track products that have stock right now, to check when new products are added.
My problem is that when i navigate to the site https://www.nike.com.br/snkrs#estoque I see different products compared to what I see using python requests method.
Here is the code I am using:
import requests
from bs4 import BeautifulSoup
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'https://www.nike.com.br/snkrs#estoque'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
len(soup.find_all(class_='produto produto--comprar'))
This code gives me 40, but using the browser I can see 56 products https://prnt.sc/26jeo1i
The data comes from a different source, within 3 pages.
import requests
from bs4 import BeautifulSoup
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
productList = []
for p in [1,2,3]:
url = f'https://www.nike.com.br/Snkrs/Estoque?p={p}&demanda=true'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
productList += soup.find_all(class_='produto produto--comprar')
Output:
print(len(productList))
56

I scrape the review of post but they don't scrape

I scrape the review of post but they don't scrape solve it I am very thankful
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
r =requests.get('https://www.realpatientratings.com/botox-cosmetic')
soup=BeautifulSoup(r.content, 'lxml')
tag = soup.find_all('p',class_='text')
for u in tag:
print(u.text)
After checking xhr requests I found out that you're getting the incorrect page.
Try:
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
r =requests.get('https://www.realpatientratings.com/reviews/procreviewfilters?type=surgical&star=&procedureId=147&sort=new&location=&state=0&within=0')
soup=BeautifulSoup(r.content, 'lxml')
tag = soup.find_all('p',class_='text')
for u in tag:
print(u.text)
Just changed https://www.realpatientratings.com/botox-cosmetic to https://www.realpatientratings.com/reviews/procreviewfilters?type=surgical&star=&procedureId=147&sort=new&location=&state=0&within=0

How to collect all specified href's?

In this test model I can collect the href value for the first ('tr', class_='rowLive'), I've tried to create a loop to collect all the others href but it always gives IndentationError: expected an indented block or says I'm trying to use find instead of find_all.
How should I proceed to collect all href?
import requests
from bs4 import BeautifulSoup
url = 'http://sports.williamhill.com/bet/pt/betlive/9'
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
jogos = soup.find_all('tr', class_='rowLive')
jogo = jogos[0]
linksgame = jogo.find('a', href=True).attrs['href'].strip()
print(linksgame)
jogos returns a list, you can loop over it and find() an a for every iteration:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
for tag in jogos:
print(tag.find("a", href=True)["href"])
Or:
print([tag.find("a", href=True)["href"] for tag in jogos])

WebScraping A Website With Json Content Gives Value Error

I am trying to scrape an api call with requests. This is the website
Following Is The Error That It Gives Me:
ValueError: No JSON object could be decoded
Following Is The Code :
import requests
import json
import time
from bs4 import BeautifulSoup
url = 'https://www.nseindia.com/api/event-calendar'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
request = requests.get(url,headers=headers)
data = json.loads(request.text)
print(data)
How Can I Scrape This Website ?
Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://www.nseindia.com/companies-listing/corporate-filings-event-calendar'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'html.parser')
print(soup)
The table is probably being dynamically generated with Javascript. Therefore, requests won't work. You need selenium and a headless browser to do that.

when I use urllib2 to crawl a wibsite,but without labels ,such as html,body

import urllib2
url = 'http://www.bilibili.com/video/av1669338'
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
text = response.read()
text[:100]
'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\xcd}ys\x1bG\xb2\xe7\xdfV\xc4|\x87\x1exhRk\x81\xb8\x08\x10\x90E\xfa\x89\xb2f\x9f\xe3\xd9\xcf\x9e\x1dyb7\xec\tD\x03h\x90\x90p\t\x07)yf"D\xf9I&EI\xd4}\x91\xb6.\xeb\xb0e\x93\x94%Y\xbc$E\xccW\x194\x00\xfe\xe5\xaf\xf0~Y\xd5\xd5\xa8\xeeF\x83\xa7'
import requests
from bs4 import BeautifulSoup
def data():
url = 'http://www.bilibili.com/video/av1669338'
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent": user_agent}
response = requests.get(url, headers=headers)
data = response.content
_html = BeautifulSoup(data)
_meta = _html.head.select('meta[name=keywords]')
print _meta[0]['content']
Try this:
import bs4, requests
res = requests.get("http://www.bilibili.com/video/av1669338")
soup = bs4.BeautifulSoup(res.content, "lxml")
result = soup.find("meta", attrs = {"name":"keywords"}).get("content")
print result

Categories

Resources