beautifulsoup not returning all html

beautifulsoup not returning all html - python

import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.amazon.com/s?k=iphone+5s&ref=nb_sb_noss')
c = r.content
soup = BeautifulSoup(c, 'html.parser')
all = soup.find_all("span", {"class": "a-size-medium a-color-base a-text-normal"})
print(all)
so this is my simple script of python trying to scrape a page in amazon but not all the html is returned in the "soup" variable therefor i get nothing when trying to find a specific series of tags and extract them.

Try the below code, it should do the trick for you.
You actually missed to add headers in your code
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.com/s?k=iphone+5s&ref=nb_sb_noss'
response = requests.get(url, headers=headers)
print(response.text)
soup = BeautifulSoup(response.content, features="lxml")
my_all = soup.find_all("span", {"class": "a-size-medium a-color-base a-text-normal"})
print(my_all)

Related

Scraping website URL from Google Image Search result

Search link: https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR
from bs4 import BeautifulSoup
search_link = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'
all_links=[]
for i in range(1,10):
url= search_link.format(i)
#print("url: " +url)
r = requests.get(url)
c = r.content
soup = BeautifulSoup(c, 'html.parser')
all = soup.find_all('a', {'class': 'ArticleTeaserSearchResultItem_link'}, href=True)
for item in all:
print(item)
print(item['href'])
all_links.append(item['href'])
print(all_links)
I found some code from the internet but it is not working. After I run the code, the list is empty. Anyone has any idea about this? Thank you very much.

You dont specify headers, and your request give empty page. One more thing, there is no - ArticleTeaserSearchResultItem_link class. You can try this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
all_links=[]
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
for href in soup.find('div', {'id': 'search'}).find_all('a'):
if href.get('href').startswith('https://'):
all_links.append(href.get('href'))
print(list(set(all_links)))
OUTPUT:
['https://www.quadrantkindercentra.nl/pedagogisch-werkplan-kdv-villa-kakelbont-2020/?w=5.6.3254479.1.27.32.red+iphone+6s', 'https://itstechprice.com/apple-iphone-6-price-in-nigeria/', 'https://olist.ng/mobile_phones-apple-iphone_6', 'https://naijaprice.com/apple-iphone-prices-in-nigeria/', 'https://www.walmart.com/browse/cell-phones/apple-ios-prepaid-phones/1105910_4527935_1072335_1231295_1231296', 'https://www.amazon.in/VcareGadGets-Apple-iPhone-Shining-Gloss/dp/B07FND9S6M', 'https://www.amazon.in/VCARE-GADGETS-Marble-White-iPhone/dp/B07P8CQZNY']

Beautiful soup text returns blank

I'm trying to scrape a website, but it returns blank, can you help please? what am i missing?
import requests
from bs4 import BeautifulSoup
URL = 'https://ks.wjx.top/jq/50921280.aspx'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.text)

To get a response, add the User-Agent header to requests.get(), otherwise, the website thinks that your a bot, and will block you.
import requests
from bs4 import BeautifulSoup
URL = "https://ks.wjx.top/jq/50921280.aspx"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify())

Elements on page don't exist when scraping wsj.com

I am using Python to scrape a webpage. This is my code:
import requests
from bs4 import BeautifulSoup
# Set local variables
URL = 'https://www.wsj.com/market-data/bonds'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
# Get Master data table and Last update from URL
table = soup.find("table", attrs={"class": "WSJTables--table--1QzSOCfq "})
print(table)
The result of that code is nothing--I can't find the table and not sure why.
Any suggestions?

You need to add the user-agent header, otherwise the page thinks that you’re a bot and will block you. Also note you had an extra space in your class name
import requests
from bs4 import BeautifulSoup
URL = 'https://www.wsj.com/market-data/bonds'
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
page = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find("table", attrs={"class": "WSJTables--table--1QzSOCfq"})
print(table)

BeautifulSoup Find periodically returns None

I am trying to get a value from a class. From time to time, find returns the value I need, but another time it no longer works.
Code:
import requests
from bs4 import BeautifulSoup
url = 'https://beru.ru/catalog/molotyi-kofe/76321/list'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
item_count = (soup.find('div', class_='_2StYqKhlBr')).text.split()[4]
print(item_count)

The reason why that you get the values sometimes and sometimes not. That's because the website is protected by CAPTCHA
So when the request is blocked by CAPTCHA
It's became like the following:
https://beru.ru/showcaptcha?retpath=https://beru.ru/catalog/molotyi-kofe/76321/list?ncrnd=4561_aa1b86c2ca77ae2b0831c4d95b9d85a4&t=0/1575204790/b39289ef083d539e2a4630548592a778&s=7e77bfda14c97f6fad34a8a654d9cd16
You can verify by parse the response content:
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://beru.ru/catalog/molotyi-kofe/76321/list')
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('div', attrs={'class': '_2StYqKhlBr _1wAXjGKtqe'}):
print(item)
for item in soup.findAll('div', attrs={'class': 'captcha__image'}):
for captcha in item.findAll('img'):
print(captcha.get('src'))
And you will get the CAPTCHA image link:
https://beru.ru/captchaimg?aHR0cHM6Ly9leHQuY2FwdGNoYS55YW5kZXgubmV0L2ltYWdlP2tleT0wMEFMQldoTnlaVGh3T21WRmN4NWFJRUdYeWp2TVZrUCZzZXJ2aWNlPW1hcmtldGJsdWU,_0/1575206667/b49556a86deeece9765a88f635c7bef2_df12d7a36f0e2d36bd9c9d94d8d9e3d7

Beautiful Soup in Python cannot find id despite the id existing

the soup.find method returns None instead of the product title despite the productTitle existing in the page.
It works on amazon.it but not on amazon.com
import requests
from bs4 import BeautifulSoup
url = r'https://www.amazon.com/SanDisk-128GB-Extreme-microSD-Adapter/dp/B07FCMKK5X/ref=sr_1_1?fst=as:off&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=HWWSV1CX6VJBC57MRVP6&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1564513802&rnid=16225007011&s=computers-intl-ship&sr=1-1'
headers = {'User-Agent' : r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' }
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
fullprice = soup.find(id='productTitle')
print(fullprice)

Seems you just need User-Agent header
import requests
from bs4 import BeautifulSoup as bs
headers = {'user-agent': 'Mozilla/5.0'}
r = requests.get('https://www.amazon.com/SanDisk-128GB-Extreme-microSD-Adapter/dp/B07FCMKK5X/ref=sr_1_1?fst=as:off&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=HWWSV1CX6VJBC57MRVP6&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1564513802&rnid=16225007011&s=computers-intl-ship&sr=1-1', headers = headers)
soup = bs(r.content, 'html.parser')
print(soup.select_one('[name="description"]')['content'])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

beautifulsoup not returning all html - python

Related

Scraping website URL from Google Image Search result

Beautiful soup text returns blank

Elements on page don't exist when scraping wsj.com

BeautifulSoup Find periodically returns None

Beautiful Soup in Python cannot find id despite the id existing

Categories

Resources