Scraping website URL from Google Image Search result - python

Search link: https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR
from bs4 import BeautifulSoup
search_link = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'
all_links=[]
for i in range(1,10):
url= search_link.format(i)
#print("url: " +url)
r = requests.get(url)
c = r.content
soup = BeautifulSoup(c, 'html.parser')
all = soup.find_all('a', {'class': 'ArticleTeaserSearchResultItem_link'}, href=True)
for item in all:
print(item)
print(item['href'])
all_links.append(item['href'])
print(all_links)
I found some code from the internet but it is not working. After I run the code, the list is empty. Anyone has any idea about this? Thank you very much.

You dont specify headers, and your request give empty page. One more thing, there is no - ArticleTeaserSearchResultItem_link class. You can try this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
all_links=[]
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
for href in soup.find('div', {'id': 'search'}).find_all('a'):
if href.get('href').startswith('https://'):
all_links.append(href.get('href'))
print(list(set(all_links)))
OUTPUT:
['https://www.quadrantkindercentra.nl/pedagogisch-werkplan-kdv-villa-kakelbont-2020/?w=5.6.3254479.1.27.32.red+iphone+6s', 'https://itstechprice.com/apple-iphone-6-price-in-nigeria/', 'https://olist.ng/mobile_phones-apple-iphone_6', 'https://naijaprice.com/apple-iphone-prices-in-nigeria/', 'https://www.walmart.com/browse/cell-phones/apple-ios-prepaid-phones/1105910_4527935_1072335_1231295_1231296', 'https://www.amazon.in/VcareGadGets-Apple-iPhone-Shining-Gloss/dp/B07FND9S6M', 'https://www.amazon.in/VCARE-GADGETS-Marble-White-iPhone/dp/B07P8CQZNY']

Related

Web Scraping a list of links from Tripadvisor

I'm trying to create a webscraper that will return a list of links to individual objects from the website example website.
The code I wrote takes the list of pages and returns the list of links to each attraction, but in the wrong way (the links are not one after the other):
Could someone help me to correct this code so that it would take list of links like below?
I will be grateful for any help.
My code:
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
restaurantLinks = open('pages.csv')
print(restaurantLinks)
urls = [url.strip() for url in restaurantLinks.readlines()]
restlist = []
for link in urls:
print("Opening link:"+str(link))
response=requests.get(link, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
productlist = soup.find_all('div', class_='cNjlV')
print(productlist)
productlinks =[]
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append('https://www.tripadvisor.com'+link['href'])
print(productlinks)
restlist.append(productlinks)
print(restlist)
df = pd.DataFrame(restlist)
df.to_csv('links.csv')
Instead of append() elements to your list try to extend() it:
restlist.extend(productlinks)
Example
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
urls = ['https://www.tripadvisor.com/Attractions-g187427-Activities-oa60-Spain.html']
restlist = []
for link in urls:
print("Opening link:"+str(link))
response=requests.get(link, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
restlist.extend(['https://www.tripadvisor.com'+a['href'] for a in soup.select('a:has(h3)')])
df = pd.DataFrame(restlist)
df.to_csv('links.csv', index=False)

Beautiful soup text returns blank

I'm trying to scrape a website, but it returns blank, can you help please? what am i missing?
import requests
from bs4 import BeautifulSoup
URL = 'https://ks.wjx.top/jq/50921280.aspx'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.text)
To get a response, add the User-Agent header to requests.get(), otherwise, the website thinks that your a bot, and will block you.
import requests
from bs4 import BeautifulSoup
URL = "https://ks.wjx.top/jq/50921280.aspx"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify())

requests.get(url) not returning for this specific url

I'm trying to use requests.get(url).text to get the HTML from this website. However, when requests.get(url) is called with this specific url, it never returns no matter how long I wait. This works with other urls, but this one specifically is giving me trouble. Code is below
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.carmax.com/cars/all', allow_redirects=True).text
soup = BeautifulSoup(source, 'lxml')
print(soup.prettify().encode('utf-8'))
Thanks for any help!
Try:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
html = requests.get("https://www.carmax.com/cars/all",headers=headers)
soup = BeautifulSoup(html.content, 'html.parser')
print(soup.prettify())

beautifulsoup not returning all html

import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.amazon.com/s?k=iphone+5s&ref=nb_sb_noss')
c = r.content
soup = BeautifulSoup(c, 'html.parser')
all = soup.find_all("span", {"class": "a-size-medium a-color-base a-text-normal"})
print(all)
so this is my simple script of python trying to scrape a page in amazon but not all the html is returned in the "soup" variable therefor i get nothing when trying to find a specific series of tags and extract them.
Try the below code, it should do the trick for you.
You actually missed to add headers in your code
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.com/s?k=iphone+5s&ref=nb_sb_noss'
response = requests.get(url, headers=headers)
print(response.text)
soup = BeautifulSoup(response.content, features="lxml")
my_all = soup.find_all("span", {"class": "a-size-medium a-color-base a-text-normal"})
print(my_all)

Beautiful Soup in Python cannot find id despite the id existing

the soup.find method returns None instead of the product title despite the productTitle existing in the page.
It works on amazon.it but not on amazon.com
import requests
from bs4 import BeautifulSoup
url = r'https://www.amazon.com/SanDisk-128GB-Extreme-microSD-Adapter/dp/B07FCMKK5X/ref=sr_1_1?fst=as:off&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=HWWSV1CX6VJBC57MRVP6&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1564513802&rnid=16225007011&s=computers-intl-ship&sr=1-1'
headers = {'User-Agent' : r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' }
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
fullprice = soup.find(id='productTitle')
print(fullprice)
Seems you just need User-Agent header
import requests
from bs4 import BeautifulSoup as bs
headers = {'user-agent': 'Mozilla/5.0'}
r = requests.get('https://www.amazon.com/SanDisk-128GB-Extreme-microSD-Adapter/dp/B07FCMKK5X/ref=sr_1_1?fst=as:off&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=HWWSV1CX6VJBC57MRVP6&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1564513802&rnid=16225007011&s=computers-intl-ship&sr=1-1', headers = headers)
soup = bs(r.content, 'html.parser')
print(soup.select_one('[name="description"]')['content'])

Categories

Resources