Web scraper returning zero in terminal - python

I'm trying to scrape this website
https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who=bygg&bf=1&page=1
And I've put a def getQuestions(tag) in the who={tag} part of the url and that works fine. When I try to add def getQuestions(tag, page) page={page} it just returns 0 in the terminal, and I really hope no clue what could be causing this.
Here is the full code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = 'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={bygg}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,5):
getQuestions('bygg', x)
print(len(questionlist))
Any help would be appreciated. Best regards!

Change the string in url variable to f-string:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
def getQuestions(tag, page):
questionlist = []
url = f"https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
questions = soup.find_all("div", {"class": "box-white p-0 mb-4"})
for item in questions:
question = {
"title": item.find("a", {"class": "link-primary"}).text,
"link": item.find("a", {"class": "link-primary"})["href"],
"nummer": item.find("a", {"class": "link-body"})["href"],
"address": item.find("address", {"class": "mt-2 mb-0"}).text,
"RegÅr": item.find("div", {"class": "col text-center"}).text,
}
questionlist.append(question)
return questionlist
out = []
for x in range(1, 5):
out.extend(getQuestions("bygg", x))
print(len(out))
Prints:
80

Try changing your url to this:
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
You didn't quite have your f-Strings set up right

Related

Selecting links within a div tag using beautiful soup

I am trying to run the following code
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
params = {
'q': 'Machine learning,
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = BeautifulSoup(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')['href']
The following output (error) is being shown
"TypeError: list indices must be integers or slices, not str"
What is it I am doing wrong?
The following is tested and works:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
params = {
'q': 'Machine learning',
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = bs(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')
for p in profiles:
print(p.get('href'))
Result in terminal:
/citations?user=rSVIHasAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
/citations?user=yxUduqMAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=9Vdfc2sAAAAJ&hl=en&oi=sra
/citations?user=lXYKgiYAAAAJ&hl=en&oi=sra
/citations?user=xzss3t0AAAAJ&hl=en&oi=sra
/citations?user=BFdcm_gAAAAJ&hl=en&oi=sra
/citations?user=okf5bmQAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
In your code, you were trying to obtain the href attribute from a list (soup.select returns a list, and soup.select_one return a single element).
See BeautifulSoup documentation here

how to iterate through a list of URL and save it to CSV?

import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'}
URL = "https://www.amazon.com/TRESemm%C3%A9-Botanique-Shampoo-Nourish-Replenish/dp/B0199WNJE8/ref=sxin_14_pa_sp_search_thematic_sspa?content-id=amzn1.sym.a15c61b7-4b93-404d-bb70-88600dfb718d%3Aamzn1.sym.a15c61b7-4b93-404d-bb70-88600dfb718d&crid=2HG5WSUDCJBMZ&cv_ct_cx=hair%2Btresemme&keywords=hair%2Btresemme&pd_rd_i=B0199WNJE8&pd_rd_r=28d72361-7f35-4b1a-be43-98e7103da70c&pd_rd_w=6UL4P&pd_rd_wg=JtUqB&pf_rd_p=a15c61b7-4b93-404d-bb70-88600dfb718d&pf_rd_r=DFPZNAG391M5JS55R6HP&qid=1660432925&sprefix=hair%2Btresemme%2Caps%2C116&sr=1-3-a73d1c8c-2fd2-4f19-aa41-2df022bcb241-spons&smid=A3DEFW12560V8M&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExQlM3VFpGRVM5Tk8wJmVuY3J5cHRlZElkPUEwNjE5MjQwM01JV0FNN1pOMlRHSSZlbmNyeXB0ZWRBZElkPUEwNTA1MDQyMlQ5RjhRQUxIWEdaUiZ3aWRnZXROYW1lPXNwX3NlYXJjaF90aGVtYXRpYyZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1"
webpage = requests.get(URL, headers=headers)
soup = BeautifulSoup(webpage.content)
rank = soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0]
Category = soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[2:6]
Category = ' '.join(Category)
type(rank)
type(Category)
import string
for char in string.punctuation:
rank = rank.replace(char, '')
print(rank)
print(Category)
I have other URLs similar to this and I want to loop through them: Here are the links: How can I loop through them and save them to a csv file. Thank you very much in advanced!
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
You could use a for-loop to iterate the list:
for url in URL:
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup(webpage.content)
Note: amazon do not want to be scraped, so it is a question of time, that they will block you. May use some delay, rotating proxy, ...
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'}
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
data = []
for url in URL:
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup(webpage.content)
data.append({
'url':url,
'rank':soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].split()[0][1:],
'category':soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller") a').text.split('Top 100 in ')[-1]
})
pd.DataFrame(data).to_csv('myfile.csv', index=False)

Why is this web scrape not working on python?

I haven’t recently been using the code attached. For the past few weeks, it has been working completely fine and always produced results. However, I used this today and for some reason it didn’t work. Could you please help and provide a solution to the problem.
import requests, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {"q": "dji", "hl": "en", 'gl': 'us', 'tbm': 'shop'}
response = requests.get("https://www.google.com/search",
params=params,
headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# list with two dict() combined
shopping_data = []
shopping_results_dict = {}
for shopping_result in soup.select('.sh-dgr__content'):
title = shopping_result.select_one('.Lq5OHe.eaGTj h4').text
product_link = f"https://www.google.com{shopping_result.select_one('.Lq5OHe.eaGTj')['href']}"
source = shopping_result.select_one('.IuHnof').text
price = shopping_result.select_one('span.kHxwFf span').text
try:
rating = shopping_result.select_one('.Rsc7Yb').text
except:
rating = None
try:
reviews = shopping_result.select_one('.Rsc7Yb').next_sibling.next_sibling
except:
reviews = None
try:
delivery = shopping_result.select_one('.vEjMR').text
except:
delivery = None
shopping_results_dict.update({
'shopping_results': [{
'title': title,
'link': product_link,
'source': source,
'price': price,
'rating': rating,
'reviews': reviews,
'delivery': delivery,
}]
})
shopping_data.append(dict(shopping_results_dict))
print(title)
Because .select in for shopping_result in soup.select('.sh-dgr__content'): could not find any element so it gives you an empty list. Therefor the body of the for-loop is not executed. Python jumps out of the loop.
title only exists and is defined when the body of the for loop executes.
You should make sure you used a correct method to find your element(s).

I want to scrape these product information

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://twillmkt.com'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://twillmkt.com/collections/denim')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='ProductItem__Wrapper')
productlinks=[]
Title=[]
Brand=[]
Colour=[]
for links in tra:
for link in links.find_all('a',href=True)[1:]:
comp=baseurl+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
title=soup.find('h1').text
Title.append(title)
price=soup.find('span',class_="money").text
Brand.append(price)
Product_Features=soup.find_all('li').text
Colour.append(Product_Features)
df = pd.DataFrame(
{"Title": Title, "Price": Brand,"Product_Features":Colour}
)
print(df)
I scrape the title and price but difficult to scrape these information SIZE,product feature,material,model size,image
Single Page Link
https://twillmkt.com/products/light-blue-butterfly-print-slim-leg-denim?variant=39498848403534
i'll give you the css selectors for the elements you want. you can catch them with soup.select()
size: select[id^="product-select"] option
product feature: div.ProductMeta__Description.Rte li
material: div.ProductMeta__Description.Rte p:nth-of-type(2)
model size: div.ProductMeta__Description.Rte p:nth-of-type(3)
image: div[id^="Image"] span img

AttributeError: 'NoneType' object has no attribute 'get_text' with input id

url = 'https://www.amazon.ca/Powerextra-Replacement-Battery-Compatible-NP-FZ100/dp/B07PQQQ82K/ref=bmx_4?pd_rd_w=TNcoX&pf_rd_p=d9347c84-a27a-4c22-b959-7bb71382273b&pf_rd_r=E7RKN5D6HSM4TFZQJCGT&pd_rd_r=e8bff283-bc96-4b77-b31a-f413b4d62e40&pd_rd_wg=wje3Z&pd_rd_i=B07PQQQ82K&psc=1'
headers = 'headers = { "user-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}'
page = requests.get(url, headers= headers)
soup = BeautifulSoup(page.content, "lxml")
title = soup.find('input', {'id': 'mbb-offeringID-1'}).get_text()
print(title)
I also tried:
title = soup.find('input', {'id': 'mbb-offeringID-1'}).get('value')
print(title)
Then I tried if it works in general with a "span id" and it gave the same error
title = soup.find(id = "productTitle").get_text()
Try to add "Accept-Language": "en-US,en;q=0.5" HTTP header for correct response from the server:
import requests
from bs4 import BeautifulSoup
url = "https://www.amazon.ca/Powerextra-Replacement-Battery-Compatible-NP-FZ100/dp/B07PQQQ82K/ref=bmx_4?pd_rd_w=TNcoX&pf_rd_p=d9347c84-a27a-4c22-b959-7bb71382273b&pf_rd_r=E7RKN5D6HSM4TFZQJCGT&pd_rd_r=e8bff283-bc96-4b77-b31a-f413b4d62e40&pd_rd_wg=wje3Z&pd_rd_i=B07PQQQ82K&psc=1"
headers = headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
"Accept-Language": "en-US,en;q=0.5",
}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "lxml")
title = (
soup.find("input", {"id": "mbb-offeringID-1"})
.find_next("span")
.get_text(strip=True, separator=" ")
)
print(title)
Prints:
Asurion 3-Year Protection Plan for $14.99

Categories

Resources