Web Page Scraping with BeautifulSoup - python

I am new and I am trying to the links of each products for this web page all subpages (1-8): https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua
I have a loop to go over each page but for some reason on page 7 it only brings 20 products and no products on page 8
This is the function that gets me all the URL for each product on each page:
def get_all_product_url(base_url):
# Set up link and gets all URLs
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = []
try:
products = soup.find_all('div', {'class':'jsx-3418419141 product-thumbnail'})
except:
return url_list
for i in products:
url = i.find("a").get('href')
if 'https://www.sodimac.cl' in url:
url_list.append(url)
else:
url_list.append('https://www.sodimac.cl'+url)
# Return all web address without duplicates
return list(set(url_list))
When I run it for page 8 I get an emply list
base_url = "https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua?currentpage=8"
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = get_all_product_url(base_url)
url_list
If you run it for page 1, you will get 28 entries
base_url = "https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua?currentpage=1"
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = get_all_product_url(base_url)
url_list
Any help I reall appreciate it.
Thanks

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.
Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

Create a specific Web Scraper

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

Python - Pagination loop

I have one question.
In my code i have:
r = session.get("https://xxxxxxx.com/online/GIRL")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profile/' in url:
links.append(url)
return links
How i can get list of all profiles on the all available pages? (ex. 1 2 3 4 5 6 etc)?
and put it to Links[]
The webcode is:
<div class="pages"><span>1</span>
2
3
4
<a accesskey="x" href="online/GIRL/2">Next</a></div>
Thanks!
ADDED:
Thanks for answer. Others pagination site have the same html as main site, so i need only read all users from all pagination (2,3,4,5 etc)
For my main site all working fine, i only need to add all users from all pagination sites to LINKS[]
login_data = {
'login': 'xxxxx',
'pass': 'xxxx',
'back_url': ''
}
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profil/' in url:
links.append(url)
return links
with requests.Session() as session:
url = "https://xxxxx.com/login/?form_login=1"
post = session.post(url, data=login_data, headers=headers)
print (post.status_code)
print (post.cookies)
r = session.get("https://xxxx.com/online/Girls")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
x = getPeopleLinks(soups)
print(x)
for path in x:
sleep(3)
url = 'http://www.xxxx.com' + path
page = urllib.request.urlopen(url)
print(url)
The output is:
http://www.xxxx.com/profile/nickname
That its all working fine, but only for:
https://xxxx.com/online/Girls
I need to read all users from all pagination sites
Thanks :)

Convert a result in the console into a list Python

I want to web scrape a list of urls from a web site and then open them one by one.
I can get the list of all urls but then try to turn into a list things get wrong.
When I print the list, intead of geting [url1, urls2...] I get something like this in the console:
[url1,url2,url3] dif line
[url1,url2,url3,url4] difline
[url1,url2,url3,url4,url5]
Find the my script bellow:
driver = webdriver.Chrome()
my_url="https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
driver.get(my_url)
time.sleep(3)
content = driver.page_source.encode('utf-8').strip()
page_soup = soup(content,"html.parser")
links =[]
for link in page_soup.find_all('a', href=True):
url=link['href']
ai=str(url)
links.append(ai)
print(links)
links.append(ai)
print(links)
I have rewritten your code a little. First you need to load and scrap main page to get all links from "href". After that just use scraped urls in a loop to get next pages.
Also there is some junk in "href" which isn't url so you have to clean it first.
I prefer requests to do GET.
http://docs.python-requests.org/en/master/
I hope it helps.
from bs4 import BeautifulSoup
import requests
def main():
links = []
url = "https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
web_page = requests.get(url)
soup = BeautifulSoup(web_page.content, "html.parser")
a_tags = soup.find_all('a', href=True)
for a in a_tags:
links.append(a.get("href"))
print(links) # just to demonstrate that links are there
cleaned_list = []
for link in links:
if "http" in link:
cleaned_list.append(link)
print(cleaned_list)
return cleaned_list
def load_pages_from_links(urls):
user_agent = {'User-agent': 'Mozilla/5.0'}
links = urls
downloaded_pages = {}
if len(links) == 0:
return "There are no links."
else:
for nr, link in enumerate(links):
web_page = requests.get(link, headers=user_agent)
downloaded_pages[nr] = web_page.content
print(downloaded_pages)
if __name__ == "__main__":
links = main()
load_pages_from_links(links)

Python web scraping page loop

Appreciate this is been asked many time on here but I cant seem to get it to work for me.
I've written a scraper which successfully scrapes everything I need from the first page of the site. But, I cant figure out how to get it to loop through the various pages.
The url simply increments like this BLAH/3 + 'page=x'
I haven't been learning to code for very long, so any advice would be appreciated!
import requests
from bs4 import BeautifulSoup
url = 'http://www.URL.org/BLAH1/BLAH2/BLAH3'
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
"<a href='>%s'>%s</a>" %(link.get("href"), link.text)
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
name = print(item.contents[0].text)
address = print(item.contents[1].text.replace('.',''))
care_type = print(item.contents[2].text)
Update:
r = requests.get('http://www.URL.org/BLAH1/BLAH2/BLAH3')
for page in range(10):
r = requests.get('http://www.URL.org/BLAH1/BLAH2/BLAH3' + 'page=' + page)
soup = BeautifulSoup(r.content, "html.parser")
#print(soup.prettify())
# String substitution for HTML
for link in soup.find_all("a"):
"<a href='>%s'>%s</a>" %(link.get("href"), link.text)
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
name = print(item.contents[0].text)
address = print(item.contents[1].text.replace('.',''))
care_type = print(item.contents[2].text)
Update 2!:
import requests
from bs4 import BeautifulSoup
url = 'http://www.URL.org/BLAH1/BLAH2/BLAH3&page='
for page in range(10):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
To loop pages with page=x you need for loop like this>
import requests
from bs4 import BeautifulSoup
url = 'http://www.housingcare.org/housing-care/results.aspx?ath=1%2c2%2c3%2c6%2c7&stp=1&sm=3&vm=list&rp=10&page='
for page in range(10):
print('---', page, '---')
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
Every page can be different and better solution needs more inforamtion about page. Sometimes you can get link to last page and then you can use this information instead 10 in range(10)
Or you can use while True to loop and break to leave loop if there is no link to next page. But first you have to show this page (url to real page) in question.
EDIT: example how to get link to next page and then you get all pages - not only 10 pages as in previous version.
import requests
from bs4 import BeautifulSoup
# link to first page - without `page=`
url = 'http://www.housingcare.org/housing-care/results.aspx?ath=1%2c2%2c3%2c6%2c7&stp=1&sm=3&vm=list&rp=10'
# only for information, not used in url
page = 0
while True:
print('---', page, '---')
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
# link to next page
next_page = soup.find('a', {'class': 'next'})
if next_page:
url = next_page.get('href')
page += 1
else:
break # exit `while True`

Categories

Resources