How to get text from next pages using Beautifulsoup in python 3? - python

Im trying to get all the game outcomes for every page for a team. So far I am able to get all the opponent 1 vs opponent 2 and score outcomes. But I don't know how to get the next page to get the rest of the data. Would I find the next page and put it in a while loop ? here is the link to the team I want
http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches
This is what I have so far, it gets all the team matches played and score for only the first page.
def all_match_outcomes():
for match_outcomes in match_history_url():
rest_server(True)
page = requests.get(match_outcomes).content
soup = BeautifulSoup(page, 'html.parser')
team_name_element = soup.select_one('div.teamNameHolder')
team_name = team_name_element.find('h1').text.replace('- Team Overview', '')
for match_outcome in soup.select('table.simple.gamelist.profilelist tr'):
opp1 = match_outcome.find('span', {'class': 'opp1'}).text
opp2 = match_outcome.find('span', {'class': 'opp2'}).text
opp1_score = match_outcome.find('span', {'class': 'hscore'}).text
opp2_score = match_outcome.find('span', {'class': 'ascore'}).text
if match_outcome(True): # If teams have past matches
print(team_name, '%s %s:%s %s' % (opp1, opp1_score, opp2_score, opp2))

Get the last page number and iterate page by page until you hit the last page.
Complete working code:
import re
import requests
from bs4 import BeautifulSoup
url = "http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches"
with requests.Session() as session:
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# locate the last page link
last_page_link = soup.find("span", text="Last").parent["href"]
# extract the last page number
last_page_number = int(re.search(r"page=(\d+)$", last_page_link).group(1))
print("Processing page number 1")
# TODO: extract data
# iterate over all pages starting from page 2 (since we are already on the page 1)
for page_number in range(2, last_page_number+1):
print("Processing page number %d" % page_number)
link = "http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches?page=%d" % page_number
response = session.get(link)
soup = BeautifulSoup(response.content, "html.parser")
# TODO: extract data

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.
Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

Create a specific Web Scraper

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

How do I scrape data from multiple webpages with BeauitfulSoup?

I have a problem with the following code and I am sorry, I am new to this all, I want to add the strings in the FullPage list to the actual URL and then I want to visit them and scrape some data from the pages. So far, It has been good but I do not know how to make it visit the other links in the list.
The output will only give me the data of one page but I need the data for 30 pages, how can I make this program to go over each link?
The URL has a pattern, the first part has 'http://arduinopak.com/Prd.aspx?Cat_Name=' and then the second part has the product category name.
import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))
import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObtTj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.geext(separator=u' '))
If you want to scape each links then moving last 3 lines of your code into loop will do it.
Your current code fetches all the links but it stores only one BeautifulSoup object reference. You could instead store them all in the array or process them before visiting another URL (as shown below).
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))
Also, note that the names using PascalCase are by convention reserved for classes. FullPage would usually be written as fullPage or FULL_PAGE if it's meant to be constant.

Python web scraping page loop

Appreciate this is been asked many time on here but I cant seem to get it to work for me.
I've written a scraper which successfully scrapes everything I need from the first page of the site. But, I cant figure out how to get it to loop through the various pages.
The url simply increments like this BLAH/3 + 'page=x'
I haven't been learning to code for very long, so any advice would be appreciated!
import requests
from bs4 import BeautifulSoup
url = 'http://www.URL.org/BLAH1/BLAH2/BLAH3'
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
"<a href='>%s'>%s</a>" %(link.get("href"), link.text)
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
name = print(item.contents[0].text)
address = print(item.contents[1].text.replace('.',''))
care_type = print(item.contents[2].text)
Update:
r = requests.get('http://www.URL.org/BLAH1/BLAH2/BLAH3')
for page in range(10):
r = requests.get('http://www.URL.org/BLAH1/BLAH2/BLAH3' + 'page=' + page)
soup = BeautifulSoup(r.content, "html.parser")
#print(soup.prettify())
# String substitution for HTML
for link in soup.find_all("a"):
"<a href='>%s'>%s</a>" %(link.get("href"), link.text)
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
name = print(item.contents[0].text)
address = print(item.contents[1].text.replace('.',''))
care_type = print(item.contents[2].text)
Update 2!:
import requests
from bs4 import BeautifulSoup
url = 'http://www.URL.org/BLAH1/BLAH2/BLAH3&page='
for page in range(10):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
To loop pages with page=x you need for loop like this>
import requests
from bs4 import BeautifulSoup
url = 'http://www.housingcare.org/housing-care/results.aspx?ath=1%2c2%2c3%2c6%2c7&stp=1&sm=3&vm=list&rp=10&page='
for page in range(10):
print('---', page, '---')
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
Every page can be different and better solution needs more inforamtion about page. Sometimes you can get link to last page and then you can use this information instead 10 in range(10)
Or you can use while True to loop and break to leave loop if there is no link to next page. But first you have to show this page (url to real page) in question.
EDIT: example how to get link to next page and then you get all pages - not only 10 pages as in previous version.
import requests
from bs4 import BeautifulSoup
# link to first page - without `page=`
url = 'http://www.housingcare.org/housing-care/results.aspx?ath=1%2c2%2c3%2c6%2c7&stp=1&sm=3&vm=list&rp=10'
# only for information, not used in url
page = 0
while True:
print('---', page, '---')
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for link in soup.find_all("a"):
print("<a href='>%s'>%s</a>" % (link.get("href"), link.text))
# Fetch and print general data from title class
general_data = soup.find_all('div', {'class' : 'title'})
for item in general_data:
print(item.contents[0].text)
print(item.contents[1].text.replace('.',''))
print(item.contents[2].text)
# link to next page
next_page = soup.find('a', {'class': 'next'})
if next_page:
url = next_page.get('href')
page += 1
else:
break # exit `while True`

Categories

Resources