How do I scrape data from multiple webpages with BeauitfulSoup? - python

I have a problem with the following code and I am sorry, I am new to this all, I want to add the strings in the FullPage list to the actual URL and then I want to visit them and scrape some data from the pages. So far, It has been good but I do not know how to make it visit the other links in the list.
The output will only give me the data of one page but I need the data for 30 pages, how can I make this program to go over each link?
The URL has a pattern, the first part has 'http://arduinopak.com/Prd.aspx?Cat_Name=' and then the second part has the product category name.
import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))

import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObtTj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.geext(separator=u' '))
If you want to scape each links then moving last 3 lines of your code into loop will do it.

Your current code fetches all the links but it stores only one BeautifulSoup object reference. You could instead store them all in the array or process them before visiting another URL (as shown below).
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))
Also, note that the names using PascalCase are by convention reserved for classes. FullPage would usually be written as fullPage or FULL_PAGE if it's meant to be constant.

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.
Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

How to get complete href links using beautifulsoup in python

I am trying to get top movies name by genre. I couldn't get complete href links for that, I stuck by getting half href links
By the following code I got,
https://www.imdb.com/search/title?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=animation&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=biography&sort=user_rating,desc&title_type=feature&num_votes=25000,
.........
Like that but i want to all top 100 movies name by its genre like action, Adventure, Animation, Biography.......
I tried the following code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.imdb.com'
main_url = url + '/chart/top'
res = requests.get(main_url)
soup = BeautifulSoup(res.text, 'html.parser')
for href in soup.find_all(class_='subnav_item_main'):
# print(href)
all_links = url + href.find('a').get('href')
print(all_links)
I want complete link as shown bellow from a link
/search/title?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=FM1ZEBQ7E9KGQSDD441H&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1"
You need another loop over those urls and a limit to only get 100. I store in a dictionary with keys being genre and values being a list of films. Note original titles may appear e.g. The Mountain II (2016) is Dag II (original title).
links is a list of tuples where I keep the genre as first item and url as second.
import requests, pprint
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://www.imdb.com/chart/top'
genres = {}
with requests.Session() as s:
r = s.get(url)
soup = bs(r.content, 'lxml')
links = [(i.text, urljoin(url,i['href'])) for i in soup.select('.subnav_item_main a')]
for link in links:
r = s.get(link[1])
soup = bs(r.content, 'lxml')
genres[link[0].strip()] = [i['alt'] for i in soup.select('.loadlate', limit = 100)]
pprint.pprint(genres)
Sample output:

Extracting 'href' under 'a' tag

I'm trying to extract a link under "a href="link"..."
As there are multiple rows I iterate over every one of them. The first link per row is the one I need so I use find_all('tr') and find('a').
I know find('a') returns a Nonetype but do not know how to work around this
I had a piece of code that worked but is inefficient (in comments).
sauce = urllib.request.urlopen('https://morocco.observation.org/soortenlijst_wg_v3.php')
soup = bs.BeautifulSoup(sauce, 'lxml')
tabel = soup.find('table', {'class': 'tablesorter'})
for i in tabel.find_all('tr'):
# if 'view' in i.get('href'):
# link_list.append(i.get('href'))
link = i.find('a')
#<a class="z1" href="/soort/view/164?from=1987-12-05&to=2019-05-31">Common Reed Bunting - <em>Emberiza schoeniclus</em></a>
How do I retrieve the link under href and work around the Nonetype getting only /soort/view/164?from=1987-12-05&to=2019-05-31?
Thanks in advance
A logical way is to use nth-of-type to isolate the target column
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://morocco.observation.org/soortenlijst_wg_v3.php')
soup = bs(r.content, 'lxml')
base = 'https://morocco.observation.org'
urls = [base + item['href'] for item in soup.select('#mytable_S td:nth-of-type(3) a')]
You could also pass a list of classes
urls = [base + item['href'] for item in soup.select('.z1, .z2,.z3,.z4')]
Or even use starts with, ^, operator for class
urls = [base + item['href'] for item in soup.select('[class^=z]')]
Or contains, *, operator for href
urls = [base + item['href'] for item in soup.select('[href*=view]')]
Read about different css selector methods here: https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
link = i.find('a')
_href = link['href']
print(_href)
O/P:
"/soort/view/164?from=1987-12-05&to=2019-05-31?"
This is not proper url link, you should concate with domain name
new_url = "https://morocco.observation.org"+_href
print(new_url)
O/p:
https://morocco.observation.org/soort/view/164?from=1987-12-05&to=2019-05-31?
Update:
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
resp = requests.get("https://morocco.observation.org/soortenlijst_wg_v3.php")
soup = BeautifulSoup(resp.text, 'lxml')
tabel = soup.find('table', {'class': 'tablesorter'})
base_url = "https://morocco.observation.org"
for i in tabel.find_all('tr'):
link = i.find('a',href=True)
if link is None or not isinstance(link,Tag):
continue
url = base_url + link['href']
print(url)
O/P:
https://morocco.observation.org/soort/view/248?from=1975-05-05&to=2019-06-01
https://morocco.observation.org/soort/view/174?from=1989-12-15&to=2019-06-01
https://morocco.observation.org/soort/view/57?from=1975-05-05&to=2019-06-01
https://morocco.observation.org/soort/view/19278?from=1975-05-13&to=2019-06-01
https://morocco.observation.org/soort/view/56?from=1993-03-25&to=2019-06-01
https://morocco.observation.org/soort/view/1504?from=1979-05-25&to=2019-06-01
https://morocco.observation.org/soort/view/78394?from=1975-05-09&to=2019-06-01
https://morocco.observation.org/soort/view/164?from=1987-12-05&to=2019-06-01

Parsing a range of urls using Urllib2 or Beautifulsoup

I am trying to get data from a site that has the following form: "http://www.mysite.here?pageNo=2"
How do I get the html data from a consecutive range of pages using Urllib2 and/or BeautifulSoup? This code returns the html only for the first page.
import urllib2
from bs4 import BeautifulSoup
for x in range(1,450):
numb = str(x)
url = "http://www.mysite.here?pageNo="+numb
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
Print soup
On each iteration you create new variable with name soup.
You need to create storage for all pages - list with pages. And append to it on each iteration.
import urllib2
from bs4 import BeautifulSoup
pages = []
for x in range(1, 450):
numb = str(x)
url = "http://www.mysite.here?pageNo=" + numb
page = urllib2.urlopen(url).read()
pages.append(BeautifulSoup(page, "html.parser"))
print pages
You can create an array soup = [] and add to it with soup.append(soup).
If you want just one soup object, you need to add the contents at each step, for instance for the body's
soup = BeautifulSoup("<html><body></body></body>") # initialize soup
for x in range(1,450):
numb = str(x)
url = "http://www.mysite.here?pageNo="+numb
page = urllib2.urlopen(url).read()
tmpsoup = BeautifulSoup(page, "html.parser")
for element in tmpsoup.body:
soup.body.append(element)

Categories

Resources