Python scraper advice - python

I have been working on a scraper for a little while now, and have come very close to getting it to run as intended. My code as follows:
import urllib.request
from bs4 import BeautifulSoup
# Crawls main site to get a list of city URLs
def getCityLinks():
city_sauce = urllib.request.urlopen('https://www.prodigy-living.co.uk/') # Enter url here
city_soup = BeautifulSoup(city_sauce, 'html.parser')
the_city_links = []
for city in city_soup.findAll('div', class_="city-location-menu"):
for a in city.findAll('a', href=True, text=True):
the_city_links.append('https://www.prodigy-living.co.uk/' + a['href'])
return the_city_links
# Crawls each of the city web pages to get a list of unit URLs
def getUnitLinks():
getCityLinks()
for the_city_links in getCityLinks():
unit_sauce = urllib.request.urlopen(the_city_links)
unit_soup = BeautifulSoup(unit_sauce, 'html.parser')
for unit_href in unit_soup.findAll('a', class_="btn white-green icon-right-open-big", href=True):
yield('the_url' + unit_href['href'])
the_unit_links = []
for link in getUnitLinks():
the_unit_links.append(link)
# Soups returns all of the html for the items in the_unit_links
def soups():
for the_links in the_unit_links:
try:
sauce = urllib.request.urlopen(the_links)
for things in sauce:
soup_maker = BeautifulSoup(things, 'html.parser')
yield(soup_maker)
except:
print('Invalid url')
# Below scrapes property name, room type and room price
def getPropNames(soup):
try:
for propName in soup.findAll('div', class_="property-cta"):
for h1 in propName.findAll('h1'):
print(h1.text)
except:
print('Name not found')
def getPrice(soup):
try:
for price in soup.findAll('p', class_="room-price"):
print(price.text)
except:
print('Price not found')
def getRoom(soup):
try:
for theRoom in soup.findAll('div', class_="featured-item-inner"):
for h5 in theRoom.findAll('h5'):
print(h5.text)
except:
print('Room not found')
for soup in soups():
getPropNames(soup)
getPrice(soup)
getRoom(soup)
When I run this, it returns all the prices for all the urls picked up. However, I does not return the names or the rooms and I am not really sure why. I would really appreciate any pointers on this, or ways to improve my code - been learning Python for a few months now!

I think that the links you are scraping will in the end redirect you to another website, in which case your scraping functions will not be useful!
For instance, the link for a room in Birmingham is redirecting you to another website.
Also, be careful in your usage of the find and find_all methods in BS. The first returns only one tag (as when you want one property name) while find_all() will return a list allowing you to get, for instance, multiple room prices and types.
Anyway, I have simplified a bit your code and this is how I have come across your issue. Maybe you would like to get some inspiration from that:
import requests
from bs4 import BeautifulSoup
main_url = "https://www.prodigy-living.co.uk/"
# Getting individual cities url
re = requests.get(main_url)
soup = BeautifulSoup(re.text, "html.parser")
city_tags = soup.find("div", class_ = "footer-city-nav") # Bottom page not loaded dynamycally
cities_links = [main_url+tag["href"] for tag in city_tags.find_all("a")] # Links to cities
# Getting the individual links to the apts
indiv_apts = []
for link in cities_links[0:4]:
print "At link: ", link
re = requests.get(link)
soup = BeautifulSoup(re.text, "html.parser")
links_tags = soup.find_all("a", class_ = "btn white-green icon-right-open-big")
for url in links_tags:
indiv_apts.append(main_url+url.get("href"))
# Now defining your functions
def GetName(tag):
print tag.find("h1").get_text()
def GetType_Price(tags_list):
for tag in tags_list:
print tag.find("h5").get_text()
print tag.find("p", class_ = "room-price").get_text()
# Now scraping teach of the apts - name, price, room.
for link in indiv_apts[0:2]:
print "At link: ", link
re = requests.get(link)
soup = BeautifulSoup(re.text, "html.parser")
property_tag = soup.find("div", class_ = "property-cta")
rooms_tags = soup.find_all("div", class_ = "featured-item")
GetName(property_tag)
GetType_Price(rooms_tags)
You will see that right at the second element of the lis, you will get an AttributeError as you are not on your website page anymore. Indeed:
>>> print indiv_apts[1]
https://www.prodigy-living.co.uk/http://www.iqstudentaccommodation.com/student-accommodation/birmingham/penworks-house?utm_source=prodigylivingwebsite&utm_campaign=birminghampagepenworksbutton&utm_medium=referral # You will not scrape the expected link right at the beginning
Next time come with a precise problem to solve, or in another case just take a look at the code review section.
On find and find_all: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all
Finally, I think it also answers your question here: https://stackoverflow.com/questions/42506033/urllib-error-urlerror-urlopen-error-errno-11001-getaddrinfo-failed
Cheers :)

Related

Trying to get a value from a html code by using beautiful soap but have hard time to get it

Trying to find the value shown in the picture below from the website https://www.coop.se/butiker-erbjudanden/coop/coop-ladugardsangen-/ with help of beautiful soap code. But the only value I get is the price number and not the "st" value.
Here is the code I try to use to get it...
CODE
test = product.find('span', class_='Splash-content ')
print(Price.text)
import requests
from bs4 import BeautifulSoup as bsoup
site_source = requests.get("https://www.coop.se/butiker-erbjudanden/coop/coop-ladugardsangen-/").content
soup = bsoup(site_source, "html.parser")
all_items = soup.find("div", class_="Section Section--margin")
item_list = soup.find_all("span", class_="Splash-content")
for item in item_list:
print("Price: ",item.find("span", class_="Splash-priceLarge").text)
if item.find("span", class_="Splash-priceSub Splash-priceUnitNoDecimal"):
print("Unit: ",item.find("span", class_="Splash-priceSub Splash-priceUnitNoDecimal").text)
In some cases the unit is missing so we want to make sure we handle for that.
My understanding is that you basically want to print the price and unit of each item so that is what i attempt to do.
try with :
url = "https://www.coop.se/butiker-erbjudanden/coop/coop-ladugardsangen-/"
try:
page = urllib.request.urlopen(url, timeout=20)
except HTTPError as e:
page = e.read()
soup = BeautifulSoup(page, 'html.parser')
body = soup.find('body')
result = body.find("span", class_="Splash-content")
print(result.get_text())
for me it worked !

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

How do you move to a new page when web scraping with BeautifulSoup?

Below I have code that pulls the records off craigslist. Everything works great but I need to be able to go to the next set of records and repeat the same process but being new to programming I am stuck. From looking at the page code it looks like I should be clicking the arrow button contained in the span here until it contains no href:
next >
I was thinking that maybe this was a loop within a loop but I suppose this could be a try/except situation too. Does that sound right? How would you implement that?
import requests
from urllib.request import urlopen
import pandas as pd
response = requests.get("https://nh.craigslist.org/d/computer-parts/search/syp")
soup = BeautifulSoup(response.text,"lxml")
listings = soup.find_all('li', class_= "result-row")
base_url = 'https://nh.craigslist.org/d/computer-parts/search/'
next_url = soup.find_all('a', class_= "button next")
dates = []
titles = []
prices = []
hoods = []
while base_url !=
for listing in listings:
datar = listing.find('time', {'class': ["result-date"]}).text
dates.append(datar)
title = listing.find('a', {'class': ["result-title"]}).text
titles.append(title)
try:
price = listing.find('span', {'class': "result-price"}).text
prices.append(price)
except:
prices.append('missing')
try:
hood = listing.find('span', {'class': "result-hood"}).text
hoods.append(hood)
except:
hoods.append('missing')
#write the lists to a dataframe
listings_df = pd.DataFrame({'Date': dates, 'Titles' : titles, 'Price' : prices, 'Location' : hoods})
#write to a file
listings_df.to_csv("craigslist_listings.csv")
For each page you crawl you can find the next url to crawl and add it to a list.
This is how I would do it, without changing your code too much. I added some comments so you understand what's happening, but leave me a comment if you need any extra explanation:
import requests
from urllib.request import urlopen
import pandas as pd
from bs4 import BeautifulSoup
base_url = 'https://nh.craigslist.org/d/computer-parts/search/syp'
base_search_url = 'https://nh.craigslist.org'
urls = []
urls.append(base_url)
dates = []
titles = []
prices = []
hoods = []
while len(urls) > 0: # while we have urls to crawl
print(urls)
url = urls.pop(0) # removes the first element from the list of urls
response = requests.get(url)
soup = BeautifulSoup(response.text,"lxml")
next_url = soup.find('a', class_= "button next") # finds the next urls to crawl
if next_url: # if it's not an empty string
urls.append(base_search_url + next_url['href']) # adds next url to crawl to the list of urls to crawl
listings = soup.find_all('li', class_= "result-row") # get all current url listings
# this is your code unchanged
for listing in listings:
datar = listing.find('time', {'class': ["result-date"]}).text
dates.append(datar)
title = listing.find('a', {'class': ["result-title"]}).text
titles.append(title)
try:
price = listing.find('span', {'class': "result-price"}).text
prices.append(price)
except:
prices.append('missing')
try:
hood = listing.find('span', {'class': "result-hood"}).text
hoods.append(hood)
except:
hoods.append('missing')
#write the lists to a dataframe
listings_df = pd.DataFrame({'Date': dates, 'Titles' : titles, 'Price' : prices, 'Location' : hoods})
#write to a file
listings_df.to_csv("craigslist_listings.csv")
Edit: You are also forgetting to import BeautifulSoup in your code, which I added in my response
Edit2: You only need to find the first instance of the next button, as the page can (and in this case it does) have more that one next button.
Edit3: For this to crawl computer parts, base_url should be changed to the one present in this code
This is not a direct answer to how to access the "next" button, but this may be a solution to your problem. When I've webscraped in the past I use the URLs of each page to loop through search results.
On craiglist, when you click "next page" the URL changes. There's usually a pattern to this change you can take advantage of. I didn't have to long a look but it looks like the second page of craigslist is: https://nh.craigslist.org/search/syp?s=120, and the third is https://nh.craigslist.org/search/syp?s=240. It looks like that final part of the URL changes by 120 each time.
You could create a list of multiples of 120, and then build a for loop to add this value on to the end of each URL.
Then you have your current for loop nested in this for loop.

Python - Show Results from all Pages not just the first page (Beautiful Soup)

I have been making a simple scraper using Beautiful Soup to get food hygiene rating of restaurants based on postcode entered by user. The code works correctly and takes results from the URL correctly.
What I need help with is how to get all the results to display, not just the results from the first page.
My code is below:
import requests
from bs4 import BeautifulSoup
pc = input("Please enter postcode")
url = "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode="+pc+"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
for item in g_data:
print (item.find_all("a", {"class": "name"})[0].text)
try:
print (item.find_all("span", {"class": "address"})[0].text)
except:
pass
try:
print (item.find_all("div", {"class": "rating-image"})[0].text)
except:
pass
I have discovered by looking at the URL that the page shown is dependent on a variable in the URL string called page
https://www.scoresonthedoors.org.uk/search.php?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=2#results
The pagination code for the Next Page button is:
<a style="float: right" href="?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=3#results" rel="next " title="Go forward one page">Next <i class="fa fa-arrow-right fa-3"></i></a>
Is there a way I can get my code to find out how many pages of results are presented and then grab the results from each of these pages?
Would the best solution to this be to have code that alters the URL string to change "page=" each time (e.g a for loop) or is there a way to find a solution using the information in the pagination link code?
Many thanks for anyone who provides help or looks at this question
You're actually going the right way. Generating the paginated urls to scrape beforehand is a good approach.
I actually nearly wrote the whole code. What you want to look at is the find_max_page() function first which consists on taking the max page from the pagination string. With this number, you can then generate all the urls that you need to scrape, and scrape them one by one.
Check the code below, it's pretty much all there.
import requests
from bs4 import BeautifulSoup
class RestaurantScraper(object):
def __init__(self, pc):
self.pc = pc # the input postcode
self.max_page = self.find_max_page() # The number of page available
self.restaurants = list() # the final list of restaurants where the scrape data will at the end of process
def run(self):
for url in self.generate_pages_to_scrape():
restaurants_from_url = self.scrape_page(url)
self.restaurants += restaurants_from_url # we increment the restaurants to the global restaurants list
def create_url(self):
"""
Create a core url to scrape
:return: A url without pagination (= page 1)
"""
return "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode=" + self.pc + \
"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
def create_paginated_url(self, page_number):
"""
Create a paginated url
:param page_number: pagination (integer)
:return: A url paginated
"""
return self.create_url() + "&page={}".format(str(page_number))
def find_max_page(self):
"""
Function to find the number of pages for a specific search.
:return: The number of pages (integer)
"""
r = requests.get(self.create_url())
soup = BeautifulSoup(r.content, "lxml")
pagination_soup = soup.findAll("div", {"id": "paginator"})
pagination = pagination_soup[0]
page_text = pagination("p")[0].text
return int(page_text.replace('Page 1 of ', ''))
def generate_pages_to_scrape(self):
"""
Generate all the paginated url using the max_page attribute previously scraped.
:return: List of urls
"""
return [self.create_paginated_url(page_number) for page_number in range(1, self.max_page + 1)]
def scrape_page(self, url):
"""
This is coming from your original code snippet. This probably need a bit of work, but you get the idea.
:param url: Url to scrape and get data from.
:return:
"""
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
restaurants = list()
for item in g_data:
name = item.find_all("a", {"class": "name"})[0].text
restaurants.append(name)
try:
print item.find_all("span", {"class": "address"})[0].text
except:
pass
try:
print item.find_all("div", {"class": "rating-image"})[0].text
except:
pass
return restaurants
if __name__ == '__main__':
pc = input('Give your post code')
scraper = RestaurantScraper(pc)
scraper.run()
print "{} restaurants scraped".format(str(len(scraper.restaurants)))

Filtering certain items from a python list of links generated by beatifulsoup

I am writing a web scraper to scrape some information off of the website JW Pepper for a sheet music database. I am using BeautifulSoup and python to do this.
Here is my code:
# a barebones program I created to scrape the description and audio file off the JW pepper website, will eventually be used in a music database
import urllib2
import re
from bs4 import BeautifulSoup
linkgot = 0
def linkget():
search = "http://www.jwpepper.com/sheet-music/search.jsp?keywords=" # this is the url without the keyword that comes up when searching something
print("enter the name of the desired piece")
keyword = raw_input("> ") # this will add the keyword to the url
url = search + keyword
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
all_links = soup.findAll("a")
link_dict = []
item_dict = []
for link in all_links:
link_dict.append(link.get('href')) # adds a list of the the links found on the page to link_dict
item_dict.append(x for x in link_dict if '.item' in x) #sorts them occording to .item
print item_dict
linkget()
The "print" command returns this: [ at 0x10ec6dc80>], which returns nothing when I google it.
Your filtering of the list was going wrong. Rather than filter in a seperate loop, you could just build the list if .item is present as follows:
from bs4 import BeautifulSoup
import urllib2
def linkget():
search = "http://www.jwpepper.com/sheet-music/search.jsp?keywords=" # this is the url without the keyword that comes up when searching something
print("enter the name of the desired piece")
keyword = raw_input("> ") # this will add the keyword to the url
url = search + keyword
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
link_dict = []
item_dict = []
for link in soup.findAll("a", href=True):
href = link.get('href')
link_dict.append(href) # adds a list of the the links found on the page to link_dict
if '.item' in href:
item_dict.append(href)
for href in item_dict:
print href
linkget()
Giving you something like:
/Festival-of-Carols/4929683.item
/Festival-of-Carols/4929683.item
/Festival-of-Carols/4929683.item
...

Categories

Resources