Scraping author names from a website with try/except using Python - python

I am trying to use Try/Except in order to scrape through different pages of a URL containing author data. I need a set of author names from 10 subsequent pages of this website.
# Import Packages
import requests
import bs4
from bs4 import BeautifulSoup as bs
# Output list
authors = []
# Website Main Page URL
URL = 'http://quotes.toscrape.com/'
res = requests.get(URL)
soup = bs4.BeautifulSoup(res.text,"lxml")
# Get the contents from the first page
for item in soup.select(".author"):
authors.append(item.text)
page = 1
pagesearch = True
# Get the contents from 2-10 pages
while pagesearch:
# Check if page is available
try:
req = requests.get(URL + '/' + 'page/' + str(page) + '/')
soup = bs(req.text, 'html.parser')
page = page + 1
for item in soup.select(".author"): # Append the author class from the webpage html
authors.append(item.text)
except:
print("Page not found")
pagesearch == False
break # Break if no page is remaining
print(set(authors)) # Print the output as a unique set of author names
First page doesn't have any page number in it's URL so I treated it separately. I'm using the try/except block for iterating through all of the possible pages and throw an exception and break the loop when the last page is scanned.
When I run the program, it enters to an infinite loop while it needs to print the "Page not found" message when the pages are over. When I interrupt the kernel, I see the correct result as a list and my exception statement but nothing before that. I get the following result.
Page not found
{'Allen Saunders', 'J.K. Rowling', 'Pablo Neruda', 'J.R.R. Tolkien', 'Harper Lee', 'J.M. Barrie',
'Thomas A. Edison', 'J.D. Salinger', 'Jorge Luis Borges', 'Haruki Murakami', 'Dr. Seuss', 'George
Carlin', 'Alexandre Dumas fils', 'Terry Pratchett', 'C.S. Lewis', 'Ralph Waldo Emerson', 'Jim
Henson', 'Suzanne Collins', 'Jane Austen', 'E.E. Cummings', 'Jimi Hendrix', 'Khaled Hosseini',
'George Eliot', 'Eleanor Roosevelt', 'André Gide', 'Stephenie Meyer', 'Ayn Rand', 'Friedrich
Nietzsche', 'Mother Teresa', 'James Baldwin', 'W.C. Fields', "Madeleine L'Engle", 'William
Nicholson', 'George R.R. Martin', 'Marilyn Monroe', 'Albert Einstein', 'George Bernard Shaw',
'Ernest Hemingway', 'Steve Martin', 'Martin Luther King Jr.', 'Helen Keller', 'Charles M. Schulz',
'Charles Bukowski', 'Alfred Tennyson', 'John Lennon', 'Garrison Keillor', 'Bob Marley', 'Mark
Twain', 'Elie Wiesel', 'Douglas Adams'}
What can be the reason for this ? Thanks.

I think that's because there is a page literally. The exception may arise when there is no page to show on the browser.
But when you make a request for this one:
http://quotes.toscrape.com/page/11/
Then, the browser shows a page that bs4 still can parse to get an element.
How to stop at page 11? You can trace the presence of the Next Page Button.
Thanks for reading.

Try using the built-in range() function to go from pages 1-10 instead:
import requests
from bs4 import BeautifulSoup
url = "http://quotes.toscrape.com/page/{}/"
authors = []
for page in range(1, 11):
response = requests.get(url.format(page))
print("Requesting Page: {}".format(response.url))
soup = BeautifulSoup(response.content, "html.parser")
for tag in soup.select(".author"):
authors.append(tag.text)
print(set(authors))

Related

Trouble scraping Wikipedia articles using Python

This is a Wikipedia article containing a list of articles about notable computer scientists. I have to write a script that collects the following info for each one of them:
Their full name
The number of awards they have
The universities they've attended
I've already written the following code to gather the links to each article.
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_computer_scientists"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
lines = soup.find(id="mw-content-text").find_all("li")
valid_links = []
for line in lines:
link = line.find("a")
if link['href'].find("/wiki/") == -1:
continue
if link.text == "Lists portal":
break
valid_links.append("https://en.wikipedia.org" + link['href'])
It's also pretty easy to get their full name (it's just the tile for each one). However I'm having trouble writing a script that can get 2 & 3 correctly for each one.
What I have so far is the following:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
scientist_name = soup.find(id="firstHeading").string
soup.find(id="mw-content-text").find("table", class_="infobox biography vcard")
scientist_education = "PLACEHOLDER"
scientist_awards = "PLACEHOLDER"
Can you try with the following code:
import requests
import re
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_computer_scientists"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
lines = soup.find(id="mw-content-text").find_all("li")
valid_links = []
for line in lines:
link = line.find("a")
if link['href'].find("/wiki/") == -1:
continue
if link.text == "Lists portal":
break
valid_links.append("https://en.wikipedia.org" + link['href'])
for url in valid_links:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
name = soup.find(id="firstHeading").string
edu = soup.find(lambda tag: len(tag.find_all()) == 0 and "Institutions" in tag.text)
edux = [i.text.strip() for i in edu.find_next_siblings("td")] if edu else []
awards = soup.find(lambda tag: len(tag.find_all()) == 0 and "Awards" in tag.text)
awardsx = [i.text.strip() for i in awards.find_next_siblings("td")] if awards else []
res = {"name": name, "education": edux, "awards": awardsx}
print(res)
It returns the following output:
{'name': 'Atta ur Rehman Khan', 'education': ['Ajman University King Saud University University of Malaya Sohar University COMSATS University Air University (Pakistan Air Force) Qurtuba University'], 'awards': []}
{'name': 'Wil van der Aalst', 'education': ['RWTH Aachen University'], 'awards': []}
{'name': 'Scott Aaronson', 'education': ['University of Texas at Austin\nMassachusetts Institute of Technology\nInstitute for Advanced Study\nUniversity of Waterloo'], 'awards': ['Alan T. Waterman Award\nPECASE\nTomassoni–Chisesi Prize\nACM Prize in Computing']}
{'name': 'Rediet Abebe', 'education': ['University of California, BerkeleyHarvard UniversityCornell UniversityUniversity of Cambridge'], 'awards': ['Andrew Carnegie Fellow (2022)Harvard Society of Fellows (2019)MIT Technology Review Innovators Under 35 (2019)']}
....
However, I believe that there are better options for crawling this page, such as Scrapy. Additionally, if that is the case, you could run your spiders on the cloud using a service like estela.

How can Beautiful Soup loop through a list of URLs to scrape multiple text fields

I am attempting to loop through a stored list of URLs to scrape stats about footballers (age, name, club etc).
My list of URLs is stored as playerLinks
playerLinks[:5]
['https://footystats.org/players/england/martyn-waghorn',
'https://footystats.org/players/norway/stefan-marius-johansen',
'https://footystats.org/players/england/grady-diangana',
'https://footystats.org/players/england/jacob-brown',
'https://footystats.org/players/england/josh-onomah']
If i attempt to scrape an individual link with the following code I am able to retrieve a result.
testreq = Request('https://footystats.org/players/england/dominic-solanke', headers=headers)
html_test = urlopen(testreq)
testsoup = BeautifulSoup(html_test, "html.parser")
testname = testsoup.find('p','col-lg-7 lh14e').text
print(testname)
#Dominic Solanke
However, when I loop through my list of URLs I receive errors. Below is the code I am using, but to no avail.
names = []
#For each player page...
for i in range(len(playerLinks)):
reqs2 = Request(playerLinks[i], headers=headers)
html_page = urlopen(reqs2)
psoup2 = BeautifulSoup(html_page, "html.parser")
for x in psoup2.find('p','col-lg-7 lh14e').text
names.append(x.get('text'))
Once I fix the name scrape I will need to repeat the process for other stats. I have pasted the html of the page below. Do I need to nest another loop within? At the moment i receive either 'invalid syntax' errors or 'no text object' errors.
"<div class="row cf lightGrayBorderBottom "> <p class="col-lg-5 semi-bold lh14e bbox mild-small">Full Name</p> <p class="col-lg-7 lh14e">Dominic Solanke</p></div>"
I'm getting the following output:
Code:
import bs4, requests
from bs4 import BeautifulSoup
playerLinks=['https://footystats.org/players/england/martyn-waghorn',
'https://footystats.org/players/norway/stefan-marius-johansen',
'https://footystats.org/players/england/grady-diangana',
'https://footystats.org/players/england/jacob-brown',
'https://footystats.org/players/england/josh-onomah']
names = []
#For each player page...
for i in range(len(playerLinks)):
reqs2 = requests.get(playerLinks[i])
psoup2 = BeautifulSoup(reqs2.content, "html.parser")
for x in psoup2.find_all('p','col-lg-7 lh14e'):
names.append(x.text)
print(names)
#print(names)
Output:
['Martyn Waghorn']
['Martyn Waghorn', 'England']
['Martyn Waghorn', 'England', 'Forward']
['Martyn Waghorn', 'England', 'Forward', '31 (23 January 1990)']
['Martyn Waghorn', 'England', 'Forward', '31 (23 January 1990)', '71st / 300 players']
Some of these links are blank content. I suspect you need to either be logged in and/or be paying for a subscription (as the do offer an api...but the free one only allows for 1 league)
But to correct that output, move your print statement until the end as opposed to printing after every time you append to the list:
import requests
from bs4 import BeautifulSoup
playerLinks=['https://footystats.org/players/england/martyn-waghorn',
'https://footystats.org/players/norway/stefan-marius-johansen',
'https://footystats.org/players/england/grady-diangana',
'https://footystats.org/players/england/jacob-brown',
'https://footystats.org/players/england/josh-onomah']
names = []
#For each player page...
for link in playerLinks:
w=1
reqs2 = requests.get(link)
psoup2 = BeautifulSoup(reqs2.content, "html.parser")
for x in psoup2.find_all('p','col-lg-7 lh14e'):
names.append(x.text)
print(names)

BeautifulSoup scrape the first title tag in each <li>

I have some code that goes through the cast list of a show or movie on Wikipedia. Scraping all the actor's names and storing them. The current code I have finds all the <a> in the list and stores their title tags. It currently goes:
from bs4 import BeautifulSoup
URL = input()
website_url = requests.get(URL).text
section = soup.find('span', id='Cast').parent
Stars = []
for x in section.find_next('ul').find_all('a'):
title = x.get('title')
print (title)
if title is not None:
Stars.append(title)
else:
continue
While this partially works there are two downsides:
It doesn't work if the actor doesn't have a Wikipedia page hyperlink.
It also scrapes any other hyperlink title it finds. e.g. https://en.wikipedia.org/wiki/Indiana_Jones_and_the_Kingdom_of_the_Crystal_Skull returns ['Harrison Ford', 'Indiana Jones (character)', 'Bullwhip', 'Cate Blanchett', 'Irina Spalko', 'Bob cut', 'Rosa Klebb', 'From Russia with Love (film)', 'Karen Allen', 'Marion Ravenwood', 'Ray Winstone', 'Sallah', 'List of characters in the Indiana Jones series', 'Sexy Beast', 'Hamstring', 'Double agent', 'John Hurt', 'Ben Gunn (Treasure Island)', 'Treasure Island', 'Courier', 'Jim Broadbent', 'Marcus Brody', 'Denholm Elliott', 'Shia LaBeouf', 'List of Indiana Jones characters', 'The Young Indiana Jones Chronicles', 'Frank Darabont', 'The Lost World: Jurassic Park', 'Jeff Nathanson', 'Marlon Brando', 'The Wild One', 'Holes (film)', 'Blackboard Jungle', 'Rebel Without a Cause', 'Switchblade', 'American Graffiti', 'Rotator cuff']
Is there a way I can get BeautifulSoup to scrape the first two Words after each <li>? Or even a better solution for what I am trying to do?
You can use css selectors to grab only the first <a> in a <li>:
for x in section.find_next('ul').select('li > a:nth-of-type(1)'):
Example
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/Indiana_Jones_and_the_Kingdom_of_the_Crystal_Skull#Cast'
website_url = requests.get(URL).text
soup = BeautifulSoup(website_url,'lxml')
section = soup.find('span', id='Cast').parent
Stars = []
for x in section.find_next('ul').select('li > a:nth-of-type(1)'):
Stars.append(x.get('title'))
Stars
Output
['Harrison Ford',
'Cate Blanchett',
'Karen Allen',
'Ray Winstone',
'John Hurt',
'Jim Broadbent',
'Shia LaBeouf']
You can use Regex to fetch all the names from the text content of <li/> and just take the first two names and it will also fix the issue in case the actor doesn't have a Wikipedia page hyperlink
import re
re.findall("([A-Z]{1}[a-z]+) ([A-Z]{1}[a-z]+)", <text_content_from_li>)
Example:
text = "Cate Blanchett as Irina Spalko, a villainous Soviet agent. Screenwriter David Koepp created the character."
re.findall("([A-Z]{1}[a-z]+) ([A-Z]{1}[a-z]+)",text)
Output:
[('Cate', 'Blanchett'), ('Irina', 'Spalko'), ('Screenwriter', 'David')]
There is considerable variation for the html for cast within the film listings on Wikipaedia. Perhaps look to an API to get this info?
E.g. imdb8 allows for a reasonable number of calls which you could use with the following endpoint
https://imdb8.p.rapidapi.com/title/get-top-cast
There also seems to be Python IMDb API
Or choose something with more regular html. For example, if you take the imdb film ids in a list you can extract full cast and main actors, from IMDb as follows. To get the shorter cast list I am filtering out the rows which occur at/after the text "Rest" within "Rest of cast listed alphabetically:"
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
movie_ids = ['tt0367882', 'tt7126948']
base = 'https://www.imdb.com'
with requests.Session() as s:
for movie_id in movie_ids:
link = f'https://www.imdb.com/title/{movie_id}/fullcredits?ref_=tt_cl_sm'
# print(link)
r = s.get(link)
soup = bs(r.content, 'lxml')
print(soup.select_one('title').text)
full_cast = [(i.img['title'], base + i['href']) for i in soup.select('.cast_list [href*=name]:has(img)')]
main_cast = [(i.img['title'], base + i['href']) for i in soup.select('.cast_list tr:not(:has(.castlist_label:contains(cast)) ~ tr, :has(.castlist_label:contains(cast))) [href*=name]:has(img)')]
df_full = pd.DataFrame(full_cast, columns = ['Actor', 'Link'])
df_main = pd.DataFrame(main_cast, columns = ['Actor', 'Link'])
# print(df_full)
print(df_main)

How to extract player names using Python with BeautifulSoup from cricinfo

I'm learning beautiful soup. I want to extract the player names i.e. the playing eleven for both teams from cricinfo.com. The exact link is "https://www.espncricinfo.com/series/13266/scorecard/439146/west-indies-vs-south-africa-1st-t20i-south-africa-tour-of-west-indies-2010"
The problem is that the website only displays the players under class "wrap batsmen" if they have batted. Otherwise they are placed under the class "wrap dnb". I want to extract all the players irrespective of whether they have batted or not. How I can maintain two arrays (one for each team) that will dynamically search for players in "wrap batsmen" and "wrap dnb" (if required)?
This is my attempt:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
years = []
# Years we will be analyzing
for i in range(2010, 2018):
years.append(i)
names = []
# URL page we will scraping (see image above)
url = "https://www.espncricinfo.com/series/13266/scorecard/439146/west-indies-vs-south-africa-1st-t20i-south-africa-tour-of-west-indies-2010"
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html, features="html.parser")
for a in range(0, 1):
names.append([a.getText() for a in soup.find_all("div", class_="cell batsmen")[1:][a].findAll('a', limit=1)])
soup = soup.find_all("div", class_="wrap dnb")
print(soup[0])
While this is possible with BeautifulSoup, it's not the best tool for the job. All that data (and much more) is available through the API. Simply pull that and then you can parse the json to get what you want (and more). Here's a quick script though to get the 11 players for each team:
You can get the api url by using dev tools (Ctrl-Shft-I) and seeing what requests the browser makes (look at Network -> XHR in the side panel. you may need to click around to view it make the request/call)
import requests
url = 'https://site.web.api.espn.com/apis/site/v2/sports/cricket/13266/summary'
payload = {
'contentorigin': 'espn',
'event': '439146',
'lang': 'en',
'region': 'gb',
'section': 'cricinfo'}
jsonData = requests.get(url, params=payload).json()
roster = jsonData['rosters']
players = {}
for team in roster:
players[team['team']['displayName']] = []
for player in team['roster']:
playerName = player['athlete']['displayName']
players[team['team']['displayName']].append(playerName)
Output:
print (players)
{'West Indies': ['Chris Gayle', 'Andre Fletcher', 'Dwayne Bravo', 'Ramnaresh Sarwan', 'Narsingh Deonarine', 'Kieron Pollard', 'Darren Sammy', 'Nikita Miller', 'Jerome Taylor', 'Sulieman Benn', 'Kemar Roach'], 'South Africa': ['Graeme Smith', 'Loots Bosman', 'Jacques Kallis', 'AB de Villiers', 'Jean-Paul Duminy', 'Johan Botha', 'Alviro Petersen', 'Ryan McLaren', 'Roelof van der Merwe', 'Dale Steyn', 'Charl Langeveldt']}
See below:

Scraping data from a website with Infinite Scroll?

I am trying to scrape a website for titles as well as other items but for the sake of brevity, just game titles.
I have tried using selenium and beautiful soup in tandem to grab the titles, but I cannot seem to get all the September releases no matter what I do. In fact, I get some of the August game titles as well. I think it has to do with the fact that there is no ending to the website. How would I grab just the September titles? Below is the code I used and I have tried to use Scrolling but I do not think I understand how to use it properly.
EDIT: My goal is to be able to eventually get each month by changing a few lines of code.
from selenium import webdriver
from bs4 import BeautifulSoup
titles = []
chromedriver = 'C:/Users/Chase The Great/Desktop/Podcast/chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.releases.com/l/Games/2019/9/')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
for title in soup.find_all(class_= 'calendar-item-title'):
titles.append(title.text)
I am expected to get 133 titles and I get some August titles plus only part of the titles as such:
['SubaraCity', 'AER - Memories of Old', 'Vambrace: Cold Soul', 'Agent A: A Puzzle in Disguise', 'Bubsy: Paws on Fire!', 'Grand Brix Shooter', 'Legend of the Skyfish', 'Vambrace: Cold Soul', 'Obakeidoro!', 'Pokemon Masters', 'Decay of Logos', 'The Lord of the Rings: Adventure ...', 'Heave Ho', 'Newt One', 'Blair Witch', 'Bulletstorm: Duke of Switch Edition', 'The Ninja Saviors: Return of the ...', 'Re:Legend', 'Risk of Rain 2', 'Decay of Logos', 'Unlucky Seven', 'The Dark Pictures Anthology: Man ...', 'Legend of the Skyfish', 'Astral Chain', 'Torchlight II', 'Final Fantasy VIII Remastered', 'Catherine: Full Body', 'Root Letter: Last Answer', 'Children of Morta', 'Himno', 'Spyro Reignited Trilogy', 'RemiLore: Lost Girl in the Lands ...', 'Divinity: Original Sin 2 - Defini...', 'Monochrome Order', 'Throne Quest Deluxe', 'Super Kirby Clash', 'Himno', 'Post War Dreams', 'The Long Journey Home', 'Spice and Wolf VR', 'WRC 8', 'Fantasy General II', 'River City Girls', 'Headliner: NoviNews', 'Green Hell', 'Hyperforma', 'Atomicrops', 'Remothered: Tormented Fathers']
Seems to me that in order to get only september, first you want to grab only the section for september:
section = soup.find('section', {'class': 'Y2019-M9 calendar-sections'})
Then once you fetch the section for September get all the titles which are in an <a> tag like this:
for title in section.find_all('a', {'class': ' calendar-item-title subpage-trigg'}):
titles.append(title.text)
Please note that none of the previous has been tested.
UPDATE:
The problem is that everytime you want load the page, it gives you only the very first section that contains only 24 items, in order to access them you have to scroll down(infinite scroll).
If you open the browser developers tool, select Network and then XHR you will notice that everytime you scroll and load the next "page" there is a request with an url similar to this:
https://www.releases.com/calendar/nextAfter?blockIndex=139&itemIndex=23&category=Games&regionId=us
Where my guess is that blockIndex is meant for the month and itemIndex is for every page loaded, if you are looking only for the month of september blockIndex will be always 139 in that request the challenge is to get the next itemIndex for the next page so you can construct your next request.
The next itemIndex will be always the last itemIndex of the previous request.
I did make a script that does what you want using only BeautifulSoup. Use it at your own discretion, there are some constants that may be extracted dynamically, but I think this could give you a head start:
import json
import requests
from bs4 import BeautifulSoup
DATE_CODE = 'Y2019-M9'
LAST_ITEM_FIRST_PAGE = f'calendar-item col-xs-6 to-append first-item calendar-last-item {DATE_CODE}-None'
LAST_ITEM_PAGES = f'calendar-item col-xs-6 to-append calendar-last-item {DATE_CODE}-None'
INITIAL_LINK = 'https://www.releases.com/l/Games/2019/9/'
BLOCK = 139
titles = []
def get_next_page_link(div: BeautifulSoup):
index = div['item-index']
return f'https://www.releases.com/calendar/nextAfter?blockIndex={BLOCK}&itemIndex={index}&category=Games&regionId=us'
def get_content_from_requests(page_link):
headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
req = requests.get(page_link, headers=headers)
return BeautifulSoup(req.content, 'html.parser')
def scroll_pages(link: str):
print(link)
page = get_content_from_requests(link)
for div in page.findAll('div', {'date-code': DATE_CODE}):
item = div.find('a', {'class': 'calendar-item-title subpage-trigg'})
if item:
# print(f'TITLE: {item.getText()}')
titles.append(item.getText())
last_index_div = page.find('div', {'class': LAST_ITEM_FIRST_PAGE})
if not last_index_div:
last_index_div = page.find('div', {'class': LAST_ITEM_PAGES})
if last_index_div:
scroll_pages(get_next_page_link(last_index_div))
else:
print(f'Found: {len(titles)} Titles')
print('No more pages to scroll finishing...')
scroll_pages(INITIAL_LINK)
with open(f'titles.json', 'w') as outfile:
json.dump(titles, outfile)
if your goal is to use Selenium, I think the same principle may apply unless it has a scrolling capability as it is loading the page.
Replacing INITIAL_LINK, DATE_CODE & BLOCK accordingly, will get you other months as well.

Categories

Resources