How to scrape websites with Python and beautiful soup - python

I am trying to scrape results from the bbc sport website. I've got the scores working but when trying to add team names the program prints out none 1-0 none (for example). This is the code:
from bs4 import BeautifulSoup
import urllib.request
import csv
url = 'http://www.bbc.co.uk/sport/football/teams/derby-county/results'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
for match in soup.select('table.table-stats tr.report'):
team1 = match.find('span', class_='team-home')
team2 = match.find('span', class_='team-away')
score = match.abbr
print(team1.string, score.string, team2.string)

It looks like you are searching for tags that are not there. For instance class_="team-home teams" is in the html, but class_='team-home' is not. The following code prints the first team name:
tables = soup.find_all("table", class_="table-stats")
tables[0].find("span", class_="team-home teams").text
# u' Birmingham '

Here is a possible solution which gets the home and away team names, the final score, the match date and the competition name via BeautifulSoup and puts it in a DataFrame.
import requests
import pandas as pd
from bs4 import BeautifulSoup
#Get the relevant webpage set the data up for parsing
url = "http://www.bbc.co.uk/sport/football/teams/derby-county/results"
r = requests.get(url)
soup=BeautifulSoup(r.content,"lxml")
#set up a function to parse the "soup" for each category of information and put it in a DataFrame
def get_match_info(soup,tag,class_name,column_name):
info_array=[]
for info in soup.find_all('%s'%tag,attrs={'class':'%s'%class_name}):
info_array.append({'%s'%column_name:info.text})
return pd.DataFrame(info_array)
#for each category pass the above function the relevant information i.e. tag names
date = get_match_info(soup,"td","match-date","Date")
home_team = get_match_info(soup,"span","team-home teams","Home Team")
score = get_match_info(soup,"span","score","Score")
away_team = get_match_info(soup,"span","team-away teams","Away Team")
competition = get_match_info(soup,"td","match-competition","Competition")
#Concatenate the DataFrames to present a final table of all the above info
match_info = pd.concat([date,home_team,score,away_team,competition],ignore_index=False,axis=1)
print match_info

Related

I am trying to give path to the variable so that I can scrape the information contained in that path. But, I m getting empty list

I am trying to make web scraper using Python and the basic concept I am using here is,
create empty list --> use 'for loop' to loop through the element on the web page. --> append that info in the empty list --> convert that list to row and column using pandas --> finally to a csv.
the code that I made is
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.imdb.com/find?q=top+1000+movies&ref_=nv_sr_sm"
results=requests.get(url,headers=headers)
soup=BeautifulSoup(results.text,"html.parser")
# print(soup.prettify())
#initializing empty lists where the data will go
titles =[]
years = []
times = []
imdb_rating = []
metascores = []
votes = []
us_gross = []
movie_div = soup.find_all('div',class_='lister-list')
#initiating the loop for scraper
for container in movie_div:
#tiles
name=container.tr.td.a.text
titles.append(name)
print(titles)
the website I want to scrap is 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'. I need help to know how can i give correct path to the variable 'name', so that i can extract the name of the movie given in name_of_movei, in the HTML script of the page. Because each time I am getting output as empty list.
This example will parse name, year, rating from the table and creates a dataframe from it:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top/"
headers = {"Accept-Language": "en-US, en;q=0.5"}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
all_data = []
for row in soup.select(".lister-list > tr"):
name = row.select_one(".titleColumn a").text.strip()
year = row.select_one(".titleColumn .secondaryInfo").text.strip()
rating = row.select_one(".imdbRating").text.strip()
# ...other variables
all_data.append([name, year, rating])
df = pd.DataFrame(all_data, columns=["Name", "Year", "Rating"])
print(df.head().to_markdown(index=False))
Prints:
Name
Year
Rating
The Shawshank Redemption
(1994)
9.2
The Godfather
(1972)
9.2
The Dark Knight
(2008)
9
The Godfather: Part II
(1974)
9
12 Angry Men
(1957)
8.9

How to put exception while scraping data off the specific tag with Beautiful Soup?

I'm scraping the data off multiple websites. I need to scrape the list of names and heights of the players in the team. However the tag that I use seems to identify names of players AND coaches. I figured I need to maybe put an exception that signals not to scrape the names of coaches. What am I doing wrong?
Coaches names tag found under - class="sidearm-roster-coach-name
Here's the snippet of my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = ['https://www.brooklyncollegeathletics.com/sports/womens-swimming-and-diving/roster',
'https://athletics.baruch.cuny.edu/sports/womens-swimming-and-diving/roster',
'https://queensknights.com/sports/womens-swimming-and-diving/roster']
df_list = list() ##list to prevent dataframe from resetting on each iteration
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
height_swimmers = soup.findAll('span', class_ = "sidearm-roster-player-height")
first_name_data = soup.findAll('span', class_ = "sidearm-roster-player-first-name")
last_name_data = soup.findAll('span', class_ = "sidearm-roster-player-last-name")
first_name = []
last_name = []
height = []
hello i think you need something like this I tested it and it doesn't print coach names
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = ['https://www.brooklyncollegeathletics.com/sports/womens-swimming-and-diving/roster',
'https://athletics.baruch.cuny.edu/sports/womens-swimming-and-diving/roster',
'https://queensknights.com/sports/womens-swimming-and-diving/roster']
df_list = list() ##list to prevent dataframe from resetting on each iteration
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
player_full_names_container = soup.find_all('div', class_="sidearm-roster-player-name") # container for player names
height_swimmers = soup.findAll('span', class_ = "sidearm-roster-player-height")
player_full_names = [name.find("h3").get_text().strip() for name in player_full_names_container] # List of player Full names DONT INCLUDE COACH NAMES
coaches_container = soup.findAll("div", class_="sidearm-roster-coach-name") # coaches container section to check if person with given name is coach or not
coach_names = [name.find("p").get_text().strip() for name in coaches_container] # List of Coach names
for name in player_full_names:
print(name)
print("--------------")

Empty Dataframe when scraping specific column from website

I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.

Python 3 Scrape yellow Pages

Im trying to scrape the data off of yellowpages but am running into where I can't get the text of each business name and address/phone. I'm using the code below, where am I going wrong? I'm trying to print the text of each business but only printing it out for the sake of seeing it right now as I test but once I'm done then Im going to save the data to csv.
import csv
import requests
from bs4 import BeautifulSoup
#dont worry about opening this file
"""with open('cities_louisiana.csv','r') as cities:
lines = cities.read().splitlines()
cities.close()"""
for city in lines:
print(city)
url = "http://www.yellowpages.com/search? search_terms=businesses&geo_location_terms=amite+LA&page="+str(count)
for city in lines:
for x in range (0, 50):
print("http://www.yellowpages.com/search?search_terms=businesses&geo_location_terms=amite+LA&page="+str(x))
page = requests.get("http://www.yellowpages.com/search?search_terms=businesses&geo_location_terms=amite+LA&page="+str(x))
soup = BeautifulSoup(page.text, "html.parser")
name = soup.find_all("div", {"class": "v-card"})
for name in name:
try:
print(name.contents[0]).find_all(class_="business-name").text
#print(name.contents[1].text)
except:
pass
You should iterate over search results, then, for every search result locate the business name (the element with the "business-name" class) and the address (the element with the "adr" class):
for result in soup.select(".search-results .result"):
name = result.select_one(".business-name").get_text(strip=True, separator=" ")
address = result.select_one(".adr").get_text(strip=True, separator=" ")
print(name, address)
.select() and .select_one() are handy CSS selector methods.

How can I retrieve the price on airbnb using beautifulsoup to scrape?

I'd like to scrape airbnb's listings by city (for the 5 cities listed in the code) and would like to gather information such as: price, a link to the listing, room type, # of guests, etc.
I was able to get the link, but I'm having trouble getting the price.
from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import urljoin # For joining next page url with base url
from datetime import datetime # For inserting the current date and time
start_url_nyc = "https://www.airbnb.com/s/New-York--NY--United-States"
start_url_mia = "https://www.airbnb.com/s/Miami--FL--United-States"
start_url_la = "https://www.airbnb.com/s/Los_Angeles--CA--United-States"
start_url_sf = "https://www.airbnb.com/s/San_Francisco--CA--United-States"
start_url_orl = "https://www.airbnb.com/s/Orlando--FL--United-States"
def scrape_airbnb(url):
# Set up the URL Request
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Iterate over search results
for search_result in soup.find_all('div', 'infoContainer_tfq3vd'):
# Parse the name and price and record the time
link_end = search_result.find('a').get('href')
link = "https://www.airbnb.com" + link_end
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
return (price)
print(scrape_airbnb(start_url_orl))
This is the html code:
<span data-pricerate="true" data-reactid=".91165im9kw.0.2.0.3.2.1.0.$0.$grid_0.$0/=1$=01$16085565.$=1$16085565.0.2.0.1.0.0.0.1:1">552</span>
This is your code
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
first:
Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")
# SyntaxError: keyword can't be an expression
You can use these attributes in searches by putting them into a
dictionary and passing the dictionary into find_all() as the attrs
argument:
data_soup.find_all(attrs={"data-foo": "value"})
# [<div data-foo="value">foo!</div>]
than:
price = search_result.find('span', attrs={"data-pricerate":"true"})
this will return a span tag which contains price as string, just use .text
price = search_result.find('span', attrs={"data-pricerate":"true"}).text

Categories

Resources