Scraping multiple pages with bs4 Beautiful Soup - only scrapes the first page - python

*** My code is for practice only!
I'm trying to scrape the names and teams that each player in FPL from their website https://www.premierleague.com/ and I got some problems with the code.
The problem is it's only getting the page with the '-1' in the end of the url, wihch I haven't even inculded in my pages list!
there isn't any logic with the pages - the basic url is https://www.premierleague.com/players?se=363&cl= while the number after the '=' seems to be random. so I created a list of the numbers and added it to the url with a for loop:
my code:
import requests
from bs4 import BeautifulSoup
import pandas
plplayers = []
pl_url = 'https://www.premierleague.com/players?se=363&cl='
pages_list = ['1', '2', '131', '34']
for page in pages_list:
r = requests.get(pl_url + page)
c = r.content
soup = BeautifulSoup(c, 'html.parser')
player_names = soup.find_all('a', {'class': 'playerName'})
for x in player_names:
player_d = {}
player_teams = []
player_href = x.get('href')
player_info_url = 'https://www.premierleague.com/' + player_href
player_r = requests.get(player_info_url, headers=headers)
player_c = player_r.content
player_soup = BeautifulSoup(player_c, 'html.parser')
team_tag = player_soup.find_all('td', {'class': 'team'})
for team in team_tag:
try:
team_name = team.find('span', {'class': 'long'}).text
if '(Loan)' in team_name:
team_name.replace(' (Loan) ', '')
if team_name not in player_teams:
player_teams.append(team_name)
player_d['NAME'] = x.text
player_d['TEAMS'] = player_teams
except:
pass
plplayers.append(player_d)
df = pandas.DataFrame(plplayers)
df.to_csv('plplayers.txt')

I would comment this but I'm new and don't have enough reputation this so I'll have to keep it in an answer.
It looks like when you made a request to store in player_r you specified a headers parameter but didn't actually make a headers variable.
If you replace player_r = requests.get(player_info_url, headers=headers)with player_r = requests.get(player_info_url) instead, your code should run perfectly. At least, it did on my machine.

Related

How to scrape embedded integers on a website

I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:
The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!
You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)

I can't browse the pages Beautifulsoup

I am a beginner in Web-scraping and I am following this tutorial to extract movie data from this link, I chose to extract movies between 2016 and 2019 for the test. I get just 25 lines but I want more than 30000.
Do you think it's possible ?
this is the code :
from requests import get
from bs4 import BeautifulSoup
import csv
import pandas as pd
from time import sleep
from random import randint
from time import time
from IPython.core.display import clear_output
headers = {"Accept-Language": "en-US, en;q=0.5"}
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
url = 'https://www.imdb.com/search/title?release_date=2016-01-01,2019-05-01'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
start_time = time()
requests = 0
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
if requests > 72:
warn('Number of requests was greater than expected.')
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
#data cleansing
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()
movie_ratings['year'].unique()
movie_ratings.to_csv('movie_ratings.csv')
Start by double checking your indentation through out (in fact - naughty naughty - it is wrong in that tutorial. I am guessing it wasn't properly proof read after publishing and the code has wrongly been left aligned repeatedly).
To illustrate, you currently have something like (reduced lines of code shown)
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Your indentation means, if code runs at all, you are only working with last url you intended to visit in terms of actual html parsing.
It should be:
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Indentation gives meaning in python.
https://docs.python.org/3/reference/lexical_analysis.html?highlight=indentation
Leading whitespace (spaces and tabs) at the beginning of a logical
line is used to compute the indentation level of the line, which in
turn is used to determine the grouping of statements.
It's hard to tell exactly what the issue is here because of the lack of functions but from what I see, you need to parse each page separately.
After every request, you need to parse the text. However, I suspect the main issue is the ordering of your code, I would suggest using functions.

How do I simultaneously scrape two pages and produce two distinct lists within one nested 'for-loop'?

I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)

How do I scrape data from multiple webpages with BeauitfulSoup?

I have a problem with the following code and I am sorry, I am new to this all, I want to add the strings in the FullPage list to the actual URL and then I want to visit them and scrape some data from the pages. So far, It has been good but I do not know how to make it visit the other links in the list.
The output will only give me the data of one page but I need the data for 30 pages, how can I make this program to go over each link?
The URL has a pattern, the first part has 'http://arduinopak.com/Prd.aspx?Cat_Name=' and then the second part has the product category name.
import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))
import urllib2
from bs4 import BeautifulSoup
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObtTj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.geext(separator=u' '))
If you want to scape each links then moving last 3 lines of your code into loop will do it.
Your current code fetches all the links but it stores only one BeautifulSoup object reference. You could instead store them all in the array or process them before visiting another URL (as shown below).
for n in FullPage:
URL = urlp1 + n
page = urllib2.urlopen(URL)
bsObj = BeautifulSoup(page, "html.parser")
descList = bsObj.findAll('div', attrs={"class": "panel-default"})
for desc in descList:
print(desc.getText(separator=u' '))
Also, note that the names using PascalCase are by convention reserved for classes. FullPage would usually be written as fullPage or FULL_PAGE if it's meant to be constant.

How can I make the output with pair of list : content in my python code?

I have been developing a python web-crawler for this website. I made two functions, which works well as separately.
One is to collect the list of stocks and
Another is to collect the content data of each list.
I would like to make the output of my code with pairs of
"list#1/content#1",
"list#2/content#2",
"list#3/content#3",
What needs to be modified in my code in order to achieve this?
Thanks.
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
car_span_t = dd.find_all('span', {'class': 't'})[0]
car_span_s = dd.find_all('span', {'class': 's'})[0]
#print(car_span_t.text, ':', car_span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
seller_span_t = dd.find_all('span', {'class': 't'})[0]
seller_span_s = dd.find_all('span', {'class': 's'})[0]
#print(seller_span_t.text, ':', seller_span_s.text)
return dds

Categories

Resources