I am importing links to boxscores from this webpage
http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html
This is how I am doing it now. I get the links from the first page.
url = 'http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html'
boxurl = urllib2.urlopen(url).read()
soup = BeautifulSoup(boxurl)
boxscores = soup.findAll('a', href=re.compile('boxscore'))
basepath = "http://www.covers.com"
pages=[] # This grabs the links from the page
for a in boxscores:
pages.append(urllib2.urlopen(basepath + a['href']).read())
Then in a new window I do this.
newsoup = pages[1] # I am manually changing this every time
soup = BeautifulSoup(newsoup)
def _unpack(row, kind='td'):
return [val.text for val in row.findAll(kind)]
tables = soup('table')
linescore = tables[1]
linescore_rows = linescore.findAll('tr')
roadteamQ1 = float(_unpack(linescore_rows[1])[1])
roadteamQ2 = float(_unpack(linescore_rows[1])[2])
roadteamQ3 = float(_unpack(linescore_rows[1])[3])
roadteamQ4 = float(_unpack(linescore_rows[1])[4]) # add OT rows if ???
roadteamFinal = float(_unpack(linescore_rows[1])[-3])
hometeamQ1 = float(_unpack(linescore_rows[2])[1])
hometeamQ2 = float(_unpack(linescore_rows[2])[2])
hometeamQ3 = float(_unpack(linescore_rows[2])[3])
hometeamQ4 = float(_unpack(linescore_rows[2])[4]) # add OT rows if ???
hometeamFinal = float(_unpack(linescore_rows[2])[-3])
misc_stats = tables[5]
misc_stats_rows = misc_stats.findAll('tr')
roadteam = str(_unpack(misc_stats_rows[0])[0]).strip()
hometeam = str(_unpack(misc_stats_rows[0])[1]).strip()
datefinder = tables[6]
datefinder_rows = datefinder.findAll('tr')
date = str(_unpack(datefinder_rows[0])[0]).strip()
year = 2012
from dateutil.parser import parse
parsedDate = parse(date)
date = parsedDate.replace(year)
month = parsedDate.month
day = parsedDate.day
modDate = str(day)+str(month)+str(year)
gameid = modDate + roadteam + hometeam
data = {'roadteam': [roadteam],
'hometeam': [hometeam],
'roadQ1': [roadteamQ1],
'roadQ2': [roadteamQ2],
'roadQ3': [roadteamQ3],
'roadQ4': [roadteamQ4],
'homeQ1': [hometeamQ1],
'homeQ2': [hometeamQ2],
'homeQ3': [hometeamQ3],
'homeQ4': [hometeamQ4]}
globals()["%s" % gameid] = pd.DataFrame(data)
df = pd.DataFrame.load('df')
df = pd.concat([df, globals()["%s" % gameid]])
df.save('df')
How can I automate this so I don't have to manually change newsoup = pages[1] manually and scrape all of the boxscores that are linked from the first url in one go. I am pretty new to python and lacking in some understanding of the basics.
So in the first code box you collect the pages
So in the second code box you have to loop this, if I understood it
for page in pages:
soup = BeautifulSoup(page)
# rest of the code here
Related
I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...
I am trying to iterate through player seasons on NBA.com and pull shooting statistics after each click of the season dropdown menu. After each click, I get the error message "list index out of range" for:
headers = table[1].findAll('th')
It seems to me that the page doesn't load all the way before the source data is saved.
Looking at other similar questions, I have tried using an browser.implicitly_wait() for each loop, but I am still getting the same error. It also doesn't seem that the browser waits after more than the first iteration of the loop.
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
player_id = str(1629216)
url = 'https://www.nba.com/stats/player/' + player_id + "/shooting/"
browser = Chrome(executable_path='/usr/local/bin/chromedriver')
browser.get(url)
select = Select(browser.find_element_by_xpath('/html/body/main/div/div/div/div[4]/div/div/div/div/div[1]/div[1]/div/div/label/select'))
options = select.options
for index in range(0, len(options)):
select.select_by_index(index)
browser.implicitly_wait(5)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I found my own answer. I used time.sleep(1) at the top of the loop to give the browser a second to load all the way. Without this delay, the pages source code did not have the appropriate table that I am scraping.
Responding to those who answered - I did not want to go the api route, but I have seen people scrape nba.com using that method. Table[1] is the correct table; just needed the source code a chance to load after the I loop through the season dropdown.
select.select_by_index(index)
time.sleep(1)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I am working on a screen scraper to pull football statistics down from www.pro-football-reference.com. I'm currently scraping from the main player's stat page and then diving into their individual page with their statistics by year.
I was able to implement this process successfully with my first set of players (quarterbacks, using the Passing Table). However, when I attempted to re-create the process to get running back data, I am reciving an additional column in my data frame with the values "Unnamed: x_level_0". This is my first experience with HTML data so I'm not sure what piece I missed, I just assumed it would be the same code as the quarterbacks.
Below is the QB Code sample and the correct dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Passing Data
r = requests.get(url+ '/years/' + str(year) + '/passing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_passing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Philip Rivers
Year 2020
Age 39
Tm IND
Pos qb
No. 17
G 1
GS 1
Below is the RB Code sample and the incorrect dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Rushing Data
r = requests.get(url+ '/years/' + str(year) + '/rushing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_rushing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
print(stub)
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
print(v)
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
print('found 2020')
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Unnamed: 0_level_0 Year 2020
Unnamed: 1_level_0 Age 26
Unnamed: 2_level_0 Tm TEN
Unnamed: 3_level_0 Pos rb
Unnamed: 4_level_0 No. 22
Games G 1
GS 1
Rushing Rush 31
Yds 116
TD 0
An example URL where this data is pulled from is: https://www.pro-football-reference.com/players/J/JacoJo01.htm
And it is pulling the Rushing & Receiving. Is there something additional I need to be on the lookout for when it comes to parsing HTML?
I attempted to add index_col = 1 into my tdf = pd.read_html(url + stub)[1]. However, that just kind of grouped the two values into one column.
Any input on this would be greatly appreciated. If I can provide any further information, please let me know.
Thank you
You can try this code to parse the table passing for each player (Now I get the players from https://www.pro-football-reference.com/years/2020/passing.htm but you can pass any player URL to it:
import requests
from bs4 import BeautifulSoup
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
row = soup.select_one('table#passing tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this csv:
EDIT: To parse Rushing&Receiving, you can use this script:
import requests
from bs4 import BeautifulSoup, Comment
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
soup = BeautifulSoup(soup.select_one('#rushing_and_receiving_link').find_next(text=lambda t: isinstance(t, Comment)), 'html.parser')
row = soup.select_one('table#rushing_and_receiving tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('tr')[-1].select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this CSV:
I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.
Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()
i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you
If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""