I am working on a screen scraper to pull football statistics down from www.pro-football-reference.com. I'm currently scraping from the main player's stat page and then diving into their individual page with their statistics by year.
I was able to implement this process successfully with my first set of players (quarterbacks, using the Passing Table). However, when I attempted to re-create the process to get running back data, I am reciving an additional column in my data frame with the values "Unnamed: x_level_0". This is my first experience with HTML data so I'm not sure what piece I missed, I just assumed it would be the same code as the quarterbacks.
Below is the QB Code sample and the correct dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Passing Data
r = requests.get(url+ '/years/' + str(year) + '/passing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_passing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Philip Rivers
Year 2020
Age 39
Tm IND
Pos qb
No. 17
G 1
GS 1
Below is the RB Code sample and the incorrect dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Rushing Data
r = requests.get(url+ '/years/' + str(year) + '/rushing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_rushing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
print(stub)
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
print(v)
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
print('found 2020')
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Unnamed: 0_level_0 Year 2020
Unnamed: 1_level_0 Age 26
Unnamed: 2_level_0 Tm TEN
Unnamed: 3_level_0 Pos rb
Unnamed: 4_level_0 No. 22
Games G 1
GS 1
Rushing Rush 31
Yds 116
TD 0
An example URL where this data is pulled from is: https://www.pro-football-reference.com/players/J/JacoJo01.htm
And it is pulling the Rushing & Receiving. Is there something additional I need to be on the lookout for when it comes to parsing HTML?
I attempted to add index_col = 1 into my tdf = pd.read_html(url + stub)[1]. However, that just kind of grouped the two values into one column.
Any input on this would be greatly appreciated. If I can provide any further information, please let me know.
Thank you
You can try this code to parse the table passing for each player (Now I get the players from https://www.pro-football-reference.com/years/2020/passing.htm but you can pass any player URL to it:
import requests
from bs4 import BeautifulSoup
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
row = soup.select_one('table#passing tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this csv:
EDIT: To parse Rushing&Receiving, you can use this script:
import requests
from bs4 import BeautifulSoup, Comment
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
soup = BeautifulSoup(soup.select_one('#rushing_and_receiving_link').find_next(text=lambda t: isinstance(t, Comment)), 'html.parser')
row = soup.select_one('table#rushing_and_receiving tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('tr')[-1].select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this CSV:
Related
I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...
I am trying to scrape a table from multiple pages for different weeks, however I keep on getting the results from this url https://www.boxofficemojo.com/weekly/2018W52/ , here's the code I am using:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import re
pages = np.arange(2015,2016)
week = ['01','02','03','04','05','06','07','08','09']
week1 = np.arange(10,11)
for x in week1:
week.append(x)
week
mov = soup.find_all("table", attrs={"class": "a-bordered"})
print("Number of tables on site: ",len(mov))
all_rows= []
all_rows= []
for page in pages:
for x in week:
url = requests.get('https://www.boxofficemojo.com/weekly/'+str(page)+'W'+str(x)+'/')
soup = BeautifulSoup(url.text, 'lxml')
mov = soup.find_all("table", attrs={"class": "a-bordered"})
table1 = mov[0]
body = table1.find_all("tr")
head = body[0]
body_rows = body[1:]
sleep(randint(2,10))
for row_num in range(len(body_rows)):
row = []
for row_item in body_rows[row_num].find_all("td"):
aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
row.append(aa)
all_rows.append(row)
print('Page', page, x)
Assuming you want 52 weeks from each year why not generate the links in advance then use pandas to retrieve the table, create a list of such dataframes and concatenate those into a final dataframe?
import pandas as pd
def get_table(url):
year = int(url[37:41])
week_yr = int(url[42:44])
df = pd.read_html(url)[0]
df['year'] = year
df['week_yr'] = week_yr
return df
years = ['2015','2016']
weeks = [str(i).zfill(2) for i in range(1, 53)]
base = 'https://www.boxofficemojo.com/weekly'
urls = [f'{base}/{year}W{week}' for week in weeks for year in years]
results = pd.concat([get_table(url, int(url.split('/')[-1][:4])) for url in urls])
You might then look at ways of speeding things up e.g.
from multiprocessing import Pool, cpu_count
import pandas as pd
def get_table(url):
year = int(url[37:41])
week_yr = int(url[42:44])
df = pd.read_html(url)[0]
df['year'] = year
df['week_yr'] = week_yr
return df
if __name__ == '__main__':
years = ['2015','2016']
weeks = [str(i).zfill(2) for i in range(1, 53)]
base = 'https://www.boxofficemojo.com/weekly'
urls = [f'{base}/{year}W{week}' for week in weeks for year in years]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
from bs4 import BeautifulSoup
import requests
import csv
url = "https://coingecko.com/en"
page = requests.get(url)
html_doc = page.content
soup = BeautifulSoup(html_doc,"html.parser")
coinname =soup.find_all("div",attrs={"class":"coin-content center"})
coin_sign = soup.find_all("div",attrs={"class":"coin-icon mr-2 center flex-column"})
coinvalue = soup.find_all("td",attrs={"class":"td-price price text-right "})
marketcap = soup.find_all("td",attrs={"class":"td-market_cap cap "})
Liquidity = soup.find_all("td", attrs={"class": "td-liquidity_score lit text-right "})
coin_name = []
coinsign = []
Coinvalue = []
Marketcap = []
marketliquidity = []
for div in coinname:
coin_name.append(div.a.span.text)
for sign in coin_sign:
coinsign.append(sign.span.text)
for Value in coinvalue:
Coinvalue.append(Value.a.span.text)
for cap in marketcap:
Marketcap.append(cap.div.span.text)
for liquidity in Liquidity:
marketliquidity.append(liquidity.a.span.text)
print(coin_name)
print(coinsign)
print(Coinvalue)
print(Marketcap)
print(marketliquidity)
I want to save the output into a csv file file with 5 columns. Column 1 will be "coin_name", Column 2 will be "coinsign", Column 3 will be "coinvalue", Column 4 will be "Marketcap", and Column 5 will be "Marketliquidity". How can I solve this?
I also want to limit the data I receive, as I want to receive only 100 coin_name but I received 200 coin_name.
from bs4 import BeautifulSoup
import requests
import csv
url = "https://coingecko.com/en"
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
#Instead of assigning variable and looping you can use list comprehension.
names = [div.a.span.text for div in soup.find_all("div",attrs={"class":"coin-content center"})]
signs = [sign.span.text for sign in soup.find_all("div",attrs={"class":"coin-icon mr-2 center flex-column"})]
values = [value.a.span.text for value in soup.find_all("td",attrs={"class":"td-price price text-right "})]
caps = [cap.div.span.text for cap in soup.find_all("td",attrs={"class":"td-market_cap cap "})]
liquidities = [liquidity.a.span.text for liquidity in soup.find_all("td", attrs={"class": "td-liquidity_score lit text-right "})]
with open('coins.csv', mode='w',newline='') as coins:
writer = csv.writer(coins, delimiter=',', quotechar='"')
#Take only first 100 coins
for i in range(100):
writer.writerow([names[i],signs[i],values[i],caps[i],liquidities[i]])
The output will be
Bitcoin,BTC,"$6,578.62","$113,894,498,118","$1,476,855,331"
Ethereum,ETH,$224.49,"$22,995,876,618","$1,256,303,216"
EOS,EOS,$5.73,"$5,193,319,905","$708,339,006"
XRP,XRP,$0.48,"$19,249,618,341","$564,378,978"
Litecoin,LTC,$57.80,"$3,388,966,637","$486,289,650"
NEO,NEO,$18.11,"$1,177,368,159","$160,733,208"
Monero,XMR,$113.64,"$1,871,890,512","$55,235,745"
I am importing links to boxscores from this webpage
http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html
This is how I am doing it now. I get the links from the first page.
url = 'http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html'
boxurl = urllib2.urlopen(url).read()
soup = BeautifulSoup(boxurl)
boxscores = soup.findAll('a', href=re.compile('boxscore'))
basepath = "http://www.covers.com"
pages=[] # This grabs the links from the page
for a in boxscores:
pages.append(urllib2.urlopen(basepath + a['href']).read())
Then in a new window I do this.
newsoup = pages[1] # I am manually changing this every time
soup = BeautifulSoup(newsoup)
def _unpack(row, kind='td'):
return [val.text for val in row.findAll(kind)]
tables = soup('table')
linescore = tables[1]
linescore_rows = linescore.findAll('tr')
roadteamQ1 = float(_unpack(linescore_rows[1])[1])
roadteamQ2 = float(_unpack(linescore_rows[1])[2])
roadteamQ3 = float(_unpack(linescore_rows[1])[3])
roadteamQ4 = float(_unpack(linescore_rows[1])[4]) # add OT rows if ???
roadteamFinal = float(_unpack(linescore_rows[1])[-3])
hometeamQ1 = float(_unpack(linescore_rows[2])[1])
hometeamQ2 = float(_unpack(linescore_rows[2])[2])
hometeamQ3 = float(_unpack(linescore_rows[2])[3])
hometeamQ4 = float(_unpack(linescore_rows[2])[4]) # add OT rows if ???
hometeamFinal = float(_unpack(linescore_rows[2])[-3])
misc_stats = tables[5]
misc_stats_rows = misc_stats.findAll('tr')
roadteam = str(_unpack(misc_stats_rows[0])[0]).strip()
hometeam = str(_unpack(misc_stats_rows[0])[1]).strip()
datefinder = tables[6]
datefinder_rows = datefinder.findAll('tr')
date = str(_unpack(datefinder_rows[0])[0]).strip()
year = 2012
from dateutil.parser import parse
parsedDate = parse(date)
date = parsedDate.replace(year)
month = parsedDate.month
day = parsedDate.day
modDate = str(day)+str(month)+str(year)
gameid = modDate + roadteam + hometeam
data = {'roadteam': [roadteam],
'hometeam': [hometeam],
'roadQ1': [roadteamQ1],
'roadQ2': [roadteamQ2],
'roadQ3': [roadteamQ3],
'roadQ4': [roadteamQ4],
'homeQ1': [hometeamQ1],
'homeQ2': [hometeamQ2],
'homeQ3': [hometeamQ3],
'homeQ4': [hometeamQ4]}
globals()["%s" % gameid] = pd.DataFrame(data)
df = pd.DataFrame.load('df')
df = pd.concat([df, globals()["%s" % gameid]])
df.save('df')
How can I automate this so I don't have to manually change newsoup = pages[1] manually and scrape all of the boxscores that are linked from the first url in one go. I am pretty new to python and lacking in some understanding of the basics.
So in the first code box you collect the pages
So in the second code box you have to loop this, if I understood it
for page in pages:
soup = BeautifulSoup(page)
# rest of the code here