here is my code
from pandas import Series, DataFrame
import datetime
import requests
import lxml
import yfinance as yf
import time
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
def web_content_div(web_content,class_path):
web_content_div = web_content.find_all('div',{'class': class_path})
try:
spans = web_content_div[0].find_all('span')
texts = [span.get_text() for span in spans]
except IndexError:
texts = []
return texts
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code
try :
r = requests.get(url)
web_content = BeautifulSoup(r.text,'lxml')
texts = web_content_div(web_content, 'My(6px) Pos(r) smartphone_Mt(6px)')
if texts != []:
price, change = texts[0],texts[1]
else:
price , change = [] , []
texts = web_content_div(web_content,'D(ib) W(1/2) Bxz(bb) Pstart(12px) Va(t) ie-7_D(i) ie-7_Pos(a) smartphone_D(b) smartphone_W(100%) smartphone_Pstart(0px) smartphone_BdB smartphone_Bdc($seperatorColor)')
if texts != []:
for count, EX in enumerate(texts):
if EX == 'Ex-Dividend Date':
EXdate = texts[count + 1]
else:
EXdate = []
texts = web_content_div(web_content,'D(ib) W(1/2) Bxz(bb) Pend(12px) Va(t) ie-7_D(i) smartphone_D(b) smartphone_W(100%) smartphone_Pend(0px) smartphone_BdY smartphone_Bdc($seperatorColor)')
if texts != []:
for count, vol in enumerate(texts):
if vol == 'Volume':
volume = texts[count + 1]
else:
volume = []
except ConnectionError:
price, change,EXdate,volume = [],[],[],[]
return price, change, EXdate,volume
stock=['awr','dov','nwn','emr','gpc','pg','ph','mmm','ginf','jnj','ko','lanc','low','fmcb'
'cl','ndsn','hrl','abm','cwt','tr','frt','scl','swk','tgt','cbsh','mo','syy']
while(True):
info = []
col = []
for stock_code in stock:
time_stamp = datetime.datetime.now() - datetime.timedelta(hours=10)
time_stamp = time_stamp.strftime('%Y-%M-%D %H:%M:%S')
price,change,EXdate,volume = real_time_price(stock_code)
info.append(price)
info.extend([change])
info.extend([EXdate])
info.extend([volume])
time.sleep(1)
col = [time_stamp]
col.extend(info)
df = DataFrame(col, columns=['price', 'change', 'EX-dividend-date', 'Volume'], index=stock)
df.T
print(col)
what i want to see is:
price change ex-dividend-date volume
awr
dov
nwn
emr
etc...
then i fixed my code like this
while(True):
info_price = []
info_change = []
info_exdate = []
info_volume = []
for stock_code in stock:
price,change,EXdate,volume = real_time_price(stock_code)
info_price.append(price)
info_change.append(change)
info_exdate.append(EXdate)
info_volume.append(volume)
time.sleep(1)
df = DataFrame(columns={"price":info_price,"change":info_change,"EX-dividend-date":info_exdate,"volume":info_volume},index=stock)
df.T
print(df)
but it print as....for example
price change EX-dividend-date volume
awr NaN NaN NaN NaN
it just print nothing at all!!
so. what should i do get right output..
please help me. how can i handle this problem
I really appreciate your help:)!!!
stock=['awr','dov','nwn','emr','gpc','pg','ph','mmm','ginf','jnj','ko','lanc','low','fmcb'
'cl','ndsn','hrl','abm','cwt','tr','frt','scl','swk','tgt','cbsh','mo','syy']
while(True):
info_timestamp = []
info_price = []
info_change = []
info_exdate = []
info_volume = []
for stock_code in stock:
time_stamp = datetime.datetime.now() - datetime.timedelta(hours=10)
time_stamp = time_stamp.strftime('%Y-%M-%D %H:%M:%S')
price,change,EXdate,volume = real_time_price(stock_code)
info_timestamp.append(time_stamp)
info_price.append(price)
info_change.append(change)
info_exdate.append(EXdate)
info_volume.append(volume)
time.sleep(1)
df = DataFrame({"timestamp":info_timestamp,"price":info_price,"change":info_change,"EX-dividend-date":info_exdate,"volume":info_volume},index=stock)
df.T
Related
I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")
I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.
I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')
As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers
I'm getting to error 'find() takes no keyword arguments' on the line of code place = racers.find('td', class_='horse_number').get_text()
I presume this is due to the nested for loop - is find onto find the problem??
My goal is to get detail of the race in first loop, second loop reiterate over each runner within the race, third for loop to get the times that meet each nested if statement.
for race in results:
race_number = race.find('td', class_='raceNumber').get_text()
race_name1 = race.find('td', class_='raceTitle').get_text()
race_title1 = race.find('td', class_='raceInformation').get_text()
race_title1 = ' '.join(race_title1.split())
race_distance1 = race.find('td', class_='distance').get_text()
tableofdata = race.find('table', class_='raceFieldTable')
for racers in tableofdata:
place = racers.find('td', class_='horse_number').get_text()
horsename = racers.find('a', class_='horse_name_link')
horsename = horsename.text.replace('HorseName: ', '') if horsename else ''
prizemoney = racers.find('td', class_='prizemoney')
prizemoney = prizemoney.text.replace('Prizemoney: ', '') if prizemoney else ''
barrier = racers.find('td', class_='barrier')
barrier = barrier.text.replace('Row: ', '') if barrier else ''
#tabnumber = race.find('td', class_='horse_number')
#tabnumber = tabnumber.text.replace('HorseNumber: ', '') if tabnumber else ''
#print(tabnumber, tr2)
trainer = racers.find_all('td', class_='trainer-short')
trainer = trainer.text.replace('Trainer: ', '') if trainer else ''
driver = racers.find_all('td', class_='driver-short')
driver = driver.text.replace('Driver: ', '') if driver else ''
margin = racers.find_all('td', class_='margin')
margin = margin.text.replace('Margin: ', '') if margin else ''
startingprice = racers.find_all('td', class_='starting_price')
startingprice = startingprice.text.replace('StartingOdds: ', '')
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
stewardscomments = racers.find_all('span', class_='stewardsTooltip')
stewardscomments = stewardscomments.text.replace('StewardsComments: ', '') if horsename else ''
scratchingnumber = racers.find_all('td', class_='number')
scratchingnumber = scratchingnumber.text.replace('Scratching: ', '') if scratchingnumber else ''
tableoftimes = race.find('table', class_='raceTimes')
for row in tableoftimes.select('td>strong:contains(":")'):
for t in row:
if "Track Rating:" in t:
trackrating = t.next_element.strip()
else:
trackrating = ''
if "Gross Time:" in t:
grosstime = t.next_element.strip()
else:
grosstime = ''
if "Mile Rate:" in t:
milerate = t.next_element.strip()
else:
milerate = ''
if "Lead Time:" in t:
leadtime = t.next_element.strip()
else:
leadtime = ''
if "First Quarter:" in t:
firstquarter = t.next_element.strip()
else:
firstquarter = ''
if "Second Quarter:" in t:
secondquarter = t.next_element.strip()
else:
secondquarter = ''
if "Third Quarter:" in t:
thirdquarter = t.next_element.strip()
else:
thirdquarter = ''
if "Fourth Quarter:" in t:
fourthquarter = t.next_element.strip()
else:
fourthquarter = ''
Last query is this replace doesnt work - still prints $2.40Â onto csv file
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
....
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
....
writer.writerow([tr2, race_number, race_name1, race_title1, race_distance1, place, horsename, prizemoney, barrier, trainer, driver, margin, startingprice, stewardscomments, scratchingnumber, trackrating, grosstime, milerate, leadtime, firstquarter, secondquarter, thirdquarter, fourthquarter])
UPDATED
Start of HTML with scraping looks like below
from datetime import datetime, date, timedelta
import requests
import re
import csv
import os
import numpy
import pandas as pd
from bs4 import BeautifulSoup as bs
from simplified_scrapy import SimplifiedDoc,req,utils
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = bs(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
enddate = datetime(2020, 4, 20)
#prints header in csv
writer.writerow(['Venue', 'RaceNumber', 'RaceName', 'RaceTitle', 'RaceDistance', 'Place', 'HorseName', 'Prizemoney', 'Row', 'Trainer', 'Driver', 'Margin', 'StartingOdds', 'StewardsComments', 'Scratching', 'TrackRating', 'Gross_Time', 'Mile_Rate', 'Lead_Time', 'First_Quarter', 'Second_Quarter', 'Third_Quarter', 'Fourth_Quarter'])
while enddate <= yesterday:
enddate += timedelta(days=1)
enddate1 = enddate.strftime("%d-%m-%y")
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = bs(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
tr = table1.find_all('tr', {'class':['odd', 'even']})
for tr1 in tr:
tr2 = tr1.find('a').get_text()
tr3 = tr1.find('a')['href']
newurl = base1_url + tr3
with requests.Session() as s:
webpage_response = s.get(newurl)
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
results = soup.find_all('div', {'class':'forPrint'})
#resultsv2 = soup.find_all('table', {'class':'raceFieldTable'})
Expect the CSV to look like
I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.