Python web scraper not getting certain values - python

I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.

I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)

Related

None of [([ ])] are in the columns

I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")

finance using python DataFrame

here is my code
from pandas import Series, DataFrame
import datetime
import requests
import lxml
import yfinance as yf
import time
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
def web_content_div(web_content,class_path):
web_content_div = web_content.find_all('div',{'class': class_path})
try:
spans = web_content_div[0].find_all('span')
texts = [span.get_text() for span in spans]
except IndexError:
texts = []
return texts
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code
try :
r = requests.get(url)
web_content = BeautifulSoup(r.text,'lxml')
texts = web_content_div(web_content, 'My(6px) Pos(r) smartphone_Mt(6px)')
if texts != []:
price, change = texts[0],texts[1]
else:
price , change = [] , []
texts = web_content_div(web_content,'D(ib) W(1/2) Bxz(bb) Pstart(12px) Va(t) ie-7_D(i) ie-7_Pos(a) smartphone_D(b) smartphone_W(100%) smartphone_Pstart(0px) smartphone_BdB smartphone_Bdc($seperatorColor)')
if texts != []:
for count, EX in enumerate(texts):
if EX == 'Ex-Dividend Date':
EXdate = texts[count + 1]
else:
EXdate = []
texts = web_content_div(web_content,'D(ib) W(1/2) Bxz(bb) Pend(12px) Va(t) ie-7_D(i) smartphone_D(b) smartphone_W(100%) smartphone_Pend(0px) smartphone_BdY smartphone_Bdc($seperatorColor)')
if texts != []:
for count, vol in enumerate(texts):
if vol == 'Volume':
volume = texts[count + 1]
else:
volume = []
except ConnectionError:
price, change,EXdate,volume = [],[],[],[]
return price, change, EXdate,volume
stock=['awr','dov','nwn','emr','gpc','pg','ph','mmm','ginf','jnj','ko','lanc','low','fmcb'
'cl','ndsn','hrl','abm','cwt','tr','frt','scl','swk','tgt','cbsh','mo','syy']
while(True):
info = []
col = []
for stock_code in stock:
time_stamp = datetime.datetime.now() - datetime.timedelta(hours=10)
time_stamp = time_stamp.strftime('%Y-%M-%D %H:%M:%S')
price,change,EXdate,volume = real_time_price(stock_code)
info.append(price)
info.extend([change])
info.extend([EXdate])
info.extend([volume])
time.sleep(1)
col = [time_stamp]
col.extend(info)
df = DataFrame(col, columns=['price', 'change', 'EX-dividend-date', 'Volume'], index=stock)
df.T
print(col)
what i want to see is:
price change ex-dividend-date volume
awr
dov
nwn
emr
etc...
then i fixed my code like this
while(True):
info_price = []
info_change = []
info_exdate = []
info_volume = []
for stock_code in stock:
price,change,EXdate,volume = real_time_price(stock_code)
info_price.append(price)
info_change.append(change)
info_exdate.append(EXdate)
info_volume.append(volume)
time.sleep(1)
df = DataFrame(columns={"price":info_price,"change":info_change,"EX-dividend-date":info_exdate,"volume":info_volume},index=stock)
df.T
print(df)
but it print as....for example
price change EX-dividend-date volume
awr NaN NaN NaN NaN
it just print nothing at all!!
so. what should i do get right output..
please help me. how can i handle this problem
I really appreciate your help:)!!!
stock=['awr','dov','nwn','emr','gpc','pg','ph','mmm','ginf','jnj','ko','lanc','low','fmcb'
'cl','ndsn','hrl','abm','cwt','tr','frt','scl','swk','tgt','cbsh','mo','syy']
while(True):
info_timestamp = []
info_price = []
info_change = []
info_exdate = []
info_volume = []
for stock_code in stock:
time_stamp = datetime.datetime.now() - datetime.timedelta(hours=10)
time_stamp = time_stamp.strftime('%Y-%M-%D %H:%M:%S')
price,change,EXdate,volume = real_time_price(stock_code)
info_timestamp.append(time_stamp)
info_price.append(price)
info_change.append(change)
info_exdate.append(EXdate)
info_volume.append(volume)
time.sleep(1)
df = DataFrame({"timestamp":info_timestamp,"price":info_price,"change":info_change,"EX-dividend-date":info_exdate,"volume":info_volume},index=stock)
df.T

Python requests.get() loop returns nothing

When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
for page in range(100, 350):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=" + str(page) + "&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('tbody')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
if cand is not None:
can2 = info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
if partyn is not None:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
if votec is not None:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
if cansh is not None:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
if fvote is not None:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
Thanks in advance for your help!
I've made some simplification changes. The major changes that needed to be changed were:
ghana_tbody = soup.find_all('table', class_='canResults')
can2 = info # not info.get_text()
I have only tested this against page 112; life is too short.
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
can = []
pty_n = []
cv1 = []
cvs1 = []
vot1 = []
START_PAGE = 112
END_PAGE = 112
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=112&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('table', class_='canResults')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
can2 = info # not info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
print('can = ', can)
print('pty_n = ', pty_n)
print('cv1 = ', cv1)
print('cvs1 = ', cvs1)
print('vot1 = ', vot1)
Prints:
can = ['Kwadwo Baah Agyemang', 'Daniel Osei', 'Anyang - Kusi Samuel', 'Mary Awusi']
pty_n = ['NPP', 'NDC', 'IND', 'IND']
cv1 = ['14,966', '9,709', '8,648', '969', '34292']
cvs1 = ['43.64', '28.31', '25.22', '2.83', '\xa0']
vot1 = ['34292', '34292', '34292', '34292']
Be sure to first change START_PAGE and END_PAGE to 100 and 350 respecively.

Q: What is the workaround for nested for loop error - find() takes no keyword arguments

I'm getting to error 'find() takes no keyword arguments' on the line of code place = racers.find('td', class_='horse_number').get_text()
I presume this is due to the nested for loop - is find onto find the problem??
My goal is to get detail of the race in first loop, second loop reiterate over each runner within the race, third for loop to get the times that meet each nested if statement.
for race in results:
race_number = race.find('td', class_='raceNumber').get_text()
race_name1 = race.find('td', class_='raceTitle').get_text()
race_title1 = race.find('td', class_='raceInformation').get_text()
race_title1 = ' '.join(race_title1.split())
race_distance1 = race.find('td', class_='distance').get_text()
tableofdata = race.find('table', class_='raceFieldTable')
for racers in tableofdata:
place = racers.find('td', class_='horse_number').get_text()
horsename = racers.find('a', class_='horse_name_link')
horsename = horsename.text.replace('HorseName: ', '') if horsename else ''
prizemoney = racers.find('td', class_='prizemoney')
prizemoney = prizemoney.text.replace('Prizemoney: ', '') if prizemoney else ''
barrier = racers.find('td', class_='barrier')
barrier = barrier.text.replace('Row: ', '') if barrier else ''
#tabnumber = race.find('td', class_='horse_number')
#tabnumber = tabnumber.text.replace('HorseNumber: ', '') if tabnumber else ''
#print(tabnumber, tr2)
trainer = racers.find_all('td', class_='trainer-short')
trainer = trainer.text.replace('Trainer: ', '') if trainer else ''
driver = racers.find_all('td', class_='driver-short')
driver = driver.text.replace('Driver: ', '') if driver else ''
margin = racers.find_all('td', class_='margin')
margin = margin.text.replace('Margin: ', '') if margin else ''
startingprice = racers.find_all('td', class_='starting_price')
startingprice = startingprice.text.replace('StartingOdds: ', '')
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
stewardscomments = racers.find_all('span', class_='stewardsTooltip')
stewardscomments = stewardscomments.text.replace('StewardsComments: ', '') if horsename else ''
scratchingnumber = racers.find_all('td', class_='number')
scratchingnumber = scratchingnumber.text.replace('Scratching: ', '') if scratchingnumber else ''
tableoftimes = race.find('table', class_='raceTimes')
for row in tableoftimes.select('td>strong:contains(":")'):
for t in row:
if "Track Rating:" in t:
trackrating = t.next_element.strip()
else:
trackrating = ''
if "Gross Time:" in t:
grosstime = t.next_element.strip()
else:
grosstime = ''
if "Mile Rate:" in t:
milerate = t.next_element.strip()
else:
milerate = ''
if "Lead Time:" in t:
leadtime = t.next_element.strip()
else:
leadtime = ''
if "First Quarter:" in t:
firstquarter = t.next_element.strip()
else:
firstquarter = ''
if "Second Quarter:" in t:
secondquarter = t.next_element.strip()
else:
secondquarter = ''
if "Third Quarter:" in t:
thirdquarter = t.next_element.strip()
else:
thirdquarter = ''
if "Fourth Quarter:" in t:
fourthquarter = t.next_element.strip()
else:
fourthquarter = ''
Last query is this replace doesnt work - still prints $2.40Â onto csv file
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
....
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
....
writer.writerow([tr2, race_number, race_name1, race_title1, race_distance1, place, horsename, prizemoney, barrier, trainer, driver, margin, startingprice, stewardscomments, scratchingnumber, trackrating, grosstime, milerate, leadtime, firstquarter, secondquarter, thirdquarter, fourthquarter])
UPDATED
Start of HTML with scraping looks like below
from datetime import datetime, date, timedelta
import requests
import re
import csv
import os
import numpy
import pandas as pd
from bs4 import BeautifulSoup as bs
from simplified_scrapy import SimplifiedDoc,req,utils
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = bs(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
enddate = datetime(2020, 4, 20)
#prints header in csv
writer.writerow(['Venue', 'RaceNumber', 'RaceName', 'RaceTitle', 'RaceDistance', 'Place', 'HorseName', 'Prizemoney', 'Row', 'Trainer', 'Driver', 'Margin', 'StartingOdds', 'StewardsComments', 'Scratching', 'TrackRating', 'Gross_Time', 'Mile_Rate', 'Lead_Time', 'First_Quarter', 'Second_Quarter', 'Third_Quarter', 'Fourth_Quarter'])
while enddate <= yesterday:
enddate += timedelta(days=1)
enddate1 = enddate.strftime("%d-%m-%y")
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = bs(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
tr = table1.find_all('tr', {'class':['odd', 'even']})
for tr1 in tr:
tr2 = tr1.find('a').get_text()
tr3 = tr1.find('a')['href']
newurl = base1_url + tr3
with requests.Session() as s:
webpage_response = s.get(newurl)
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
results = soup.find_all('div', {'class':'forPrint'})
#resultsv2 = soup.find_all('table', {'class':'raceFieldTable'})
Expect the CSV to look like

Handling exception with list out of range

I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.

Categories

Resources