How can I loop through to pages using selenium? - python

I am trying to scrape data from Oddsportal but I have an incomplete code.
How can I loop through pages for the competition and the season?
I have just started on Selenium and I am very new to it.
My current code is:
browser = webdriver.Chrome()
browser.get("https://www.oddsportal.com/soccer/england/premier-league/results/")
df = pd.read_html(browser.page_source, header=0)[0]
dateList = []
gameList = []
scoreList = []
home_odds = []
draw_odds = []
away_odds = []
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
date = row[1].split('-')[0]
continue
time = row[1]
dateList.append(date)
gameList.append(row[2])
scoreList.append(row[3])
home_odds.append(row[4])
draw_odds.append(row[5])
away_odds.append(row[6])
result = pd.DataFrame({'date': dateList,
'game': gameList,
'score': scoreList,
'Home': home_odds,
'Draw': draw_odds,
'Away': away_odds})

You have to create a for loop first
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# input URLs here
urls = {}
if __name__ == '__main__':
results = None
for url in urls:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)

Related

unable to implement explicit wait in the code

I am trying to apply explicit wait in the code till the page loads and then I can extract the data. I have tried this solution however I dont know where to insert the same in the code.
browser.implicitly_wait does not seem to work and I dont know why.
code:
import os
import threading
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(pgSoup, defaultVal=None):
evtSel = {
'time': 'p.whitespace-nowrap',
'game': 'a div:has(>a[title])',
'score': 'a:has(a[title])+div.hidden',
'home_odds': 'a:has(a[title])~div:not(.hidden)',
'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
}
events, current_group = [], {}
pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
for evt in pgSoup.select('div[set]>div:last-child'):
if evt.parent.select(f':scope>div:first-child+div+div'):
cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
evt.parent.select_one(s) for s in
[':scope>div:first-child+div>div:first-child',
':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
current_group = dict(zip(['date', 'country', 'league'], cgVals))
if pgDate: current_group['date'] = pgDate
evtRow = {'date': current_group.get('date', defaultVal)}
for k, v in evtSel.items():
v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
evtTeams = evt.select('a div>a[title]')
evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
evtRow['country'] = current_group.get('country', defaultVal)
evtRow['league'] = current_group.get('league', defaultVal)
events.append(evtRow)
return events
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
browser.implicitly_wait(30) # I could not get Explicit wait to work here. implicity_wait does not seem to work at all.
soup = bs(browser.page_source, "lxml")
game_data = GameData()
game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
for row in generate_matches(soup, defaultVal=nan):
for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
if return_urls:
if return_urls:
a_cont = soup.find('div', {'class': 'tabs'})
if a_cont is None:
a_tags = []
else:
a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
urls = [
'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
if not a_tag['href'].startswith('#') # sections in current page
and 'active-item-calendar' not in a_tag['class'] # current page
]
print(pd.DataFrame(urls, columns=['urls']))
return game_data, urls
return game_data
if __name__ == '__main__':
games = None
pool = ThreadPool(5)
# Get today's data and the Urls for the other days:
url_today = 'https://www.oddsportal.com/matches/soccer'
game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
game_data_results = pool.imap(parse_data, urls)
############################ BUILD DATAFRAME ############################
game_n, added_todayGame = 0, False
for game_data in game_data_results:
try:
game_n += 1
gd_df = pd.DataFrame(game_data.__dict__)
games = gd_df if games is None else pd.concat([games, gd_df])
if not added_todayGame:
game_n += 1
gdt_df = pd.DataFrame(game_data_today.__dict__)
games, added_todayGame = pd.concat([games, gdt_df]), True
except Exception as e:
print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
##########################################################################
print('!?NO GAMES?!' if games is None else games) ## print(games)
# ensure all the drivers are "quitted":
del threadLocal # a little extra insurance
import gc
gc.collect()
Where would I insert explicit wait till the page loads fully and then extract the dataframe games?

None of [([ ])] are in the columns

I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")

Iterate over URLS for Webscraping using BeautifulSoup

This is my code to scrape odds from www.oddsportal.com.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
global country, league
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Currently, the code just gets data for one urls. I would like
I am trying to integrate this part into my code that allows the pages to be iterated over all the links for "Yesterday, today, tomorrow and the next 5 days" as below:
This part of another code allows to get the URLs.
browser = webdriver.Chrome()
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
....
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
How do I get the urls to integrate with my code and iterate to provide me with one single dataframe?
I had to make some adjustments to function generate_matches since the returning of certain class names was not reliable. And I removed global statements from that function that I never have never should have had.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Prints:
date time game score home_odds draw_odds away_odds country league
0 07 Sep 2021 00:00 Pachuca W - Monterrey W  0:1 +219 +280 -106  Mexico Liga MX Women
1 07 Sep 2021 01:05 Millonarios - Patriotas 1:0 -303 +380 +807  Colombia Primera A
2 07 Sep 2021 02:00 Club Tijuana W - Club Leon W  4:0 -149 +293 +311  Mexico Liga MX Women
3 07 Sep 2021 08:30 Suzhou Dongwu - Nanjing City 0:0 +165 +190 +177  China Jia League
4 07 Sep 2021 08:45 Kuching City FC - Sarawak Utd. 1:0 +309 +271 -143  Malaysia Premier League
... ... ... ... ... ... ... ... ... ...
1305 14 Sep 2021 21:45 Central Cordoba - Atl. Tucuman +192 +217 +146 13  Argentina Liga Profesional
1306 14 Sep 2021 22:00  Colo Colo - Everton -141 +249 +395 11  Chile Primera Division
1307 14 Sep 2021 23:30  Columbus Crew - New York Red Bulls - - - 1  USA MLS
1308 14 Sep 2021 23:30  New York City - FC Dallas - - - 1  USA MLS
1309 14 Sep 2021 23:30  Toronto FC - Inter Miami - - - 1  USA MLS
[1310 rows x 9 columns]
I'd suggest you to integrate this method when iterating over urls.
Code snippet-
#assuming you have a list of start_urls
start_urls=['https://www.oddsportal.com/matches/soccer/20210903/']
urls=[]
#get links for Yesterday, today, tomorrow and the next 5 days
for start_url in start_urls:
driver.get(start_url)
html_source=driver.page_source
soup=BeautifulSoup(html_source,'lxml')
dates=soup.find('span',class_='next-games-date')
links=dates.find_all('a')
for link in links:
urls.append(('https://www.oddsportal.com'+link['href']))
#get data from each link
for url in urls:
driver.get(url)
#function call to parse data
#function call to append data

IndexError: list index out of range while webscraping

This code is giving an IndexError: list index out of range
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
global league
df = pd.read_html(html, header=0)[0]
# print(len(df.index))
# print(df.columns)
html = browser.page_source
soup = bs(html, "lxml")
# print(len(soup.select('#table-matches tr')))
scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
soup.select('#table-matches tr:nth-of-type(n+2)')]
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
n = 0
for number, row in enumerate(df.itertuples()):
if n == 0 or '»' in row[1]:
league = leagues[n]
n += 1
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
print(len(scores))
print(len(scores))
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(league)
game_data.game.append(row[2])
game_data.score.append(scores[number])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
when I try to find out about the error by
print(len(scores))
print(scores[number])
346
2:3
346
0:2
346
1:3
346
1:1
......
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 112, in <module>
game_data = parse_data(html)
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 84, in parse_data
print(scores[number])
IndexError: list index out of range
While
print(scores[number])
is
2:3
0:2
1:1
on a good day
How can i resolve this?

Python web scraper not getting certain values

I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.
I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)

Categories

Resources