Iterate over URLS for Webscraping using BeautifulSoup - python

This is my code to scrape odds from www.oddsportal.com.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
global country, league
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Currently, the code just gets data for one urls. I would like
I am trying to integrate this part into my code that allows the pages to be iterated over all the links for "Yesterday, today, tomorrow and the next 5 days" as below:
This part of another code allows to get the URLs.
browser = webdriver.Chrome()
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
....
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
How do I get the urls to integrate with my code and iterate to provide me with one single dataframe?

I had to make some adjustments to function generate_matches since the returning of certain class names was not reliable. And I removed global statements from that function that I never have never should have had.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Prints:
date time game score home_odds draw_odds away_odds country league
0 07 Sep 2021 00:00 Pachuca W - Monterrey W  0:1 +219 +280 -106  Mexico Liga MX Women
1 07 Sep 2021 01:05 Millonarios - Patriotas 1:0 -303 +380 +807  Colombia Primera A
2 07 Sep 2021 02:00 Club Tijuana W - Club Leon W  4:0 -149 +293 +311  Mexico Liga MX Women
3 07 Sep 2021 08:30 Suzhou Dongwu - Nanjing City 0:0 +165 +190 +177  China Jia League
4 07 Sep 2021 08:45 Kuching City FC - Sarawak Utd. 1:0 +309 +271 -143  Malaysia Premier League
... ... ... ... ... ... ... ... ... ...
1305 14 Sep 2021 21:45 Central Cordoba - Atl. Tucuman +192 +217 +146 13  Argentina Liga Profesional
1306 14 Sep 2021 22:00  Colo Colo - Everton -141 +249 +395 11  Chile Primera Division
1307 14 Sep 2021 23:30  Columbus Crew - New York Red Bulls - - - 1  USA MLS
1308 14 Sep 2021 23:30  New York City - FC Dallas - - - 1  USA MLS
1309 14 Sep 2021 23:30  Toronto FC - Inter Miami - - - 1  USA MLS
[1310 rows x 9 columns]

I'd suggest you to integrate this method when iterating over urls.
Code snippet-
#assuming you have a list of start_urls
start_urls=['https://www.oddsportal.com/matches/soccer/20210903/']
urls=[]
#get links for Yesterday, today, tomorrow and the next 5 days
for start_url in start_urls:
driver.get(start_url)
html_source=driver.page_source
soup=BeautifulSoup(html_source,'lxml')
dates=soup.find('span',class_='next-games-date')
links=dates.find_all('a')
for link in links:
urls.append(('https://www.oddsportal.com'+link['href']))
#get data from each link
for url in urls:
driver.get(url)
#function call to parse data
#function call to append data

Related

unable to implement explicit wait in the code

I am trying to apply explicit wait in the code till the page loads and then I can extract the data. I have tried this solution however I dont know where to insert the same in the code.
browser.implicitly_wait does not seem to work and I dont know why.
code:
import os
import threading
from math import nan
from multiprocessing.pool import ThreadPool
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(pgSoup, defaultVal=None):
evtSel = {
'time': 'p.whitespace-nowrap',
'game': 'a div:has(>a[title])',
'score': 'a:has(a[title])+div.hidden',
'home_odds': 'a:has(a[title])~div:not(.hidden)',
'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
}
events, current_group = [], {}
pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
for evt in pgSoup.select('div[set]>div:last-child'):
if evt.parent.select(f':scope>div:first-child+div+div'):
cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
evt.parent.select_one(s) for s in
[':scope>div:first-child+div>div:first-child',
':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
current_group = dict(zip(['date', 'country', 'league'], cgVals))
if pgDate: current_group['date'] = pgDate
evtRow = {'date': current_group.get('date', defaultVal)}
for k, v in evtSel.items():
v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
evtTeams = evt.select('a div>a[title]')
evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
evtRow['country'] = current_group.get('country', defaultVal)
evtRow['league'] = current_group.get('league', defaultVal)
events.append(evtRow)
return events
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
browser.implicitly_wait(30) # I could not get Explicit wait to work here. implicity_wait does not seem to work at all.
soup = bs(browser.page_source, "lxml")
game_data = GameData()
game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
for row in generate_matches(soup, defaultVal=nan):
for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
if return_urls:
if return_urls:
a_cont = soup.find('div', {'class': 'tabs'})
if a_cont is None:
a_tags = []
else:
a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
urls = [
'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
if not a_tag['href'].startswith('#') # sections in current page
and 'active-item-calendar' not in a_tag['class'] # current page
]
print(pd.DataFrame(urls, columns=['urls']))
return game_data, urls
return game_data
if __name__ == '__main__':
games = None
pool = ThreadPool(5)
# Get today's data and the Urls for the other days:
url_today = 'https://www.oddsportal.com/matches/soccer'
game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
game_data_results = pool.imap(parse_data, urls)
############################ BUILD DATAFRAME ############################
game_n, added_todayGame = 0, False
for game_data in game_data_results:
try:
game_n += 1
gd_df = pd.DataFrame(game_data.__dict__)
games = gd_df if games is None else pd.concat([games, gd_df])
if not added_todayGame:
game_n += 1
gdt_df = pd.DataFrame(game_data_today.__dict__)
games, added_todayGame = pd.concat([games, gdt_df]), True
except Exception as e:
print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
##########################################################################
print('!?NO GAMES?!' if games is None else games) ## print(games)
# ensure all the drivers are "quitted":
del threadLocal # a little extra insurance
import gc
gc.collect()
Where would I insert explicit wait till the page loads fully and then extract the dataframe games?

IndexError: list index out of range while webscraping

This code is giving an IndexError: list index out of range
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
global league
df = pd.read_html(html, header=0)[0]
# print(len(df.index))
# print(df.columns)
html = browser.page_source
soup = bs(html, "lxml")
# print(len(soup.select('#table-matches tr')))
scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
soup.select('#table-matches tr:nth-of-type(n+2)')]
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
n = 0
for number, row in enumerate(df.itertuples()):
if n == 0 or '»' in row[1]:
league = leagues[n]
n += 1
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
print(len(scores))
print(len(scores))
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(league)
game_data.game.append(row[2])
game_data.score.append(scores[number])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
when I try to find out about the error by
print(len(scores))
print(scores[number])
346
2:3
346
0:2
346
1:3
346
1:1
......
Traceback (most recent call last):
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 112, in <module>
game_data = parse_data(html)
File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_10.py", line 84, in parse_data
print(scores[number])
IndexError: list index out of range
While
print(scores[number])
is
2:3
0:2
1:1
on a good day
How can i resolve this?

Python beautifulsoup code not looping elements correctly

I'm not sure what the problem is. But I have a small script using Selenium and Beautifulsoup 4 to visit and parse contents of www.oddsportal.com
Code below not looping for league
The row no is [1] for game_data.league.append(count[1].text) but the value is repeating for that webpage instead for every row.
My code:
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
df = pd.read_html(html, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
score = row[3] if row[3] else nan
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(count[1].text)
game_data.game.append(row[2])
game_data.score.append(score)
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
results:
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| | score | date | time | country | league | game | home_odds | draw_odds | away_odds |
+=====+=========================+============+========+===========+===============+=========================+=============+=============+=============+
| 496 | Inter Turku - Mariehamn | 2021-06-10 | 15:00 | Finland | Veikkausliiga | Inter Turku - Mariehamn | 1.4 | 4.6 | 7.49 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 497 | KTP - HIFK | 2021-06-10 | 15:30 | Finland | Veikkausliiga | KTP - HIFK | 3.42 | 3.17 | 2.18 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 498 | Haka - HJK | 2021-06-10 | 15:30 | Finland | Veikkausliiga | Haka - HJK | 6.56 | 4.25 | 1.47 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 499 | SJK - KuPS | 2021-06-10 | 15:30 | Finland | Veikkausliiga | SJK - KuPS | 3.34 | 3.25 | 2.18 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 500 | Lahti - Ilves | 2021-06-10 | 15:30 | Finland | Veikkausliiga | Lahti - Ilves | 2.5 | 3.08 | 2.93 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
How do I get to loop the correct values for every row instead of the same value for the entire page?
To answer your specific problem, and not address the other issues I see, you need to alter your logic for determining when to add league
if n == 0 or '»' in row[1]:
league = leagues[n]
n+=1
I would also retrieve leagues as its own list:
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
df = pd.read_html(html, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
n = 0
for row in df.itertuples():
if n == 0 or '»' in row[1]:
league = leagues[n]
n+=1
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
score = row[3] if row[3] else nan
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(league)
game_data.game.append(row[2])
game_data.score.append(score)
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)

How can I loop through to pages using selenium?

I am trying to scrape data from Oddsportal but I have an incomplete code.
How can I loop through pages for the competition and the season?
I have just started on Selenium and I am very new to it.
My current code is:
browser = webdriver.Chrome()
browser.get("https://www.oddsportal.com/soccer/england/premier-league/results/")
df = pd.read_html(browser.page_source, header=0)[0]
dateList = []
gameList = []
scoreList = []
home_odds = []
draw_odds = []
away_odds = []
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
date = row[1].split('-')[0]
continue
time = row[1]
dateList.append(date)
gameList.append(row[2])
scoreList.append(row[3])
home_odds.append(row[4])
draw_odds.append(row[5])
away_odds.append(row[6])
result = pd.DataFrame({'date': dateList,
'game': gameList,
'score': scoreList,
'Home': home_odds,
'Draw': draw_odds,
'Away': away_odds})
You have to create a for loop first
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# input URLs here
urls = {}
if __name__ == '__main__':
results = None
for url in urls:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)

Python 3, bs4, webcrawler; error connecting too website

I am trying to build a web-crawler for a specific website.
But for some reason I won't connect to the website.
I get a error (made myself) it can't connect.
Using selesium tot call up the website, I see it doesn't connect
As a newbie I am probably making a stupid mistake but I can't figure out what.
Hoping you are willing to help me.
import csv
import requests
import datetime
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')
# def get_driver():
# driver = webdriver.Chrome()
# return driver
def get_driver():
# initialize options
options = webdriver.ChromeOptions()
# pass in headless argument to options
options.add_argument('--headless')
# initialize driver
driver = webdriver.Chrome(chrome_options=options)
return driver
def connect_to_base(browser, page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
html = None
links = None
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
#wait for table element with id = 'map' to load
#before returning True
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
return False
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
inside = soup.find_all('a', {'class':'property-inner'},{'href'})
# Make empty lists with header lines
output_list = []
listing = 1
for items in inside:
href = items.get('href')
url1 = href.format(page)
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(url1)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
details = BeautifulSoup(browser.page_source, 'html')
adres = details.find_all ('div', {'class':'detail-address'})
for adresinfo in adres:
try:
adres = adres[0].get_text(separator=',', strip=True)
except Indexerror:
adres = "Unknown"
kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
except IndexError:
tr_kenmerken = 'Unknown'
waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
try:
tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
except IndexError:
tr_waarde = 'Unknown'
informatie = {
'adres': adres,
'kenmerken': tr_kenmerken,
'waarde': tr_waarde,
'url': href
}
output_list.append(informatie)
listing += 1
return output_list
def get_load_time(article_url):
try:
# set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as ex:
load_time = 'Loading Error'
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(filename, 'a') as csvfile:
fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
def run_process(page_number, filename, browser):
if connect_to_base(browser, page_number):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print('Error connecting to jaap')
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'output_{output_timestamp}.csv'
browser = get_driver()
# scrape and crawl
while current_page <= 3:
print(f'Scraping page #{current_page}...')
run_process(current_page, output_filename, browser)
current_page = current_page + 1
# exit
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
I see you fixed EC.presence_of_element_located((By.ID,{'class':'result-content'})) to be EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
Next, you might have an issue with (depending where the browser is opened) of having to bypass/clicking a javascript that says you are ok and accept cookies.
But all that code seems to be an awful lot of work considering the data is stored as a json format in the script tags from the html. Why not just simply use requests, pull out the json, convert to dataframe, then write to csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
Output:
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds
and the csv file that looks like:
app area detailsUrl expired houseTypeValue id latLng latLng.latitude latLng.longitude location.city location.street location.zipcode lotSize market numberOfRooms openHouseDate openHouseTimes openhouse photo price priceToShow showoffColor showoffCustomText showoffPhotoText spotlight status veiling
0 False 165 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 6899666 NaN 52.368420 4.833631 AMSTERDAM Hof van Versailles 61 1064NX 216 sale 4 None None False 10014EAAF8B8883668593EFAC9E5FF1C 595000.0 595000.0 None None None False Sale False
1 True 211 /te-koop/noord+holland/groot-amsterdam/amsterd... False Appartement 10585731 NaN 52.327550 4.889076 AMSTERDAM Beysterveld 35 1083KA Onbekend sale 4 None None False E4F9E5BC7BC90B5B92C7BD8D48B7A677 925000.0 925000.0 None None None False Sale False
2 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Dubbele bovenwoning 11731386 NaN 52.341890 4.896053 AMSTERDAM Uiterwaardenstraat 320 2 1079DC Onbekend sale 5 None None False AB9F45B2CD4AD7879C5A80F18092F9D4 750000.0 750000.0 None None None False SoldConditionally False
3 False 269 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 11840681 NaN 52.358266 4.875508 AMSTERDAM Korte van Eeghenstraat 4 1071ER 107 sale 9 None None False A3DF2B1D426B5E4D501503C5D0E66966 3100000.0 3100000.0 None None None False Sale False
4 False 100 /te-koop/noord+holland/groot-amsterdam/amsterd... False Tussenwoning 12152943 NaN 52.421245 4.899478 AMSTERDAM Pieter A v Heijningestraat 9 1035SV 83 sale 5 None None False 55C6F589523FA553D67A709776DD70DD 399000.0 399000.0 None None None False Sale False
5 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Bovenwoning 15796874 NaN NaN NaN AMSTERDAM Eerste Amstelvlietpad 20 1096GB Onbekend sale 3 None None False AE822B627ED096310B9ECBE7756340C8 1200000.0 1200000.0 None None None False Sale False
6 True 76 /te-koop/noord+holland/groot-amsterdam/amsterd... False Benedenwoning 10580650 NaN 52.346010 4.888799 AMSTERDAM Grevelingenstraat 18 HS 1078KP Onbekend sale 2 None None False 6FD1011D917E776DCF4DA836B5FFEE3E 550000.0 550000.0 None None None False SoldConditionally False
7 False 298 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9623182 NaN 52.330610 4.862902 AMSTERDAM Cannenburg 51 1081GW 651 sale 7 None None False 15FA170B99D4E2DEA03B6FC27E3B5B74 2495000.0 2495000.0 None None None False Sale False
8 False 270 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 15791215 NaN 52.347780 5.004530 AMSTERDAM Nico Jessekade 189 1087MR 200 sale 9 None None False 6EA5C0CDA0475DFC88A3A918A6B2909A 1549000.0 1549000.0 None None None False SoldConditionally False
9 False 201 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9617942 NaN 52.377391 4.764554 AMSTERDAM Osdorperweg 803 1067SW 1348 sale 6 None None False 4680429D99EC5AC47C950D57A77DF1EB 950000.0 950000.0 None None None False Sale False
UPDATE:
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
root_URL = 'https://jaap.nl'
df['detailsUrl'] = root_URL + df['detailsUrl']
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row['detailsUrl']
print ('Scraping: %s' %(row['location.street']))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how='all')
specialCase = False
for col in list(each.columns):
if each[col].dtypes == 'object':
if each[col].str.contains('Voorziening').any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + '---' + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split('---', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
temp_df = each.T
cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')

Categories

Resources