from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd
s=Service("C:\selenium driver\chromedriver.exe")
driver = webdriver.Chrome(service=s)
companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []
driver.get("https://www.saveface.co.uk/search/")
driver.implicitly_wait(10)
blocks = driver.find_elements(By.XPATH, "//div[#class='result clientresult']")
for block in range(30):
company_name = blocks[block].find_element(By.XPATH, "//h3[#class='resulttitle']").text.strip()
companies_names.append(company_name)
person_name = blocks[block].find_element(By.XPATH, "//p[#class='name_wrapper']").text.strip()
persons_names.append(person_name)
phone_number = blocks[block].find_element(By.XPATH, "//div[#class='searchContact phone']").text.strip()
phones_numbers.append(phone_number)
location = blocks[block].find_element(By.XPATH, "//li[#class='cls_loc']").text.strip()
locations.append(location)
opening_hour = blocks[block].find_element(By.XPATH, "//li[#class='opening-hours']").text.strip()
opening_hours.append(opening_hour)
profile = blocks[block].find_element(By.XPATH, "//a[#class='visitpage']").get_attribute("href")
all_profiles.append(profile)
print(company_name, person_name, phone_number, location, opening_hour, profile)
if block == 29:
two_page = driver.find_element(By.XPATH, "//a[#class='facetwp-page']")
two_page.click()
driver.implicitly_wait(10)
blocks = driver.find_elements(By.XPATH, "//div[#class='result clientresult']")
for i in range(len(all_profiles)):
driver.get(all_profiles[i])
description = driver.find_element(By.XPATH, "//div[#class='desc-text-left']").text.strip()
descriptions.append(description)
website_link = driver.find_element(By.XPATH, "//a[#class='visitwebsite website']").get_attribute("href")
websites_links.append(website_link)
driver.implicitly_wait(10)
driver.close()
df = pd.DataFrame(
{
"company_name": companies_names,
"person_name": persons_names,
"phone_number": phones_numbers,
"location": locations,
"opening_hour": opening_hours,
"description": descriptions,
"website_link": websites_links,
"profile_on_saveface": all_profiles
}
)
df.to_csv('saveface.csv',index=False)
#print(df)
This is the result:
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
To restric the search within a subtree rooted at the context node, your expression should start with .// so you have to replace // with .// in each of the commands
... = blocks[block].find_element(...)
The meaning of // is to search the document from the document's root, ignoring the context node blocks[block] altogether.
Moreover, notice that not all the blocks have a location as you can see from this image
in this case
location = blocks[block].find_element(By.XPATH, "//li[#class='cls_loc']")
will raise a NoSuchElementException. To avoid this you have to put the command in a try...except... block
UPDATE
Scraping 400 blocks with selenium takes about 1 minute on my computer, I tried with BeautifulSoup and it just takes less than 1 second! The slow part is to scrape the profiles, because for each of them we have to download a new webpage, however is still way faster with BeautifulSoup.
So I write a script without using selenium, just BeautifulSoup (you can install by running pip install beautifulsoup4 in the terminal)
import requests
from bs4 import BeautifulSoup
url = 'https://www.saveface.co.uk/search/'
soup = BeautifulSoup(requests.get(url).text, "html.parser")
css_selector = {
'company name' : ".title",
'person name' : ".name_wrapper",
'phone number' : ".phone",
'location' : ".cls_loc",
'opening hours': ".opening-hours",
'profile link' : ".visitpage",
}
data = {key:[] for key in list(css_selector)+['description','website link']}
number_of_pages = int(str(soup).split('total_pages":')[1].split('}')[0])
for page in range(2,number_of_pages+2):
blocks = soup.select('.clientresult')
for idx,block in enumerate(blocks):
print(f'blocks {idx+1}/{len(blocks)}',end='\r')
for key in list(css_selector):
try:
if 'link' in key:
data[key] += [ block.select_one(css_selector[key])['href'] ]
else:
data[key] += [ block.select_one(css_selector[key]).text.strip().replace('\r\n',', ') ]
except AttributeError:
data[key] += ['*missing value*']
if page <= number_of_pages:
print('\nloading page', page)
url_page = f'{url}?fwp_paged={page}'
soup = BeautifulSoup(requests.get(url_page).text, "html.parser")
print('\nno more pages to load, moving to scrape profile links...')
for idx,url in enumerate(data['profile link']):
print(f"profile link {idx+1}/{len(data['profile link'])} ",end='\r')
soup_profile = BeautifulSoup(requests.get(url).text, "html.parser")
try:
data['description'] += [soup_profile.select_one('.clinicContent > .description').text.strip()]
except AttributeError:
data['description'] += ['*missing value*']
try:
data['website link'] += [soup_profile.select_one('.visitwebsite')['href']]
except AttributeError:
data['website link'] += ['*missing value*']
Output (it took about 8 minutes to complete the execution)
blocks 400/400
loading page 2
blocks 109/109
no more pages to load, moving to scrape profile links...
profile link 509/509
Then you can easily create the dataframe by running pd.DataFrame(data)
this is the new code
but it returns the same output on every page why:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd
s=Service("C:\selenium driver\chromedriver.exe")
driver = webdriver.Chrome(service=s)
companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []
driver.get("https://www.saveface.co.uk/search/")
driver.implicitly_wait(10)
pages = driver.find_elements(By.XPATH, ".//a[#class='facetwp-page']")
for page in range(len(pages)+1):
blocks = driver.find_elements(By.XPATH, ".//div[#class='result clientresult']")
for block in range(10):
try:
company_name = blocks[block].find_element(By.XPATH, ".//h3[#class='resulttitle']").text.strip()
companies_names.append(company_name)
except:
companies_names.append("Not found on the site")
try:
person_name = blocks[block].find_element(By.XPATH, ".//p[#class='name_wrapper']").text.strip()
persons_names.append(person_name)
except:
persons_names.append("Not found on the site")
try:
phone_number = blocks[block].find_element(By.XPATH, ".//div[#class='searchContact phone']").text.strip()
phones_numbers.append(phone_number)
except:
phones_numbers.append("Not found on the site")
try:
location = blocks[block].find_element(By.XPATH, ".//li[#class='cls_loc']").text.strip()
locations.append(location)
except:
locations.append("Not found on the site")
try:
opening_hour = blocks[block].find_element(By.XPATH, ".//li[#class='opening-hours']").text.strip()
opening_hours.append(opening_hour)
except:
opening_hours.append("Not found on the site")
try:
profile = blocks[block].find_element(By.XPATH, ".//a[#class='visitpage']").get_attribute("href")
all_profiles.append(profile)
except:
all_profiles.append("Not found on the site")
two_page = driver.find_element(By.XPATH, ".//a[#class='facetwp-page']")
two_page.click()
for i in range(len(all_profiles)):
try:
driver.get(all_profiles[i])
driver.implicitly_wait(10)
try:
description = driver.find_element(By.XPATH, ".//div[#class='desc-text-left']").text.strip()
descriptions.append(description)
except:
descriptions.append("Not found on the site")
try:
website_link = driver.find_element(By.XPATH, ".//a[#class='visitwebsite website']").get_attribute("href")
websites_links.append(website_link)
except:
websites_links.append("Not found on the site")
except:
descriptions.append("Not found on the site")
websites_links.append("Not found on the site")
driver.implicitly_wait(10)
driver.close()
df = pd.DataFrame(
{
"company_name": companies_names,
"person_name": persons_names,
"phone_number": phones_numbers,
"location": locations,
"opening_hour": opening_hours,
"description": descriptions,
"website_link": websites_links,
"profile_on_saveface": all_profiles
}
)
df.to_csv('saveface.csv',index=False)
print(df)
I am using BeautifulSoup on python to scrape football statistics from this website: https://www.skysports.com/premier-league-results/2020-21. Yet the site only shows the first 200 games of the season and the rest of the 180 games are behind a "show more" button. The button does not change the url so I can't just replace the url.
This is my code:
from bs4 import BeautifulSoup
import requests
scores_html_text = requests.get('https://www.skysports.com/premier-league-results/2020-21').text
scores_soup = BeautifulSoup(scores_html_text, 'lxml')
fixtures = scores_soup.find_all('div', class_ = 'fixres__item')
This only gets the first 200 fixtures.
How would I access the html past the show more button?
The hidden results are inside <script> tag, so to get all 380 results you need to parse it additionally:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.skysports.com/premier-league-results/2020-21"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
script = soup.select_one('[type="text/show-more"]')
script.replace_with(BeautifulSoup(script.contents[0], "html.parser"))
all_data = []
for item in soup.select(".fixres__item"):
all_data.append(item.get_text(strip=True, separator="|").split("|")[:5])
all_data[-1].append(
item.find_previous(class_="fixres__header2").get_text(strip=True)
)
df = pd.DataFrame(
all_data, columns=["Team 1", "Score 1", "Score 2", "Time", "Team 2", "Date"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
Team 1 Score 1 Score 2 Time Team 2 Date
0 Arsenal 2 0 16:00 Brighton and Hove Albion Sunday 23rd May
1 Aston Villa 2 1 16:00 Chelsea Sunday 23rd May
2 Fulham 0 2 16:00 Newcastle United Sunday 23rd May
3 Leeds United 3 1 16:00 West Bromwich Albion Sunday 23rd May
...
377 Crystal Palace 1 0 15:00 Southampton Saturday 12th September
378 Liverpool 4 3 17:30 Leeds United Saturday 12th September
379 West Ham United 0 2 20:00 Newcastle United Saturday 12th September
and saves data.csv (screenshot from LibreOffice):
I am not aware of how to do this with BeautifulSoup, but this is how I would do it using Selenium (note that I am very new to Selenium, so there are probably better ways of doing this).
The imports used are:
from selenium import webdriver
import time
You will also need to download the Chrome webdriver (assuming that you are on Chrome), and place it in the same directory as your script, or in your library path.
There will be a cookies popup which you have to workaround:
# prepare the driver
URL = "https://www.skysports.com/premier-league-results/2020-21"
driver = webdriver.Chrome()
driver.get(URL)
# wait so that driver has loaded before we look for the cookies popup
time.sleep(2)
# accept cookies popup, which occurs in an iframe
# begin by locating iframe
frame = driver.find_element_by_id('sp_message_iframe_533903')
# find the accept button (inspect element and copy Xpath of button)
driver.find_element_by_xpath('//*[#id="notice"]/div[3]/button[1]').click()
time.sleep(2)
driver.refresh()
# find "show more text" button and click
driver.find_element_by_class_name("plus-more__text").click()
i tried to go up a few levels and this worked , u might need to process it a wee bit more.
from bs4 import BeautifulSoup
import requests
scores_html_text = requests.get('https://www.skysports.com/premier-league-results/2020-21').text
scores_soup = BeautifulSoup(scores_html_text,'lxml')
fixtures = scores_soup.find(class_ = 'site-layout-secondary block page-nav__offset grid')
print(fixtures)
I want to extract date and summary of an article in a website, here is my code
from bs4 import BeautifulSoup
from selenium import webdriver
full_url = 'https://www.wsj.com/articles/readers-favorite-summer-recipes-11599238648?mod=searchresults&page=1&pos=20'
url0 = full_url
browser0 = webdriver.Chrome('C:/Users/liuzh/Downloads/chromedriver_win32/chromedriver')
browser0.get(url0)
html0 = browser0.page_source
page_soup = BeautifulSoup(html0, 'html5lib')
date = page_soup.find_all("time", class_="timestamp article__timestamp flexbox__flex--1")
sub_head = page_soup.find_all("h2", class_="sub-head")
print(date)
print(sub_head)
I got the following result, how can I obtain the standard form ?(e.g. Sept. 4, 2020 12:57 pm ET; This Labor Day weekend, we’re...)
[<time class="timestamp article__timestamp flexbox__flex--1">
Sept. 4, 2020 12:57 pm ET
</time>]
[<h2 class="sub-head" itemprop="description">This Labor Day weekend, we’re savoring the last of summer with a collection of seasonal recipes shared by Wall Street Journal readers. Each one comes with a story about what this food means to a family and why they return to it each year.</h2>]
Thanks.
Try something like:
for d in date:
print(d.text.strip())
Given your sample html, output should be:
Sept. 4, 2020 12:57 pm ET
I'm trying to pull data form a website and have been looking and trying to learn for weeks. I'm trying
from bs4 import BeautifulSoup as Soup
req = requests.get('http://www.rushmore.tv/schedule')
soup = Soup(req.text, "html.parser")
soup.find('home-section-wrap center', id="section-home")
print soup.find
but it's returning something do to with Steam that's completely random considering that nothing I am doing is related to Steam.
<bound method BeautifulSoup.find of \n<td class="listtable_1" height="16">\n\n 76561198134729239\n \n</td>>
What I'm trying to do is scrape a div ID and print the contents. Extremely new. Cheers
Use this:
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.rushmore.tv/schedule')
soup = BeautifulSoup(r.text, "html.parser")
for row in soup.find('ul', id='myUL').findAll('li'):
print(row.text)
Partial Output:
10:30 - 13:30 Olympics: Women's Curling, Canada vs China (CA Coverage) - Channel 21
10:30 - 11:30 Olympics: Freestyle, Men's Half Pipe (US Coverage) - Channel 34
11:30 - 14:45 Olympics: BBC Coverage - Channel 92
11:30 - 19:30 Olympics: BBC Red Button Coverage - Channel 103
11:30 - 13:30 Olympics: Women's Curling, Great Britain vs Japan - Channel 105
13:00 - 15:30 Olympics: Men's Ice Hockey: Slovenia vs Norway - Channel 11
13:30 - 15:30 Olympics: Men's Ice Hockey: Slovenia vs Norway (JIP) - Channel 21
13:30 - 21:30 Olympics: DE Coverage - Channel 88
14:45 - 18:30 Olympics: BBC Coverage - Channel 91
Try to run following code:
import urllib2
from bs4 import BeautifulSoup
quote_page='http://www.rushmore.tv/schedule'
def page_scrapper(quote_page):
print(quote_page+' is being processed... ')
page = urllib2.urlopen(quote_page) #Let's open the page...
soup = BeautifulSoup(page,'html.parser') #And now we parse it with BSoup parser..
box = soup.find('ul', attrs = {'id': 'myUL'}) #Save the contents of the 'ul' tag with id myUL(it contains schedule)
print(box) #and print it!
page_scrapper(quote_page)
This should do the trick.
EDIT - added some lines of code
I am trying to parse an ESPN webpage to get the date, time, and teams playing in each NFL game for a given week using BeautifulSoup. I am able to get most of the information, however, I am having trouble with the time information.
For some reason, the text between the a tag is not being returned.
The html for one of the a tags is:
<a data-dateformat="time1" name="&lpos=nfl:schedule:time" href="/nfl/game?gameId=400874572">12:00 PM</a>
I am looking to get the "12:00 PM" in between the a tags, but instead I get:
<a data-dateformat="time1" href="/nfl/game?gameId=400874572" name="&lpos=nfl:schedule:time"></a>
which doesn't have any text in between the tags.
Here is what I have used to parse the webpage.
import urllib2
from bs4 import BeautifulSoup
def parse_nfl_schedule_espn():
schedule = BeautifulSoup(urllib2.urlopen("http://www.espn.com/nfl/schedule/_/week/10").read(), "lxml")
for date in schedule.find_all('h2'):
#separate by game
game_info = date.nextSibling.find_all('tr')
date = str(date).split(">")
date = date[1].split("<")
date = date[0]
#print date
for i in range(len(game_info)):
#separate each part of game row
value = game_info[i].find_all('td')
#iterate over <thead>
if len(value) > 1:
#away team abv
away = str(value[0].find('abbr')).split(">")
away = away[1].split("<")
away = away[0]
#home team abv
home = str(value[1].find('abbr')).split(">")
home = home[1].split("<")
home = home[0]
time = value[2].find_all('a')
print time
#print "%s at %s" % (away, home)
if __name__ == "__main__":
parse_nfl_schedule_espn()
Any help/suggestions would be much appreciated.
You will need to use something like Selenium to get the HTML. This would then allow the browser to run any Javascript. This can be done as follows:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
def parse_nfl_schedule_espn():
browser = webdriver.Firefox(firefox_binary=FirefoxBinary())
browser.get("http://www.espn.com/nfl/schedule/_/week/10")
schedule = BeautifulSoup(browser.page_source, "lxml")
for date in schedule.find_all('a', attrs={'data-dateformat' : "time1"}):
print date.text
if __name__ == "__main__":
parse_nfl_schedule_espn()
Which would display the following:
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
9:05 PM
9:25 PM
9:25 PM
1:30 AM
1:30 AM
You could also investigate "headless" solutions such as PhantomJS to avoid having to see a browser window being displayed.