The code is not scraping the text when using Beautiful Soup FindAll as it returns an empty set. There are other issues with the code after this but at this stage I am trying to solve the first problem. I am pretty new to this so I understand the code structure may be less than ideal. I come from a VBA background.
import requests
from requests import get
from selenium import webdriver
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
#import chromedriver_binary # Adds chromedriver binary to path
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\Users\mmanenica\Documents\chromedriver.exe")
#click the search button on Austenders to return all Awarded Contracts
import time
#define the starting point: Austenders Awarded Contracts search page
driver.get('https://www.tenders.gov.au/cn/search')
#Find the Search Button and return all search results
Search_Results = driver.find_element_by_name("SearchButton")
if 'inactive' in Search_Results.get_attribute('name'):
print("Search Button not found")
exit;
print('Search Button found')
Search_Results.click()
#Pause code to prevent blocking by website
time.sleep(1)
i = 0
Awarded = []
#Move to the next search page by finding the Next button at the bottom of the page
#This code will need to be refined as the last search will be skipped currently.
while True:
Next_Page = driver.find_element_by_class_name('next')
if 'inactive' in Next_Page.get_attribute('class'):
print("End of Search Results")
exit;
i = i + 1
time.sleep(2)
#Loop through all the Detail links on the current Search Results Page
print("Checking search results page " + str(i))
print(driver.current_url)
soup = BeautifulSoup(driver.current_url, features='lxml')
#Find all Contract detail links in the current search results page
Details = soup.findAll('div', {'class': 'list-desc-inner'})
for each_Contract in Details:
#Loop through each Contract details link and scrape all the detailed
#Contract information page
Details_Page = each_Contract.find('a', {'class': 'detail'}).get('href')
driver.get(Details_Page)
#Scrape all the data in the Awarded Contract page
#r = requests.get(driver.current_url)
soup = BeautifulSoup(driver.current_url, features='lxml')
#find a list of all the Contract Info (contained in the the 'Contact Heading'
#class of the span element)
Contract = soup.find_all('span', {'class': 'Contact-Heading'})
Contract_Info = [span.get_text() for span in Contract]
#find a list of all the Summary Contract info which is in the text of\
#the 'list_desc_inner' class
Sub = soup.find_all('div', {'class': 'list_desc_inner'})
Sub_Info = [div.get_text() for div in Sub]
#Combine the lists into a unified list and append to the Awarded table
Combined = [Contract_Info, Sub_Info]
Awarded.append[Combined]
#Go back to the Search Results page (from the Detailed Contract page)
driver.back()
#Go to the next Search Page by clicking on the Next button at the bottom of the page
Next_Page.click()
#
time.sleep(3)
print(Awarded.Shape)
as stated, you are not actually feeding in the html source into BeautifulSoup. So first thing changed is: soup = BeautifulSoup(driver.current_url, features='lxml') to soup = BeautifulSoup(driver.page_source, features='lxml')
Second issue: Some of the elements there is no tag <a> with class=detail. So you won;t be able to get the href from a NoneType. I added a try/except to skip over when that happens (not sure if that gives your desired results though). You could also just get rid of that class, and just say Details_Page = each_Contract.find('a').get('href')
Next, that is only the extension of the url, you need to append the root, so: driver.get('https://www.tenders.gov.au' + Details_Page)
I also do not see where you are referring to class=Contact-Heading.
You also refer to class='class': 'list-desc-inner' and one point, then 'class': 'list_desc_inner' at another. Again, I don't see a class=list_desc_inner
Next. to append a list to a list, you want Awarded.append(Combined), not Awarded.append[Combined]
I also added .strip() in there to clean up some of that white space in the text.
Anyways, there's a lot you need to fix and clean up, and I also don't know what your expected output should be. But hopefully this gets you started.
Also, as stated in the comments, you COULD just click the download buttin and get the results straight away, but maybe you're doing it the hard way to practice...
import requests
from requests import get
from selenium import webdriver
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
#import chromedriver_binary # Adds chromedriver binary to path
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
#click the search button on Austenders to return all Awarded Contracts
import time
#define the starting point: Austenders Awarded Contracts search page
driver.get('https://www.tenders.gov.au/cn/search')
#Find the Search Button and return all search results
Search_Results = driver.find_element_by_name("SearchButton")
if 'inactive' in Search_Results.get_attribute('name'):
print("Search Button not found")
exit;
print('Search Button found')
Search_Results.click()
#Pause code to prevent blocking by website
time.sleep(1)
i = 0
Awarded = []
#Move to the next search page by finding the Next button at the bottom of the page
#This code will need to be refined as the last search will be skipped currently.
while True:
Next_Page = driver.find_element_by_class_name('next')
if 'inactive' in Next_Page.get_attribute('class'):
print("End of Search Results")
exit;
i = i + 1
time.sleep(2)
#Loop through all the Detail links on the current Search Results Page
print("Checking search results page " + str(i))
print(driver.current_url)
soup = BeautifulSoup(driver.page_source, features='lxml')
#Find all Contract detail links in the current search results page
Details = soup.findAll('div', {'class': 'list-desc-inner'})
for each_Contract in Details:
#Loop through each Contract details link and scrape all the detailed
#Contract information page
try:
Details_Page = each_Contract.find('a', {'class': 'detail'}).get('href')
driver.get('https://www.tenders.gov.au' + Details_Page)
#Scrape all the data in the Awarded Contract page
#r = requests.get(driver.current_url)
soup = BeautifulSoup(driver.page_source, features='lxml')
#find a list of all the Contract Info (contained in the the 'Contact Heading'
#class of the span element)
Contract = soup.find_all('span', {'class': 'Contact-Heading'})
Contract_Info = [span.text.strip() for span in Contract]
#find a list of all the Summary Contract info which is in the text of\
#the 'list_desc_inner' class
Sub = soup.find_all('div', {'class': 'list-desc-inner'})
Sub_Info = [div.text.strip() for div in Sub]
#Combine the lists into a unified list and append to the Awarded table
Combined = [Contract_Info, Sub_Info]
Awarded.append(Combined)
#Go back to the Search Results page (from the Detailed Contract page)
driver.back()
except:
continue
#Go to the next Search Page by clicking on the Next button at the bottom of the page
Next_Page.click()
#
time.sleep(3)
driver.close()
print(Awarded.Shape)
Related
I have the below code for opening the "New" link of a page which has the data i want to scrape (As in the screenshot). It's working ok and actually clicking the link but the soup i get is still for content under "Popular" (As in screenshot).
What am i doing wrong?
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.homeworkmarket.com/fields/business-finance")
time.sleep(2)
doc = driver.find_elements_by_xpath('//*[#id="wrapper"]/div[2]/div[1]/div[1]/div[3]/div[1]/ul/li[1]/a')[0]
doc.click()
time.sleep(10)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
The rest of the code for scraping href links:
question_links = soup.find_all(class_='css-e5w42e')
final_links = []
for link in question_links:
if 'href' in link.attrs:
link = 'https://www.homeworkmarket.com' + str(link.attrs['href'])
print(link)
final_links.append(link)
You do not need to click on New, cause elements are already present in HTML DOM:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.implicitly_wait(30)
driver.get("https://www.homeworkmarket.com/fields/business-finance")
for link in driver.find_elements(By.XPATH, "(*//a[text()='New']/ancestor::div[contains(#class,'css')])[3]/following-sibling::div/section/descendant::a[contains(#class,'css')]"):
print(link.get_attribute('href'))
Initial 80 links are from popular tab and rest should be from new tab.
This doesn't open a new page, just expands some area on existing page.
The page source contains this data even before expanding it visually, this is why clicking that button still gives you the same page data by performing
page = driver.page_source
I'm trying to create a script in python to scrape the title and link of different post from a webpage when it meets a certain condition. I want the script to print the rest of the results that are available after a certain text, as in Alternative to Chromedriver in this case. However, my current attempt (faulty) only prints this text Alternative to Chromedriver.
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/questions/tagged/web-scraping?tab=Newest"
check_title = "Alternative to Chromedriver"
res = requests.get(URL)
soup = BeautifulSoup(res.text,'html.parser')
for item in soup.select(".summary .question-hyperlink"):
if check_title!=item.get_text(strip=True):continue
title = item.get_text(strip=True)
link = item.get("href")
print(title,link)
How can I let the script parse the rest of the results that appear after a certain text?
Try:
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/questions/tagged/web-scraping?tab=Newest"
check_title = "Alternative to Chromedriver"
res = requests.get(URL)
soup = BeautifulSoup(res.text,'html.parser')
# Initialise a flag to track where to start printing from
start_printing = False
for item in soup.select(".summary .question-hyperlink"):
title = item.get_text(strip=True)
# Keep iterating until the required text is found. Initialise it only once
if not start_printing and check_title == title:
start_printing = True
continue
if start_printing:
link = item.get("href")
print(title,link)
Because you have the ì statement: "If the title is not Alternative to Chromedriver then continue loop the next title", that's why it only prints out when your title == 'Alternative to Chromedriver'. To print all titles, change the code like this
for item in soup.select(".summary .question-hyperlink"):
title = item.get_text(strip=True)
link = item.get("href")
if check_title == title:
print(f'Found it: {title}, {link}')
else:
print(title,link)
#Web scrapting of image with Python /questions/61035199/web-scrapting-of-image-with-python
#Imported functions not working in puppeteer /questions/61035043/imported-functions-not-working-in-puppeteer
#Trying to build a webscraper following a tutorial and keep getting attribute error for findall /questions/61034690/trying-to-build-a-webscraper-following-a-tutorial-and-keep-getting-attribute-err
#Python Selenium Web Scraping Hidden Div /questions/61034439/python-selenium-web-scraping-hidden-div
#Found it: Alternative to Chromedriver, /questions/61034224/alternative-to-chromedriver
Look at the output above, the last one has title euqal to 'Alternative to Chromedriver', so it prints out 'Found it', others just print title, link
Taking the example of this site:
https://www.imglobal.com
The contact number is present in the body of the webiste
Certain sites have it on top, along with the menu tabs, and certain sites at the bottom footer.
I have developed the routine to locate the element and its position using
element.location
element.location_once_scrolled_into_view
and to scroll into the view of the element
browser.execute_script("arguments[0].scrollIntoView()", element)
Is there a way to directly interpret if the element is present in the header/body/footer of the webpage, with the help of the tags, with selenium or bs4 in python.?
EDIT
Header example:
https://www.moeck.com/
Footer example:
https://www.andrew-kelly.co.uk/
Could you not just using element.parent and loop over until you find one of your targeted tag?
Like something like this:
from bs4 import BeautifulSoup as soup
html = """<html><header><div><span class="phone">123456789</span></div><body><div></div><footer><div></div></footer>"""
location = ['header','body','footer']
page = soup(html, 'html.parser')
element = page.find('span',{'class':'phone'})
while (element.parent):
if element.parent.name in location:
print("Phone is in " + element.parent.name)
break
else:
element = element.parent
EDIT:
To check class name too:
from bs4 import BeautifulSoup as soup
html = """<html><header class='test-class'><div><span class="phone">123456789</span></div><body><div></div><footer><div></div></footer>"""
location = ['header','body','footer']
soup = BeautifulSoup(html, 'html.parser')
element = soup.find('span',{'class':'phone'})
while (element.parent):
if element.parent.name in location and 'test-class' in element.parent.get('class'):
print("Phone is in " + element.parent.name)
break
else:
element = element.parent
I am trying to scrape data from this webpage
http://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;team=5;template=results;type=batting
I need to copy the contents from the table and put them in a csv file, then go the next page and append the contents of those page into the same file. I am able to scrape the table but however when I try to loop over clicking next button using selenium webdriver's click, it goes to the next page and stops. This is my code.
driver = webdriver.Chrome(executable_path = 'path')
url = 'http://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;team=5;template=results;type=batting'
def data_from_cricinfo(url):
driver.get(url)
pgsource = str(driver.page_source)
soup = BeautifulSoup(pgsource, 'html5lib')
data = soup.find_all('div', class_ = 'engineTable')
for tr in data:
info = tr.find_all('tr')
# grab data
next_link = driver.find_element_by_class_name('PaginationLink')
next_link.click()
data_from_cricinfo(url)
Is there anyway to click next for all pages using a loop and copy the contents of all pages into the same file? Thanks in advance.
You can do something like below to traverse all the pages (through Next button) and parse the data from the table:
from selenium import webdriver
from bs4 import BeautifulSoup
URL = 'http://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;team=5;template=results;type=batting'
driver = webdriver.Chrome()
driver.get(URL)
while True:
soup = BeautifulSoup(driver.page_source, 'html5lib')
table = soup.find_all(class_='engineTable')[2]
for info in table.find_all('tr'):
data = [item.text for item in info.find_all("td")]
print(data)
try:
driver.find_element_by_partial_link_text('Next').click()
except:
break
driver.quit()
I am trying to write a python script that lists all the links in a webpage that contain some substring. The problem that I am running into is that the webpage has multiple "pages" so that it doesn't clutter all the screen. Take a look at https://www.go-hero.net/jam/17/solutions/1/1/C++ for an example.
This is what I have so far:
import requests
from bs4 import BeautifulSoup
url = "https://www.go-hero.net/jam/17/solutions/1/1/C++"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html5lib")
links = soup.find_all('a')
for tag in links:
link = tag.get('href', None)
if link is not None and 'GetSource' in link:
print(link)
Any suggestions on how I might get this to work? Thanks in advance.
Edit/Update: Using Selenium, you could click the page links before scraping the html to collect all the content into the html. Many/most websites with pagination don't collect all the text in the html when you click through the pages, but I noticed that the example you provided does. Take a look at this SO question for a quick example of making Selenium work with BeautifulSoup. Here is how you could use it in your code:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
original_url = "https://www.go-hero.net/jam/17/solutions/1/1/C++"
driver.get(original_url)
# click the links for pages 1-29
for i in range(1, 30):
path_string = '/jam/17/solutions/1/1/C++#page-' + str(i)
driver.find_element_by_xpath('//a[#href=' + path_string + ']').click()
# scrape from the accumulated html
html = driver.page_source
soup = BeautifulSoup(html)
links = soup.find_all('a')
# proceed as normal from here
for tag in links:
link = tag.get('href', None)
if link is not None and 'GetSource' in link:
print(link)
Original Answer: For the link you provided above, you could simply loop through possible urls and run your scraping code in the loop:
import requests
from bs4 import BeautifulSoup
original_url = "https://www.go-hero.net/jam/17/solutions/1/1/C++"
# scrape from the original page (has no page number)
response = requests.get(original_url)
soup = BeautifulSoup(response.content, "html5lib")
links = soup.find_all('a')
# prepare to scrape from the pages numbered 1-29
# (note that the original page is not numbered, and the next page is "#page-1")
url_suffix = '#page-'
for i in range(1, 30):
# add page number to the url
paginated_url = original_url + url_suffix + str(i)
response = requests.get(paginated_url)
soup = BeautifulSoup(response.content, "html5lib")
# append resulting list to 'links' list
links += soup.find_all('a')
# proceed as normal from here
for tag in links:
link = tag.get('href', None)
if link is not None and 'GetSource' in link:
print(link)
I don't know if you mind that you'll get duplicates in your results. You will get duplicate results in your link list as the code currently stands, but you could add the links to a Set or something instead to easily remedy that.