from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId='
driver = webdriver.Chrome('C:\chromedriver.exe')
driver.get(url)
driver.maximize_window()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 6:
data = row.find_all('td')
name = data[1].text.strip()
print("NAME:"+name)
root_url = data[5].input['onclick'].split(",")[4]
link ='http://dciindia.gov.in/'+root_url
print("LINK:"+link)
except:
pass
try:
driver.find_element_by_xpath('//*[#id="gvSearchDentistlist"]/tbody/tr[52]/td/table/tbody/tr/td[1]').click()
time.sleep(1)
except:
print ('No more pages')
next_page=False
driver.close()
I am not able to click on the next page. I don't know the end page and there is no 'next' button to click, the pages are given as 1,2,3.. and so on.
I open the site,
http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId=
and found the page had record the total numbers,
you just need some mathematical. and then get the total page number.
You can used this way to adjust had next page or not?
Related
My code gets links/HTML from different "sections" of a page.
It prints 2 links per section, however I only want the first one printed.
Expected output should not contain the links ending with "video", as it does with my code.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
jam=[]
baseurl='https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('a',class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a',href=True):
jam.append(baseurl+link['href'])
print(jam)
You can use the condition function before appending the script.
...
for item in productlist:
ahrefs = item.find_all('a', href=True)
for index in range(len(ahrefs)):
if (index % 2 == 0) and ('video' not in ahrefs[index]['href']):
jam.append(baseurl+ahrefs[index]['href'])
print(jam)
...
Let me know after trying.
Good luck
Use os.path.basename to get the end of string.And use in operator to check whether "video" exists:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os
driver = webdriver.Chrome()
jam = []
baseurl = 'https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
productlist = soup.find_all('a', class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a', href=True):
url = link['href']
if "video" not in os.path.basename(url):
jam.append(baseurl + url)
print(jam)
result:
['https://meetinglibrary.asco.org/record/185955/abstract',
'https://meetinglibrary.asco.org/record/185955/slide',
'https://meetinglibrary.asco.org/record/185954/abstract',
'https://meetinglibrary.asco.org/record/186048/abstract',
'https://meetinglibrary.asco.org/record/186048/slide',
'https://meetinglibrary.asco.org/record/190197/slide',
'https://meetinglibrary.asco.org/record/192623/slide',
'https://meetinglibrary.asco.org/record/185414/abstract',
'https://meetinglibrary.asco.org/record/185414/slide',
'https://meetinglibrary.asco.org/record/185415/abstract',
'https://meetinglibrary.asco.org/record/185415/slide',
'https://meetinglibrary.asco.org/record/185473/abstract',
'https://meetinglibrary.asco.org/record/185473/slide',
'https://meetinglibrary.asco.org/record/187584/slide',
'https://meetinglibrary.asco.org/record/188561/slide',
'https://meetinglibrary.asco.org/record/186710/abstract',
'https://meetinglibrary.asco.org/record/186710/slide',
'https://meetinglibrary.asco.org/record/186699/abstract',
'https://meetinglibrary.asco.org/record/186699/slide',
'https://meetinglibrary.asco.org/record/186698/abstract',
'https://meetinglibrary.asco.org/record/186698/slide',
'https://meetinglibrary.asco.org/record/187720/slide',
'https://meetinglibrary.asco.org/record/187480/abstract',
'https://meetinglibrary.asco.org/record/187480/slide',
'https://meetinglibrary.asco.org/record/191961/slide',
'https://meetinglibrary.asco.org/record/192626/slide',
'https://meetinglibrary.asco.org/record/186983/abstract',
'https://meetinglibrary.asco.org/record/186983/slide',
'https://meetinglibrary.asco.org/record/188580/abstract',
'https://meetinglibrary.asco.org/record/188580/slide',
'https://meetinglibrary.asco.org/record/189047/abstract',
'https://meetinglibrary.asco.org/record/189047/slide',
'https://meetinglibrary.asco.org/record/190223/slide',
'https://meetinglibrary.asco.org/record/190273/slide',
'https://meetinglibrary.asco.org/record/184812/abstract',
'https://meetinglibrary.asco.org/record/184812/slide',
'https://meetinglibrary.asco.org/record/184927/slide',
'https://meetinglibrary.asco.org/record/184805/abstract',
'https://meetinglibrary.asco.org/record/184805/slide',
'https://meetinglibrary.asco.org/record/184811/abstract',
'https://meetinglibrary.asco.org/record/184811/slide',
'https://meetinglibrary.asco.org/record/185576/slide',
'https://meetinglibrary.asco.org/record/190147/slide']
I am scraping goodreads.com using Selenium and Beautiful soup. I can able to get the results for the first page. When I give the URL for the second page then it loads the first page and gives the first page results only. I tried with different pages and all loads the first page only. What would be the reason and how to overcome this?
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
# First-page site URL: https://www.goodreads.com/shelf/show/business?page=1
driver = webdriver.Chrome(ChromeDriverManager().install())
# Reading the second page
driver.get("https://www.goodreads.com/shelf/show/non-fiction?page=2")
time.sleep(3)
summaryItems = driver.find_elements_by_xpath("//a[contains(#class, 'bookTitle')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
#Closing the pop-up window
try:
close = driver.find_elements_by_class_name('gr-iconButton')
close.click()
except:
close = "None"
try:
# Taking book description
more = driver.find_element_by_css_selector("#description > a:nth-child(3)").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
#for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
# print(item.text)
sections = soup.findAll("span", id=re.compile("^freeText"))[:2]
print("message ")
i = 0
for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
i = i+1
if i == 2:
desc.append(item.text)
except:
more = "None"
try: # Taking book title
# time.sleep(2)
job_title = driver.find_element_by_xpath("//h1[#class='gr-h1 gr-h1--serif']").text
#job_title = driver.find_element_by_id('bookTitle').find_element_by_class_name('gr-h1 gr-h1--serif').text
title.append(job_title)
#print(title)
except:
job_title = "None"
#Taking Author name
try:
# time.sleep(2)
authors = driver.find_element_by_xpath("//a[#class='authorName']").text
author.append(authors)
#print(author)
except:
authors = "None"
#Taking Ratings
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
rate = soup.find("span", itemprop="ratingValue").text.strip()
rates = rate.replace('\n','')
rating.append(rates)
driver.close()
Output:
I am able to scrape book title, author name, book description, and rating for the first page only.
You should login first to scrap data on other page.
Try to add following code into your script:
driver = webdriver.Chrome(ChromeDriverManager().install())
# Add below code after webdriver.Chrome()
driver.get("https://www.goodreads.com/user/sign_in")
time.sleep(5)
driver.find_element_by_css_selector("#user_email").send_keys("your email")
driver.find_element_by_css_selector("#user_password").send_keys("your password")
driver.find_element_by_xpath("//input[#type='submit' and #value='Sign in']").click()
My code is stopping short before finishing all the tasks.
It should be:
1 - getting a link from search results of fitness classes to go to the individual studio page.
2 - then from the individual studio page(first for loop):
A) grab the studio name and write to csv.
B) grab a link to a fitness class from the class schedule
3 - Open class page link and grab class name (second for loop)
It completes step 2 and instead of continuing to step 3, it goes back to initial search results page and repeats step 1 for the next studio in order.
What am i doing wrong? Thanks in advance!
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as browser_wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
import csv
# initialize the chrome browser
browser = webdriver.Chrome(executable_path=r'./chromedriver')
# URL
class_pass_url = 'https://www.classpass.com'
# Create file and writes the first row, added encoding type as write was giving errors
f = open('ClassPass.csv', 'w', encoding='utf-8')
headers = 'Studio, Name, Description, Image, Address, Phone, Email, Website\n'
f.write(headers)
# classpass results page
page = "https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9"
browser.get(page)
# Browser waits
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
# Scrolls to bottom of page to reveal all classes
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Extract page source and parse
page_source = browser.page_source
page_soup = soup(page_source, "html.parser")
# Parse of class listings # Looks through results and gets link to class page
sessions = page_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
for session in sessions:
# gets link to class page and
session_link = class_pass_url + session.a['href']
browser.get(session_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses class page
session_page_source = browser.page_source
session_soup = soup(session_page_source, "html.parser")
# get studio name
try:
studio = session_soup.find('h2', {'class': 'gamma'}).text
except (AttributeError, TypeError,) as e:
pass
# write studio name
f.write(
studio.replace(',', '|') + "\n")
print('got studio name name')
# gets link to individual class in classes schedule table
classses = page_soup.findAll('section', {'class': '_33uV0qMCu2Sfk4M3oTJjVv'})
for classs in classses:
classs_link = class_pass_url + classs.a['href']
browser.get(classs_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses individual class page
classses_page_source = browser.page_source
classses_soup = soup(classses_page_source, "html.parser")
try:
classs_name = session_soup.find('span', {'data-component': 'LocalizableMessage'}).text
except (AttributeError, TypeError,) as e:
pass
# gets class names
f.write(
classs_name.replace(',', '|') + "\n")
print('got class name')
I'm not quite sure about your goal since your question and your code is completely unexplained well.
But from my point of view, i think that's your goal.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def Main():
r = requests.get(
"https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9")
soup = BeautifulSoup(r.text, 'html.parser')
urls = []
for item in soup.findAll("a", {'class': '_3Rgmjog5fetGEXICK2gVhh'}):
item = item.get("href")
urls.append(f"https://classpass.com{item}")
return urls
options = Options()
options.add_argument('--headless')
def Second():
urls = Main()
studios = []
links = []
driver = webdriver.Firefox(options=options)
for url in urls:
print(f"Extracting: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
studio = soup.find('h2', {'class': 'gamma'}).text
studios.append(studio)
for item in soup.findAll("a", {'href': True}):
item = item.get("href")
if item.startswith("/classes/"):
print(item)
links.append(f"https://www.classpass.com{item}")
driver.quit()
return links
def Third():
links = Second()
driver = webdriver.Firefox(options=options)
for link in links:
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
name = soup.find(
'span', {'data-component': 'LocalizableMessage'}).text
print(name)
except:
pass
driver.quit()
Third()
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'https://www.mciindia.org/CMS/information-desk/indian-medical-register'
driver = webdriver.Chrome('C:\chromedriver.exe')
driver.get(url)
driver.find_element_by_xpath("//div[#class='col-sm-4']//button[#class='multiselect dropdown-toggle btn btn-default']").click()
driver.find_elements_by_xpath("//label[contains(text(),'2015')]")
driver.find_element_by_xpath("//button[#id='doctor_advance_Details']").click()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'doct_info5'})
headers = [ header.text.strip() for header in table.find_all('th') ]
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'doct_info5'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 7:
data = row.find_all('td')
name = data[4].text.strip()
root_url = data[6].a['href'].split("'")[1]
id_url = data[6].a['href'].split("'")[3]
link = root_url + 'ViewDetails.aspx?ID=' + id_url
print ('Link: %s' %(link))
except:
pass
time.sleep(5)
try:
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
except:
print ('No more pages')
next_page=False
driver.close()
The code written above is not clicking on the year using xpath I tried to click the specific years to fetch the data is there any other way to fetch the data from the above link. Is there any other way than selenium webdriver..can we just extract using beautifulsoup and requests.
want to extract the links in 'view' from 'next'page tab also,means from n number of pagess
from bs4 import BeautifulSoup
import requests
r = requests.get('https://old.mciindia.org/InformationDesk/IndianMedicalRegister.aspx')
soup = BeautifulSoup(r.text,'lxml')
for links in soup.find('tr',class_='row'):
for link in links.find('a',id_='lnkDesc'):
print link['href']
should get you started:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'https://old.mciindia.org/InformationDesk/IndianMedicalRegister.aspx'
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
driver.find_element_by_xpath("//a[contains(text(),'Year of Registration')]").click()
driver.find_elements_by_css_selector("input[type='text']")[-1].send_keys("2015")
driver.find_element_by_css_selector("input[value='Submit']").click()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'dnn_ctr588_IMRIndex_GV_Search'})
headers = [ header.text.strip() for header in table.find_all('th') ]
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'dnn_ctr588_IMRIndex_GV_Search'})
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 7:
data = row.find_all('td')
name = data[4].text.strip()
root_url = data[6].a['href'].split("'")[1]
id_url = data[6].a['href'].split("'")[3]
link = root_url + 'ViewDetails.aspx?ID=' + id_url
print ('Name: %-50s\t Link: %s' %(name, link))
time.sleep(5)
try:
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
except:
print ('No more pages')
next_page=False
driver.close()