want to extract the links in 'view' from 'next'page tab also,means from n number of pagess
from bs4 import BeautifulSoup
import requests
r = requests.get('https://old.mciindia.org/InformationDesk/IndianMedicalRegister.aspx')
soup = BeautifulSoup(r.text,'lxml')
for links in soup.find('tr',class_='row'):
for link in links.find('a',id_='lnkDesc'):
print link['href']
should get you started:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'https://old.mciindia.org/InformationDesk/IndianMedicalRegister.aspx'
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
driver.find_element_by_xpath("//a[contains(text(),'Year of Registration')]").click()
driver.find_elements_by_css_selector("input[type='text']")[-1].send_keys("2015")
driver.find_element_by_css_selector("input[value='Submit']").click()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'dnn_ctr588_IMRIndex_GV_Search'})
headers = [ header.text.strip() for header in table.find_all('th') ]
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'dnn_ctr588_IMRIndex_GV_Search'})
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 7:
data = row.find_all('td')
name = data[4].text.strip()
root_url = data[6].a['href'].split("'")[1]
id_url = data[6].a['href'].split("'")[3]
link = root_url + 'ViewDetails.aspx?ID=' + id_url
print ('Name: %-50s\t Link: %s' %(name, link))
time.sleep(5)
try:
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
except:
print ('No more pages')
next_page=False
driver.close()
Related
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'https://www.mciindia.org/CMS/information-desk/indian-medical-register'
driver = webdriver.Chrome('C:\chromedriver.exe')
driver.get(url)
driver.find_element_by_xpath("//div[#class='col-sm-4']//button[#class='multiselect dropdown-toggle btn btn-default']").click()
driver.find_elements_by_xpath("//label[contains(text(),'2015')]")
driver.find_element_by_xpath("//button[#id='doctor_advance_Details']").click()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'doct_info5'})
headers = [ header.text.strip() for header in table.find_all('th') ]
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'doct_info5'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 7:
data = row.find_all('td')
name = data[4].text.strip()
root_url = data[6].a['href'].split("'")[1]
id_url = data[6].a['href'].split("'")[3]
link = root_url + 'ViewDetails.aspx?ID=' + id_url
print ('Link: %s' %(link))
except:
pass
time.sleep(5)
try:
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
except:
print ('No more pages')
next_page=False
driver.close()
The code written above is not clicking on the year using xpath I tried to click the specific years to fetch the data is there any other way to fetch the data from the above link. Is there any other way than selenium webdriver..can we just extract using beautifulsoup and requests.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
url = 'http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId='
driver = webdriver.Chrome('C:\chromedriver.exe')
driver.get(url)
driver.maximize_window()
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
next_page = True
while next_page == True:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 6:
data = row.find_all('td')
name = data[1].text.strip()
print("NAME:"+name)
root_url = data[5].input['onclick'].split(",")[4]
link ='http://dciindia.gov.in/'+root_url
print("LINK:"+link)
except:
pass
try:
driver.find_element_by_xpath('//*[#id="gvSearchDentistlist"]/tbody/tr[52]/td/table/tbody/tr/td[1]').click()
time.sleep(1)
except:
print ('No more pages')
next_page=False
driver.close()
I am not able to click on the next page. I don't know the end page and there is no 'next' button to click, the pages are given as 1,2,3.. and so on.
I open the site,
http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId=
and found the page had record the total numbers,
you just need some mathematical. and then get the total page number.
You can used this way to adjust had next page or not?
I can't seem to get this to work. I have my script going to a site and scraping the data into my info variable but when I am trying to pull out the href from a specific class I am getting None or it just isn't working when I try all kinds of different combos. Where am I screwing up? When I scrape it into my info variable, there is a class='business-name' and the href inside of it.
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
info = soup.findAll('div', {'class': 'info'})
for each_business in info:
# This is the spot that is broken. I can't make it work!
yp_bus_url = each_business.get('class_','business-name')['href']
print(yp_bus_url)
You can also do this:
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
With the change here (be sure to assign the list to whatever you want):
#info = soup.findAll('div', {'class': 'info'})
info = soup.select("[class~=business-name]")
[i.get('href') for i in info]
Returns:
['/new-orleans-la/mip/upperline-restaurant-526381149?lid=1001797484770',
'/new-orleans-la/mip/brunos-tavern-451091659?lid=451091659',
'/new-orleans-la/mip/lafittes-blacksmith-shop-bar-19195002?lid=19195002',
'/new-orleans-la/mip/johnny-whites-pub-grill-5198728?lid=5198728',
'/new-orleans-la/mip/chart-room-6924442?lid=6924442',
'/new-orleans-la/mip/golden-lantern-8517918?lid=8517918',
'/new-orleans-la/mip/ryans-irish-pub-inc-851820?lid=851820',
'/new-orleans-la/mip/d-b-a-2084747?lid=2084747',
'/new-orleans-la/mip/parlays-13663513?lid=13663513',
'/new-orleans-la/mip/apple-barrel-18379645?lid=18379645',
'/new-orleans-la/mip/snake-jakes-xmas-club-lounge-4531421?lid=4531421',
'/new-orleans-la/mip/port-of-call-394043?lid=394043',
'/new-orleans-la/mip/coops-place-14511722?lid=14511722',
'/new-orleans-la/mip/twi-ro-pa-466224645?lid=466224645',
'/new-orleans-la/mip/krazy-korner-11594425?lid=11594425',
'/new-orleans-la/mip/bourbon-o-480103567?lid=480103567',
'/new-orleans-la/mip/hi-ho-lounge-458821090?lid=458821090',.....]
I think this is, what you need:
for each_business in info:
yp_bus_url = each_business.find('a', {'class': 'business-name'}).get('href')
print(yp_bus_url)
The below code should work for you:
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
info = soup.findAll('div', {'class': 'info'})
for each_business in info:
# Your Fix here
for a in each_business.find_all('a', href=True):
print("Found the URL:", a['href'])
I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)
Does anyone know how to scrape a list of urls from the same website by Beautifulsoup? list = ['url1', 'url2', 'url3'...]
==========================================================================
My code to extract a list of urls:
url = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=2'
url1 = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=3'
url2 = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=4'
r = requests.get(url)
r1 = requests.get(url1)
r2 = requests.get(url2)
data = r.text
soup = BeautifulSoup(data, 'lxml')
links = []
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
data1 = r1.text
soup = BeautifulSoup(data1, 'lxml')
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
data2 = r2.text
soup = BeautifulSoup(data2, 'lxml')
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
new = ['http://www.hkjc.com/chinese/racing/']*1123
url_list = ['{}{}'.format(x,y) for x,y in zip(new,links)]
code to extract from a single page of url:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'myurl'
r = requests.get(myurl)
r.encoding = 'utf-8'
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
soup.findAll('tr')[27].findAll('td')
column_headers = [th.getText() for th in
soup.findAll('tr')[27].findAll('td')]
data_rows =soup.findAll('tr')[29:67]
data_rows
player_data = [[td.getText() for td in data_rows[i].findAll('td', {'class':['htable_text', 'htable_eng_text']})]
for i in range(len(data_rows))]
player_data_02 = []
for i in range(len(data_rows)):
player_row = []
for td in data_rows[i].findAll('td'):
player_row.append(td.getText())
player_data_02.append(player_row)
df = pd.DataFrame(player_data, columns=column_headers[:18])
Based on your links subset collection of table data goes like this:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
url_list = ['http://www.hkjc.com/english/racing/horse.asp?HorseNo=S217','http://www.hkjc.com/english/racing/horse.asp?HorseNo=A093','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V344','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V077', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=P361', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=T103']
for link in url_list:
r = requests.get(link)
r.encoding = 'utf-8'
html_content = r.text
soup = BS(html_content, 'lxml')
table = soup.find('table', class_='bigborder')
if not table:
continue
trs = table.find_all('tr')
if not trs:
continue #if trs are not found, then starting next iteration with other link
headers = trs[0]
headers_list=[]
for td in headers.find_all('td'):
headers_list.append(td.text)
headers_list+=['Season']
headers_list.insert(19,'pseudocol1')
headers_list.insert(20,'pseudocol2')
headers_list.insert(21,'pseudocol3')
res=[]
row = []
season = ''
for tr in trs[1:]:
if 'Season' in tr.text:
season = tr.text
else:
tds = tr.find_all('td')
for td in tds:
row.append(td.text.strip('\n').strip('\r').strip('\t').strip('"').strip()) #clean data
row.append(season.strip())
res.append(row)
row=[]
res = [i for i in res if i[0]!='']
df=pd.DataFrame(res, columns=headers_list)
del df['pseudocol1'],df['pseudocol2'],df['pseudocol3']
del df['VideoReplay']
df.to_csv('/home/username/'+str(url_list.index(link))+'.csv')
if you want to store data from all tables to one dataframe, this little modification will do the trick:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
url_list = ['http://www.hkjc.com/english/racing/horse.asp?HorseNo=S217','http://www.hkjc.com/english/racing/horse.asp?HorseNo=A093','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V344','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V077', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=P361', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=T103']
res=[] #placing res outside of loop
for link in url_list:
r = requests.get(link)
r.encoding = 'utf-8'
html_content = r.text
soup = BS(html_content, 'lxml')
table = soup.find('table', class_='bigborder')
if not table:
continue
trs = table.find_all('tr')
if not trs:
continue #if trs are not found, then starting next iteration with other link
headers = trs[0]
headers_list=[]
for td in headers.find_all('td'):
headers_list.append(td.text)
headers_list+=['Season']
headers_list.insert(19,'pseudocol1')
headers_list.insert(20,'pseudocol2')
headers_list.insert(21,'pseudocol3')
row = []
season = ''
for tr in trs[1:]:
if 'Season' in tr.text:
season = tr.text
else:
tds = tr.find_all('td')
for td in tds:
row.append(td.text.strip('\n').strip('\r').strip('\t').strip('"').strip())
row.append(season.strip())
res.append(row)
row=[]
res = [i for i in res if i[0]!=''] #outside of loop
df=pd.DataFrame(res, columns=headers_list) #outside of loop
del df['pseudocol1'],df['pseudocol2'],df['pseudocol3']
del df['VideoReplay']
df.to_csv('/home/Username/'+'tables.csv') #outside of loop