Python scraping gmc-uk.org - python

I am trying to scrape a website : https://www.gmc-uk.org/doctors/register/LRMP.asp
Below is the code i wrote & its not working :
import requests, csv, re, sys
from lxml import html
def parser1(keyword,source):
with open(str(keyword)+'.csv','wb')as export:
writer = csv.writer(export)
for each in re.findall('<tr><td class="listapplettablerows" >(.+?)</tr>',source,re.DOTALL):
new_each = '<td class="listapplettablerows" >'+each
source = html.fromstring(new_each)
lines = source.xpath('//td[#class="listapplettablerows"]//text()')
#print (lines)
try:
writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6]])
except:
writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],None,lines[5]])
def make_requests(url,keyword,SWETS):
s = requests.Session()
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.9',
'Host':'webcache.gmc-uk.org',
#'Cookie':'_ga=GA1.2.1612314458.1511275507; _gid=GA1.2.1054886815.1511275507',
'Referer':'http://webcache.gmc-uk.org/gmclrmp_enu/start.swe?SWENeedContext=false&SWECmd=GetCachedFrame&W=t&SWEACn=7691&_sn=AVN6CAdOO0TLfHYEWmkfiCc5NXsWqEWnu1QinbOLc8NU.5VYcL46LP-V1h1wBqvlQYqNVBRCbMk6wOV9ByGHIw6-NgaeeOCxe-VxSekkxnLHXZZSKGnrBiJaYUTe-S7K.d3nInri.S4wG6fk0CD4JAEKBxpsYv8C0hibwdV3LcAlTqBpiFSlHFjguoh8q8WZOtzdmX07Geg_&SWEC=1&SWEFrame=top._sweclient._sweview&SWEBID=-1&SRN=&SWETS=',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
s.get('https://www.gmc-uk.org/doctors/register/LRMP.asp')
r = s.get(url)
formdata = {'s_3_1_5_0':'',
's_3_1_3_0':'',
's_3_1_9_0':keyword,
's_3_1_6_0':'60',
's_3_1_4_0':'40',
's_3_1_7_0':'',
'SWEFo':'SWEForm3_0',
'SWEField':'s_3_1_10_0',
'SWENeedContext':'true',
'SWENoHttpRedir':'true',
'W':'t',
'SWECmd':'InvokeMethod',
'SWEMethod':'NewQuerySearch',
'SWERowIds':'',
'SWESP':'false',
'SWEVI':'',
'SWESPNR':'',
'SWEPOC':'',
'SWESPNH':'',
'SWEH':'',
'SWETargetView':'',
'SWEDIC':'false',
'_sn':url.split('_sn=')[1].split('&')[0],
'SWEReqRowId':'1',
'SWEView':'GMC WEB Doctor Search',
'SWEC':'1',
'SWERowId':'VRId-0',
'SWETVI':'',
'SWEW':'',
'SWEBID':re.findall('navigator.id = "(.+?)"',r.text,re.DOTALL)[0],
'SWEM':'',
'SRN':'',
'SWESPa':'',
'SWETS':SWETS,
'SWEContainer':'',
'SWEWN':'',
'SWEKeepContext':'0',
'SWEApplet':'GMC WEB Health Provider Search Applet',
'SWETA':''}
headers['Referer'] = url
r1 = s.post('http://webcache.gmc-uk.org/gmclrmp_enu/start.swe',data=formdata)
if 'Sorry but we cannot find a record that matches your search' not in r1.text:
parser1(keyword,r1.text)
make_requests(sys.argv[1],sys.argv[2],sys.argv[3])
The problem is the SWETS key in the formdata dictionary , when i inspected network elements i found out that website takes a POST request with SWETS as 13 digit GMT linux datetime stamp . but i am unable to find out how to use the correct 13 digit stamp as i do not find anything such from js responses from the server & when i send a generated 13 digit GMT linux datetime stamp it says invalid input . Please take a look & advice possible steps.

Related

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]

Python/Json Code - Where was this variable located

This code is from a separate submission.
If you look at the lines :
each[AuthorString]
each[Title]
im wondering where the user got these variables from?
I navigated to the json page
Link
and could not find these variables? maybe im in the wrong page? Screenshots will help
here is the code
import requests
session_ids = ['13619' ,'13736']
for session_id in session_ids:
url = 'https://cdn-solr.asco.org/solr/ml/mlselect'
payload = '?_format=json&wt=json&indent=true&q=SessionId:' + session_id + '&start=0&rows=30&sort=score%20desc,%20SessionId%20asc&fq=RecordType:sessions&facet=true&f.Year.facet.sort=index&facet.field={!key=Year}Year&facet.field={!key=subject_thes}subject_thes&facet.field={!key=MediaTypes}MediaTypes&facet.field={!key=fctSessionType}fctSessionType&facet.pivot={!key=MeetingName}fctMeetingName,fctTrack&spellcheck.maxCollationTries=100'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
jsonData = requests.get(url+payload, headers=headers).json()
sessionParticipationID = jsonData['response']['docs'][0]['SessionParticipationID']
session_id_list = '%20OR%20'.join(sessionParticipationID)
payload = '?_format=json&wt=json&indent=true&sort=PresentationOrderWithinSession%20asc,%20ISODateString%20asc,%20ISODateStringEnd%20asc&fl=_id,%20score,%20ISODateString,%20ISODateStringEnd,%20ISODateString_1,%20ISODateStringEnd_1,%20Year,%20Title,%20tempAbstractID,%20MediaID,%20VideoID,%20EdBookID,%20edBookTitle,%20PosterID,%20edBookTitle,%20SessionTitle,%20SessionTypeId,%20AuthorString,%20AbstID,%20Role,%20FullName,%20PosterBoard,%20Institution,%20ProgramTitle,%20MeetingName,%20FirstAuthor&q=_id:(' + session_id_list + ')&rows=' + str(len(sessionParticipationID))
jsonData = requests.get(url+payload, headers=headers).json()
title_auth = [] #<-- to make a list of {title:author} dictionary
for each in jsonData['response']['docs']:
title = each['Title'] #this line
author = each['AuthorString'] #and this

How can I append these two beautiful soup objects into a dataframe simultaneously?

I'm trying to add the date and dividends off yahoo finance at the same time into a dataframe. This way the dates match together. Not really sure how to do this because I don't know how to use both for loops at the same time. Hoping you might be able to make an attempt. Thank you!
import pandas as pd
from datetime import date
import sys
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/MSFT/history?period1=1570838400&period2=1602460800&interval=div%7Csplit&filter=div&frequency=1d&includeAdjustedClose=true"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.text, "html.parser")
dividend = soup.findAll("strong")
date = soup.find_all('td', class_='Py(10px) Ta(start) Pend(10px)')
dividends = []
for f in dividend:
dividends.append(f.text)
dates = []
for f in date:
dates.append(f.text)
print(dates, dividends)

Scraping seat layout page of book my show using python

I am trying to scrape the bookmyshow website for finding out movie details like at what time tickets are available and how many seats are available. I have got to find how to get the show timings in which seats are available but now i want to get total seats avaialble in that show. My code is :
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://in.bookmyshow.com"
s =requests.session()
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
r = s.get("https://in.bookmyshow.com/vizag/movies", headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"html.parser")
movies_list = soup.find("div",{"class":"__col-now-showing"})
movies = movies_list.findAll("a",{"class":"__movie-name"})
for movie in movies:
print(movie.text)
show = []
containers = movies_list.findAll("div",{"class":"card-container"})
for container in containers:
try:
detail = container.find("div",{"class":"__name overflowEllipses"})
button = container.find("div",{"class":"book-button"})
print(detail.text)
print(button.a["href"])
url_ticket = base_url + button.a["href"]
show.append(url_ticket)
except:
pass
for i in show:
print(i)
for t in show:
res = s.get(t,headers=headers)
bs = BeautifulSoup(res.text,"html.parser")
movie_name = bs.find("div",{"class":"cinema-name-wrapper"})
print(movie_name.text.replace(" ","").replace("\t","").replace("\n",""))
venue_list = bs.find("ul",{"id":"venuelist"})
venue_names = venue_list.findAll("li",{"class":"list"})
try:
for i in venue_names:
vn = i.find("div",{"class":"__name"})
print(vn.text.replace(" ","").replace("\t","").replace("\n",""))
show_times = i.findAll("div",{"data-online":"Y"})
for st in show_times:
print(st.text.replace(" ","").replace("\t","").replace("\n",""))
except:
pass
print("\n")
heads = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"origin":"https://in.bookmyshow.com",
"referer":"https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rr = s.post("https://b-eu.simility.com/b?c=bookmyshow&v=1.905&ec=BLOFaZ2HdToCxwcr&cl=0&si=5a76bfce6ae4a00027767ae9&sc=3B0CB9F4-4A27-4588-9FB4-A2A2760569BC&uc=D834EDA4-57E4-4889-A34F-473AC6BBDDBB&e=Seatlayout&cd=.simility.com&r=0&st=1517731803171&s=792a6c66313a2032223133302633343a2c393a322e3c202422636e312a382037633f3c606669673e61653e6338323230353f3c35616f3b2a2c2269663a203820606765696d7371606f77282e2a61663320327e70756f2e2a63643e20326c776e6e242861643f20326e75666e24206166342a306c75666e2422636e352a386c776e64262073692032223348324b403b4436253e43323d2f3c3538322f314440362f493843323d3438353633404b202e20776b2838224e3a3b34454e433c2f3735473c273638323b2541333e4425363531434b3c40424e464a422226206a66303120326c636c79672422626e303a203864636479672c28716c32342838253131322e2a7966323f203231353b353f31333a323b3b353326207b643428382a32202e207b6e302230767a756526207b663420382a6f6c2d5f512a2c2279663f203859206d642f5559202422656420552e2071663028383026207b6431392032204f6d7861666e6125372630202255616c666d757b2a4c542a33382e3031225f6b6c3436332a7a363e2b2841707a6e6d55676049617e2d3539352633362a2a434a564f4e242a6e6961672847656969672b22416a7a656f6525343b2e3024313a313b2c333b3822536b6469726925373b352c31342a2620736e3338223a2855616c313020242871643b362a3a224d6d67656e67224164612e282e2a73643b342a383a3036242871643b352a3a313f313e2e2071663932203a32343c2c227966393b2038333d39342c28716c323028383a362e20716c38332230303c2c22686639362038767a7f672c28606c313628383b2e206066393d203a282f3a30303f363c353a3a332a2620626e3330223a282024207565332a3076727f672422776d302a385920756d68656c282e2a65787a677a6b6f676c7c6b6e2d7d676a676c285f24207565342a3020576f60436974282e2a756535203228556568496174205d676a454e202e2a7d65323d203274727f6724207565312a30202d3b333c3833323a31333a202e2a7a66312838535b226b72786e6b61637c636d6e257a25676f656564672f616a7a656f6527726c66222620616c766770666b6e2d7a666e2d7663677f6770202e2a496a72656f6d20504e4428526e77656164202c6477646c5d26592a6372726e61696374636d662f706e642a2e206f6a626c606d6e656b666a68607863676d68676c6d6865676e67696f6a62636b202e2a496a72656f6d20504e4428546b67756d78202c6477646c5d26592a6372726e61696374636d662f78276c69616e2e63787a6e6969637c696f642d702f726c636b66202c286b667465786c696e2f6c636b662f7066776f696e282e2a4c63766b7e6f2243666b6d6e74282e66776e6e5f245120617a726469636b76616d6c2d7a257a72617a6b2577696e677e6b6c672f6b6e6f2226207f69646f74616c676166656b66617a766d722e6e6e64202e2055616e6776636c6d2043656c7c676c76224c6f617273727c696f6422456d66776e6d282e223b2c3c2e38243338303b205f5577",headers =heads) # i got the link while i was inspecting the booking tickets page
f = s.get("https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204#!seatlayout") # this is the page gets displayed when we click the show time
ff = f.text
j = json.loads(ff)
print(j)
After i get the source code of this page i can get seats availability easily. But i am unable to get that page. How to do this? Thanks in Advance!
Steps:
1) use selenium to click on the time showing block
driver.find_element_by_xpath('<enter xpath>').click()
find xpath using inspect element and then click on element then copy you will get the option for copy xpath
time.sleep(4) # wait for 4 seconds for the page to appear
2) Get the html source code using
html = driver.page_source
then use beautiful soup to scrap the page
soup = BeautifulSoup(html,'html.parser')
Find all a href tag having class ='_available' and count them and then
find all a href tag having class = '_blocked' and count them
using these data you can find total no of seats and available seats

BeautifulSoup Scraping no output

import requests
from bs4 import BeautifulSoup
def findPosts():
url = 'http://espn.go.com/nba/scoreboard'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.3"
soup = BeautifulSoup(requests.get(url, headers = headers).text, "html.parser")
team1 = soup.find_all('a',{'name' : "&lpos=nba:scoreboard:team"})
score1 = soup.find_all('td',{'class' : 'total'})
print(team1)
print(score1)
findPosts()
I am receiving an empty list, but I am sure that the url source code contains the elements that I specified. Is there anything in the BeautifulSoup Documentation that I am using?
The data on that page is dynamically created through Javascript. If you right click in your browser -> view source, and look for the anchors with the name you provided, you will find nothing.
From what I can tell, all of the JSON data for the page to be created is already on the page, so you don't need to make any extra requests to get the data you want.
To find the JSON data on the page, I searched for one of the team names (Mavericks) and saw a massive Javascript object containing what appears to be the data you want to scrape.
You can extract the json using regex and access the data using dict notation:
from bs4 import BeautifulSoup
import requests
import re
import json
url='http://espn.go.com/nba/scoreboard'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.3"
soup = BeautifulSoup(requests.get(url, headers = headers).text, "html.parser")
script = soup.find_all('script')[7].get_text()
map_search = re.search('^.*?= (\{.*);window.*', script)
mapData = map_search.group(1)
mapDataObj = json.loads(mapData)
scores = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['linescores']
name = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['team']['shortDisplayName']
total_score = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['score']
print 'Team: %s' % name
for score in scores:
print('Score: %s' % score['value'])
print('Total score: %s' % total_score)
Output :
Team: Pacers
Score: 19
Score: 24
Score: 27
Score: 30
Total score: 100

Categories

Resources