I am trying to scrape the bookmyshow website for finding out movie details like at what time tickets are available and how many seats are available. I have got to find how to get the show timings in which seats are available but now i want to get total seats avaialble in that show. My code is :
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://in.bookmyshow.com"
s =requests.session()
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
r = s.get("https://in.bookmyshow.com/vizag/movies", headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"html.parser")
movies_list = soup.find("div",{"class":"__col-now-showing"})
movies = movies_list.findAll("a",{"class":"__movie-name"})
for movie in movies:
print(movie.text)
show = []
containers = movies_list.findAll("div",{"class":"card-container"})
for container in containers:
try:
detail = container.find("div",{"class":"__name overflowEllipses"})
button = container.find("div",{"class":"book-button"})
print(detail.text)
print(button.a["href"])
url_ticket = base_url + button.a["href"]
show.append(url_ticket)
except:
pass
for i in show:
print(i)
for t in show:
res = s.get(t,headers=headers)
bs = BeautifulSoup(res.text,"html.parser")
movie_name = bs.find("div",{"class":"cinema-name-wrapper"})
print(movie_name.text.replace(" ","").replace("\t","").replace("\n",""))
venue_list = bs.find("ul",{"id":"venuelist"})
venue_names = venue_list.findAll("li",{"class":"list"})
try:
for i in venue_names:
vn = i.find("div",{"class":"__name"})
print(vn.text.replace(" ","").replace("\t","").replace("\n",""))
show_times = i.findAll("div",{"data-online":"Y"})
for st in show_times:
print(st.text.replace(" ","").replace("\t","").replace("\n",""))
except:
pass
print("\n")
heads = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"origin":"https://in.bookmyshow.com",
"referer":"https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rr = s.post("https://b-eu.simility.com/b?c=bookmyshow&v=1.905&ec=BLOFaZ2HdToCxwcr&cl=0&si=5a76bfce6ae4a00027767ae9&sc=3B0CB9F4-4A27-4588-9FB4-A2A2760569BC&uc=D834EDA4-57E4-4889-A34F-473AC6BBDDBB&e=Seatlayout&cd=.simility.com&r=0&st=1517731803171&s=792a6c66313a2032223133302633343a2c393a322e3c202422636e312a382037633f3c606669673e61653e6338323230353f3c35616f3b2a2c2269663a203820606765696d7371606f77282e2a61663320327e70756f2e2a63643e20326c776e6e242861643f20326e75666e24206166342a306c75666e2422636e352a386c776e64262073692032223348324b403b4436253e43323d2f3c3538322f314440362f493843323d3438353633404b202e20776b2838224e3a3b34454e433c2f3735473c273638323b2541333e4425363531434b3c40424e464a422226206a66303120326c636c79672422626e303a203864636479672c28716c32342838253131322e2a7966323f203231353b353f31333a323b3b353326207b643428382a32202e207b6e302230767a756526207b663420382a6f6c2d5f512a2c2279663f203859206d642f5559202422656420552e2071663028383026207b6431392032204f6d7861666e6125372630202255616c666d757b2a4c542a33382e3031225f6b6c3436332a7a363e2b2841707a6e6d55676049617e2d3539352633362a2a434a564f4e242a6e6961672847656969672b22416a7a656f6525343b2e3024313a313b2c333b3822536b6469726925373b352c31342a2620736e3338223a2855616c313020242871643b362a3a224d6d67656e67224164612e282e2a73643b342a383a3036242871643b352a3a313f313e2e2071663932203a32343c2c227966393b2038333d39342c28716c323028383a362e20716c38332230303c2c22686639362038767a7f672c28606c313628383b2e206066393d203a282f3a30303f363c353a3a332a2620626e3330223a282024207565332a3076727f672422776d302a385920756d68656c282e2a65787a677a6b6f676c7c6b6e2d7d676a676c285f24207565342a3020576f60436974282e2a756535203228556568496174205d676a454e202e2a7d65323d203274727f6724207565312a30202d3b333c3833323a31333a202e2a7a66312838535b226b72786e6b61637c636d6e257a25676f656564672f616a7a656f6527726c66222620616c766770666b6e2d7a666e2d7663677f6770202e2a496a72656f6d20504e4428526e77656164202c6477646c5d26592a6372726e61696374636d662f706e642a2e206f6a626c606d6e656b666a68607863676d68676c6d6865676e67696f6a62636b202e2a496a72656f6d20504e4428546b67756d78202c6477646c5d26592a6372726e61696374636d662f78276c69616e2e63787a6e6969637c696f642d702f726c636b66202c286b667465786c696e2f6c636b662f7066776f696e282e2a4c63766b7e6f2243666b6d6e74282e66776e6e5f245120617a726469636b76616d6c2d7a257a72617a6b2577696e677e6b6c672f6b6e6f2226207f69646f74616c676166656b66617a766d722e6e6e64202e2055616e6776636c6d2043656c7c676c76224c6f617273727c696f6422456d66776e6d282e223b2c3c2e38243338303b205f5577",headers =heads) # i got the link while i was inspecting the booking tickets page
f = s.get("https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204#!seatlayout") # this is the page gets displayed when we click the show time
ff = f.text
j = json.loads(ff)
print(j)
After i get the source code of this page i can get seats availability easily. But i am unable to get that page. How to do this? Thanks in Advance!
Steps:
1) use selenium to click on the time showing block
driver.find_element_by_xpath('<enter xpath>').click()
find xpath using inspect element and then click on element then copy you will get the option for copy xpath
time.sleep(4) # wait for 4 seconds for the page to appear
2) Get the html source code using
html = driver.page_source
then use beautiful soup to scrap the page
soup = BeautifulSoup(html,'html.parser')
Find all a href tag having class ='_available' and count them and then
find all a href tag having class = '_blocked' and count them
using these data you can find total no of seats and available seats
Related
I need help with the use of BeautifulSoup. How do I extract article text and article title from a list of URLs? Here is my current state of the code::
for i in range(0,len(df)):
j = df.iloc[i].values
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
page = requests.get(j[0],headers = headers)#loading text in url
soup = BeautifulSoup(page.content,'html.parser')#parsing url text
content = soup.findAll(attrs = {'class':'td-post-content'})#extracting only text part
content = content[0].text
title = soup.findAll(attrs = {'class':'entry-title'})#extracting title of website
#title = title[16].text
print(content)
But I get Index error , Can anyone help ?
You need to add some checks and error handling in your code.
for i in range(0,len(df)):
j = df.iloc[i].values
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
page = requests.get(j[0],headers = headers)#loading text in url
if page.status_code != 200 or not page.content:
cMsg = '' if page.content else 'Empty content - '
print(f'{cMsg}{page.status_code} {page.reason} - with url {j[0]}')
break
soup = BeautifulSoup(page.content,'html.parser')#parsing url text
content = soup.findAll(attrs = {'class':'td-post-content'})#extracting only text part
# empty content if not td-post-content found
content = content[0].text if len(content) > 0 else ''
title = soup.findAll(attrs = {'class':'entry-title'})#extracting title of website
#title = title[16].text
print(content)
Before getting soup, this checks whether your request worked, and then for content, it only tries to get the text if findAll returned anything at all - otherwise it just sets as ''; this avoids the index error.
If findAll should be returning something, you should set a break at this point as well and examine your soup and/or page.content - you might not be getting the html you are expecting. While blockers should trigger the if page.status_code != 200 part, many sites require login, or require you to confirm some thing/s or render parts of the page dynamically with js, so that data would be missing from the request response...
from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]
I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".
This is my code, I want to take the location's name and link, the variable "lugares" finds multiple item-containers, but I only want the first one [0]; then goes the for loop, but I can't find the span classes.
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
lugares=lugares[0]
print(len(lugares))
for lugar in lugares:
locationlink = i + str(lugar.find("span",{"class":"item"}).find("a")["href"])
location= lugar.find("span",{"class":"item"}).text
a=[location,locationlink]
b.append(a)
There are multiple options to get the goal, best one depence on what you expect and wanna do with this information in follow up process.
First Option
If you are just looking for the infos of first location you can do the following:
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
or
lugar = soup.select('div.items-container a')[0]
b = [lugar.text, f'{i}{lugar["href"]}']
Both select the first <a> in the <div> with class items-container.
Output
['Huixquilucan','https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']
Alternativ
If you are interested to get all at once, you should use a list of dicts, so later on you just have to iterate it and get all information in place:
[{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
Output
[{'name': 'Huixquilucan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1'},
{'name': 'Naucalpan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/naucalpan/v1c1098l10710p1'},
{'name': 'Atizapán',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/atizapan/v1c1098l10662p1'},
{'name': 'Metepec',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/metepec-edomex/v1c1098l10707p1'},...]
Example (showing results of both)
from bs4 import BeautifulSoup
import requests
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
print(f'First lugar:\n {b} \n')
## or alternative option
allLugaros = [{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
print(f'First lugar from lugaros (list of dict):\n {allLugaros[0]} \n')
print(f'All lugaros as list of dict:\n {allLugaros} \n')
First, you need to get all spans in the first Lugares lugares[0].
Then you need to iterate for each span to get the link and text for each location.
The Code:
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
#lugares=lugares[0]
print(len(lugares))
# get all spans
spans = lugares[0].find_all("span",{"class":"item"})
# itreate throw each span
for span in spans:
# get location text
location = span.find("a").text
# locationlink builder
site = "www.vivanuncios.com.mx"
link = span.find("a")["href"]
locationlink = f"{site}{link}"
a = [location,locationlink]
b.append(a)
print (b[0])
Output:
['Huixquilucan', 'www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']
Currently using the below Python scraper to pull Job title, Company, Salary, and Description. Looking for a way to take it one step further by filtering only results where application link is URL to company website, as opposed to the 'Easily Apply' postings that send application through Indeed. Is there a way to do this?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
url = f'https://www.indeed.com/jobs?q=Software%20Engineer&l=Austin%2C%20TX&ts=1630951951455&rq=1&rsIdx=1&fromage=last&newcount=6&vjk=c8f4815c6ecfa793'
r = requests.get(url, headers) # 200 is OK, 404 is page not found
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# <span title="API Developer"> API Developer </span>
def transform(soup):
divs = soup.find_all('div', class_ = 'slider_container')
for item in divs:
if item.find(class_ = 'label'):
continue # need to fix, if finds a job that has a 'new' span before the title span, skips job completely
title = item.find('span').text.strip()
company = item.find('span', class_ = "companyName").text.strip()
description = item.find('div', class_ = "job-snippet").text.strip().replace('\n', '')
try:
salary = item.find('span', class_ = "salary-snippet").text.strip()
except:
salary = ""
job = {
'title': title,
'company': company,
'salary': salary,
'description': description
}
jobList.append(job)
# print("Seeking a: "+title+" to join: "+company+" paying: "+salary+". Job description: "+description)
return
jobList = []
# go through multiple pages
for i in range(0,100, 10): #0-40 stepping in 10's
print(f'Getting page, {i}')
c = extract(0)
transform(c)
print(len(jobList))
df = pd.DataFrame(jobList)
print(df.head())
df.to_csv('jobs.csv')
My approach is as follows-
Find the href from the <a> tag for each job card on the initial page, and then send a request to each of those links, and grab the external job link (If "Apply on Company Site" button is available) from there.
Code snippet-
#function which gets external job links
def get_external_link(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
#if Apply On Company Site button is available, fetch the link
if(soup.find('a',attrs={"referrerpolicy" : "origin"})) is not None:
external_job_link=soup.find('a',attrs={"referrerpolicy" : "origin"})
print(external_job_link['href'])
#add this piece of code to transform function
def transform(soup):
cards=soup.find('div',class_='mosaic-provider-jobcards')
links=cards.find_all("a", class_=lambda value: value and value.startswith("tapItem"))
#for each job link in the page call get_external_links
for link in links:
get_external_link('https://www.indeed.com'+(link['href']))
Note- You can also use the page source of the new requests which are being called to fetch the data like title, company, salary, description which you previously used to scrape from the main page.