BeautifulSoup Scraping no output - python

import requests
from bs4 import BeautifulSoup
def findPosts():
url = 'http://espn.go.com/nba/scoreboard'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.3"
soup = BeautifulSoup(requests.get(url, headers = headers).text, "html.parser")
team1 = soup.find_all('a',{'name' : "&lpos=nba:scoreboard:team"})
score1 = soup.find_all('td',{'class' : 'total'})
print(team1)
print(score1)
findPosts()
I am receiving an empty list, but I am sure that the url source code contains the elements that I specified. Is there anything in the BeautifulSoup Documentation that I am using?

The data on that page is dynamically created through Javascript. If you right click in your browser -> view source, and look for the anchors with the name you provided, you will find nothing.
From what I can tell, all of the JSON data for the page to be created is already on the page, so you don't need to make any extra requests to get the data you want.
To find the JSON data on the page, I searched for one of the team names (Mavericks) and saw a massive Javascript object containing what appears to be the data you want to scrape.

You can extract the json using regex and access the data using dict notation:
from bs4 import BeautifulSoup
import requests
import re
import json
url='http://espn.go.com/nba/scoreboard'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.3"
soup = BeautifulSoup(requests.get(url, headers = headers).text, "html.parser")
script = soup.find_all('script')[7].get_text()
map_search = re.search('^.*?= (\{.*);window.*', script)
mapData = map_search.group(1)
mapDataObj = json.loads(mapData)
scores = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['linescores']
name = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['team']['shortDisplayName']
total_score = mapDataObj['events'][0]['competitions'][0]['competitors'][1]['score']
print 'Team: %s' % name
for score in scores:
print('Score: %s' % score['value'])
print('Total score: %s' % total_score)
Output :
Team: Pacers
Score: 19
Score: 24
Score: 27
Score: 30
Total score: 100

Related

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]

Beautiful soup web scraping returning None-Python

I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".

Can't get rid of a loop when certain conditions are met

I've created a script in python to get the first 400 links of search results from bing. It's not sure that there will always be at least 400 results. In this case the number of results is around 300. There are 10 results in it's landing page. However, the rest of the results can be found traversing next pages. The problem is when there is no more next page link in there, the webpage displays the last results over and over again.
Search keyword is michael jackson and ths is a full-fledged link
How can I get rid of the loop when there are no more new results or the results are less than 400?`
I've tried with:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
while q<=400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)
As I mentioned in the comments, couldn't you do something like this:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
prev_soup = str()
while q <= 400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
if str(soup) != prev_soup:
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
prev_soup = str(soup)
else:
break
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)

Scraping seat layout page of book my show using python

I am trying to scrape the bookmyshow website for finding out movie details like at what time tickets are available and how many seats are available. I have got to find how to get the show timings in which seats are available but now i want to get total seats avaialble in that show. My code is :
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://in.bookmyshow.com"
s =requests.session()
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
r = s.get("https://in.bookmyshow.com/vizag/movies", headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"html.parser")
movies_list = soup.find("div",{"class":"__col-now-showing"})
movies = movies_list.findAll("a",{"class":"__movie-name"})
for movie in movies:
print(movie.text)
show = []
containers = movies_list.findAll("div",{"class":"card-container"})
for container in containers:
try:
detail = container.find("div",{"class":"__name overflowEllipses"})
button = container.find("div",{"class":"book-button"})
print(detail.text)
print(button.a["href"])
url_ticket = base_url + button.a["href"]
show.append(url_ticket)
except:
pass
for i in show:
print(i)
for t in show:
res = s.get(t,headers=headers)
bs = BeautifulSoup(res.text,"html.parser")
movie_name = bs.find("div",{"class":"cinema-name-wrapper"})
print(movie_name.text.replace(" ","").replace("\t","").replace("\n",""))
venue_list = bs.find("ul",{"id":"venuelist"})
venue_names = venue_list.findAll("li",{"class":"list"})
try:
for i in venue_names:
vn = i.find("div",{"class":"__name"})
print(vn.text.replace(" ","").replace("\t","").replace("\n",""))
show_times = i.findAll("div",{"data-online":"Y"})
for st in show_times:
print(st.text.replace(" ","").replace("\t","").replace("\n",""))
except:
pass
print("\n")
heads = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"origin":"https://in.bookmyshow.com",
"referer":"https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rr = s.post("https://b-eu.simility.com/b?c=bookmyshow&v=1.905&ec=BLOFaZ2HdToCxwcr&cl=0&si=5a76bfce6ae4a00027767ae9&sc=3B0CB9F4-4A27-4588-9FB4-A2A2760569BC&uc=D834EDA4-57E4-4889-A34F-473AC6BBDDBB&e=Seatlayout&cd=.simility.com&r=0&st=1517731803171&s=792a6c66313a2032223133302633343a2c393a322e3c202422636e312a382037633f3c606669673e61653e6338323230353f3c35616f3b2a2c2269663a203820606765696d7371606f77282e2a61663320327e70756f2e2a63643e20326c776e6e242861643f20326e75666e24206166342a306c75666e2422636e352a386c776e64262073692032223348324b403b4436253e43323d2f3c3538322f314440362f493843323d3438353633404b202e20776b2838224e3a3b34454e433c2f3735473c273638323b2541333e4425363531434b3c40424e464a422226206a66303120326c636c79672422626e303a203864636479672c28716c32342838253131322e2a7966323f203231353b353f31333a323b3b353326207b643428382a32202e207b6e302230767a756526207b663420382a6f6c2d5f512a2c2279663f203859206d642f5559202422656420552e2071663028383026207b6431392032204f6d7861666e6125372630202255616c666d757b2a4c542a33382e3031225f6b6c3436332a7a363e2b2841707a6e6d55676049617e2d3539352633362a2a434a564f4e242a6e6961672847656969672b22416a7a656f6525343b2e3024313a313b2c333b3822536b6469726925373b352c31342a2620736e3338223a2855616c313020242871643b362a3a224d6d67656e67224164612e282e2a73643b342a383a3036242871643b352a3a313f313e2e2071663932203a32343c2c227966393b2038333d39342c28716c323028383a362e20716c38332230303c2c22686639362038767a7f672c28606c313628383b2e206066393d203a282f3a30303f363c353a3a332a2620626e3330223a282024207565332a3076727f672422776d302a385920756d68656c282e2a65787a677a6b6f676c7c6b6e2d7d676a676c285f24207565342a3020576f60436974282e2a756535203228556568496174205d676a454e202e2a7d65323d203274727f6724207565312a30202d3b333c3833323a31333a202e2a7a66312838535b226b72786e6b61637c636d6e257a25676f656564672f616a7a656f6527726c66222620616c766770666b6e2d7a666e2d7663677f6770202e2a496a72656f6d20504e4428526e77656164202c6477646c5d26592a6372726e61696374636d662f706e642a2e206f6a626c606d6e656b666a68607863676d68676c6d6865676e67696f6a62636b202e2a496a72656f6d20504e4428546b67756d78202c6477646c5d26592a6372726e61696374636d662f78276c69616e2e63787a6e6969637c696f642d702f726c636b66202c286b667465786c696e2f6c636b662f7066776f696e282e2a4c63766b7e6f2243666b6d6e74282e66776e6e5f245120617a726469636b76616d6c2d7a257a72617a6b2577696e677e6b6c672f6b6e6f2226207f69646f74616c676166656b66617a766d722e6e6e64202e2055616e6776636c6d2043656c7c676c76224c6f617273727c696f6422456d66776e6d282e223b2c3c2e38243338303b205f5577",headers =heads) # i got the link while i was inspecting the booking tickets page
f = s.get("https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204#!seatlayout") # this is the page gets displayed when we click the show time
ff = f.text
j = json.loads(ff)
print(j)
After i get the source code of this page i can get seats availability easily. But i am unable to get that page. How to do this? Thanks in Advance!
Steps:
1) use selenium to click on the time showing block
driver.find_element_by_xpath('<enter xpath>').click()
find xpath using inspect element and then click on element then copy you will get the option for copy xpath
time.sleep(4) # wait for 4 seconds for the page to appear
2) Get the html source code using
html = driver.page_source
then use beautiful soup to scrap the page
soup = BeautifulSoup(html,'html.parser')
Find all a href tag having class ='_available' and count them and then
find all a href tag having class = '_blocked' and count them
using these data you can find total no of seats and available seats

Want to scrape data using 10 different keywords in URL for 2 pages and write scraped data to csv using Python 3.6.2 and BS4

I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')

Categories

Resources