with attached screenshot my question can be explained quite well.
I am scraping the following page: https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1
Table 1 lists the team. In the second column is the player. I need the link as you can see in the screenshot on the bottom left.
When I look into the data frame normally, I only get the following in this cell: "Oliver BaumannO. BaumannTorwart" But I am looking for "https://www.transfermarkt.de/oliver-baumann/profil/spieler/55089".
You guys got any ideas?
Code:
import pandas as pd
import requests
# Global variables
HEADS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
dateiname = 'test.xlsx'
# Global variables
def get_response(url):
# URL-Anfrage durchfuehren
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
dfs = pd.read_html(response.text)
#dfs = dfs.to_html(escape=False)
print(dfs[1])
print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)
as I know read_html gets only text from table and it doesn't care of links, hidden elements, attributes, etc.
You need module like BeautifulSoup or lxml to work with full HTML and manually get needed information.
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
This example gets only links but in the same way you can get other elements.
import requests
from bs4 import BeautifulSoup
#import pandas as pd
HEADS = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
def get_response(url):
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
#print(dfs[1])
#print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
# --- main --
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)
Result
/oliver-baumann/profil/spieler/55089
/philipp-pentke/profil/spieler/8246
/luca-philipp/profil/spieler/432671
/stefan-posch/profil/spieler/223974
/kevin-vogt/profil/spieler/84435
/benjamin-hubner/profil/spieler/52348
/kevin-akpoguma/profil/spieler/160241
/kasim-adams/profil/spieler/263801
/ermin-bicakcic/profil/spieler/51676
/havard-nordtveit/profil/spieler/42234
/melayro-bogarde/profil/spieler/476915
/konstantinos-stafylidis/profil/spieler/148967
/pavel-kaderabek/profil/spieler/143798
/joshua-brenet/profil/spieler/207006
/florian-grillitsch/profil/spieler/195736
/diadie-samassekou/profil/spieler/315604
/dennis-geiger/profil/spieler/251309
/ilay-elmkies/profil/spieler/443752
/christoph-baumgartner/profil/spieler/324278
/mijat-gacinovic/profil/spieler/215864
/jacob-bruun-larsen/profil/spieler/293281
/sargis-adamyan/profil/spieler/125614
/felipe-pires/profil/spieler/327911
/robert-skov/profil/spieler/270393
/ihlas-bebou/profil/spieler/237164
/andrej-kramaric/profil/spieler/46580
/ishak-belfodil/profil/spieler/111039
/munas-dabbur/profil/spieler/145866
/klauss/profil/spieler/498862
/maximilian-beier/profil/spieler/578392
That helps me.
I have now copied the table with pandas and replaced the column with the name with the link from your BS4 code. Works!
Related
So basically I am using the below code to scrape the image urls of the credit cards from the respective links in the explore_more_url variable.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json, requests, re
from selenium import webdriver
driver = webdriver.Chrome(executable_path="C:\\Users\\Hari\\Downloads\\chromedriver.exe")
img_url = []
explore_more_url = ['https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits', 'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account', 'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits', 'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits']
for x in explore_more_url:
driver.get(x)
soup_1 = BeautifulSoup(driver.page_source, 'lxml')
img_url.append("https://www.axisbank.com" + soup_1.find('img', alt="Fast Forward Banner").get('src'))
print(img_url)
Output :
Traceback (most recent call last):
File "C:\Users\Hari\PycharmProjects\Card_Prj\axis.py", line 82, in <module>
img_url.append("https://www.axisbank.com" + soup_1.find('img', alt="Fast Forward Banner").get('src'))
AttributeError: 'NoneType' object has no attribute 'get'
The images are something like this in each link:
What is the appropriate code that I could use so that I can get exactly what I am expecting ?
One way of getting the image might be this:
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
page = requests.get("https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card", headers=headers).text
img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]
with open(img_src_.rsplit("/")[-1], "wb") as image:
image.write(requests.get(f"https://www.axisbank.com{img_src_}").content)
Output: an .jpg file in the script's local directory
ace-product-landing-web-version-1920x360.jpg
EDIT: To get just the source urls, try this:
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
explore_more_url = [
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits',
'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account',
'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits',
'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card',
'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits',
]
img_urls = []
for url in explore_more_url:
page = requests.get(url, headers=headers).text
try:
img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]
print(f"Finding image source url for {url}")
img_urls.append(f"https://www.axisbank.com{img_src_}")
except (KeyError, TypeError):
continue
print(img_urls)
Output:
['https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/ace-product-landing-web-version-1920x360.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/health-and-wellness-product-page-1920x360_v1.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/freecharge-product-landing-page-desktop-banner-revised.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/indian-oil-banner-desktop.jpg', 'https://www.axisbank.com/img/magnuscard/apply-now.png', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/flipkart-abcc-desk.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/product-landing-page-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/neo-credit-card-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/privilege-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/miles---more-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/desktop-select-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/insta-easy-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/platinum-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/my-wings-credit-card.jpg']
I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.
Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()
I've created a script in python to get the first 400 links of search results from bing. It's not sure that there will always be at least 400 results. In this case the number of results is around 300. There are 10 results in it's landing page. However, the rest of the results can be found traversing next pages. The problem is when there is no more next page link in there, the webpage displays the last results over and over again.
Search keyword is michael jackson and ths is a full-fledged link
How can I get rid of the loop when there are no more new results or the results are less than 400?`
I've tried with:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
while q<=400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)
As I mentioned in the comments, couldn't you do something like this:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
prev_soup = str()
while q <= 400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
if str(soup) != prev_soup:
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
prev_soup = str(soup)
else:
break
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)
I am trying to scrape the bookmyshow website for finding out movie details like at what time tickets are available and how many seats are available. I have got to find how to get the show timings in which seats are available but now i want to get total seats avaialble in that show. My code is :
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://in.bookmyshow.com"
s =requests.session()
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
r = s.get("https://in.bookmyshow.com/vizag/movies", headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"html.parser")
movies_list = soup.find("div",{"class":"__col-now-showing"})
movies = movies_list.findAll("a",{"class":"__movie-name"})
for movie in movies:
print(movie.text)
show = []
containers = movies_list.findAll("div",{"class":"card-container"})
for container in containers:
try:
detail = container.find("div",{"class":"__name overflowEllipses"})
button = container.find("div",{"class":"book-button"})
print(detail.text)
print(button.a["href"])
url_ticket = base_url + button.a["href"]
show.append(url_ticket)
except:
pass
for i in show:
print(i)
for t in show:
res = s.get(t,headers=headers)
bs = BeautifulSoup(res.text,"html.parser")
movie_name = bs.find("div",{"class":"cinema-name-wrapper"})
print(movie_name.text.replace(" ","").replace("\t","").replace("\n",""))
venue_list = bs.find("ul",{"id":"venuelist"})
venue_names = venue_list.findAll("li",{"class":"list"})
try:
for i in venue_names:
vn = i.find("div",{"class":"__name"})
print(vn.text.replace(" ","").replace("\t","").replace("\n",""))
show_times = i.findAll("div",{"data-online":"Y"})
for st in show_times:
print(st.text.replace(" ","").replace("\t","").replace("\n",""))
except:
pass
print("\n")
heads = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"origin":"https://in.bookmyshow.com",
"referer":"https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rr = s.post("https://b-eu.simility.com/b?c=bookmyshow&v=1.905&ec=BLOFaZ2HdToCxwcr&cl=0&si=5a76bfce6ae4a00027767ae9&sc=3B0CB9F4-4A27-4588-9FB4-A2A2760569BC&uc=D834EDA4-57E4-4889-A34F-473AC6BBDDBB&e=Seatlayout&cd=.simility.com&r=0&st=1517731803171&s=792a6c66313a2032223133302633343a2c393a322e3c202422636e312a382037633f3c606669673e61653e6338323230353f3c35616f3b2a2c2269663a203820606765696d7371606f77282e2a61663320327e70756f2e2a63643e20326c776e6e242861643f20326e75666e24206166342a306c75666e2422636e352a386c776e64262073692032223348324b403b4436253e43323d2f3c3538322f314440362f493843323d3438353633404b202e20776b2838224e3a3b34454e433c2f3735473c273638323b2541333e4425363531434b3c40424e464a422226206a66303120326c636c79672422626e303a203864636479672c28716c32342838253131322e2a7966323f203231353b353f31333a323b3b353326207b643428382a32202e207b6e302230767a756526207b663420382a6f6c2d5f512a2c2279663f203859206d642f5559202422656420552e2071663028383026207b6431392032204f6d7861666e6125372630202255616c666d757b2a4c542a33382e3031225f6b6c3436332a7a363e2b2841707a6e6d55676049617e2d3539352633362a2a434a564f4e242a6e6961672847656969672b22416a7a656f6525343b2e3024313a313b2c333b3822536b6469726925373b352c31342a2620736e3338223a2855616c313020242871643b362a3a224d6d67656e67224164612e282e2a73643b342a383a3036242871643b352a3a313f313e2e2071663932203a32343c2c227966393b2038333d39342c28716c323028383a362e20716c38332230303c2c22686639362038767a7f672c28606c313628383b2e206066393d203a282f3a30303f363c353a3a332a2620626e3330223a282024207565332a3076727f672422776d302a385920756d68656c282e2a65787a677a6b6f676c7c6b6e2d7d676a676c285f24207565342a3020576f60436974282e2a756535203228556568496174205d676a454e202e2a7d65323d203274727f6724207565312a30202d3b333c3833323a31333a202e2a7a66312838535b226b72786e6b61637c636d6e257a25676f656564672f616a7a656f6527726c66222620616c766770666b6e2d7a666e2d7663677f6770202e2a496a72656f6d20504e4428526e77656164202c6477646c5d26592a6372726e61696374636d662f706e642a2e206f6a626c606d6e656b666a68607863676d68676c6d6865676e67696f6a62636b202e2a496a72656f6d20504e4428546b67756d78202c6477646c5d26592a6372726e61696374636d662f78276c69616e2e63787a6e6969637c696f642d702f726c636b66202c286b667465786c696e2f6c636b662f7066776f696e282e2a4c63766b7e6f2243666b6d6e74282e66776e6e5f245120617a726469636b76616d6c2d7a257a72617a6b2577696e677e6b6c672f6b6e6f2226207f69646f74616c676166656b66617a766d722e6e6e64202e2055616e6776636c6d2043656c7c676c76224c6f617273727c696f6422456d66776e6d282e223b2c3c2e38243338303b205f5577",headers =heads) # i got the link while i was inspecting the booking tickets page
f = s.get("https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204#!seatlayout") # this is the page gets displayed when we click the show time
ff = f.text
j = json.loads(ff)
print(j)
After i get the source code of this page i can get seats availability easily. But i am unable to get that page. How to do this? Thanks in Advance!
Steps:
1) use selenium to click on the time showing block
driver.find_element_by_xpath('<enter xpath>').click()
find xpath using inspect element and then click on element then copy you will get the option for copy xpath
time.sleep(4) # wait for 4 seconds for the page to appear
2) Get the html source code using
html = driver.page_source
then use beautiful soup to scrap the page
soup = BeautifulSoup(html,'html.parser')
Find all a href tag having class ='_available' and count them and then
find all a href tag having class = '_blocked' and count them
using these data you can find total no of seats and available seats
I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')