Web scraping review section on agoda website with post request - python

need help with retrieving review section from agoda website.
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
filename = "hotel.csv"
f = open(filename, "w", encoding="utf-8")
headers = "title, rating, review\n"
f.write(headers)
api_url = "https://www.agoda.com/api/cronos/property/review/ReviewComments"
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# for loop for multiple page scrap
for x in tqdm(range(1,10)):
post_data = {"hotelId":"2252947",
"providerId":"332",
"demographicId":"0",
"page":str(x),
"pageSize":"20",
"sorting":"7",
"providerIds":[332],
"isReviewPage":"false",
"isCrawlablePage":"true",
"filters":{"language":[],"room":[]},
"searchKeyword":"",
"searchFilters":[]}
html = requests.post(api_url, data=post_data)
values = html.text
soup = BeautifulSoup(values, "html.parser")
hotels = soup.find_all("div", {"class": "review-comment"})
for hotel in hotels:
try:
rating = hotel.find("div", {"class":"Review-comment-leftScore"}).text
title = hotel.find("p", {"class":"Review-comment-bodyTitle"}).text
review = hotel.find("p", {"class":"Review-comment-bodyText"}).text
f.write(title + ", "+ rating + ", " + review + "\n")
except TypeError:
continue
f.close()
post data i get from firefox network monitor when i change the page on the review section.
the hotel: Hotel Page
tried the json method but i dont understand

I think your api endpoint or data is wrong. 'cause if you try just to print, you get <Response [415]>.
Should be 200.
html.json()
{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.13', 'title': 'Unsupported Media Type', 'status': 415, 'traceId': '00-68f23e7f0431e7bffae420112667ed1b-6306a38dd716894d-00'}

Related

Indeed Job Scraper only for Postings with External Link

Currently using the below Python scraper to pull Job title, Company, Salary, and Description. Looking for a way to take it one step further by filtering only results where application link is URL to company website, as opposed to the 'Easily Apply' postings that send application through Indeed. Is there a way to do this?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
url = f'https://www.indeed.com/jobs?q=Software%20Engineer&l=Austin%2C%20TX&ts=1630951951455&rq=1&rsIdx=1&fromage=last&newcount=6&vjk=c8f4815c6ecfa793'
r = requests.get(url, headers) # 200 is OK, 404 is page not found
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# <span title="API Developer"> API Developer </span>
def transform(soup):
divs = soup.find_all('div', class_ = 'slider_container')
for item in divs:
if item.find(class_ = 'label'):
continue # need to fix, if finds a job that has a 'new' span before the title span, skips job completely
title = item.find('span').text.strip()
company = item.find('span', class_ = "companyName").text.strip()
description = item.find('div', class_ = "job-snippet").text.strip().replace('\n', '')
try:
salary = item.find('span', class_ = "salary-snippet").text.strip()
except:
salary = ""
job = {
'title': title,
'company': company,
'salary': salary,
'description': description
}
jobList.append(job)
# print("Seeking a: "+title+" to join: "+company+" paying: "+salary+". Job description: "+description)
return
jobList = []
# go through multiple pages
for i in range(0,100, 10): #0-40 stepping in 10's
print(f'Getting page, {i}')
c = extract(0)
transform(c)
print(len(jobList))
df = pd.DataFrame(jobList)
print(df.head())
df.to_csv('jobs.csv')
My approach is as follows-
Find the href from the <a> tag for each job card on the initial page, and then send a request to each of those links, and grab the external job link (If "Apply on Company Site" button is available) from there.
Code snippet-
#function which gets external job links
def get_external_link(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
#if Apply On Company Site button is available, fetch the link
if(soup.find('a',attrs={"referrerpolicy" : "origin"})) is not None:
external_job_link=soup.find('a',attrs={"referrerpolicy" : "origin"})
print(external_job_link['href'])
#add this piece of code to transform function
def transform(soup):
cards=soup.find('div',class_='mosaic-provider-jobcards')
links=cards.find_all("a", class_=lambda value: value and value.startswith("tapItem"))
#for each job link in the page call get_external_links
for link in links:
get_external_link('https://www.indeed.com'+(link['href']))
Note- You can also use the page source of the new requests which are being called to fetch the data like title, company, salary, description which you previously used to scrape from the main page.

Scraping live stock price using google finance

I have recently started learning python and one of my first project is to get live stock prices from Google finance using beautifulsoup. Basically I am looking up for a stock and setting a price alert.
here is what my code looks like.
import requests
import time
import tkinter
from bs4 import BeautifulSoup
def st_Price(symbol):
baseurl = 'http://google.com/finance/quote/'
URL = baseurl + symbol + ":NSE?hl=en&gl=in"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(class_="YMlKec fxKbKc")
result = results.__str__()
#print(result)
res = result.split("₹")[1].split("<")[0]
res_flt = float(res.replace(",",""))
return res_flt
def main():
sym = input("Enter Stock Symbol : ")
price = input("Enter desired price : ")
x = st_Price(sym)
while x < float(price):
print(x)
t1 = time.perf_counter()
x = st_Price(sym)
t2 = time.perf_counter()
print("Internal refresh time is {}".format(t2-t1))
else:
print("The Stock {} achieved price greater than {}".format(sym,x))
root = tkinter.Tk()
root.geometry("150x150")
tkinter.messagebox.showinfo(title="Price Alert",message="Stock Price {} greater Than {}".format(x,price))
root.destroy()
if __name__ == "__main__":
main()
I am looking up following class in the Page HTML:
HTML element for the Stock
The code works perfectly fine but it takes too much time to fetch the information:
Enter Stock Symbol : INFY
Enter desired price : 1578
1574.0
Internal refresh time is 9.915285099999892
1574.0
Internal refresh time is 7.2284357999997155
I am not too much familiar with HTML. By referring online documentation I was able to figure out how to scrape necessary part.
Is there any way to reduce the time to fetch the data ?
Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
Also, when using the requests library, the default requests user-agent is python-requests so websites understand that it's a bot or a script that sends a request, not a real user. Check what's your user-agent and pass it request headers.
To get just the current price you would need to use such CSS selector AHmHk .fxKbKc via the select_one() bs4 method, which could also change in the future.
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/INFY:NSE", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
current_price = soup.select_one(".zzDege").text
print(current_price)
# ₹1,860.50
Code and full example in the online IDE to scrape current price and right panel data:
from bs4 import BeautifulSoup
import requests, lxml, json
from itertools import zip_longest
def scrape_google_finance(ticker: str):
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/{ticker}", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
ticker_data = {"right_panel_data": {},
"ticker_info": {}}
ticker_data["ticker_info"]["title"] = soup.select_one(".zzDege").text
ticker_data["ticker_info"]["current_price"] = soup.select_one(".AHmHk .fxKbKc").text
right_panel_keys = soup.select(".gyFHrc .mfs7Fc")
right_panel_values = soup.select(".gyFHrc .P6K39c")
for key, value in zip_longest(right_panel_keys, right_panel_values):
key_value = key.text.lower().replace(" ", "_")
ticker_data["right_panel_data"][key_value] = value.text
return ticker_data
data = scrape_google_finance(ticker="INFY:NSE")
# ensure_ascii=False to display Indian Rupee ₹ symbol
print(json.dumps(data, indent=2, ensure_ascii=False))
print(data["right_panel_data"].get("ceo"))
Outputs:
{
"right_panel_data": {
"previous_close": "₹1,882.95",
"day_range": "₹1,857.15 - ₹1,889.60",
"year_range": "₹1,311.30 - ₹1,953.90",
"market_cap": "7.89T INR",
"p/e_ratio": "36.60",
"dividend_yield": "1.61%",
"primary_exchange": "NSE",
"ceo": "Salil Parekh",
"founded": "Jul 2, 1981",
"headquarters": "Bengaluru, KarnatakaIndia",
"website": "infosys.com",
"employees": "292,067"
},
"ticker_info": {
"title": "Infosys Ltd",
"current_price": "₹1,860.50"
}
}
Salil Parekh
If you want to scrape more data with a line-by-line explanation, there's a Scrape Google Finance Ticker Quote Data in Python blog post of mine.

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

Scraping seat layout page of book my show using python

I am trying to scrape the bookmyshow website for finding out movie details like at what time tickets are available and how many seats are available. I have got to find how to get the show timings in which seats are available but now i want to get total seats avaialble in that show. My code is :
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://in.bookmyshow.com"
s =requests.session()
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
r = s.get("https://in.bookmyshow.com/vizag/movies", headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"html.parser")
movies_list = soup.find("div",{"class":"__col-now-showing"})
movies = movies_list.findAll("a",{"class":"__movie-name"})
for movie in movies:
print(movie.text)
show = []
containers = movies_list.findAll("div",{"class":"card-container"})
for container in containers:
try:
detail = container.find("div",{"class":"__name overflowEllipses"})
button = container.find("div",{"class":"book-button"})
print(detail.text)
print(button.a["href"])
url_ticket = base_url + button.a["href"]
show.append(url_ticket)
except:
pass
for i in show:
print(i)
for t in show:
res = s.get(t,headers=headers)
bs = BeautifulSoup(res.text,"html.parser")
movie_name = bs.find("div",{"class":"cinema-name-wrapper"})
print(movie_name.text.replace(" ","").replace("\t","").replace("\n",""))
venue_list = bs.find("ul",{"id":"venuelist"})
venue_names = venue_list.findAll("li",{"class":"list"})
try:
for i in venue_names:
vn = i.find("div",{"class":"__name"})
print(vn.text.replace(" ","").replace("\t","").replace("\n",""))
show_times = i.findAll("div",{"data-online":"Y"})
for st in show_times:
print(st.text.replace(" ","").replace("\t","").replace("\n",""))
except:
pass
print("\n")
heads = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"origin":"https://in.bookmyshow.com",
"referer":"https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rr = s.post("https://b-eu.simility.com/b?c=bookmyshow&v=1.905&ec=BLOFaZ2HdToCxwcr&cl=0&si=5a76bfce6ae4a00027767ae9&sc=3B0CB9F4-4A27-4588-9FB4-A2A2760569BC&uc=D834EDA4-57E4-4889-A34F-473AC6BBDDBB&e=Seatlayout&cd=.simility.com&r=0&st=1517731803171&s=792a6c66313a2032223133302633343a2c393a322e3c202422636e312a382037633f3c606669673e61653e6338323230353f3c35616f3b2a2c2269663a203820606765696d7371606f77282e2a61663320327e70756f2e2a63643e20326c776e6e242861643f20326e75666e24206166342a306c75666e2422636e352a386c776e64262073692032223348324b403b4436253e43323d2f3c3538322f314440362f493843323d3438353633404b202e20776b2838224e3a3b34454e433c2f3735473c273638323b2541333e4425363531434b3c40424e464a422226206a66303120326c636c79672422626e303a203864636479672c28716c32342838253131322e2a7966323f203231353b353f31333a323b3b353326207b643428382a32202e207b6e302230767a756526207b663420382a6f6c2d5f512a2c2279663f203859206d642f5559202422656420552e2071663028383026207b6431392032204f6d7861666e6125372630202255616c666d757b2a4c542a33382e3031225f6b6c3436332a7a363e2b2841707a6e6d55676049617e2d3539352633362a2a434a564f4e242a6e6961672847656969672b22416a7a656f6525343b2e3024313a313b2c333b3822536b6469726925373b352c31342a2620736e3338223a2855616c313020242871643b362a3a224d6d67656e67224164612e282e2a73643b342a383a3036242871643b352a3a313f313e2e2071663932203a32343c2c227966393b2038333d39342c28716c323028383a362e20716c38332230303c2c22686639362038767a7f672c28606c313628383b2e206066393d203a282f3a30303f363c353a3a332a2620626e3330223a282024207565332a3076727f672422776d302a385920756d68656c282e2a65787a677a6b6f676c7c6b6e2d7d676a676c285f24207565342a3020576f60436974282e2a756535203228556568496174205d676a454e202e2a7d65323d203274727f6724207565312a30202d3b333c3833323a31333a202e2a7a66312838535b226b72786e6b61637c636d6e257a25676f656564672f616a7a656f6527726c66222620616c766770666b6e2d7a666e2d7663677f6770202e2a496a72656f6d20504e4428526e77656164202c6477646c5d26592a6372726e61696374636d662f706e642a2e206f6a626c606d6e656b666a68607863676d68676c6d6865676e67696f6a62636b202e2a496a72656f6d20504e4428546b67756d78202c6477646c5d26592a6372726e61696374636d662f78276c69616e2e63787a6e6969637c696f642d702f726c636b66202c286b667465786c696e2f6c636b662f7066776f696e282e2a4c63766b7e6f2243666b6d6e74282e66776e6e5f245120617a726469636b76616d6c2d7a257a72617a6b2577696e677e6b6c672f6b6e6f2226207f69646f74616c676166656b66617a766d722e6e6e64202e2055616e6776636c6d2043656c7c676c76224c6f617273727c696f6422456d66776e6d282e223b2c3c2e38243338303b205f5577",headers =heads) # i got the link while i was inspecting the booking tickets page
f = s.get("https://in.bookmyshow.com/buytickets/chalo-vizag/movie-viza-ET00064364-MT/20180204#!seatlayout") # this is the page gets displayed when we click the show time
ff = f.text
j = json.loads(ff)
print(j)
After i get the source code of this page i can get seats availability easily. But i am unable to get that page. How to do this? Thanks in Advance!
Steps:
1) use selenium to click on the time showing block
driver.find_element_by_xpath('<enter xpath>').click()
find xpath using inspect element and then click on element then copy you will get the option for copy xpath
time.sleep(4) # wait for 4 seconds for the page to appear
2) Get the html source code using
html = driver.page_source
then use beautiful soup to scrap the page
soup = BeautifulSoup(html,'html.parser')
Find all a href tag having class ='_available' and count them and then
find all a href tag having class = '_blocked' and count them
using these data you can find total no of seats and available seats

Want to scrape data using 10 different keywords in URL for 2 pages and write scraped data to csv using Python 3.6.2 and BS4

I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')

Categories

Resources