I have recently started learning python and one of my first project is to get live stock prices from Google finance using beautifulsoup. Basically I am looking up for a stock and setting a price alert.
here is what my code looks like.
import requests
import time
import tkinter
from bs4 import BeautifulSoup
def st_Price(symbol):
baseurl = 'http://google.com/finance/quote/'
URL = baseurl + symbol + ":NSE?hl=en&gl=in"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(class_="YMlKec fxKbKc")
result = results.__str__()
#print(result)
res = result.split("₹")[1].split("<")[0]
res_flt = float(res.replace(",",""))
return res_flt
def main():
sym = input("Enter Stock Symbol : ")
price = input("Enter desired price : ")
x = st_Price(sym)
while x < float(price):
print(x)
t1 = time.perf_counter()
x = st_Price(sym)
t2 = time.perf_counter()
print("Internal refresh time is {}".format(t2-t1))
else:
print("The Stock {} achieved price greater than {}".format(sym,x))
root = tkinter.Tk()
root.geometry("150x150")
tkinter.messagebox.showinfo(title="Price Alert",message="Stock Price {} greater Than {}".format(x,price))
root.destroy()
if __name__ == "__main__":
main()
I am looking up following class in the Page HTML:
HTML element for the Stock
The code works perfectly fine but it takes too much time to fetch the information:
Enter Stock Symbol : INFY
Enter desired price : 1578
1574.0
Internal refresh time is 9.915285099999892
1574.0
Internal refresh time is 7.2284357999997155
I am not too much familiar with HTML. By referring online documentation I was able to figure out how to scrape necessary part.
Is there any way to reduce the time to fetch the data ?
Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
Also, when using the requests library, the default requests user-agent is python-requests so websites understand that it's a bot or a script that sends a request, not a real user. Check what's your user-agent and pass it request headers.
To get just the current price you would need to use such CSS selector AHmHk .fxKbKc via the select_one() bs4 method, which could also change in the future.
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/INFY:NSE", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
current_price = soup.select_one(".zzDege").text
print(current_price)
# ₹1,860.50
Code and full example in the online IDE to scrape current price and right panel data:
from bs4 import BeautifulSoup
import requests, lxml, json
from itertools import zip_longest
def scrape_google_finance(ticker: str):
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/{ticker}", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
ticker_data = {"right_panel_data": {},
"ticker_info": {}}
ticker_data["ticker_info"]["title"] = soup.select_one(".zzDege").text
ticker_data["ticker_info"]["current_price"] = soup.select_one(".AHmHk .fxKbKc").text
right_panel_keys = soup.select(".gyFHrc .mfs7Fc")
right_panel_values = soup.select(".gyFHrc .P6K39c")
for key, value in zip_longest(right_panel_keys, right_panel_values):
key_value = key.text.lower().replace(" ", "_")
ticker_data["right_panel_data"][key_value] = value.text
return ticker_data
data = scrape_google_finance(ticker="INFY:NSE")
# ensure_ascii=False to display Indian Rupee ₹ symbol
print(json.dumps(data, indent=2, ensure_ascii=False))
print(data["right_panel_data"].get("ceo"))
Outputs:
{
"right_panel_data": {
"previous_close": "₹1,882.95",
"day_range": "₹1,857.15 - ₹1,889.60",
"year_range": "₹1,311.30 - ₹1,953.90",
"market_cap": "7.89T INR",
"p/e_ratio": "36.60",
"dividend_yield": "1.61%",
"primary_exchange": "NSE",
"ceo": "Salil Parekh",
"founded": "Jul 2, 1981",
"headquarters": "Bengaluru, KarnatakaIndia",
"website": "infosys.com",
"employees": "292,067"
},
"ticker_info": {
"title": "Infosys Ltd",
"current_price": "₹1,860.50"
}
}
Salil Parekh
If you want to scrape more data with a line-by-line explanation, there's a Scrape Google Finance Ticker Quote Data in Python blog post of mine.
Related
I am trying to run this code in idle 3.10.6 and I am not seeing any kind of data that should be extracted from Indeed. All this data should be in the output when I run it but it isn't. Below is the input statement
#Indeed data
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko"}
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}"
r = requests.get(url,headers)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
for item in divs:
title = item.find ("a").text.strip()
company = item.find("span", class_="company").text.strip()
try:
salary = item.find("span", class_ = "salarytext").text.strip()
finally:
salary = ""
summary = item.find("div",{"class":"summary"}).text.strip().replace("\n","")
job = {
"title":title,
"company":company,
'salary':salary,
"summary":summary
}
joblist.append(job)
joblist = []
for i in range(0,40,10):
print(f'Getting page, {i}')
c = extract(10)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
Here is the output I get
Getting page, 0
Getting page, 10
Getting page, 20
Getting page, 30
Empty DataFrame
Columns: []
Index: []
Why is this going on and what should I do to get that extracted data from indeed? What I am trying to get is the jobtitle,company,salary, and summary information. Any help would be greatly apprieciated.
The URL string includes {page}, bit it's not an f-string, so it's not being interpolated, and the URL you are fetching is:
https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}
That returns an error page.
So you should add an f before opening quote when you set url.
Also, you are calling extract(10) each time, instead of extract(i).
This is the correct way of using url
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}".format(page=page)
r = requests.get(url,headers)
here r.status_code gives an error 403 which means the request is forbidden.The site will block your request from fullfilling.use indeed job search Api
need help with retrieving review section from agoda website.
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
filename = "hotel.csv"
f = open(filename, "w", encoding="utf-8")
headers = "title, rating, review\n"
f.write(headers)
api_url = "https://www.agoda.com/api/cronos/property/review/ReviewComments"
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# for loop for multiple page scrap
for x in tqdm(range(1,10)):
post_data = {"hotelId":"2252947",
"providerId":"332",
"demographicId":"0",
"page":str(x),
"pageSize":"20",
"sorting":"7",
"providerIds":[332],
"isReviewPage":"false",
"isCrawlablePage":"true",
"filters":{"language":[],"room":[]},
"searchKeyword":"",
"searchFilters":[]}
html = requests.post(api_url, data=post_data)
values = html.text
soup = BeautifulSoup(values, "html.parser")
hotels = soup.find_all("div", {"class": "review-comment"})
for hotel in hotels:
try:
rating = hotel.find("div", {"class":"Review-comment-leftScore"}).text
title = hotel.find("p", {"class":"Review-comment-bodyTitle"}).text
review = hotel.find("p", {"class":"Review-comment-bodyText"}).text
f.write(title + ", "+ rating + ", " + review + "\n")
except TypeError:
continue
f.close()
post data i get from firefox network monitor when i change the page on the review section.
the hotel: Hotel Page
tried the json method but i dont understand
I think your api endpoint or data is wrong. 'cause if you try just to print, you get <Response [415]>.
Should be 200.
html.json()
{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.13', 'title': 'Unsupported Media Type', 'status': 415, 'traceId': '00-68f23e7f0431e7bffae420112667ed1b-6306a38dd716894d-00'}
Hi guys i am trying to create a program in python that compares prices from websites but i cant get the prices. I have managed to ge the title of the product and the quantity using the code bellow.
page = requests.get(urls[7],headers=Headers)
soup = BeautifulSoup(page.text, 'html.parser')
title = soup.find("h1",{"class" : "Titlestyles__TitleStyles-sc-6rxg4t-0 fDKOTS"}).get_text().strip()
quantity = soup.find("li", class_="quantity").get_text().strip()
total_price = soup.find('div', class_='Pricestyles__ProductPriceStyles-sc-118x8ec-0 fzwZWj price')
print(title)
print(quantity)
print(total_price)
Iam trying to get the price from this website (Iam creating a program do look for diper prices lol) https://www.drogasil.com.br/fralda-huggies-tripla-protecao-tamanho-m.html .
the price is not coming even if i get the text it always says that its nonetype.
Some of the information is built up via javascript from data stored in <script> sections in the HTML. You can access this directly by searching for it and using Python's JSON library to decode it into a Python structure. For example:
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
url = 'https://www.drogasil.com.br/fralda-huggies-tripla-protecao-tamanho-m.html'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
script = soup.find('script', type='application/ld+json')
data = json.loads(script.text)
title = data['name']
total_price = data['offers']['price']
quantity = soup.find("li", class_="quantity").get_text().strip()
print(title)
print(quantity)
print(total_price)
Giving you:
HUGGIES FRALDAS DESCARTAVEL INFANTIL TRIPLA PROTECAO TAMANHO M COM 42 UNIDADES
42 Tiras
38.79
I recommend you add print(data) to see what other information is available.
I'm scraping the activities to do in Paris from TripAdvisor (https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html).
The code that I've written works well, but I haven't still found a way to obtain the rating of each activity. The rating in Tripadvisor is represented from 5 rounds, I need to know how many of these rounds are colored.
I obtain nothing in the "rating" field.
Following the code:
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.get("https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html")
import pprint
detail_tours = []
for tour in list_tours:
url = tour.find_elements_by_css_selector("a")[0].get_attribute("href")
title = ""
reviews = ""
rating = ""
if(len(tour.find_elements_by_css_selector("._1gpq3zsA._1zP41Z7X")) > 0):
title = tour.find_elements_by_css_selector("._1gpq3zsA._1zP41Z7X")[0].text
if(len(tour.find_elements_by_css_selector("._7c6GgQ6n._22upaSQN._37QDe3gr.WullykOU._3WoyIIcL")) > 0):
reviews = tour.find_elements_by_css_selector("._7c6GgQ6n._22upaSQN._37QDe3gr.WullykOU._3WoyIIcL")[0].text
if(len(tour.find_elements_by_css_selector(".zWXXYhVR")) > 0):
rating = tour.find_elements_by_css_selector(".zWXXYhVR")[0].text
detail_tours.append({'url': url,
'title': title,
'reviews': reviews,
'rating': rating})
I would use BeautifulSoup in a way similar to the suggested code. (I would also recommend you study the structure of the html, but seeing the original code I don't think that's necessary.)
import requests
from bs4 import BeautifulSoup
import re
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
resp = requests.get('https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html', headers=header)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
cards = soup.find_all('div', {'data-automation': 'cardWrapper'})
for card in cards:
rating = card.find('svg', {'class': 'zWXXYhVR'})
match = re.match('Punteggio ([0-9,]+)', rating.attrs['aria-label'])[1]
print(float(match.replace(',', '.')))
And a small bonus-info, the part in the link preceeded by oa (In the example below: oa60), indicates the starting offset, which runs in 30 result increments - So in case you want to change pages, you can change your link to include oa30, oa60, oa90, etc.: https://www.tripadvisor.it/Attractions-g187147-Activities-c42-oa60-Paris_Ile_de_France.html
I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))