Im a complete beginner
i would like to change this code, so that i can read the Asins line by line from a CSV file or a text file and get notified when Amazon is out of stock.
I hope someone has an idea
Thank you in advance
Python script for Amazon product availability checker
# importing libraries
from lxml import html
import requests
from time import sleep
import time
import schedule
import smtplib
receiver_email_id = "EMAIL_ID_OF_USER"
def check(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url, headers=headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
def ReadAsin():
Asin = 'B07XCRYSDT'
url = "http://www.amazon.de/dp/" + Asin
print("Processing: " + url)
ans = check(url)
arr = [
'Only 1 left in stock.',
'Only 2 left in stock.',
'In stock.']
print(ans)
if ans in arr:
# sending email to user if
# in case product available
sendemail(ans, Asin)
def job():
print("Tracking....")
ReadAsin()
schedule.every(1).minutes.do(job)
while True:
schedule.run_pending()
time.sleep(1)
Related
I have recently started learning python and one of my first project is to get live stock prices from Google finance using beautifulsoup. Basically I am looking up for a stock and setting a price alert.
here is what my code looks like.
import requests
import time
import tkinter
from bs4 import BeautifulSoup
def st_Price(symbol):
baseurl = 'http://google.com/finance/quote/'
URL = baseurl + symbol + ":NSE?hl=en&gl=in"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(class_="YMlKec fxKbKc")
result = results.__str__()
#print(result)
res = result.split("₹")[1].split("<")[0]
res_flt = float(res.replace(",",""))
return res_flt
def main():
sym = input("Enter Stock Symbol : ")
price = input("Enter desired price : ")
x = st_Price(sym)
while x < float(price):
print(x)
t1 = time.perf_counter()
x = st_Price(sym)
t2 = time.perf_counter()
print("Internal refresh time is {}".format(t2-t1))
else:
print("The Stock {} achieved price greater than {}".format(sym,x))
root = tkinter.Tk()
root.geometry("150x150")
tkinter.messagebox.showinfo(title="Price Alert",message="Stock Price {} greater Than {}".format(x,price))
root.destroy()
if __name__ == "__main__":
main()
I am looking up following class in the Page HTML:
HTML element for the Stock
The code works perfectly fine but it takes too much time to fetch the information:
Enter Stock Symbol : INFY
Enter desired price : 1578
1574.0
Internal refresh time is 9.915285099999892
1574.0
Internal refresh time is 7.2284357999997155
I am not too much familiar with HTML. By referring online documentation I was able to figure out how to scrape necessary part.
Is there any way to reduce the time to fetch the data ?
Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
Also, when using the requests library, the default requests user-agent is python-requests so websites understand that it's a bot or a script that sends a request, not a real user. Check what's your user-agent and pass it request headers.
To get just the current price you would need to use such CSS selector AHmHk .fxKbKc via the select_one() bs4 method, which could also change in the future.
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/INFY:NSE", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
current_price = soup.select_one(".zzDege").text
print(current_price)
# ₹1,860.50
Code and full example in the online IDE to scrape current price and right panel data:
from bs4 import BeautifulSoup
import requests, lxml, json
from itertools import zip_longest
def scrape_google_finance(ticker: str):
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/{ticker}", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
ticker_data = {"right_panel_data": {},
"ticker_info": {}}
ticker_data["ticker_info"]["title"] = soup.select_one(".zzDege").text
ticker_data["ticker_info"]["current_price"] = soup.select_one(".AHmHk .fxKbKc").text
right_panel_keys = soup.select(".gyFHrc .mfs7Fc")
right_panel_values = soup.select(".gyFHrc .P6K39c")
for key, value in zip_longest(right_panel_keys, right_panel_values):
key_value = key.text.lower().replace(" ", "_")
ticker_data["right_panel_data"][key_value] = value.text
return ticker_data
data = scrape_google_finance(ticker="INFY:NSE")
# ensure_ascii=False to display Indian Rupee ₹ symbol
print(json.dumps(data, indent=2, ensure_ascii=False))
print(data["right_panel_data"].get("ceo"))
Outputs:
{
"right_panel_data": {
"previous_close": "₹1,882.95",
"day_range": "₹1,857.15 - ₹1,889.60",
"year_range": "₹1,311.30 - ₹1,953.90",
"market_cap": "7.89T INR",
"p/e_ratio": "36.60",
"dividend_yield": "1.61%",
"primary_exchange": "NSE",
"ceo": "Salil Parekh",
"founded": "Jul 2, 1981",
"headquarters": "Bengaluru, KarnatakaIndia",
"website": "infosys.com",
"employees": "292,067"
},
"ticker_info": {
"title": "Infosys Ltd",
"current_price": "₹1,860.50"
}
}
Salil Parekh
If you want to scrape more data with a line-by-line explanation, there's a Scrape Google Finance Ticker Quote Data in Python blog post of mine.
I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))
In this code I'm trying to fetch NSE option chain data via Python code.
Tool - Spyder4
Python - 3.7
CODE IS NOT THROWING ANY ERROR ,I don't know what I'm doing wrong.
PRINT 1 is giving my proper output as JSON data but PRINT 2 & PRINT 3 is not showing any output.
Can someone please help me in debugging this code.
import requests
import json
import pandas as pd
import xlwings as xw
from df2gspread import df2gspread as d2g
import gspread
from oauth2client.service_account import ServiceAccountCredentials
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 75)
pd.set_option('display.max_row', 2500)
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
"accept-language": "en-US,en;q=0.9,hi;q=0.8","accept-encoding": "gzip, deflate, br"}
cookie_dict = {'bm_sv' : 'AA02590AB18B4FC4A036CC62F5230694~8py6nqGfKvu3P4aKZoNpf4HZOUYQJ4i6JMyPMX14ksLZYE+0HlglIA3S2AAa9JGJPvXrBHcJ7uS2ZMcMq3f+FZ/ttHuqFzuAmMf1ZnI9hFgpqB7USISOoa3NfzMufwVAd0U7MgeSxF7+GjuyOuApyOQcoHmyr53hB4JLSqd0U1s'}
session = requests.session()
for cookie in cookie_dict:
session.cookies.set(cookie,cookie_dict[cookie])
expiry = '16-Jul-2020'
def fetch_oi():
r = session.get(url, headers=headers).json()
#print(r) PRINT 1 - THIS PRINT IS WORKING
if expiry:
ce_values = [data['CE'] for data in r ['records']['data'] if "CE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
pe_values = [data['PE'] for data in r ['records']['data'] if "PE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
else:
ce_values = [data['CE'] for data in r ['filtered']['data'] if "CE" in data]
pe_values = [data['PE'] for data in r ['filtered']['data'] if "PE" in data]
print(ce_values) # PRINT 2 NO OUTPUT NO ERROR
ce_data = pd.DataFrame(ce_values)
pe_data = pd.DataFrame(pe_values)
ce_data = ce_data.sort_values(['strikePrice'])
pe_data = pe_data.sort_values(['strikePrice'])
print(ce_values) # PRINT 3 NO OUTPUT NO ERROR
def main():
fetch_oi()
if __name__ == '__main__':
main()
your str conversion was failing and requests handle had missing parameters, I have modified your code, should work below
import requests
import json
import pandas as pd
new_url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(new_url,headers=headers)
dajs = json.loads(page.text)
def fetch_oi(expiry_dt):
ce_values = [data['CE'] for data in dajs['records']['data'] if "CE" in data and data['expiryDate'] == expiry_dt]
pe_values = [data['PE'] for data in dajs['records']['data'] if "PE" in data and data['expiryDate'] == expiry_dt]
ce_dt = pd.DataFrame(ce_values).sort_values(['strikePrice'])
pe_dt = pd.DataFrame(pe_values).sort_values(['strikePrice'])
print(ce_dt[['strikePrice','lastPrice']])
def main():
expiry_dt = '27-Aug-2020'
fetch_oi(expiry_dt)
if __name__ == '__main__':
main()
Now they have added 2 main cookies which determines if you are authentic user or not
Cookie Names
nsit , nseappid
Couldnt find how these both cookies are being set into the browser.
At first visit to NSE site these 2 cookies are being set somehow ofcourse with some expiration. For each resource request For Eg https://www.nseindia.com/api/option-chain-indices?COUNTER these two cookies needed to be set into request headers inorder to get data.
Maybe I am late for this answer. but below script is working fine for me
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; '
'x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
main_url = "https://www.nseindia.com/"
response = requests.get(main_url, headers=headers)
#print(response.status_code)
cookies = response.cookies
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
bank_nifty_oi_data = requests.get(url, headers=headers, cookies=cookies)
print(bank_nifty_oi_data.json())
Thanks,
You can repetitively call the url until you get the data:
url = 'https://www.nseindia.com/api/option-chain-indices?symbol='+symbol
found = False
while not found:
try:
data = requests.get(url, headers=urlheader).content
data2 = data.decode('utf-8')
df = json.loads(data2)
expiry_dt = df['records']['expiryDates'][0]
found = True
except:
pass
I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')
l feel puzzled.
My idea is that I want to send request to the url, and then extract the POST data in the web page, and then sent to the web page.When l used the urllib.request in python,l failed,but instead that l used the requests,it works!
Please tell me why....
Here is the code,and the annotation is code which l used urllib.request
import urllib.request
import http.cookiejar
import re
import requests
loginUrl='https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
#Here is the urllib.request code
#cookies=http.cookiejar.MozillaCookieJar()
#handler=urllib.request.HTTPCookieProcessor(cookies)
#opener=urllib.request.build_opener(handler)
headers={
'Origin': 'http://passport.csdn.net',
'Referer':'http://passport.csdn.net/account/login?from=http%3A%2F%2Fmy.csdn.net%2Fmy%2Fmycsdn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
}
#Here is the requests code
s = requests.Session()
data = s.get(loginUrl)
data = data.text
#request = urllib.request.Request(loginUrl)
#response = urllib.request.urlopen(request)
#data = response.read().decode('utf-8')
#l get the value of lt and execution from the web page
pattern_lt = re.compile('<input type="hidden" name="lt" value="(.*?)" />',re.S)
lt = re.findall(pattern_lt,data)
lt = lt[0]
pattern_exe = re.compile('<input type="hidden" name="execution" value="(.*?)" />',re.S)
exe = re.findall(pattern_exe,data)
exe = exe[0]
postDict = {
'username':'qinyufeng_hdq#163.com',
'password':'csdn690076598',
'lt':lt,
'execution':exe,
'_eventId':'submit'
}
r = s.post(loginUrl, data=postDict)
#postData = urllib.parse.urlencode(postDict).encode()
#request = urllib.request.Request(loginUrl, postData,headers)
#response = opener.open(request)
#data = response.read().decode('UTF-8')
print (r.text)
l'm not good at English and l hope you get my idea and thank you for reading my problem.