Webscraping NSE Option Chain data in Python - python

In this code I'm trying to fetch NSE option chain data via Python code.
Tool - Spyder4
Python - 3.7
CODE IS NOT THROWING ANY ERROR ,I don't know what I'm doing wrong.
PRINT 1 is giving my proper output as JSON data but PRINT 2 & PRINT 3 is not showing any output.
Can someone please help me in debugging this code.
import requests
import json
import pandas as pd
import xlwings as xw
from df2gspread import df2gspread as d2g
import gspread
from oauth2client.service_account import ServiceAccountCredentials
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 75)
pd.set_option('display.max_row', 2500)
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
"accept-language": "en-US,en;q=0.9,hi;q=0.8","accept-encoding": "gzip, deflate, br"}
cookie_dict = {'bm_sv' : 'AA02590AB18B4FC4A036CC62F5230694~8py6nqGfKvu3P4aKZoNpf4HZOUYQJ4i6JMyPMX14ksLZYE+0HlglIA3S2AAa9JGJPvXrBHcJ7uS2ZMcMq3f+FZ/ttHuqFzuAmMf1ZnI9hFgpqB7USISOoa3NfzMufwVAd0U7MgeSxF7+GjuyOuApyOQcoHmyr53hB4JLSqd0U1s'}
session = requests.session()
for cookie in cookie_dict:
session.cookies.set(cookie,cookie_dict[cookie])
expiry = '16-Jul-2020'
def fetch_oi():
r = session.get(url, headers=headers).json()
#print(r) PRINT 1 - THIS PRINT IS WORKING
if expiry:
ce_values = [data['CE'] for data in r ['records']['data'] if "CE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
pe_values = [data['PE'] for data in r ['records']['data'] if "PE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
else:
ce_values = [data['CE'] for data in r ['filtered']['data'] if "CE" in data]
pe_values = [data['PE'] for data in r ['filtered']['data'] if "PE" in data]
print(ce_values) # PRINT 2 NO OUTPUT NO ERROR
ce_data = pd.DataFrame(ce_values)
pe_data = pd.DataFrame(pe_values)
ce_data = ce_data.sort_values(['strikePrice'])
pe_data = pe_data.sort_values(['strikePrice'])
print(ce_values) # PRINT 3 NO OUTPUT NO ERROR
def main():
fetch_oi()
if __name__ == '__main__':
main()

your str conversion was failing and requests handle had missing parameters, I have modified your code, should work below
import requests
import json
import pandas as pd
new_url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(new_url,headers=headers)
dajs = json.loads(page.text)
def fetch_oi(expiry_dt):
ce_values = [data['CE'] for data in dajs['records']['data'] if "CE" in data and data['expiryDate'] == expiry_dt]
pe_values = [data['PE'] for data in dajs['records']['data'] if "PE" in data and data['expiryDate'] == expiry_dt]
ce_dt = pd.DataFrame(ce_values).sort_values(['strikePrice'])
pe_dt = pd.DataFrame(pe_values).sort_values(['strikePrice'])
print(ce_dt[['strikePrice','lastPrice']])
def main():
expiry_dt = '27-Aug-2020'
fetch_oi(expiry_dt)
if __name__ == '__main__':
main()

Now they have added 2 main cookies which determines if you are authentic user or not
Cookie Names
nsit , nseappid
Couldnt find how these both cookies are being set into the browser.
At first visit to NSE site these 2 cookies are being set somehow ofcourse with some expiration. For each resource request For Eg https://www.nseindia.com/api/option-chain-indices?COUNTER these two cookies needed to be set into request headers inorder to get data.

Maybe I am late for this answer. but below script is working fine for me
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; '
'x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
main_url = "https://www.nseindia.com/"
response = requests.get(main_url, headers=headers)
#print(response.status_code)
cookies = response.cookies
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
bank_nifty_oi_data = requests.get(url, headers=headers, cookies=cookies)
print(bank_nifty_oi_data.json())
Thanks,

You can repetitively call the url until you get the data:
url = 'https://www.nseindia.com/api/option-chain-indices?symbol='+symbol
found = False
while not found:
try:
data = requests.get(url, headers=urlheader).content
data2 = data.decode('utf-8')
df = json.loads(data2)
expiry_dt = df['records']['expiryDates'][0]
found = True
except:
pass

Related

How to use one script output data as input for another script

from datetime import timedelta, date
from nsepy import get_history
import requests
import json
import codecs
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
mini_df = df['symbol']
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#print(df)
#print(mini_df)
print(mini_df.to_string(index=False))
mini_df = []
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
print(stock_fut[["Underlying","Change in OI","Open Interest"]])
a = []
for i in range(0,len(a)):
print(a[i])
importdata(a[i])
Here I want to make mini_df output value(which is symbols ) to be used as input value of a(script line 32) to get 3 days historical data but getting no output for historical data. How to do that Plz ???
I'm pretty sure, the culprit is here:
a = []
for i in range(0,len(a)):
print(a[i])
importdata(a[i])
You iterate over the range(0,len(a)) but at this moment a is empty. So what should ever happen there? Maybe you have to initialize a first? This depends a bit on what you want to achieve there.

web-scraping and pagination with python, csv, beautifulsoup and Pandas

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022. Currently, the code dumps the results in .csv file, but it could also be in SQLite.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
# use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')]) # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 1901
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
Lmao, I wrote that code for someone on this site once before. I've edited to work for the missing years here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
try:
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
try:
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
except:
pages = 1
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for row in table.find_all('tr',{'class':regex}):
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
return info
except Exception as e:
print(e, url)
return []
if __name__ == "__main__":
START = 2022
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
list_of_dicts= list(final_list)
flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list
df= pd.DataFrame(flat_list)
df.to_csv('all_years_aviation-safety.csv',index=False)

Amazon Availability Checker

Im a complete beginner
i would like to change this code, so that i can read the Asins line by line from a CSV file or a text file and get notified when Amazon is out of stock.
I hope someone has an idea
Thank you in advance
Python script for Amazon product availability checker
# importing libraries
from lxml import html
import requests
from time import sleep
import time
import schedule
import smtplib
receiver_email_id = "EMAIL_ID_OF_USER"
def check(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url, headers=headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
def ReadAsin():
Asin = 'B07XCRYSDT'
url = "http://www.amazon.de/dp/" + Asin
print("Processing: " + url)
ans = check(url)
arr = [
'Only 1 left in stock.',
'Only 2 left in stock.',
'In stock.']
print(ans)
if ans in arr:
# sending email to user if
# in case product available
sendemail(ans, Asin)
def job():
print("Tracking....")
ReadAsin()
schedule.every(1).minutes.do(job)
while True:
schedule.run_pending()
time.sleep(1)

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

Python3 the same url,using urllib.request failed but requests succeed?

l feel puzzled.
My idea is that I want to send request to the url, and then extract the POST data in the web page, and then sent to the web page.When l used the urllib.request in python,l failed,but instead that l used the requests,it works!
Please tell me why....
Here is the code,and the annotation is code which l used urllib.request
import urllib.request
import http.cookiejar
import re
import requests
loginUrl='https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
#Here is the urllib.request code
#cookies=http.cookiejar.MozillaCookieJar()
#handler=urllib.request.HTTPCookieProcessor(cookies)
#opener=urllib.request.build_opener(handler)
headers={
'Origin': 'http://passport.csdn.net',
'Referer':'http://passport.csdn.net/account/login?from=http%3A%2F%2Fmy.csdn.net%2Fmy%2Fmycsdn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
}
#Here is the requests code
s = requests.Session()
data = s.get(loginUrl)
data = data.text
#request = urllib.request.Request(loginUrl)
#response = urllib.request.urlopen(request)
#data = response.read().decode('utf-8')
#l get the value of lt and execution from the web page
pattern_lt = re.compile('<input type="hidden" name="lt" value="(.*?)" />',re.S)
lt = re.findall(pattern_lt,data)
lt = lt[0]
pattern_exe = re.compile('<input type="hidden" name="execution" value="(.*?)" />',re.S)
exe = re.findall(pattern_exe,data)
exe = exe[0]
postDict = {
'username':'qinyufeng_hdq#163.com',
'password':'csdn690076598',
'lt':lt,
'execution':exe,
'_eventId':'submit'
}
r = s.post(loginUrl, data=postDict)
#postData = urllib.parse.urlencode(postDict).encode()
#request = urllib.request.Request(loginUrl, postData,headers)
#response = opener.open(request)
#data = response.read().decode('UTF-8')
print (r.text)
l'm not good at English and l hope you get my idea and thank you for reading my problem.

Categories

Resources