Python requests 401 error but url opens in browser - python

I am trying to pull the json from this location -
https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY
This opens fine in my browser, but using requests in python throws a 401 permission error. I have tried adding headers with different arguments, but to no avail.
Interestingly, the json on this page does not open in the browser as well until https://www.nseindia.com is opened separately. I believe it requires some kind of authentication, but surprised it works in the browser without any.
Is there a way to extract the information from this url? Any help is much appreciated.
Here is my implementation -
import requests
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
# This throws a 401 response error
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
# This throws a 'Connection aborted' error
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)"})

To get the correct data, first load cookies from other URL with requests.get() and then do Ajax request to load the JSON:
import json
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
with requests.session() as s:
# load cookies:
s.get('https://www.nseindia.com/get-quotes/derivatives?symbol=BANKNIFTY', headers=headers)
# get data:
data = s.get(url, headers=headers).json()
# print data to screen:
print(json.dumps(data, indent=4))
Prints:
{
"records": {
"expiryDates": [
"03-Sep-2020",
"10-Sep-2020",
"17-Sep-2020",
"24-Sep-2020",
"01-Oct-2020",
"08-Oct-2020",
"15-Oct-2020",
"22-Oct-2020",
"29-Oct-2020",
"26-Nov-2020"
],
"data": [
{
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"CE": {
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"underlying": "BANKNIFTY",
"identifier": "OPTIDXBANKNIFTY03-09-2020CE18100.00",
"openInterest": 1,
"changeinOpenInterest": 1,
"pchangeinOpenInterest": 0,
"totalTradedVolume": 2,
"impliedVolatility": 95.51,
"lastPrice": 6523.6,
"change": 2850.1000000000004,
"pChange": 77.58540901048048,
"totalBuyQuantity": 2925,
"totalSellQuantity": 2800,
"bidQty": 25,
"bidprice": 6523.6,
"askQty": 25,
"askPrice": 6570.3,
"underlyingValue": 24523.8
},
"PE": {
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"underlying": "BANKNIFTY",
...and so on.

Related

Python requests GET not getting the JSON payload?

I am trying to get the JSON data from the following URL:
import requests as r
url = "https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json"
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
resp = r.get(url, stream=True, timeout=20, headers=header)
j = resp.json()
I get the JSON from doing this, but in the inspection I see the data is in the Response payload part, which is not in j.
I have never faced this problem before and my search lead me to POST questions.
I tested it using postman
User-Agent value is your problem
you could simply remove it and it will work
I might be wrong and didn't get question correctly, but compering data getting from UI and compering data getting from API are the same:
import json
import requests
from selenium import webdriver
url = 'https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json'
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
driver = webdriver.Chrome()
driver.get(url)
content = driver.find_element_by_xpath('//pre').text
driver.quit()
response = requests.get(url,
stream=True,
timeout=20,
headers=header
)
print(json.loads(content) == response.json())
assert json.loads(content) == response.json()

get requests is not loading in python

I am trying to get data from the following website. https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE
I tried the following:
Get the whole url in requests:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
Get the base webpage and add the params:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp', params = {'symbol':'SBIN','segmentLink':'3','symbolCount':'2','series':'EQ','dateRange':' ','fromDate':'01-01-2020','toDate':'31-12-2020','dataType':'PRICEVOLUMEDELIVERABLE'})
used the urllib:
f = urllib.request.urlopen('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
none of the above methods work.
They are just loading indefinitely.
Thanks in advance.
Don't forget to add User-Agent to request header, like that:
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
response = requests.get('you_url', headers=header)
print(response)

Scraping Amazon using BeautifulSoup on AWS Lambda

I'm working on a project that requires me to scrape product titles/names from Amazon using AWS Lambda. My code is as follows:
import json
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import requests
import base64
def lambda_handler(event, context):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
URL = "https://www.amazon.in/Amazon-Brand-Solimo-Foliage-Bedsheet/dp/B076ZTSW47/ref=sr_1_6_mod_primary_lightning_deal?dchild=1&pd_rd_r=53f449ad-419e-466d-bab8-2a09a026efc6&pd_rd_w=5XoS9&pd_rd_wg=VXhi2&pf_rd_p=d848f904-cfb3-4815-b7b2-fe0f44e4153f&pf_rd_r=B9HGM8VTCBCPSTBNW5Z6&qid=1616001168&refinements=p_n_format_browse-bin%3A19560802031&s=kitchen&sbo=Tc8eqSFhUl4VwMzbE4fw%2Fw%3D%3D&smid=AT95IG9ONZD7S&sr=1-6"
content = requests.get(URL, headers = headers)
soup = BeautifulSoup(content.text, 'html.parser')
title = soup.find("span", attrs={"id":'productTitle'}).string.strip()
return {
'title' : title,
}
For some reason, this is only working when my URL is from Amazon.in (India) but not amazon.com (US) * Note: I am not in India (unsure if that plays a role in anything)
If I keep all the rest same and simply change the URL to something from amazon.com (US). When I inspected both the .in and US pages, both had a span element with the id 'productTitle'
for example:
URL = https://www.amazon.com/Saucony-Mens-Kinvara-Running-Shoe/dp/B07Q8Y4GQL/?_encoding=UTF8&pd_rd_w=c0VM8&pf_rd_p=de0c3fe6-321f-473e-bef6-6a700af423d3&pf_rd_r=S7482G8JKWVS6GB5ADG8&pd_rd_r=faa67bb1-ca9b-4c1d-a730-21c49cfd9b35&pd_rd_wg=KiSz4&ref_=pd_gw_trq_rep_sims_gw
I am getting this error:
Response
{
"errorMessage": "'NoneType' object has no attribute 'string'",
"errorType": "AttributeError",
"stackTrace": [
[
"/var/task/lambda_function.py",
14,
"lambda_handler",
"title = soup.find(\"span\", attrs={\"id\":'productTitle'}).string.strip()"
]
]
}
I am pretty sure this has something to do with the User-Agent and Amazon disallowing scraping. I am very new to web-scraping so please let me know if I am doing something wrong or if there are any other changes to make. The code I have is incredibly simple so I am pretty sure the error is with the headers/User Agent but once again, I am very new to this and need some direction on it
EDIT: #MendelG suggested an answer that actually worked on other IDEs such as Pycharm or Spyder, but it is still giving the same error on AWS Lambda, Does anybody know the reason lambda is executing it differently?
Add "upgrade-insecure-requests": "1" to your headers:
headers = {
"upgrade-insecure-requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
}
import requests
from bs4 import BeautifulSoup
headers = {
"upgrade-insecure-requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
}
URL = "https://www.amazon.com/Saucony-Mens-Kinvara-Running-Shoe/dp/B07Q8Y4GQL/?_encoding=UTF8&pd_rd_w=c0VM8&pf_rd_p=de0c3fe6-321f-473e-bef6-6a700af423d3&pf_rd_r=S7482G8JKWVS6GB5ADG8&pd_rd_r=faa67bb1-ca9b-4c1d-a730-21c49cfd9b35&pd_rd_wg=KiSz4&ref_=pd_gw_trq_rep_sims_gw"
soup = BeautifulSoup(requests.get(URL, headers=headers).content, "html.parser")
print(soup.find("span", attrs={"id": "productTitle"}).string.strip())
Output:
Saucony Men's Kinvara 10 Running Shoe

how to get a pdf file from coursehero?

can insert code from Javascript or something else?I found the code on Python, is it giving something or working?to get a pdf file from coursehero?
import requests
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.3987.78 Safari/537.36'
}
data = {
"client": "web",
"query": "scrape",
"view": "list_w",
"filters": {
"type": ["document"],
"doc_type": [],
},
"sort": "relevancy",
"limit": 20,
"offset": 0,
"callout_types": ["textbook"]
}
response = requests.post(
'https://www.coursehero.com/api/v2/search/', headers=headers, json=data)
data = response.json()
for result in data['results']:
url = f"https://www.coursehero.com/file/{result['document']['db_filename']}"
print(f"'{result['core']['title']}' URL: {url}")
# Login and extract download URL from HTML
#
# response = requests.get(url, headers=headers)
# soup = BeautifulSoup(response.content, 'lxml')
# download_url = soup.select('...')
#
# OR
#
# Download file via direct HTTP request if URL is returned via XHR request
#
# download_url = 'https://www.coursehero.com/...'
# requests.get(download_url, headers=headers)
Course Hero front-end sends POST request to https://www.coursehero.com/api/v2/search and renders search results from JavaScript.Just fetch JSON via an HTTP request. Full example. I don't have a paid account so the last part of the code is commented since it's a pseudo-code.

Access denied - python selenium - even after using User-Agent and other headers

Using python, I am trying to extract the options chain data table published publicly by NSE exchange on https://www.nseindia.com/option-chain
Tried to use requests session as well as selenium, but somehow the website is not allowing to extract data using bot.
Below are the attempts done:
Instead of plain requests, tried to setup a session and attempted to first get csrf_token from https://www.nseindia.com/api/csrf-token and then called the url. However the website seems to have certain additional authorization using javascripts.
On studying the xhr and js tabs of chrome developer console, the website seems to be using certain js scripts for first time authorisation, so used selenium this time. Also passed useragent and Accept-Language arguments in headers (as per this stackoverflow answer) while loading driver. But somehow the access is still blocked by website.
Is there anything obvious that i am missing ? Or website will make all attempts to block automated extraction of data from website using selenium/requests + python? Either case, how do i extract this data?
Below is my current code: ( to get table contents from https://www.nseindia.com/option-chain)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
opts = Options()
opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36")
opts.add_argument("Accept-Language=en-US,en;q=0.5")
opts.add_argument("Accept=text/html")
driver = webdriver.Chrome(executable_path="C:\\chromedriver.exe",chrome_options=opts)
#driver.get('https://www.nseindia.com/api/csrf-token')
driver.get('https://www.nseindia.com/')
#driver.get('https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY')
driver.get('https://www.nseindia.com/option-chain')
The data is loaded via Javascript from external URL. But you need first to load cookies visiting other URL:
import json
import requests
from bs4 import BeautifulSoup
symbol = 'NIFTY'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=' + symbol
with requests.session() as s:
# load cookies:
s.get('https://www.nseindia.com/get-quotes/derivatives?symbol=' + symbol, headers=headers)
# get data:
data = s.get(url, headers=headers).json()
# print data to screen:
print(json.dumps(data, indent=4))
Prints:
{
"records": {
"expiryDates": [
"03-Sep-2020",
"10-Sep-2020",
"17-Sep-2020",
"24-Sep-2020",
"01-Oct-2020",
"08-Oct-2020",
"15-Oct-2020",
"22-Oct-2020",
"29-Oct-2020",
"26-Nov-2020",
"31-Dec-2020",
"25-Mar-2021",
"24-Jun-2021",
"30-Dec-2021",
"30-Jun-2022",
"29-Dec-2022",
"29-Jun-2023"
],
"data": [
{
"strikePrice": 4600,
"expiryDate": "31-Dec-2020",
"PE": {
"strikePrice": 4600,
"expiryDate": "31-Dec-2020",
"underlying": "NIFTY",
"identifier": "OPTIDXNIFTY31-12-2020PE4600.00",
"openInterest": 19,
"changeinOpenInterest": 0,
"pchangeinOpenInterest": 0,
"totalTradedVolume": 0,
"impliedVolatility": 0,
"lastPrice": 31,
"change": 0,
"pChange": 0,
"totalBuyQuantity": 10800,
"totalSellQuantity": 0,
"bidQty": 900,
"bidprice": 3.05,
"askQty": 0,
"askPrice": 0,
"underlyingValue": 11647.6
}
},
{
"strikePrice": 5000,
"expiryDate": "31-Dec-2020",
...and so on.

Categories

Resources