Python requests GET not getting the JSON payload? - python

I am trying to get the JSON data from the following URL:
import requests as r
url = "https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json"
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
resp = r.get(url, stream=True, timeout=20, headers=header)
j = resp.json()
I get the JSON from doing this, but in the inspection I see the data is in the Response payload part, which is not in j.
I have never faced this problem before and my search lead me to POST questions.

I tested it using postman
User-Agent value is your problem
you could simply remove it and it will work

I might be wrong and didn't get question correctly, but compering data getting from UI and compering data getting from API are the same:
import json
import requests
from selenium import webdriver
url = 'https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json'
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
driver = webdriver.Chrome()
driver.get(url)
content = driver.find_element_by_xpath('//pre').text
driver.quit()
response = requests.get(url,
stream=True,
timeout=20,
headers=header
)
print(json.loads(content) == response.json())
assert json.loads(content) == response.json()

Related

requests.get(link, headers=headers).json() fails to load but on browser it load ok

I am trying to access an url with requests.get but on terminal I always get this error:
Failed to open https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q=%7B%22%24match%22%3A%7B%22collectionSymbol%22%3A%22meekolony%22%7D%2C%22%24sort%22%3A%7B%22takerAmount%22%3A1%2C%22createdAt%22%3A-1%7D%2C%22%24skip%22%3A0%2C%22%24limit%22%3A5%7D
However if I click that same link to open it on browser it loads just fine.
Here is some part of the code:
link = "https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q=%7B%22%24match%22%3A%7B%22collectionSymbol%22%3A%22"+str(nameCollection)+"%22%7D%2C%22%24sort%22%3A%7B%22takerAmount%22%3A1%2C%22createdAt%22%3A-1%7D%2C%22%24skip%22%3A0%2C%22%24limit%22%3A5%7D"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
# make the request from the website and convert the data into a JSON file
try:
# print(resp) will show all the data collected
resp = requests.get(link, headers=headers).json()
except:
return None
It is strange because some weeks ago everything worked perfect. So maybe the website changed the way of accessing data recently?
Thank you in advance for your help :)
You may have more success with urllib3 as follows:
import sys
import io
import json
import urllib3
URL = 'https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q={"$match":{"collectionSymbol":"meekolony"},"$sort":{"takerAmount":1,"createdAt":-1},"$skip":0,"$limit":5}'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0'
}
with urllib3.PoolManager() as http:
try:
if (r := http.request('GET', URL, headers=HEADERS, preload_content=False)).status != 200:
raise Exception(f'HTTP status {r.status}')
r.auto_close = False
j = json.load(io.TextIOWrapper(r))
print(json.dumps(j, indent=2))
except Exception as e:
print(e, file=sys.stderr)

get requests is not loading in python

I am trying to get data from the following website. https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE
I tried the following:
Get the whole url in requests:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
Get the base webpage and add the params:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp', params = {'symbol':'SBIN','segmentLink':'3','symbolCount':'2','series':'EQ','dateRange':' ','fromDate':'01-01-2020','toDate':'31-12-2020','dataType':'PRICEVOLUMEDELIVERABLE'})
used the urllib:
f = urllib.request.urlopen('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
none of the above methods work.
They are just loading indefinitely.
Thanks in advance.
Don't forget to add User-Agent to request header, like that:
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
response = requests.get('you_url', headers=header)
print(response)

Python script to download file from button on website

I want to download an xls file by clicking the button "Export to excel" from the following url: https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD.
More specifically the button: name = "ctl00$MainContent$btndata". I've already been able to do this using selenium, but, I plan on building a docker image with this script and running as a docker container because this xls is regularly updated and I need the most current data on my local machine and it doesn't make sense to have a browser open that often to fetch this data. I understand there are headless versions of chrome and firefox although I don't believe they support downloads. Also, I understand that web get will not work in this situation because the button is not a static link to the resource. Maybe there's a completely different approach for downloading and updating this data to my computer?
import urllib
import requests
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f, "html.parser")
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Encoding', 'gzip, deflate, br'),
('Accept-Language', 'en-US,en;q=0.5'),
('Host', 'apps,tampagov.net'),
('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'))
payload = urllib.urlencode(formData)
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", params=payload)
print(r.status_code, r.reason)
First: I removed import urllib because 'requests' is enough.
Some issues you have:
You don't need to create one nested tuple then apply urllib.urlencode, uses one dictionary instead that is one reason why requests is so popular.
You'd better populate all parameters for the http post request. like below what I did, otherwise, the request may be rejected by the backend.
I added one simple codes to save the content to the local.
PS: for those form parameters, you can get their values by analysis the html responsed from http get. Also you can customize the parameters as you need, like page size etc.
Below is a working sample:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def downloadExcel():
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
r = requests.get("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", headers=headers)
# parse and retrieve two vital form values
if not r.status_code == 200:
print('Error')
return
soup = BeautifulSoup(r.content, "html.parser")
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
print ('__VIEWSTATE:', viewstate)
print ('__EVENTVALIDATION:', eventvalidation)
formData = {
'__EVENTVALIDATION': eventvalidation,
'__VIEWSTATE': viewstate,
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATEGENERATOR': '49DF2C80',
'MainContent_RadScriptManager1_TSM':""";;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:59e0a739-153b-40bd-883f-4e212fc43305:ea597d4b:b25378d2;Telerik.Web.UI, Version=2015.2.826.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:c2ba43dc-851e-4009-beab-3032480b6a4b:16e4e7cd:f7645509:24ee1bba:c128760b:874f8ea2:19620875:4877f69a:f46195d3:92fe8ea0:fa31b949:490a9d4e:bd8f85e4:58366029:ed16cbdc:2003d0b8:88144a7a:1e771326:aa288e2d:b092aa46:7c926187:8674cba1:ef347303:2e42e72a:b7778d6c:c08e9f8a:e330518b:c8618e41:e4f8f289:1a73651d:16d8629e:59462f1:a51ee93e""",
'search_block_form':'',
'ctl00$MainContent$btndata':'Export to Excel',
'ctl00_MainContent_RadWindow1_C_RadGridVehicles_ClientState':'',
'ctl00_MainContent_RadWindow1_ClientState':'',
'ctl00_MainContent_RadWindowManager1_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl00$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time$dateInput':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_dateInput_ClientState':'{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00","lastSetTextBoxValue":""}',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_ClientState':'{"minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00"}',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1address':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1address_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1case_description':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1case_description_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_grid':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1report_number':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1report_number_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_max_date':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_rowcount':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl03$ctl01$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState':'',
'ctl00_MainContent_RadGrid1_rfltMenu_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedTimeView_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_SD':'[]',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_AD':'[[1900,1,1],[2099,12,31],[2018,3,29]]',
'ctl00_MainContent_RadGrid1_ClientState':'',
}
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", data=formData, headers=headers)
print('received:', r.status_code, len(r.content))
with open(r"C:\Users\xxx\Desktop\test\test\apps.xls", "wb") as handle:
for data in tqdm(r.iter_content()):
handle.write(data)
downloadExcel()
Find out the URL you need to fetch as #Sphinx explains, and then simulate it using something similar to:
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({...})
data = data.encode('ascii')
with urllib.request.urlopen("http://...", data) as fd:
print(fd.read().decode('utf-8'))
Take a look at the documentation of urllib.

Python scraping response data

i would like to take the response data about a specific website.
I have this site:
https://enjoy.eni.com/it/milano/map/
and if i open the browser debuger console i can see a posr request that give a json response:
how in python i can take this response by scraping the website?
Thanks
Apparently the webservice has a PHPSESSID validation so we need to get it first using proper user agent:
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
}
r = requests.get('https://enjoy.eni.com/it/milano/map/', headers=headers)
session_id = r.cookies['PHPSESSID']
headers['Cookie'] = 'PHPSESSID={};'.format(session_id)
res = requests.post('https://enjoy.eni.com/ajax/retrieve_vehicles', headers=headers, allow_redirects=False)
json_obj = json.loads(res.content)

Simulating ajax request with python using requests lib

Why does request not download a response for this webpage?
#!/usr/bin/python
import requests
headers={ 'content-type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
'Referer' : 'http://sportsbeta.ladbrokes.com/football',
}
payload={'N': '4294966750',
'facetCount_156%23327': '12',
'facetCount_157%23325': '8',
'form-trigger':'moreId',
'moreId':'156%23327',
'pageId':'p_football_home_page',
'pageType':'EventClass',
'type':'ajaxrequest'
}
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
r = requests.post(url, data=payload, headers=headers)
These are the parameters of the POST that I see in Firebug, and there the response received back contains a list (of football leagues), yet when I run my python script like this I get nothing.
(you can see the request in Firefox by clicking the See All in the competitions section of the left hand nav bar of link and looking at the XHR in Firebug. The Firebug response shows the HTML body as expected.)
Anyone any ideas? Will my handling of the % symbols in the payload be causing any trouble at all?
EDIT: Attempt using session
from requests import Request, Session
#turn post string into dict:
def parsePOSTstring(POSTstr):
paramList = POSTstr.split('&')
paramDict = dict([param.split('=') for param in paramList])
return paramDict
headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
'Referer' : 'http://sportsbeta.ladbrokes.com/football'
}
#prep the data (POSTstr copied from Firebug raw source)
POSTstr = "moreId=156%23327&facetCount_156%23327=12&event=&N=4294966750&pageType=EventClass&
pageId=p_football_home_page&type=ajaxrequest&eventIDNav=&removedSelectionNav=&
currentSelectedId=&form-trigger=moreId"
payload = parsePOSTstring(POSTstr)
#end url
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
#start a session to manage cookies, and visit football page first so referer agrees
s = Session()
s.get('http://sportsbeta.ladbrokes.com/football')
#now visit disired url with headers/data
r = s.post(url, data=payload, headers=headers)
#print output
print r.text #this is empty
Working curl
curl 'http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
-H 'Cookie: JSESSIONID=DE93158F07E02DD3CC1CC32B1AA24A9E.ecomprodsw015;
geoCode=FRA;
FLAGS=en|en|uk|default|ODDS|0|GBP;
ECOM_BETA_SPORTS=1;
PLAYED=4%7C0%7C0%7C0%7C0%7C0%7C0'
-H 'Referer: http://sportsbeta.ladbrokes.com/football'
-H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0)
Gecko/20100101 Firefox/27.0'
--data 'facetCount_157%23325=8&moreId=156%23327&
facetCount_156%23327=12&event=&
N=4294966750&
pageType=EventClass&pageId=p_football_home_page&
type=ajaxrequest&eventIDNav=&
removedSelectionNav=&currentSelectedId=&
form-trigger=moreId' --compressed
Yet this curl works.
Here's the smallest working example that I can come up with:
from requests import Session
session = Session()
# HEAD requests ask for *just* the headers, which is all you need to grab the
# session cookie
session.head('http://sportsbeta.ladbrokes.com/football')
response = session.post(
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController',
data={
'N': '4294966750',
'form-trigger': 'moreId',
'moreId': '156#327',
'pageType': 'EventClass'
},
headers={
'Referer': 'http://sportsbeta.ladbrokes.com/football'
}
)
print response.text
You just weren't decoding the percent-encoded POST data properly, so # was being represented as %23 in the actual POST data (e.g. 156%23327 should've been 156#327).

Categories

Resources