Simulating ajax request with python using requests lib - python

Why does request not download a response for this webpage?
#!/usr/bin/python
import requests
headers={ 'content-type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
'Referer' : 'http://sportsbeta.ladbrokes.com/football',
}
payload={'N': '4294966750',
'facetCount_156%23327': '12',
'facetCount_157%23325': '8',
'form-trigger':'moreId',
'moreId':'156%23327',
'pageId':'p_football_home_page',
'pageType':'EventClass',
'type':'ajaxrequest'
}
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
r = requests.post(url, data=payload, headers=headers)
These are the parameters of the POST that I see in Firebug, and there the response received back contains a list (of football leagues), yet when I run my python script like this I get nothing.
(you can see the request in Firefox by clicking the See All in the competitions section of the left hand nav bar of link and looking at the XHR in Firebug. The Firebug response shows the HTML body as expected.)
Anyone any ideas? Will my handling of the % symbols in the payload be causing any trouble at all?
EDIT: Attempt using session
from requests import Request, Session
#turn post string into dict:
def parsePOSTstring(POSTstr):
paramList = POSTstr.split('&')
paramDict = dict([param.split('=') for param in paramList])
return paramDict
headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
'Referer' : 'http://sportsbeta.ladbrokes.com/football'
}
#prep the data (POSTstr copied from Firebug raw source)
POSTstr = "moreId=156%23327&facetCount_156%23327=12&event=&N=4294966750&pageType=EventClass&
pageId=p_football_home_page&type=ajaxrequest&eventIDNav=&removedSelectionNav=&
currentSelectedId=&form-trigger=moreId"
payload = parsePOSTstring(POSTstr)
#end url
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
#start a session to manage cookies, and visit football page first so referer agrees
s = Session()
s.get('http://sportsbeta.ladbrokes.com/football')
#now visit disired url with headers/data
r = s.post(url, data=payload, headers=headers)
#print output
print r.text #this is empty
Working curl
curl 'http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController'
-H 'Cookie: JSESSIONID=DE93158F07E02DD3CC1CC32B1AA24A9E.ecomprodsw015;
geoCode=FRA;
FLAGS=en|en|uk|default|ODDS|0|GBP;
ECOM_BETA_SPORTS=1;
PLAYED=4%7C0%7C0%7C0%7C0%7C0%7C0'
-H 'Referer: http://sportsbeta.ladbrokes.com/football'
-H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0)
Gecko/20100101 Firefox/27.0'
--data 'facetCount_157%23325=8&moreId=156%23327&
facetCount_156%23327=12&event=&
N=4294966750&
pageType=EventClass&pageId=p_football_home_page&
type=ajaxrequest&eventIDNav=&
removedSelectionNav=&currentSelectedId=&
form-trigger=moreId' --compressed
Yet this curl works.

Here's the smallest working example that I can come up with:
from requests import Session
session = Session()
# HEAD requests ask for *just* the headers, which is all you need to grab the
# session cookie
session.head('http://sportsbeta.ladbrokes.com/football')
response = session.post(
url='http://sportsbeta.ladbrokes.com/view/EventDetailPageComponentController',
data={
'N': '4294966750',
'form-trigger': 'moreId',
'moreId': '156#327',
'pageType': 'EventClass'
},
headers={
'Referer': 'http://sportsbeta.ladbrokes.com/football'
}
)
print response.text
You just weren't decoding the percent-encoded POST data properly, so # was being represented as %23 in the actual POST data (e.g. 156%23327 should've been 156#327).

Related

Python requests GET not getting the JSON payload?

I am trying to get the JSON data from the following URL:
import requests as r
url = "https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json"
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
resp = r.get(url, stream=True, timeout=20, headers=header)
j = resp.json()
I get the JSON from doing this, but in the inspection I see the data is in the Response payload part, which is not in j.
I have never faced this problem before and my search lead me to POST questions.
I tested it using postman
User-Agent value is your problem
you could simply remove it and it will work
I might be wrong and didn't get question correctly, but compering data getting from UI and compering data getting from API are the same:
import json
import requests
from selenium import webdriver
url = 'https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json'
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
driver = webdriver.Chrome()
driver.get(url)
content = driver.find_element_by_xpath('//pre').text
driver.quit()
response = requests.get(url,
stream=True,
timeout=20,
headers=header
)
print(json.loads(content) == response.json())
assert json.loads(content) == response.json()

get requests is not loading in python

I am trying to get data from the following website. https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE
I tried the following:
Get the whole url in requests:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
Get the base webpage and add the params:
response = requests.get('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp', params = {'symbol':'SBIN','segmentLink':'3','symbolCount':'2','series':'EQ','dateRange':' ','fromDate':'01-01-2020','toDate':'31-12-2020','dataType':'PRICEVOLUMEDELIVERABLE'})
used the urllib:
f = urllib.request.urlopen('https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=SBIN&segmentLink=3&symbolCount=2&series=EQ&dateRange=+&fromDate=01-01-2020&toDate=31-12-2020&dataType=PRICEVOLUMEDELIVERABLE')
none of the above methods work.
They are just loading indefinitely.
Thanks in advance.
Don't forget to add User-Agent to request header, like that:
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
response = requests.get('you_url', headers=header)
print(response)

Problems with api access of Olx

Good afternoon.
I am trying to scrape the information of each item that is found in this link, but when launching the requests for obtain the links where is the information that i need, i can't get them. I was inspecting the page and i see that it brings an API, but i couldn't access it. Can someone help me with this? i really don't handle very well the API's.
This is my request for verify access
url = 'https://www.olx.com.co/api/relevance/search?category=16&facet_limit=100&location=1000001&location_facet_limit=20&page=1&user=1776310a947x4a045a04'
headers = {
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
'accept': '*/*',
'accept-encoding':'gzip, deflate, br',
'accept-language':'es-ES,es;q=0.9'
}
req = requests.get(url, headers = headers)
req.content
Note: excuse my english
Thank you!
Its work fine, just print(req.json())
import requests
import json
url = 'https://www.olx.com.co/api/relevance/search?category=16&facet_limit=100&location=1000001&location_facet_limit=20&page=1&user=1776310a947x4a045a04'
headers = {
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
'accept': '*/*',
'accept-encoding':'gzip, deflate, br',
'accept-language':'es-ES,es;q=0.9'
}
req = requests.get(url, headers = headers)
req.json()

Scrapy-Splash doesn't set custom request headers

I am trying to scrape a website using Scrapy + Splash in Python 2.7.
The website uses JavaScript to generate most of the HTML, which is why I need Splash.
First, I make a FormRequest with Scrapy to login to a website. It is successful.
I then extract "access_token" from JSON response, because it should be used in the next request as an "Authorization" header - to confirm to the website that I am logged in.
jsonresp = json.loads(response.body_as_unicode())
self.token = 'Bearer ' + jsonresp['access_token']
self.my_headers['Authorization'] = self.token
Before proceeding with SplashRequest, I decided to test the session with scrapy.Request. I passed cookies and the new headers:
yield scrapy.Request('https://www.example.com/products', cookies=self.cookies, dont_filter=True, callback=self.parse_pages, headers=self.my_headers)
The HTML from result.body confirmed that I was logged in. Great!
Calling response.request.headers showed that 'Authorization' header was also sent.
{'Accept-Language': ['en-US,en;q=0.5'],
'Accept-Encoding': ['gzip,deflate'],
'Accept': ['application/json, text/plain, */*'],
'User-Agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'],
'Connection': ['keep-alive'],
'Referer': ['https://www.example.com/Web'],
'Cookie': ["___cookies___"],
'Content-Type': ['application/x-www-form-urlencoded'],
'Authorization': ['Bearer Zyb9c20JW0LLJCTA-GmLtEeL9A48se_AviN9xajP8NZVE8r6TddoPHC6dJnmbQ4RCddM8QVJ2v23ey-kq5f8S12uLMXlLF_WzInNI9eaI29WAcIwNK-FixBpDm4Ws3SqXdwBIXfkqYhd6gJs4BP7sNpAKc93t-A4ws9ckpTyih2cHeC8KGQmTnQXLOYch2XIyT5r9verzRMMGHEiu6kgJWK9yRL19PVqCWDjapYbtutKiTRKD1Q35EHjruBJgJD-Fg_iyMovgYkfy9XtHpAEuUvL_ascWHWvrFQqV-19p-6HQPocEuri0Vu0NsAqutfIbi420_zhD8sDFortDmacltNOw-3f6H1imdGstXE_2GQ']}
Cookie DEBUG showed that all cookies were sent without issues.
After that I substituted scrapy.Request with SplashRequest:
yield SplashRequest('https://www.example.com/products', cookies=self.cookies, callback=self.parse_pages, args={"lua_source": lua_script, 'headers':self.my_headers}, endpoint='execute', errback=self.errors)
lua_script:
lua_script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(2))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
html = splash:html(),
}
end
"""
However, the HTML that I got from Splash response showed that I was not logged in.
Cookie DEBUG didn't show any issues - the same cookies were sent as before.
But here is what I got from calling response.request.headers:
{'Accept-Language': ['en'],
'Accept-Encoding': ['gzip,deflate'],
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'],
'User-Agent': ['Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'],
'Cookie': ["___cokies___"],
'Content-Type': ['application/json']}
As you can see, Splash didn't set my custom headers, instead it just combined cookies with the default ones.
I tried setting my own headers both as SplashRequest function arguments and inside lua_script, but none of the approaches worked.
My question is, how to set my own request headers in Splash?

Python script to download file from button on website

I want to download an xls file by clicking the button "Export to excel" from the following url: https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD.
More specifically the button: name = "ctl00$MainContent$btndata". I've already been able to do this using selenium, but, I plan on building a docker image with this script and running as a docker container because this xls is regularly updated and I need the most current data on my local machine and it doesn't make sense to have a browser open that often to fetch this data. I understand there are headless versions of chrome and firefox although I don't believe they support downloads. Also, I understand that web get will not work in this situation because the button is not a static link to the resource. Maybe there's a completely different approach for downloading and updating this data to my computer?
import urllib
import requests
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f, "html.parser")
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Encoding', 'gzip, deflate, br'),
('Accept-Language', 'en-US,en;q=0.5'),
('Host', 'apps,tampagov.net'),
('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'))
payload = urllib.urlencode(formData)
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", params=payload)
print(r.status_code, r.reason)
First: I removed import urllib because 'requests' is enough.
Some issues you have:
You don't need to create one nested tuple then apply urllib.urlencode, uses one dictionary instead that is one reason why requests is so popular.
You'd better populate all parameters for the http post request. like below what I did, otherwise, the request may be rejected by the backend.
I added one simple codes to save the content to the local.
PS: for those form parameters, you can get their values by analysis the html responsed from http get. Also you can customize the parameters as you need, like page size etc.
Below is a working sample:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def downloadExcel():
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=.08',
'Origin': 'https://www.tampagov.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'en-US,en;q=0.5',
}
r = requests.get("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", headers=headers)
# parse and retrieve two vital form values
if not r.status_code == 200:
print('Error')
return
soup = BeautifulSoup(r.content, "html.parser")
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
print ('__VIEWSTATE:', viewstate)
print ('__EVENTVALIDATION:', eventvalidation)
formData = {
'__EVENTVALIDATION': eventvalidation,
'__VIEWSTATE': viewstate,
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATEGENERATOR': '49DF2C80',
'MainContent_RadScriptManager1_TSM':""";;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:59e0a739-153b-40bd-883f-4e212fc43305:ea597d4b:b25378d2;Telerik.Web.UI, Version=2015.2.826.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:c2ba43dc-851e-4009-beab-3032480b6a4b:16e4e7cd:f7645509:24ee1bba:c128760b:874f8ea2:19620875:4877f69a:f46195d3:92fe8ea0:fa31b949:490a9d4e:bd8f85e4:58366029:ed16cbdc:2003d0b8:88144a7a:1e771326:aa288e2d:b092aa46:7c926187:8674cba1:ef347303:2e42e72a:b7778d6c:c08e9f8a:e330518b:c8618e41:e4f8f289:1a73651d:16d8629e:59462f1:a51ee93e""",
'search_block_form':'',
'ctl00$MainContent$btndata':'Export to Excel',
'ctl00_MainContent_RadWindow1_C_RadGridVehicles_ClientState':'',
'ctl00_MainContent_RadWindow1_ClientState':'',
'ctl00_MainContent_RadWindowManager1_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl00$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RDIPFdispatch_time$dateInput':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_dateInput_ClientState':'{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00","lastSetTextBoxValue":""}',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RDIPFdispatch_time_ClientState':'{"minDateStr":"1900-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00"}',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1address':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1address_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1case_description':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1case_description_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_grid':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$RadComboBox1report_number':'',
'ctl00_MainContent_RadGrid1_ctl00_ctl02_ctl02_RadComboBox1report_number_ClientState':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_max_date':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl02$ctl02$FilterTextBox_out_rowcount':'',
'ctl00$MainContent$RadGrid1$ctl00$ctl03$ctl01$PageSizeComboBox':'20',
'ctl00_MainContent_RadGrid1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState':'',
'ctl00_MainContent_RadGrid1_rfltMenu_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedTimeView_ClientState':'',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_SD':'[]',
'ctl00_MainContent_RadGrid1_gdtcSharedCalendar_AD':'[[1900,1,1],[2099,12,31],[2018,3,29]]',
'ctl00_MainContent_RadGrid1_ClientState':'',
}
# second HTTP request with form data
r = requests.post("https://apps.tampagov.net/CallsForService_Webapp/Default.aspx?type=TPD", data=formData, headers=headers)
print('received:', r.status_code, len(r.content))
with open(r"C:\Users\xxx\Desktop\test\test\apps.xls", "wb") as handle:
for data in tqdm(r.iter_content()):
handle.write(data)
downloadExcel()
Find out the URL you need to fetch as #Sphinx explains, and then simulate it using something similar to:
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({...})
data = data.encode('ascii')
with urllib.request.urlopen("http://...", data) as fd:
print(fd.read().decode('utf-8'))
Take a look at the documentation of urllib.

Categories

Resources