I am using Requests to parse some data on a server. However, I keep getting a 503 response. The request headers have cookies in them, but my method does not seem to be handling them properly.
I am also a bit confused as to what I should be doing with cookies and when full stop. The website is http://epgservices.sky.com/nevermiss/and my code is below.
Headers and params look correct when viewed in Google Dev Tools, other than the cookies are missing when I use Requests. Any ideas?
import json
import requests
from urllib3.util import Retry
from requests.adapters import HTTPAdapter
from requests import Session, exceptions
import re
import traceback
from cookielib import LWPCookieJar
class sky_ondemand:
session = requests.Session()
jar = session.cookies
url = 'http://epgservices.sky.com'
movie_path = ''.join(movie_path)
headers = {
'Host': 'epgservices.sky.com',
'Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
'Referer': 'http://epgservices.sky.com/never-miss/index.htm',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8'
}
params = {
'queryType': 'movieQuery',
'query': '',
'exactMatch': 'false',
'genre': '',
'startsWith': 'all',
'sortBy': 'requested',
'pageNum': '1',
'pageSize': '10',
'src': 'movieLetterButton'
}
r = session.get(url, params=params, headers=headers, cookies=jar)
data = r.content
print(data)
Sorted this if anyone is interested....was nothing to do with the cookies...the url should have been 'http://epgservices.sky.com/tvlistings-proxy/NeverMissProxy/neverMissMovieSearchRequest.json?'
Related
I'm getting a 419 page expired status code when using requests on this site. I gathered the information for the headers and data by monitoring the network tab of the developer console. How can I use the Python requests module to successfully login?
import requests
url = 'https://rates.itgtrans.com/login'
headers = {
'authority': 'rates.itgtrans.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'origin': 'https://rates.itgtrans.com',
'content-type': 'application/x-www-form-urlencoded',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://rates.itgtrans.com/login',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'XSRF-TOKEN=eyJpdiI6IkEzbi9JQkVwbWloZTM1UVdSdVJtK0E9PSIsInZhbHVlIjoiM1pxQVYxajhPcWdlZ1NlYlVMSUlyQzFISVpPNjNrMVB0UmNYMXZGa0crSmYycURoem1vR0FzRUMrNjB2bXFPbCs4U3ZyeGM4ZVNLZ1NjRGVmditUMldNUUNmYmVzeTY2WS85VC93a1c0M0JUMk1Jek00TTNLVnlPb2VVRXpiN0ciLCJtYWMiOiJkNjQyMTMwMGRmZmQ4YTg0ZTNhZDgzODQ5M2NiMmE2ODdlYjRlOTIyMWE5Yjg4YzEyMTBjNTI2ODQxY2YxMzNkIiwidGFnIjoiIn0%3D; draymaster_session=eyJpdiI6Im9vUDZabmlYSTY0a1lSNGdYZzZHT0E9PSIsInZhbHVlIjoiMGVVcSs2T3RheGhMeDNVVFJUQjRmb212TkoySVY5eWFjeVNHT1lGWE9sRHdtR3JTa0REZFhMTzNJeisyTjNOZ1hrQnNscWY0dXBheFFaRFhIdDAvUlFMOFdvTFdaOXBoejcwb2ZDNFNMdDZ6MUFxT2dHU3hlNVkxZmpiTnd2Z0QiLCJtYWMiOiIwN2RmZTc1ZDUzYzViYTgzYWU1MjFjNjIxZjYzMzY3MDE0YjI4MDhkMWMwMTVkYmYxYWM2MzQ0ODM1YzRkNDY1IiwidGFnIjoiIn0%3D'
}
data = {
'_token': 'o8jJ4tR3PHkuz5TR2kuoHwBAdHd5RczFx2rlul1C',
'email': '****',
'password': '****',
'button': ''
}
with requests.Session() as s:
cookies = s.cookies
p = s.post(url='https://rates.itgtrans.com/login', data=data, headers=headers, cookies=cookies)
print(p)
As for me all problem is that you always use the same _token.
Server for every user should generate new uniq token which is valid only few minutes - all for security reason (so hacker can't get it and use it after longer time)
BTW: went I run your code and get page with status 419 and display p.text then I see HTML with text Page Expired which can confirm that you use expired token.
You should always GET this page and search new token in HTML
<input name="_token" type="hidden" value="Xz0pJ0djGVnfaRMuXNDGMdBmZRbc55Ql2Q2CTPit"/>
and use this value in POST
I don't have account on this page but using fresh token from <input name="_token"> I get status 200 instead of 419.
import requests
from bs4 import BeautifulSoup
url = 'https://rates.itgtrans.com/login'
headers = {
'authority': 'rates.itgtrans.com',
'cache-control': 'max-age=0',
'origin': 'https://rates.itgtrans.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'referer': 'https://rates.itgtrans.com/login',
'accept-language': 'en-US,en;q=0.9',
}
data = {
'_token': '-empty-',
'email': '****',
'password': '****',
'button': ''
}
with requests.Session() as s:
# --- first GET page ---
response = s.get(url='https://rates.itgtrans.com/login', headers=headers)
#print(response.text)
# --- search fresh token in HTML ---
soup = BeautifulSoup(response.text)
token = soup.find('input', {'name': "_token"})['value']
print('token:', token)
# --- run POST with new token ---
data['_token'] = token
response = s.post(url='https://rates.itgtrans.com/login', data=data, headers=headers)
#print(response.text)
print('status_code:', response.status_code)
BTW:
I get 200 even if I don't use headers.
Because code uses Session so I don't have to copy cookies from GET to POST because Session copies them automatically.
In the following code, I am trying to do POST method to microsoft online account, and I am starting with a page that requires to post an email. This is my try till now
import requests
from bs4 import BeautifulSoup
url = 'https://moe-register.emis.gov.eg/account/login?ReturnUrl=%2Fhome%2FRegistrationForm'
headers ={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':'__RequestVerificationToken=vdS3aPPg5qQ2bH9ADTppeKIVJfclPsMI6dqB6_Ru11-2XJPpLfs7jBlejK3n0PZuYl-CwuM2hmeCsXzjZ4bVfj2HGLs2KOfBUphZHwO9cOQ1; .AspNet.MOEEXAMREGFORM=ekeG7UWLA6OSbT8ZoOBYpC_qYMrBQMi3YOwrPGsZZ_3XCuCsU1BP4uc5QGGE2gMnFgmiDIbkIk_8h9WtTi-P89V7ME6t_mBls6T3uR2jlllCh0Ob-a-a56NaVNIArqBLovUnLGMWioPYazJ9DVHKZY7nR_SvKVKg2kPkn6KffkpzzHOUQAatzQ2FcStZBYNEGcfHF6F9ZkP3VdKKJJM-3hWC8y62kJ-YWD0sKAgAulbKlqcgL1ml6kFoctt2u66eIWNm3ENnMbryh8565aIk3N3UrSd5lBoO-3Qh8jdqPCCq38w3cURRzCd1Z1rhqYb3V2qYs1ULRT1_SyRXFQLrJs5Y9fsMNkuZVeDp_CKfyzM',
'Host': 'moe-register.emis.gov.eg',
'Origin': 'https://moe-register.emis.gov.eg',
'Referer': 'https://moe-register.emis.gov.eg/account/login?ReturnUrl=%2Fhome%2FRegistrationForm',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
with requests.session() as s:
# r = s.post(url)
#soup = BeautifulSoup(r.content, 'lxml')
data = {'EmailAddress': '476731809#matrouh1.moe.edu.eg'}
r_post = s.post(url, data=data, headers=headers, verify=False)
soup = BeautifulSoup(r_post.content, 'lxml')
print(soup)
What I got is the same page that requires the post of the email again. I expected to get the page that requires sign-in password..
This is the starting page
and this is an example of the email that needed to be posted 476731809#matrouh1.moe.edu.eg
** I have tried to use such a code but I got the page sign in again (although the credentials are correct)
Can you please try this code
import requests
from bs4 import BeautifulSoup
url = 'https://login.microsoftonline.com/common/login'
s = requests.Session()
res = s.get('https://login.microsoftonline.com')
cookies = dict(res.cookies)
res = s.post(url,
auth=('476731809#matrouh1.moe.edu.eg', 'Std#050202'),
verify=False,
cookies=cookies)
soup = BeautifulSoup(res.text, 'html.parser')
print(soup)
I checked out the page and following seems to be working:
import requests
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://moe-register.emis.gov.eg',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://moe-register.emis.gov.eg/account/login',
'Accept-Language': 'en-US,en;q=0.9,gl;q=0.8,fil;q=0.7,hi;q=0.6',
}
data = {
'EmailAddress': '476731809#matrouh1.moe.edu.eg'
}
response = requests.post('https://moe-register.emis.gov.eg/account/authenticate', headers=headers, data=data, verify=False)
Your POST endpoint seems to be wrong, since you need to re-direct from /login to /authenticate to proceed with the request (I am on a mac so my user-agent may be different than yours/required, you can change that from the headers variable).
I am trying to scrape a website using POST request to fill the form:
http://www.planning2.cityoflondon.gov.uk/online-applications/search.do?action=advanced
in python, this goes as follow:
import requests
import webbrowser
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'JSESSIONID=OwXG0Hkxj+X9ELygHZa-aLQ5.undefined; _ga=GA1.3.1911942552.',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.planning2.cityoflondon.gov.uk',
'Origin': 'http://www.planning2.cityoflondon.gov.uk',
'Referer': 'http://www.planning2.cityoflondon.gov.uk/online-applications/search.do?action=advanced',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
data = {
'searchCriteria.developmentType': '002',
'date(applicationReceivedStart)': '01/08/2000',
'date(applicationReceivedEnd)': '01/08/2018'
}
url = 'http://www.planning2.cityoflondon.gov.uk/online-applications/advancedSearchResults.do?action=firstPage'
test_file = 'planning_app.html'
with requests.Session() as session:
r = session.post(url, headers = headers, data = data)
with open (test_file, 'w') as file:
file.write(r.text)
webbrowser.open(test_file)
As you can see from the page reopened with webbrowser, this gives an error of outdated cookie.
For this to work I would need to manually go to the webpage, perform a query while opening the inspect panel of google chrome on the network tab, look at the cookie in the requests header and copy paste the cookie in my code. This would work until of course the cookie is expired again.
I tried to automate that retrieval of the cookie by doing the following:
headers_get = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.planning2.cityoflondon.gov.uk',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
with requests.Session() as session:
c = session.get('http://www.planning2.cityoflondon.gov.uk/online-applications/', headers = headers_get)
headers['Cookie'] = 'JSESSIONID=' + list(c.cookies.get_dict().values())[0]
r = session.post(url, headers = headers, data = data)
with open (test_file, 'w') as file:
file.write(r.text)
webbrowser.open(test_file)
I would expect this to work as it is simply automating what i do manually:
Go to the page of the GET request, get the cookie from it add said cookie to the headers dict of the POST request.
However I still receive the 'server error' page from the POST requests.
Anyone would be able to get an understanding of why this happen?
The requests.post accept cookies name parameter. Using it instead of sending cookies directly in header may fix the problem:
with requests.Session() as session:
c = session.get('http://www.planning2.cityoflondon.gov.uk/online- applications/', headers = headers_get)
# Also, you can set with cookies=session.cookies
r = session.post(url, headers = headers, data = data, cookies=c.cookies)
Basically I suppose there may be some javascript logic on the site, which isn't executed with the use of requests.post. If that's the case, to resolve that you have to use selenium for filling and submitting form.
Please see Dynamic Data Web Scraping with Python, BeautifulSoup which has similar problem - javascript not executed.
The response web page is as below when to slect title and input wordpress.
Here is my python code to pass arguments for get method with python3.
import urllib.request
import urllib.parse
url = 'http://www.it-ebooks.info/'
values = {'q': 'wordpress','type': 'title'}
data = urllib.parse.urlencode(values).encode(encoding='utf-8',errors='ignore')
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' }
request = urllib.request.Request(url=url, data=data,headers=headers,method='GET')
response = urllib.request.urlopen(request)
buff = response.read()
html = buff.decode("utf8")
print(html)
I can't get the desired output web page.
How to pass arguments for get method with urllib in my example?
The data kwarg of urllib.request.Request is only used for POST requests as it modifies the request's body.
GET requests simply use URL parameters, so you should append these to the url:
params = '?q=wordpress&type=title'
url = 'http://www.it-ebooks.info/search/{}'.format(params)
You can of course take the time and generalize this into a generic function.
is better if you use the library called requests
import requests
headers = {
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'es-ES,es;q=0.8,en;q=0.6',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.it-ebooks.info/',
'Connection': 'keep-alive',
}
r = requests.get('http://www.it-ebooks.info/search/?q=wordpress&type=title', headers=headers)
print r.content
I am trying to get the HTML page back from sending a POST request:
import httplib
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
headers = {
'Host': 'digitalvita.pitt.edu',
'Connection': 'keep-alive',
'Content-Length': '325',
'Origin': 'https://digitalvita.pitt.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'Referer': 'https://digitalvita.pitt.edu/index.php',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Cookie': 'PHPSESSID=lvetilatpgs9okgrntk1nvn595'
}
data = {
'action': 'search',
'xdata': '<search id="1"><context type="all" /><results><ordering>familyName</ordering><pagesize>100000</pagesize><page>1</page></results><terms><name>d</name><school>All</school></terms></search>',
'request': 'search'
}
data = urllib.urlencode(data)
print data
req = urllib2.Request('https://digitalvita.pitt.edu/dispatcher.php', data, headers)
response = urllib2.urlopen(req)
the_page = response.read()
soup=BeautifulSoup(the_page)
print soup
Can anyone tell me how to make it work?
Do not specify a Content-Length header, urllib2 calculates it for you. As it is, your header specifies the wrong length:
>>> data = urllib.urlencode(data)
>>> len(data)
319
Without that header the rest of the posted code works fine for me.