I have a problem while trying to save cookies to a file using FileCookieJar's save method. Here is my code:
#!/usr/bin/python
import httplib, cookielib, urllib2, json, time
from datetime import date
class FoN:
def __init__(self):
self.cookiefile = "cookies.txt"
self.cj = cookielib.FileCookieJar(self.cookiefile)
def login (self, login, password):
js = json.JSONEncoder().encode({"login":login,"password":password})
req=urllib2.Request("http://www.example.com/user/login", js)
res=urllib2.urlopen(req)
self.cj.extract_cookies(res,req)
self.cj.save(self.cookiefile, ignore_discard=True)
f.write ("Login: "+login+", result: "+str(res.read().count("true"))+"\n")
time.sleep(2)
return res
So it fails at self.cj.save(self.cookiefile, ignore_discard=True) raising NotImplementedError exception, which is according to the documentation. But my question how do I save cookies to the file then? I even tried to include the code in try clause but that didn't help at all.
The base FileCookieJar does not implement .save To get saving, you should use one of the subclasses like MozillaCookieJar or LWPCookieJar.
I have write sample code to demo:
auto handle cookies: cookie in memory
auto handle cookies: cookie in file
support two format:
LWP
Mozilla
two kind of operation
save into file
load from file
code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Function: 【整理】Python中Cookie的处理:自动处理Cookie,保存为Cookie文件,从文件载入Cookie
http://www.crifan.com/python_auto_handle_cookie_and_save_to_from_cookie_file
Version: 2013-01-15
Author: Crifan
Contact: admin (at) crifan.com
"""
import os;
import cookielib;
import urllib2;
def pythonAutoHandleCookie():
"""
Demo how to auto handle cookie in Python
cookies in memory
cookies in file:
save cookie to file
LWP format
Mozilla format
load cookie from file
"""
print "1. Demo how to auto handle cookie (in memory)";
cookieJarInMemory = cookielib.CookieJar();
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJarInMemory));
urllib2.install_opener(opener);
print "after init, cookieJarInMemory=",cookieJarInMemory; #after init, cookieJarInMemory=
#!!! following urllib2 will auto handle cookies
demoUrl = "http://www.google.com/";
response = urllib2.urlopen(demoUrl);
#here, we already got response cookies
print "after urllib2.urlopen, cookieJarInMemory=",cookieJarInMemory;
#after urllib2.urlopen, cookieJarInMemory= , , ]>
print "2. Demo how to auto handle cookie in file, LWP format";
cookieFilenameLWP = "localCookiesLWP.txt";
cookieJarFileLWP = cookielib.LWPCookieJar(cookieFilenameLWP);
#will create (and save to) new cookie file
cookieJarFileLWP.save();
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJarFileLWP));
urllib2.install_opener(opener);
#!!! following urllib2 will auto handle cookies
demoUrl = "http://www.google.com/";
response = urllib2.urlopen(demoUrl);
#update cookies, save cookies into file
cookieJarFileLWP.save();
#for demo, print cookies in file
print "LWP cookies:";
print open(cookieFilenameLWP).read(os.path.getsize(cookieFilenameLWP));
# #LWP-Cookies-2.0
# Set-Cookie3: PREF="ID=34c1415b570a93ae:FF=0:NW=1:TM=1358236121:LM=1358236121:S=gEVVojW4x37ht5n-"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2015-01-15 07:48:41Z"; version=0
# Set-Cookie3: NID="67=JI_uEwUm5GDrQ_vCwAp2z_YGU7MdLm5CLMa4CNLF7RQuTDMzrrk1EjRddGcnpoFbht81LaV9spxZQQInf0mPS6lDrvcRqBBL5NOTmy8SwOzA6HWC3iTIo4-o3fO1Udkv"; path="/"; domain=".google.com.hk"; path_spec; domain_dot; expires="2013-07-17 07:48:41Z"; HttpOnly=None; version=0
# Set-Cookie3: PREF="ID=8f7e4efca89bdb1b:U=f85a4afa4db021aa:FF=2:LD=zh-CN:NW=1:TM=1358236121:LM=1358236121:S=2WR59hDWutdnUJtF"; path="/"; domain=".google.com.hk"; path_spec; domain_dot; expires="2015-01-15 07:48:41Z"; version=0
print "3. Demo how to auto handle cookie in file, Mozilla Format";
cookieFilenameMozilla = "localCookiesMozilla.txt";
cookieJarFileMozilla = cookielib.MozillaCookieJar(cookieFilenameMozilla);
#will create (and save to) new cookie file
cookieJarFileMozilla.save();
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJarFileMozilla));
urllib2.install_opener(opener);
#!!! following urllib2 will auto handle cookies
demoUrl = "http://www.google.com/";
response = urllib2.urlopen(demoUrl);
#update cookies, save cookies into file
cookieJarFileMozilla.save();
#for demo, print cookies in file
print "Mozilla cookies:";
print open(cookieFilenameMozilla).read(os.path.getsize(cookieFilenameMozilla));
# # Netscape HTTP Cookie File
# # http://www.netscape.com/newsref/std/cookie_spec.html
# # This is a generated file! Do not edit.
# .google.com TRUE / FALSE 1421308121 PREF ID=0e05040dd979207c:FF=0:NW=1:TM=1358236121:LM=1358236121:S=jcFid2XgXMIhPUPl
# .google.com.hk TRUE / FALSE 1374047321 NID 67=klMI_Z5ZPWDjUYrWSUHIE_kYI77_ziJaL0kWRoUGThagME86LKY7H-MNa2wAMI_GklIwYcD8t82qPinxzLd4GLDbmWT0OVLCXhRj0wQDC57dTNAsTs4lhVR7Yjvj2tfn
# .google.com.hk TRUE / FALSE 1421308121 PREF ID=028f8b736db06a9a:U=6ba6d080847c8de6:FF=2:LD=zh-CN:NW=1:TM=1358236121:LM=1358236121:S=_1BcC5v3G0ZojVz8
print "4. read cookies from file";
parseAndSavedCookieFile = "parsedAndSavedCookies.txt"
parsedCookieJarFile = cookielib.MozillaCookieJar(parseAndSavedCookieFile);
#parsedCookieJarFile = cookielib.MozillaCookieJar(cookieFilenameMozilla);
print parsedCookieJarFile; #
parsedCookieJarFile.load(cookieFilenameMozilla);
print parsedCookieJarFile; #, , ]>
if __name__=="__main__":
pythonAutoHandleCookie();
Related
I'm new to python and have undertaken my first project to automate something for my role (I'm in the network space, so forgive me if this is terrible!).
I'm required to to download a .json file from the below link:
https://www.microsoft.com/en-us/download/confirmation.aspx?id=56519
My script goes through and retrieves the manual download link.
The reason I'm getting the URL in this way, is that the download link changes every fortnight when MS update the file.
My preference is to extract the "addressPrefixes" contents from the names of "AzureCloud.australiacentral", "AzureCloud.australiacentral2", "AzureCloud.australiaeast", "AzureCloud.australiasoutheast".
I'm then wanting to strip out characters of " & ','.
Each of the subnet ranges should then reside on a new line and be placed in a text file.
If I perform the below, I'm able to get the output that I am wanting.
Am I correct in thinking that I can use a for loop to achieve this? If so, would it be better to use a Python dictionary as opposed to using JSON formatted output?
# Script to check Azure IPs
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Import Modules for script
import requests
import re
import json
import urllib.request
search = 'https://download.*?\.json'
ms_dl_centre = "https://www.microsoft.com/en-us/download/confirmation.aspx?id=56519"
requests_get = requests.get(ms_dl_centre)
json_url_search = re.search(search, requests_get.text)
json_file = json_url_search.group(0)
with urllib.request.urlopen(json_file) as url:
contents = json.loads(url.read().decode())
print(json.dumps(contents['values'][1]['properties']['addressPrefixes'], indent = 0)) #use this to print contents from json entry 1
I'm not convinced that using re to parse HTML is a good idea. BeautifulSoup is more suited to the task. Upon inspection of the HTML response I note that there's a span element of class file-link-view1 that seems to uniquely identify the URL to the JSON download. Assuming that to be a robust approach (i.e. Microsoft don't change the way the download URL is presented) then this is how I'd do it:-
import requests
from bs4 import BeautifulSoup
namelist = ["AzureCloud.australiacentral", "AzureCloud.australiacentral2",
"AzureCloud.australiaeast", "AzureCloud.australiasoutheast"]
baseurl = 'https://www.microsoft.com/en-us/download/confirmation.aspx?id=56519'
with requests.Session() as session:
response = session.get(baseurl)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
downloadurl = soup.find('span', class_='file-link-view1').find('a')['href']
response = session.get(downloadurl)
response.raise_for_status()
json = response.json()
for n in json['values']:
if n['name'] in namelist:
print(n['name'])
for ap in n['properties']['addressPrefixes']:
print(ap)
#andyknight, thanks for your direction. I'd up vote you but as I'm a noob, it doesn't permit from doing so.
I've taken the basis of your python script and added in some additional components.
I removed the print statement for the region name in the .txt file, as this is file is referenced by a firewall, which is looking for IP addresses.
I've added in Try/Except/Else for portion of the script, to identify if there is ever an error with reaching the URL, or other unspecified error. I've leveraged logging to send an email based on the status of the script. If an exception is thrown I get an email with traceback information, otherwise I receive an email advising the script was successful.
This writes out the specific prefixes for AU regions into a .txt file.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import logging
import logging.handlers
from bs4 import BeautifulSoup
smtp_handler = logging.handlers.SMTPHandler(mailhost=("sanitised.smtp[.]xyz", 25),
fromaddr="UpdateIPs#sanitised[.]xyz",
toaddrs="FriendlyAdmin#sanitised[.]xyz",
subject=u"Check Azure IP Script completion status.")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logger.addHandler(smtp_handler)
namelist = ["AzureCloud.australiacentral", "AzureCloud.australiacentral2",
"AzureCloud.australiaeast", "AzureCloud.australiasoutheast"]
baseurl = 'https://www.microsoft.com/en-us/download/confirmation.aspx?id=56519'
with requests.Session() as session:
response = session.get(baseurl)
try:
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
downloadurl = soup.find('span', class_='file-link-view1').find('a')['href']
response = session.get(downloadurl)
response.raise_for_status()
json = response.json()
for n in json['values']:
if n['name'] in namelist:
for ap in n['properties']['addressPrefixes']:
with open('Check_Azure_IPs.txt', 'a') as file:
file.write(ap + "\n")
except requests.exceptions.HTTPError as e:
logger.exception(
"URL is no longer valid, please check the URL that's defined in this script with MS, as this may have changed.\n\n")
except Exception as e:
logger.exception("Unknown error has occured, please review script")
else:
logger.info("Script has run successfully! Azure IPs have been updated.")
Please let me know if you think there is a better way to handle this, otherwise this is marked as answered. I appreciate your help greatly!
I'm trying to access an authenticated site using a cookies.txt file (generated with a Chrome extension) with Python Requests:
import requests, cookielib
cj = cookielib.MozillaCookieJar('cookies.txt')
cj.load()
r = requests.get(url, cookies=cj)
It doesn't throw any error or exception, but yields the login screen, incorrectly. However, I know that my cookie file is valid, because I can successfully retrieve my content using it with wget. Any idea what I'm doing wrong?
Edit:
I'm tracing cookielib.MozillaCookieJar._really_load and can verify that the cookies are correctly parsed (i.e. they have the correct values for the domain, path, secure, etc. tokens). But as the transaction is still resulting in the login form, it seems that wget must be doing something additional (as the exact same cookies.txt file works for it).
MozillaCookieJar inherits from FileCookieJar which has the following docstring in its constructor:
Cookies are NOT loaded from the named file until either the .load() or
.revert() method is called.
You need to call .load() method then.
Also, like Jermaine Xu noted the first line of the file needs to contain either # Netscape HTTP Cookie File or # HTTP Cookie File string. Files generated by the plugin you use do not contain such a string so you have to insert it yourself. I raised appropriate bug at http://code.google.com/p/cookie-txt-export/issues/detail?id=5
EDIT
Session cookies are saved with 0 in the 5th column. If you don't pass ignore_expires=True to load() method all such cookies are discarded when loading from a file.
File session_cookie.txt:
# Netscape HTTP Cookie File
.domain.com TRUE / FALSE 0 name value
Python script:
import cookielib
cj = cookielib.MozillaCookieJar('session_cookie.txt')
cj.load()
print len(cj)
Output:
0
EDIT 2
Although we managed to get cookies into the jar above they are subsequently discarded by cookielib because they still have 0 value in the expires attribute. To prevent this we have to set the expire time to some future time like so:
for cookie in cj:
# set cookie expire date to 14 days from now
cookie.expires = time.time() + 14 * 24 * 3600
EDIT 3
I checked both wget and curl and both use 0 expiry time to denote session cookies which means it's the de facto standard. However Python's implementation uses empty string for the same purpose hence the problem raised in the question. I think Python's behavior in this regard should be in line with what wget and curl do and that's why I raised the bug at http://bugs.python.org/issue17164
I'll note that replacing 0s with empty strings in the 5th column of the input file and passing ignore_discard=True to load() is the alternate way of solving the problem (no need to change expiry time in this case).
I tried taking into account everything that Piotr Dobrogost had valiantly figured out about MozillaCookieJar but to no avail. I got fed up and just parsed the damn cookies.txt myself and now all is well:
import re
import requests
def parseCookieFile(cookiefile):
"""Parse a cookies.txt file and return a dictionary of key value pairs
compatible with requests."""
cookies = {}
with open (cookiefile, 'r') as fp:
for line in fp:
if not re.match(r'^\#', line):
lineFields = line.strip().split('\t')
cookies[lineFields[5]] = lineFields[6]
return cookies
cookies = parseCookieFile('cookies.txt')
import pprint
pprint.pprint(cookies)
r = requests.get('https://example.com', cookies=cookies)
This worked for me:
from http.cookiejar import MozillaCookieJar
from pathlib import Path
import requests
cookies = Path('/Users/name/cookies.txt')
jar = MozillaCookieJar(cookies)
jar.load()
requests.get('https://path.to.site.com', cookies=jar)
<Response [200]>
I tried editing Tristan answer to add some info to it but it seems SO edit q is full therefore, I am writing this answer, since, I have struggled real bad on using existing cookies with python request.
First, get the cookies from the Chrome. Easiest way would be to use an extension called 'cookies.txt'
https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/related
After downloading those cookies, use the below code to make sure that you are able to parse the file without any issues.
import re, requests, pprint
def parseCookieFile(cookiefile):
"""Parse a cookies.txt file and return a dictionary of key value pairs
compatible with requests."""
cookies = {}
with open (cookiefile, 'r') as fp:
for line in fp:
if not re.match(r'^\#', line):
lineFields = re.findall(r'[^\s]+', line) #capturing anything but empty space
try:
cookies[lineFields[5]] = lineFields[6]
except Exception as e:
print (e)
return cookies
cookies = parseCookieFile('cookies.txt') #replace the filename
pprint.pprint(cookies)
Next, use those cookies with python request
x = requests.get('your__url', verify=False, cookies=cookies)
print (x.content)
This should save your day from going on different SO posts and trying those cookielib and other methods which never worked for me.
I finally found a way to make it work (I got the idea by looking at curl's verbose ouput): instead of loading my cookies from a file, I simply created a dict with the required value/name pairs:
cd = {'v1': 'n1', 'v2': 'n2'}
r = requests.get(url, cookies=cd)
and it worked (although it doesn't explain why the previous method didn't). Thanks for all the help, it's really appreciated.
I'm trying to access an authenticated site using a cookies.txt file (generated with a Chrome extension) with Python Requests:
import requests, cookielib
cj = cookielib.MozillaCookieJar('cookies.txt')
cj.load()
r = requests.get(url, cookies=cj)
It doesn't throw any error or exception, but yields the login screen, incorrectly. However, I know that my cookie file is valid, because I can successfully retrieve my content using it with wget. Any idea what I'm doing wrong?
Edit:
I'm tracing cookielib.MozillaCookieJar._really_load and can verify that the cookies are correctly parsed (i.e. they have the correct values for the domain, path, secure, etc. tokens). But as the transaction is still resulting in the login form, it seems that wget must be doing something additional (as the exact same cookies.txt file works for it).
MozillaCookieJar inherits from FileCookieJar which has the following docstring in its constructor:
Cookies are NOT loaded from the named file until either the .load() or
.revert() method is called.
You need to call .load() method then.
Also, like Jermaine Xu noted the first line of the file needs to contain either # Netscape HTTP Cookie File or # HTTP Cookie File string. Files generated by the plugin you use do not contain such a string so you have to insert it yourself. I raised appropriate bug at http://code.google.com/p/cookie-txt-export/issues/detail?id=5
EDIT
Session cookies are saved with 0 in the 5th column. If you don't pass ignore_expires=True to load() method all such cookies are discarded when loading from a file.
File session_cookie.txt:
# Netscape HTTP Cookie File
.domain.com TRUE / FALSE 0 name value
Python script:
import cookielib
cj = cookielib.MozillaCookieJar('session_cookie.txt')
cj.load()
print len(cj)
Output:
0
EDIT 2
Although we managed to get cookies into the jar above they are subsequently discarded by cookielib because they still have 0 value in the expires attribute. To prevent this we have to set the expire time to some future time like so:
for cookie in cj:
# set cookie expire date to 14 days from now
cookie.expires = time.time() + 14 * 24 * 3600
EDIT 3
I checked both wget and curl and both use 0 expiry time to denote session cookies which means it's the de facto standard. However Python's implementation uses empty string for the same purpose hence the problem raised in the question. I think Python's behavior in this regard should be in line with what wget and curl do and that's why I raised the bug at http://bugs.python.org/issue17164
I'll note that replacing 0s with empty strings in the 5th column of the input file and passing ignore_discard=True to load() is the alternate way of solving the problem (no need to change expiry time in this case).
I tried taking into account everything that Piotr Dobrogost had valiantly figured out about MozillaCookieJar but to no avail. I got fed up and just parsed the damn cookies.txt myself and now all is well:
import re
import requests
def parseCookieFile(cookiefile):
"""Parse a cookies.txt file and return a dictionary of key value pairs
compatible with requests."""
cookies = {}
with open (cookiefile, 'r') as fp:
for line in fp:
if not re.match(r'^\#', line):
lineFields = line.strip().split('\t')
cookies[lineFields[5]] = lineFields[6]
return cookies
cookies = parseCookieFile('cookies.txt')
import pprint
pprint.pprint(cookies)
r = requests.get('https://example.com', cookies=cookies)
This worked for me:
from http.cookiejar import MozillaCookieJar
from pathlib import Path
import requests
cookies = Path('/Users/name/cookies.txt')
jar = MozillaCookieJar(cookies)
jar.load()
requests.get('https://path.to.site.com', cookies=jar)
<Response [200]>
I tried editing Tristan answer to add some info to it but it seems SO edit q is full therefore, I am writing this answer, since, I have struggled real bad on using existing cookies with python request.
First, get the cookies from the Chrome. Easiest way would be to use an extension called 'cookies.txt'
https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/related
After downloading those cookies, use the below code to make sure that you are able to parse the file without any issues.
import re, requests, pprint
def parseCookieFile(cookiefile):
"""Parse a cookies.txt file and return a dictionary of key value pairs
compatible with requests."""
cookies = {}
with open (cookiefile, 'r') as fp:
for line in fp:
if not re.match(r'^\#', line):
lineFields = re.findall(r'[^\s]+', line) #capturing anything but empty space
try:
cookies[lineFields[5]] = lineFields[6]
except Exception as e:
print (e)
return cookies
cookies = parseCookieFile('cookies.txt') #replace the filename
pprint.pprint(cookies)
Next, use those cookies with python request
x = requests.get('your__url', verify=False, cookies=cookies)
print (x.content)
This should save your day from going on different SO posts and trying those cookielib and other methods which never worked for me.
I finally found a way to make it work (I got the idea by looking at curl's verbose ouput): instead of loading my cookies from a file, I simply created a dict with the required value/name pairs:
cd = {'v1': 'n1', 'v2': 'n2'}
r = requests.get(url, cookies=cd)
and it worked (although it doesn't explain why the previous method didn't). Thanks for all the help, it's really appreciated.
I have the following cookie saved by curl (in test.txt, tab-separated, this editor doesn't preserve tabs):
# Netscape HTTP Cookie File
# http://curlm.haxx.se/rfc/cookie_spec.html
# This file was generated by libcurl! Edit at your own risk.
#HttpOnly_my-example.com FALSE / FALSE 0 _rails-root_session test
I'm trying to read it with the following code:
import sys
if sys.version_info < (3,):
from cookielib import Cookie, MozillaCookieJar
else:
from http.cookiejar import Cookie, MozillaCookieJar
def load_cookies_from_mozilla(filename):
ns_cookiejar = MozillaCookieJar()
ns_cookiejar.load(filename, ignore_discard=True)
return ns_cookiejar
cookies = load_cookies_from_mozilla("test.txt")
print (len(cookies))
It outputs 0 (unable to read the cookie).
If I manually modify my cookie to the following line (remove HttpOnly flag and changing 0 to the empty string for expiration time, and again, tab-separated):
my-example.com FALSE / FALSE _rails-root_session test
then it outputs 1 (successfully read the cookie).
What needs to be done to my python code to read the original cookie line? And preferably to be able to save it in the same format (with HttpOnly flag and with 0 instead of empty string for never-expiring cookie)?
Thanks.
This appears to be an open bug: https://bugs.python.org/issue2190.
This bug report contains a link to a workaround: https://gerrit.googlesource.com/git-repo/+/master/subcmds/sync.py#995
In that linked code, the developer creates a temporary cookies file, removes the "#HttpOnly_" prefixes, and then creates a cookiejar with that temporary file.
tmpcookiefile = tempfile.NamedTemporaryFile()
tmpcookiefile.write("# HTTP Cookie File")
try:
with open(cookiefile) as f:
for line in f:
if line.startswith("#HttpOnly_"):
line = line[len("#HttpOnly_"):]
tmpcookiefile.write(line)
tmpcookiefile.flush()
cookiejar = cookielib.MozillaCookieJar(tmpcookiefile.name)
try:
cookiejar.load()
except cookielib.LoadError:
cookiejar = cookielib.CookieJar()
finally:
tmpcookiefile.close()
I tested your code and modified it, it works.
First in the cookie file you have to put off the '#' before your cookie, I think it will comment the data after it.
Second the 0 in the cookie means the expire time, 0 means expire now, so you can change the 0 to empty string or latter time, but i suggest you use the argument ignore_expire=True, the official means:
ignore_discard: save even cookies set to be discarded.
ignore_expires: save even cookies that have expiredThe file is overwritten if it already exists
and the result code is :
import sys
if sys.version_info < (3,):
from cookielib import Cookie, MozillaCookieJar
else:
from http.cookiejar import Cookie, MozillaCookieJar
def load_cookies_from_mozilla(filename):
ns_cookiejar = MozillaCookieJar()
ns_cookiejar.load(filename, ignore_discard=True, ignore_expires=True)
return ns_cookiejar
cookies = load_cookies_from_mozilla("test.txt")
print (len(cookies))
and you can see the link to find more detail:
Using cookies.txt file with Python Requests
I am trying to download the CSV files from this page, via a python script.
But when I try to access the CSV file directly by links in my browser, an agreement form is displayed. I have to agree to this form before I am allowed to download the file.
The exact URLs to the csv files can't be retrieved. It is a value being sent to backend db which fetches the file - e.g PERIOD_ID=2013-0:
https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/DataExports/ExportProductionData.aspx?PERIOD_ID=2013-0
I've tried urllib2.open() and urllib2.read(), but it leads to the html content of agreement form, not the file content.
How do i write a python code which handles this re-direct and then fetches me the CSV file and let me save on disk ?
You need to set the ASP.NET_SessionId cookie. You can find this by using Chrome's Inspect element option in the context menu, or by using Firefox and the Firebug extension.
With Chrome:
Right-click on the webpage (after you've agreed to the terms) and select Inspect element
Click Resources -> Cookies
Select the only element in the list
Copy the Value of the ASP.NET_SessionId element
With Firebug:
Right-click on the webpage (after you've agreed to the terms), and click *Inspect Element with Firebug
Click Cookies
Copy the Value of the ASP.NET_SessionId element
In my case, I got ihbjzynwfcfvq4nzkncbviou - it might work for you, if not you need to perform the above procedure.
Add the cookie to your request, and download the file using the requests module (based on an answer by eladc):
import requests
cookies = {'ASP.NET_SessionId': 'ihbjzynwfcfvq4nzkncbviou'}
r = requests.get(
url=('https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/'
'DataExports/ExportProductionData.aspx?PERIOD_ID=2013-0'),
cookies=cookies
)
with open('2013-0.csv', 'wb') as ofile:
for chunk in r.iter_content(chunk_size=1024):
ofile.write(chunk)
ofile.flush()
Here's my suggestion, for automatically applying the server cookies and basically mimicking standard client session behavior.
(Shamelessly inspired by #pope's answer 554580.)
import urllib2
import urllib
from lxml import etree
_TARGET_URL = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/DataExports/ExportProductionData.aspx?PERIOD_ID=2013-0'
_AGREEMENT_URL = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/Welcome/Agreement.aspx'
_CSV_OUTPUT = 'urllib2_ProdExport2013-0.csv'
class _MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
print 'Follow redirect...' # Any cookie manipulation in-between redirects should be implemented here.
return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
http_error_301 = http_error_303 = http_error_307 = http_error_302
cookie_processor = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(_MyHTTPRedirectHandler, cookie_processor)
urllib2.install_opener(opener)
response_html = urllib2.urlopen(_TARGET_URL).read()
print 'Cookies collected:', cookie_processor.cookiejar
page_node, submit_form = etree.HTML(response_html), {} # ElementTree node + dict for storing hidden input fields.
for input_name in ['ctl00$MainContent$AgreeButton', '__EVENTVALIDATION', '__VIEWSTATE']: # Form `input` fields used on the ``Agreement.aspx`` page.
submit_form[input_name] = page_node.xpath('//input[#name="%s"][1]' % input_name)[0].attrib['value']
print 'Form input \'%s\' found (value: \'%s\')' % (input_name, submit_form[input_name])
# Submits the agreement form back to ``_AGREEMENT_URL``, which redirects to the CSV download at ``_TARGET_URL``.
csv_output = opener.open(_AGREEMENT_URL, data=urllib.urlencode(submit_form)).read()
print csv_output
with file(_CSV_OUTPUT, 'wb') as f: # Dumps the CSV output to ``_CSV_OUTPUT``.
f.write(csv_output)
f.close()
Good luck!
[Edit]
On the why of things, I think #Steinar Lima is correct with respect to requiring a session cookie. Though unless you've already visited the Agreement.aspx page and submitted a response via the provider's website, the cookie you copy from the browser's web inspector will only result in another redirect to the Welcome to the PA DEP Oil & Gas Reporting Website welcome page. Which of course eliminates the whole point of having a Python script do the job for you.