I am trying to download multiple netcdf4 files from GES DISC, but I seem to be having trouble with the Authorization.
'fpath' is location of the netcdf4 file. If I was to paste into address bar, a pop box will appear for 'https://urs.earthdata.nasa.gov' requiring username and password. If entered successfully, the file would download. However using 'fpath' in request.get() does not work.
request.get() successfully connects if I use 'https://urs.earthdata.nasa.gov' instead of fpath, but then I cannot download the netcdf4 file.
I've tried solution mentioned here but no luck.
Any help be appreciated
Code example below
import requests
from requests.auth import HTTPBasicAuth
from datetime import timedelta, date
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2016, 1, 1)
end_date = date(2016, 1, 2)
for single_date in daterange(start_date, end_date):
YYYY = single_date.strftime("%Y")
MM = single_date.strftime("%m")
DD = single_date.strftime("%d")
fpath1 = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2I1NXASM.5.12.4/' + YYYY + '/' + MM + '/'
fpath2 = 'MERRA2_400.inst1_2d_asm_Nx.' + YYYY + MM + DD + '.nc4.nc?'
fpath3 = 'U2M[0:23][94:160][469:534],TROPT[0:23][94:160][469:534],TROPPB[0:23][94:160][469:534],' \
'T2M[0:23][94:160][469:534],TQL[0:23][94:160][469:534],TOX[0:23][94:160][469:534],' \
'PS[0:23][94:160][469:534],V50M[0:23][94:160][469:534],DISPH[0:23][94:160][469:534],' \
'TO3[0:23][94:160][469:534],TS[0:23][94:160][469:534],T10M[0:23][94:160][469:534],' \
'TROPPT[0:23][94:160][469:534],TQI[0:23][94:160][469:534],SLP[0:23][94:160][469:534],' \
'TQV[0:23][94:160][469:534],V2M[0:23][94:160][469:534],TROPQ[0:23][94:160][469:534],' \
'V10M[0:23][94:160][469:534],U50M[0:23][94:160][469:534],U10M[0:23][94:160][469:534],' \
'QV2M[0:23][94:160][469:534],TROPPV[0:23][94:160][469:534],' \
'QV10M[0:23][94:160][469:534],time,lat[94:160],lon[469:534]'
fpath = fpath1 + fpath2 + fpath3
print(fpath)
# This successfully connects
# response = requests.get('https://urs.earthdata.nasa.gov', auth=HTTPBasicAuth('username', 'password'))
# print(response)
# This one does not
response = requests.get(fpath, auth=HTTPBasicAuth('username', 'password'))
print(response)
Note - anyone can create a free account to access this data by going to this website
Thank you #Stovfl for pointing me in the right direction.
Guidance led me to This website which contained information on how to set up a session for earthdata
the updated complete code is below
import requests
from datetime import timedelta, date
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2016, 1, 1)
end_date = date(2019, 7, 31)
# ***********************
# overriding requests.Session.rebuild_auth to maintain headers when redirected
# ***********************
class SessionWithHeaderRedirection(requests.Session):
AUTH_HOST = 'urs.earthdata.nasa.gov'
def __init__(self, username, password):
super().__init__()
self.auth = (username, password)
# Overrides from the library to keep headers when redirected to or from the NASA auth host.
def rebuild_auth(self, prepared_request, response):
headers = prepared_request.headers
url = prepared_request.url
if 'Authorization' in headers:
original_parsed = requests.utils.urlparse(response.request.url)
redirect_parsed = requests.utils.urlparse(url)
if (original_parsed.hostname != redirect_parsed.hostname) and \
redirect_parsed.hostname != self.AUTH_HOST and \
original_parsed.hostname != self.AUTH_HOST:
del headers['Authorization']
return
# create session with the user credentials that will be used to authenticate access to the data
username = "USERNAME"
password = "PASSWORD"
session = SessionWithHeaderRedirection(username, password)
# ***********************
# Loop through Files
# ***********************
for single_date in daterange(start_date, end_date):
YYYY = single_date.strftime("%Y")
MM = single_date.strftime("%m")
DD = single_date.strftime("%d")
fpath1 = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2I1NXASM.5.12.4/' + YYYY + '/' + MM + '/'
fpath2 = 'MERRA2_400.inst1_2d_asm_Nx.' + YYYY + MM + DD + '.nc4.nc?'
fpath3 = 'U2M[0:23][94:160][469:534],TROPT[0:23][94:160][469:534],TROPPB[0:23][94:160][469:534],' \
'T2M[0:23][94:160][469:534],TQL[0:23][94:160][469:534],TOX[0:23][94:160][469:534],' \
'PS[0:23][94:160][469:534],V50M[0:23][94:160][469:534],DISPH[0:23][94:160][469:534],' \
'TO3[0:23][94:160][469:534],TS[0:23][94:160][469:534],T10M[0:23][94:160][469:534],' \
'TROPPT[0:23][94:160][469:534],TQI[0:23][94:160][469:534],SLP[0:23][94:160][469:534],' \
'TQV[0:23][94:160][469:534],V2M[0:23][94:160][469:534],TROPQ[0:23][94:160][469:534],' \
'V10M[0:23][94:160][469:534],U50M[0:23][94:160][469:534],U10M[0:23][94:160][469:534],' \
'QV2M[0:23][94:160][469:534],TROPPV[0:23][94:160][469:534],' \
'QV10M[0:23][94:160][469:534],time,lat[94:160],lon[469:534]'
url = fpath1 + fpath2 + fpath3
# print(url)
# extract the filename from the url to be used when saving the file
filename = 'MERRA2_400.inst1_2d_asm_Nx.' + YYYY + MM + DD + '.nc4.nc'
print(filename)
try:
# submit the request using the session
response = session.get(url, stream=True)
print(response.status_code)
# raise an exception in case of http errors
response.raise_for_status()
# save the file
with open(filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=1024 * 1024):
fd.write(chunk)
except requests.exceptions.HTTPError as e:
# handle any errors here
print(e)
Related
This is a side project I am doing as I am attempting to learn Python.
I am trying to write a python script that will iterate through a date range and use each date that is returned in a GET request URL.
The URL uses a LastModified parameter and limits GET requests to a 24 hour period so I would like to run the GET request for each day from the start date.
Below is what I have currently, the major issue I am having is how to separate the returned dates in a way that I can use each date separately for the GET, the GET will also need to be looped to use each date I suppose.
Any pointer in the right direction would be helpful as I am trying to learn as much as possible.
start_date = datetime.date(2020, 1, 1)
end_date = datetime.date.today()
delta = datetime.timedelta(days=1)
while start_date <= end_date:
last_mod = start_date + delta
print(last_mod)
start_date += delta
import requests
from requests.auth import HTTPBasicAuth
vend_key = 'REDACTED'
user_key = 'REDACTED'
metrc_license = 'A12-0000015-LIC'
base_url = 'https://sandbox-api-ca.metrc.com'
last_mod_date = ''
a = HTTPBasicAuth(vend_key, user_key)
def get(path):
url = '{}/{}/?licenseNumber={}&lastModifiedStart={}'.format(base_url, path, metrc_license, last_mod_date, )
print('URL:', url)
r = requests.get(url, auth=a)
print("The server response is: ", r.status_code)
if r.status_code == 200:
return r.json()
# Would like an elif that is r.status_code is 500 wait _ seconds and try again
elif r.status_code == 500:
print("500 error, try again.")
else:
print("Error")
print((get('/packages/v1/active')))
Here is an example return from the current script, I do not need it to return each date so I can remove the print, but how can I make each loop from the date be its own variable to use in a loop of the GET?
2020-01-02
2020-01-03
2020-01-04
2020-01-05
2020-01-06
etc...
etc...
etc...
2020-05-24
2020-05-25
2020-05-26
2020-05-27
URL: https://sandbox-api-ca.metrc.com//packages/v1/active/?licenseNumber=A12-0000015-LIC&lastModifiedStart=2020-05-27
The server response is: 200
[]
It's super simple, you need to use use the while loop that generates all these dates into your get() function. Here is what I mean:
import requests
from requests.auth import HTTPBasicAuth
vend_key = 'REDACTED'
user_key = 'REDACTED'
metrc_license = 'A12-0000015-LIC'
base_url = 'https://sandbox-api-ca.metrc.com'
a = HTTPBasicAuth(vend_key, user_key)
def get(path):
start_date = datetime.date(2020, 1, 1)
end_date = datetime.date.today()
delta = datetime.timedelta(days=1)
while start_date <= end_date:
last_mod_date = start_date + delta
print(last_mod_date)
start_date += delta
url = '{}/{}/?licenseNumber={}&lastModifiedStart={}'.format(base_url, path, metrc_license, last_mod_date, )
print('URL:', url)
r = requests.get(url, auth=a)
print("The server response is: ", r.status_code)
if r.status_code == 200:
return r.json()
# Would like an elif that is r.status_code is 500 wait _ seconds and try again
elif r.status_code == 500:
print("500 error, try again.")
else:
print("Error")
print((get('/packages/v1/active')))
One thing you could do is call your get function inside the while loop. First modify the get function to take a new parameter date and then use this parameter when you build your url.
For instance:
def get(path, date):
url = '{}/{}/?licenseNumber={}&lastModifiedStart={}'.format(base_url, path, metrc_license, date, )
...
And then call get inside the while loop.
while start_date <= end_date:
last_mod = start_date + delta
get(some_path, last_mod)
start_date += delta
This would make a lot of GET requests in a short period of time, so you might want to be careful not to overload the server with requests.
Okay, here is my code:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
Using selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
Create a dict, where i store data and create progressbars
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
Forming parameters in a loop, constructing a url from them and send a browser.get request
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
The request itself
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
But after 2-4 requests it just freezes:(
Whats the problem?
Okay, the problem was in an advertising banner, which appeared after several requests. Solution is just to wait (time.sleep), untill the banner disapeares, and the send request again!:
try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
for using pandas-datareader with yahoo, when I have start and end as the same date I get no information returned when I ask on that date. If I ask a day later, it works. But I want today's close today.
import sys
from sqlalchemy import *
import os
import datetime
import pandas_datareader.data as web
end = datetime.datetime(2015, 10, 15)
start = datetime.datetime(2015, 10, 15)
path = 'c:\\python34\\myprojects\\msis\\'
try:
os.mkdir(path)
except:
pass
fname = path + 'test.txt'
fhand = open(fname, 'w')
engine = create_engine('mysql+mysqlconnector://root:#localhost /stockinfo')
connection = engine.connect()
result1 = engine.execute("select symbol from equities where daily = 'Y'")
for sqlrow in result1:
try:
info = web.DataReader(sqlrow[0], 'yahoo', start, end)
print (info)
close = info['Close'].ix['2015-10-14']
print ("=========================" + str(round(close,4)))
answer = "Closing price for " + sqlrow[0] + " is " + str(round(close,4)) + "\n"
except:
answer = "No success for " + sqlrow[0] + "\n"
fhand.write(answer)
# result2 = engine.execute("update holdings set lasrprice = " + round(close,4) + " where symbol = '" + sqlrow[0] + "'")
# result2.close()
result1.close()
fhand.close()
The code takes the second "except" route.
What am I doing wrong/what is happening?
I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?
I have this simple python code, which returning the content of URL and store the result as json text file named "file", but it keeps returning empty result .
What I am doing wrong here? It is just a simple code I am so disappointed.
I have included all the imports needed import Facebook,import request,and import json.
url ="https://graph.facebook.com/search?limit=5000&type=page&q=%26&access_token=xx&__after_id=139433456868"
content = requests.get(url).json()
file = open("file.txt" , 'w')
file.write(json.dumps(content, indent=1))
file.close()
but it keeps returning empty result to me what I am missing here?
here is the result:
"data": []
any help please?
Its working fine:
import urllib2
accesstoken="CAACEdEose0cBACF6HpTDEuVEwVnjx1sHOJFS3ZBQZBsoWqKKyFl93xwZCxKysqsQMUgOZBLjJoMurSxinn96pgbdfSYbyS9Hh3pULdED8Tv255RgnsYmnlxvR7JZCN7X25zP6fRnRK0ZCNmChfLPejaltwM2JGtPGYBQwnmAL9tQBKBmbZAkGYCEQHAbUf7k1YZD"
urllib2.urlopen("https://graph.facebook.com/search?limit=5000&type=page&q=%26&access_token="+accesstoken+"&__after_id=139433456868").read()
I think you have not requested access token before making the request.
How to find access token?
def getSecretToken(verification_code):
token_url = ( "https://graph.facebook.com/oauth/access_token?" +
"client_id=" + app_id +
"&redirect_uri=" +my_url +
"&client_secret=" + app_secret +
"&code=" + verification_code )
response = requests.get(token_url).content
params = {}
result = response.split("&", 1)
print result
for p in result:
(k,v) = p.split("=")
params[k] = v
return params['access_token']
how do you get that verification code?
verification_code=""
if "code" in request.query:
verification_code = request.query["code"]
if not verification_code:
dialog_url = ( "http://www.facebook.com/dialog/oauth?" +
"client_id=" + app_id +
"&redirect_uri=" + my_url +
"&scope=publish_stream" )
return "<script>top.location.href='" + dialog_url + "'</script>"
else:
access_token = getSecretToken(verification_code)