I try to write a parser based on `urllib` and `beautifulsoup` libraries, but don't understand why sometimes I get a request status 200, but sometimes 404 (with the same URL of course), moreover the URL which had been requested by `urllib` and returned 404 opens perfect manually via a browser.
Could anyone explain that behavior?
url = 'https://zakupki.gov.ru/epz/order/extendedsearch/results.html'
params = {'searchString': 'Сакубитрил',
'morphology': 'on',
'pageNumber': 1,
'sortDirection': 'false',
'recordsPerPage': '_10',
'showLotsInfoHidden': 'false',
'sortBy': 'UPDATE_DATE',
'fz44': 'on',
'fz223': 'on',
'af': 'on',
'ca': 'on',
'pc': 'on',
'pa': 'on',
'currencyIdGeneral': -1,
'publishDateFrom': '01.02.2021',
'publishDateTo': '21.02.2021'}
def parser(url, params):
attempt = 0
while attempt < 10:
try:
data = urllib.parse.urlencode(params)
full_url = url + '?' + data
with urllib.request.urlopen(full_url, timeout = 10) as response:
the_page = response.read()
soup = BeautifulSoup(the_page, 'html.parser')
return soup
except Exception: # don't forget replace Exception with something more specific
attempt += 1
time.sleep(5)
continue
data = parser(url, params=params)
Related
New in Python, I'd like to webscrape json data from
https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]=1&filter[entity.names]=*&filter[entity.legalAddress.country]=DE' without hard coding the number of page
Below is the code that I use and work
dfs = []
for i in np.arange(1, 20000):
try:
URL = f'https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]={i}&filter[entity.names]=*&filter[entity.legalAddress.country]=DE'
r = requests.get(URL, proxies=proxies).json()
v = pd.json_normalize(r['data'])
dfs.append(v)
print(f'Page {i}: Done')
except Exception as e:
print(f'Page {i}: Error', e)
break
Here is the Response
URL = f'https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]=1&filter[entity.names]=*&filter[entity.legalAddress.country]=DE'
r = requests.get(URL, proxies=proxies).json()
print(r)
Below is the output response
{'meta': {'goldenCopy': {'publishDate': '2020-09-17T00:00:00Z'}, 'pagination': {'currentPage': 1, 'perPage': 200, 'from': 1, 'to': 200, 'total': 139644, 'lastPage': 699}},
Question: How can I store 'lastPage' = 699 in a variable?
The goal would be to use the following loop
for i in np.arange(1, lastPage):
....
Thanks for anyone helping!
lastPage = r.get('meta').get('pagination').get('lastPage')
Im using a public available method to get the lyrics for tracks using the musixmatch api. This method has this part:
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
print(json_obj) returns in this format if the track has a lyric associated:
{'message': {'header': {'status_code': 200, 'execute_time': 0.0032379627227783}, 'body': {'lyrics': {'lyrics_id': 1290130, 'can_edit': 0, 'locked': 0, 'action_requested': '', 'verified': 0, 'restricted': 1, 'instrumental': 0, 'explicit': 0, 'lyrics_body': '', 'lyrics_language': 'en', 'lyrics_language_description': 'English', 'script_tracking_url': 'https://tracking.musixmatch.com/t1.0/m_js/e_1/sn_0/l_1290130/su_0/tr_3vUCAELvdbv2u68t6EhkMnd-8HjFVDR8ySJtVDvx1wu2t7bIHzQn4QjBnewHmPfaFosJFYkWv3dp_LyRwBhUrXjOsX9TA-28yzdsLebCvaG9shILvno8F94Mm6EuA6PvOiTKdFVL6hW2DSHd2uQQbOF-URNHli0NCOG2mnJS7i_pRvQM6jbzTs8GWGmfkz6SPG6SiRAw7OD1_ALUcvL6DWgqTDzzQyn1mxFzSyUWB5ddJTf3oTmgBoRKQKJJqiO6h2S6T40I73gio9TzqXUMeTjB7jKbL7pm-ma6s4zx-uAlRzVKvyEMxjJGks6p2CKu_NE3MZ26VcV4EVhaL0QcXsmVW49T064RtrZFc4qY9gqHoc_wreBVUPoftX1SWDPGzG4HZOiRXpj-V_YIvaLA2QAVsAVcBkwi9Zmb0onjsHTbknSmzO-6kcez0zrFc44LZOQ45of1eATGIVI/', 'pixel_tracking_url': 'https://tracking.musixmatch.com/t1.0/m_img/e_1/sn_0/l_1290130/su_0/tr_3vUCAAVtFUMcReVeKMMElPrsIdyf5IcOQyLMIduxSpy9U549RTNLfGorynAbS03DhldhcSFwWs_f9EiFdRepDw_HGQFZC6-hNtxts2M9eU1uD2Ga8RXdRq9pgm29WoAj8zcfH45WsO-tAfyA3oajqe-momfl2S4n51WgrBj4RxpmZ3DhrilGRO1Rhq5rqTyXRLG86fM7Hi0Bh2hXrXg115SiPywxjuMzLR2DMmGvu_Q--jjtFU38jTurZCsdtJ8slsdpvUzUo5Rs0EKdxboiopgPdTScdSQ-J3NRlXq8CFlK0j78ZMGo8KI9ZwdfMag5gPo5hI56PdcsAkfLkmvGjE2einZMruGrZVy08cZxAX7_ZdhXuLUY3RvleHog089sBJnzXDVqi7zb2yZSxrxYENQZuSR7KjR1fDN45I18ingQsAHXy18mJ6CUzWkrzwpV0-XAeNBe2d5mG9M/', 'html_tracking_url': 'https://tracking.musixmatch.com/t1.0/m_html/e_1/sn_0/l_1290130/su_0/tr_3vUCAEHn0YKFfsSIWi8uC0aMlSrJxMwAjYKks94IRU-wF13uNtElVESehNa14ZOa5h19bOKSN10QRhFF4V-Jwo6DLkoN5VVBVDpw9pqL5Ff0--pA2EUtqzvjZb3Z35J2V51RY9AFv6VYWgQ6Nm7ijSbRcbnjyqB3rvCbhUNqN_egIfyVPkDCjZZnyFIs5Vt8teA7zLE2ms5EDF6NphXfezUcsDZfN8hBtT8lvd_EjunDSKs7QaCMHmi8YV7aSiLWyFeQKBzKWm-YRq2z9kyXtcreXlGagr6UHazj88UTK_LO_TzT99BO4XKJwmQ1ARm8-c4nug9kxUic3EvDxS0CNclEXAuKdKpzZvE25PQZUb_dCyPNvCVHHV4C1XtlmOVXoRET6fuguSHlA7Hg7TqDo_PP8cbR-Q4_VeromQ4evewwzWrrNQQSA4VX0Z78Ll5nKF9lYVa5wWQTWi8/', 'lyrics_copyright': "Unfortunately we're not authorized to show these lyrics.", 'writer_list': [], 'publisher_list': [], 'backlink_url': 'https://www.musixmatch.com/lyrics/Bruce-Springsteen/No-Surrender?utm_source=application&utm_campaign=api&utm_medium=none%3A1409616623501', 'updated_time': '2010-06-29T03:43:49Z'}}}}
But if there is no lyric for the track it returns with the body empty:
{'message': {'header': {'status_code': 404, 'execute_time': 0.0011160373687744}, 'body': []}}
Traceback (most recent call last):
Or
{'message': {'header': {'status_code': 401, 'execute_time': 0.00022006034851074}, 'body': ''}}
And when the body is empty it appears this error:
in song_lyric
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
TypeError: string indices must be integers
{'message': {'header': {'status_code': 401, 'execute_time': 0.00022006034851074}, 'body': ''}}
To solve this Im using:
if json_obj["message"]["body"] != "" and json_obj["message"]["body"] != []:
print(json_obj)
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
But its not working, I get the same issue. Do you know why?
complete method:
def song_lyric(song_name,artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(song_name) + "&q_artist=" + urllib.parse.quote(artist_name) +"&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
#matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
#request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent", "curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") #Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request, timeout=4) #timeout set to 4 seconds; automatically retries if times out
raw = response.read()
#print(raw)
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
if json_obj["message"]["body"] != "":
print(json_obj)
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
if(tracking_url!= ""):
#print(body)
#print(tracking_url)
lyrics_tracking(tracking_url)
return (body + "\n\n" +copyright)
else:
return "None"
else:
return "None"
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
#request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent", "curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") #Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request, timeout=4) #timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
I think the complete method part differs from the previous code. Perhaps that's the problem. If that's the case you may fix it simply adding the empty list check you showed before:
if json_obj["message"]["body"] != "" and json_obj["message"]["body"] != []:
Also you can do those multiple checks with in:
>>> json_obj = {'message': {'header': {'status_code': 401, 'execute_time': 0.00022006034851074}, 'body': ''}}
>>> json_obj['message']['body']
''
>>> json_obj['message']['body'] not in ('', [])
False
>>> json_obj['message']['body'] in ('', [])
True
Before I give up and opt for Selenium route, I was trying to automate this page (yeezysupply.com/products/mens-crepe-boot-oil) via Python requests till checkout but got stuck at Checkout page as Credit Form and it's requests is loaded in iFrame and submit to different url https://elb.deposit.shopifycs.com/sessions where it starts giving 500 Internal Server error. Below is the code pertaining section which I tried and did not work:
payment_method_url = r.url.split('?')
payment_method_url = payment_method_url[0]
BILLING_FIRST_NAME = 'Jon'
BILLING_LAST_NAME = 'Norman'
BILLING_ADDRESS_1 = 'G-309'
BILLING_ADDRESS_2 = 'G-309'
BILLING_CITY = 'Chicago'
BILLING_COUNTRY = 'United States'
BILLING_PROVINCE = 'Illinois'
BILLING_ZIP = '60007'
BILLING_PHONE = '149584848485'
TOTAL_PRICE = 66500
# For final Checkout
s.options('https://elb.deposit.shopifycs.com/sessions', headers=headers)
session_url = 'https://elb.deposit.shopifycs.com/sessions'
session_data = '{"credit_card":{"number":"4242 4242 4242 4242","name":"Jon Norman","month":9,"year":2019,"verification_value":"123"}}'
sleep(3)
s.headers.update({
'referer': 'https://checkout.shopifycs.com/number?identifier=eeb4fe88a0fd4063043eeb5730d460f4&location=https%3A%2F%2Fpurchase.yeezysupply.com%2F17655971%2Fcheckouts%2Feeb4fe88a0fd4063043eeb5730d460f4'})
data = {
'utf8': 'utf8',
'_method': 'patch',
'authenticity_token': authenticity_token,
'previous_step': 'payment_method',
'step': '',
's': 'east-50fb8458975b56217d7317847efb9280',
'checkout[payment_gateway]': '117647559',
'checkout[credit_card][vault]': 'false',
'checkout[payment_gateway]': '117647559',
'checkout[different_billing_address]': 'true',
'checkout[billing_address][first_name]': BILLING_FIRST_NAME,
'checkout[billing_address][last_name]': BILLING_LAST_NAME,
'checkout[billing_address][address1]': BILLING_ADDRESS_1,
'checkout[billing_address][address2]': BILLING_ADDRESS_2,
'checkout[billing_address][city]': BILLING_CITY,
'checkout[billing_address][country]': BILLING_COUNTRY,
'checkout[billing_address][province]': BILLING_PROVINCE,
'checkout[billing_address][zip]': BILLING_ZIP,
'checkout[billing_address][remember_me]': 'false',
'checkout[billing_address][remember_me]': '0',
'checkout[billing_address][remember_me_country_code]': '',
'checkout[billing_address][remember_me_phone]': '',
'checkout[billing_address][total_price]': TOTAL_PRICE,
'complete': '1',
'checkout[client_details][browser_width]': '1280',
'checkout[client_details][browser_height]': '150',
'checkout[client_details][javascript_enabled]': '1',
}
sleep(2)
r = s.post(payment_method_url+'/processing', data=data, headers=headers)
r = s.get(payment_method_url, headers=headers)
print(r.text)
I've been trying to make a scraper to get my grades from my schools website. Unfortunately i cannot log in. When i try to run the program, the return page validates the user/password fields, and since they are blank, it's not letting me proceed.
Also, i am not really sure if I am even coding this correctly.
from twill.commands import *
import requests
payload = {
'ctl00$cphMainContent$lgn$UserName':'user',
'ctl00$cphMainContent$lgn$Password':'pass',
}
cookie = {
'En_oneTime_ga_tracking_v2' : 'true',
'ASP.NET_SessionId' : ''
}
with requests.Session() as s:
p = s.post('schoolUrl', data=payload, cookies=cookie)
print p.text
Updated payload:
payload = {
'ctl00$cphMainContent$lgnEaglesNest$UserName':'user',
'ctl00$cphMainContent$lgnEaglesNest$Password':'pass',
'__LASTFOCUS': '',
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE': 'LONG NUMBER',
'__VIEWSTATEGENERATOR': 'C2EE9ABB',
'__EVENTVALIDATION' : 'LONG NUMBER',
'ctl00$cphMainContent$lgnEaglesNest$RememberMe': 'on',
'ctl00$cphMainContent$lgnEaglesNest$LoginButton':'Log+In'
}
How do i know if my POST was successful?
The returned page was saying that Username/Password cannot be blank.
Complete source:
from twill.commands import *
import requests
payload = {
'ctl00$cphMainContent$lgnEaglesNest$UserName':'user',
'ctl00$cphMainContent$lgnEaglesNest$Password':'pass',
'__LASTFOCUS': '',
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE': 'LONG NUMBER',
'__VIEWSTATEGENERATOR': 'C2EE9ABB',
'__EVENTVALIDATION' : 'LONG NUMBER',
'ctl00$cphMainContent$lgnEaglesNest$RememberMe': 'on',
'ctl00$cphMainContent$lgnEaglesNest$LoginButton':'Log In'
}
cookie = {
'En_oneTime_ga_tracking_v2' : 'true',
'ASP.NET_SessionId' : ''
}
with requests.Session() as s:
loginUrl = 'http://eaglesnest.pcci.edu/Login.aspx?ReturnUrl=%2f'
gradeUrl = 'http://eaglesnest.pcci.edu/StudentServices/ClassGrades/Default.aspx'
p = s.post( loginUrl, data=payload)
print p.text
Your payload uses the wrong keys, try
ctl00$cphMainContent$lgnEaglesNest$UserName
ctl00$cphMainContent$lgnEaglesNest$Password
You can check the names by watching the network traffic in your browser (e.g. in Firefox: inspect element --> network --> post --> params)
In addition you need to specify which command you want to perform, i.e. which button was pressed.
payload['ctl00$cphMainContent$lgnEaglesNest$LoginButton': 'Log In']
I am trying to update a value in REST API of openhab using requests.put in Python. But I am getting error 404.
My code is copied below
import requests
import json
from pprint import pprint
TemperatureA_FF_Office = 20
headers = {'Content-type': 'application/json'}
payload = {'state' : TemperatureA_FF_Office}
payld = json.dumps(payload)
re = requests.put("http://localhost:8080/rest/items/TemperatureA_FF_Office
/state/put", params= payld, headers = headers)
pprint(vars(re))
The error code I am getting is
{'_content': '',
'_content_consumed': True,
'connection': <requests.adapters.HTTPAdapter object at 7fd3b55ec9d0>,
'cookies': <<class 'requests.cookies.RequestsCookieJar'>[]>,
'elapsed': datetime.timedelta(0, 0, 4019),
'encoding': None,
'history': [],
'raw': <urllib3.response.HTTPResponse object at 0x7fd3b55ecd90>,
'reason': 'Not Found',
'request': <PreparedRequest [PUT]>,
'status_code': 404,
'url': u'http://localhost:8080/rest/items/TemperatureA_FF_Office/state/put?state=21.0'}
Is this the way to use requests.put? Please help.
Try something along these lines:
import requests
req = "http://localhost:8080/rest/items/YOUR_SENSOR_HERE/state"
val = VARIABLE_WITH_YOUR_SENSOR_DATA
try:
r = requests.put(req,data=val)
except requests.ConnectionError as e:
r = "Response Error"
print e
print r
This is a massively simplified version of what I'm using for some of my presence detection and temperature scripts.
The printing of 'r' and 'e' is useful for debug purposes and can be removed once you've got your script working properly.