I'm building a python script that gathers data from Instagram, based on a user list provided in my database. However, I'm running into some issues trying to handle unexpected JSON response.
To give some context, the program is fetching a username from my database table (24/7, looping over hundreds of accounts - hence the while True: loop), requesting a URL with that username, and expecting a certain JSON response (specifically, it's looking for ['entry_data']['ProfilePage'][0] in the response).
However when usernames aren't found on Instagram, the JSON is different, and the expected part (['entry_data']['ProfilePage'][0]) is not in there. So my script crashes.
With the current code:
def get_username_from_db():
try:
with connection.cursor() as cursor:
cursor.execute("SELECT * FROM ig_users_raw WHERE `username` IS NOT NULL ORDER BY `ig_users_raw`.`last_checked` ASC LIMIT 1")
row = cursor.fetchall()
username = row[0]['username']
except pymysql.IntegrityError:
print('ERROR: ID already exists in PRIMARY KEY column')
return username
def request_url(url):
try:
response = requests.get(url)
except requests.HTTPError:
raise requests.HTTPError(f'Received non 200 status code from {url}')
except requests.RequestException:
raise requests.RequestException
else:
return response.text
def extract_json_data(url):
try:
r = requests.get(url, headers=headers)
except requests.HTTPError:
raise requests.HTTPError('Received non-200 status code.')
except requests.RequestException:
raise requests.RequestException
else:
print(url)
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
return j
if __name__ == '__main__':
while True:
sleep(randint(5,15))
username = get_username_from_db()
url = f'https://www.instagram.com/{username}/'
j = extract_json_data(url)
json_string = json.dumps(j)
user_id = j['graphql']['user']['id']
username = j['graphql']['user']['username']
#print(user_id)
try:
with connection.cursor() as cursor:
db_data = (json_string, datetime.datetime.now(),user_id)
sql = "UPDATE `ig_users_raw` SET json=%s, last_checked=%s WHERE `user_id`= %s "
cursor.execute(sql, db_data)
connection.commit()
print(f'{datetime.datetime.now()} - data inserted for user: {user_id} - {username}')
except pymysql.Error:
print('ERROR: ', pymysql.Error)
I'm getting the following error/traceback:
https://www.instagram.com/geloria.itunes/
Traceback (most recent call last):
File "D:\Python\Ministry\ig_raw.py", line 63, in <module>
j = extract_json_data(url)
File "D:\Python\Ministry\ig_raw.py", line 55, in extract_json_data
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\thoma\AppData\Local\Programs\Python\Python36-32\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
Ideally, I want this to just skip past the account (in this case geloria.itunes), and move to the next one in the database. I might want to remove the account, or at least remove the username from the row.
In an effort to solve this myself, I experimented with if / else loops, but in the case where it would continue, I'd just be looping over the same account.
Do you have any suggestions on how I can tackle this specific issue?
Thanks!
First of all you need to figure out why exception occurred.
The reason why you're getting this error is because you're telling json to parse invalid (non-JSON) string.
Just run this example with URL you've provided in traceback:
import re
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.instagram.com/geloria.itunes/")
print(r.status_code) # outputs 404(!)
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
print(stringified_json)
# j = json.loads(stringified_json) # will raise an exception
Output:
\n(function(){\n function normalizeError(err) {\n...
...
stringify(normalizedError));\n })\n }\n })\n}());
As you can see stringified_json is not a valid JSON string.
As you mentioned it is invalid because this instagram page is hidden or does not exist (HTTP status code is 404 Not Found). And you're passing the wrong response to json.loads() because you have no checks for response status code in your script.
The following except clauses did not catch "404 case" because you've received a valid HTTP response therefore there is no exception to raise:
except requests.HTTPError:
raise requests.HTTPError('Received non-200 status code.')
except requests.RequestException:
raise requests.RequestException
So basically you have 2 ways to deal with this issue:
check for response HTTP status code manually like if r.status_code != 200 ...
or use raise_for_status() method to throw an exception if 400 <= r.status_code < 600
I might want to remove the account, or at least remove the username from the row.
Well, your question here sounds a bit vague. I can just give an idea.
For example - if 404 page encountered, you can raise your custom exception when dealing with response, catch it later in __main__, delete record from database and continue with other pages:
class NotFoundError(Exception):
""" my custom exception for not found pages """
pass
... # other functions
def extract_json_data(url):
r = requests.get(url, headers=headers)
if r.status_code == 404:
raise NotFoundError() # page not found
# if any other error occurs (network unavailable for example) - an exception will be raised
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
return json.loads(stringified_json)['entry_data']['ProfilePage'][0]
if __name__ == '__main__':
while True:
sleep(randint(5, 15))
username = get_username_from_db()
url = f'https://www.instagram.com/{username}/'
try:
j = extract_json_data(url)
except NotFoundError:
delete_user_from_db(username) # implement: DELETE FROM t WHERE username = ...
continue # proceed for next user page
# rest of your code:
# json_string = json.dumps(j)
# user_id = j['graphql']['user']['id']
# ...
Related
While Handling Python dictionary key error except block is not working but try block is working.
Below is my code
def catch_empty_key(a):
try:
return 'aaaa'
except :
return 'bbbb'
def zohoapicall(accesstoken):
accesstoken = ""
if accesstoken == "":
parameters = {
"refresh_token":"1000.06f10f49d6f00478887e3820634b928f.c045ff2a9dcb9c99057ec42645bf1e44",
"client_id":"1000.UKZQIWVQ2A2THKSZ2126Y7E7CAA8CW",
"client_secret":"91d25fbaeea0e81190a681708cd554a1030a9c4547",
"redirect_uri":"https://www.google.com",
"grant_type":"refresh_token",
}
response = requests.post("https://accounts.zoho.com/oauth/v2/token?", params=parameters)
if response.status_code == 200:
data = response.json()
accesstoken = data['access_token']
headers = {
'Content-Type':'application/json',
'Authorization':'Zoho-oauthtoken ' + str(accesstoken)
}
response = requests.get("https://books.zoho.com/api/v3/invoices", headers=headers)
if response.status_code == 200:
data1 = response.json()
data_2=[catch_empty_key(invoice['not_a_key']) for invoice in data1['invoices']]
return HttpResponse(data_2, accesstoken)
Here in the second last line data_2=[catch_empty_key(invoice['not_a_key']) for invoice in data1['invoices']] except block of catch_empty_key function is not working and it is throwing an error.
On the Other hand if I replace second last line with something that is a key of invoice then try block is working and returning aaa as output. for example
data_2=[catch_empty_key(invoice['is_a_key']) for invoice in data1['invoices']]
I want to understand why this error is coming and how can we solve it?
The error gets produced when parsing the arguments you are passing, so it fails before evaluating the function; Modifying the function parameters so that the key gets checked within the function would work, for example:
def catch_empty_key(dict_to_check, key):
try:
dict_to_check[key] # error will occur if key does not exist
return 'aaaa'
except:
return 'bbbb'
Alternatively you could check if the key exists by using in:
test = {'my':'dictionary'}
print('j' in test)
Outputs:
False
So you could simply have:
def catch_empty_key(dict_to_check, key):
if key in dict_to_check:
return 'aaaa'
else:
return 'bbbb'
You are misunderstanding the concept of try-except. The code snippets need to be inside try block in order to catch any exception raised inside the snippets. In your given code, you can use it for requests.get() like this:
headers = {
'Content-Type':'application/json',
'Authorization':'Zoho-oauthtoken ' + str(accesstoken)
}
try:
response = requests.get("https://books.zoho.com/api/v3/invoices", headers=headers)
except Exception as e:
print(e)
if response.status_code == 200:
data1 = response.json()
data_2=[catch_empty_key(invoice['not_a_key']) for invoice in data1['invoices']]
return HttpResponse(data_2, accesstoken)
I've created a script for web scraping and I'm using 2Captcha to solve captchas. 2Captcha has a Python library, but I've created my own functions to generate the captcha ID and captcha token code.
My captcha module has 3 functions: get_captcha_id(), get_captcha_response(), and apply_token()
Everything works great, and I'm able to sovled a couple dozen captchas until eventually I get the 2 following error:
ERROR_WRONG_CAPTCHA_ID
When this happens, the script first comes to the error ERROR_CAPTCHA_UNSOLVABLE, then the loop goes back and generates an entire new captcha ID. Maybe I should keep the same ID and just generate a new token?
I just want to know if there's a better way to do this anyway...
Here is the code to start the 2Captcha on my main script:
captcha_solved = 0
#Solves recpacha via 2Captcha API
while captcha_solved == 0:
captcha_id = captcha.get_captcha_id(browser.current_url)
if captcha_id != 0 or captcha_id != None:
print("Captcha ID is: "+str(captcha_id))
cap_res = captcha.get_captcha_response(captcha_id)
if cap_res == "ERROR_CAPTCHA_UNSOLVABLE" or cap_res == "ERROR_TOKEN_EXPIRED" or cap_res == "ERROR_WRONG_CAPTCHA_ID":
print("Captcha failed... Restarting captcha")
browser.refresh()
sleep(1)
continue
else:
print("Capcha Token: "+cap_res)
captcha.apply_token(browser, cap_res)
solver.report(captcha_id, True)
captcha_solved = captcha_solved + 1
break
Once this while loop is complete, the main script will start. After about 2 dozen captcha or so, I'll receive this error:
Traceback (most recent call last):
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\main.py", line 191, in <module>
cap_res = captcha.get_captcha_response(captcha_id)
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\captcha.py", line 83, in get_captcha_response
solver.report(cap_id, False)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\solver.py", line 496, in report
self.api_client.res(key=self.API_KEY, action=rep, id=id_)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\api.py", line 113, in res
raise ApiException(resp)
twocaptcha.api.ApiException: ERROR_WRONG_CAPTCHA_ID
I've thought I added enough failsafes to be able to regenerate a Captcha Token
Here is my captcha.py file code:
from twocaptcha import TwoCaptcha
from random import randint
from time import sleep
from urllib.request import urlopen, Request
import re
from bs4 import BeautifulSoup
from twocaptcha.solver import ValidationException
from twocaptcha.api import NetworkException, ApiException
from selenium.common.exceptions import TimeoutException
#solver = TwoCaptcha('API_KEY')
site_key = "###"
api_key = "###"
config = {
'server': '2captcha.com',
'apiKey': api_key,
'callback': 'https://your.site.com/',
'defaultTimeout': 120,
'recaptchaTimeout': 600,
'pollingInterval': 10,
}
proxy={
'type': 'HTTP',
'uri': '###'
}
user_agent = '###'
solver = TwoCaptcha(**config)
print("2Captcha Balance: $"+str(solver.balance()))
def get_captcha_id(captcha_url):
try:
result = solver.recaptcha(sitekey=site_key, url=captcha_url, proxy=proxy)
#print(result)
split_string = str(result).split(":", 1)
substring = split_string[0]
#print(substring)
if (substring == "{'captchaId'"):
strip_beginning = re.sub("{'captchaId': '", "", str(result))
captcha_id = re.sub("'}", "", strip_beginning)
return captcha_id
else:
print("could not find captcha ID")
return 0
except ValidationException as e:
# invalid parameters passed
print(e)
return e
except NetworkException as e:
# network error occurred
print(e)
return e
except ApiException as e:
# api respond with error
print(e)
return e
except TimeoutException as e:
# captcha is not solved so far
print(e)
return e
def get_captcha_response(cap_id):
capcha_ready = 0
response_url = "https://2captcha.com/res.php?key="+api_key+"&action=get&id="+cap_id
while capcha_ready == 0:
PageRequest = Request(response_url,data=None,headers={'User-Agent': user_agent})
PageResponse = urlopen(PageRequest)
PageHtml = PageResponse.read()
PageSoup = BeautifulSoup(PageHtml, 'html.parser')
SoupText = str(PageSoup)
if SoupText == "ERROR_CAPTCHA_UNSOLVABLE" or SoupText == "ERROR_WRONG_CAPTCHA_ID" or SoupText == "ERROR_TOKEN_EXPIRED":
solver.report(cap_id, False)
return SoupText
elif str(PageSoup) == "CAPCHA_NOT_READY":
print("Waiting for capcha response...")
rand = randint(12,18)
print("sleeping for "+str(rand)+" seconds")
sleep(rand)
else:
split_string = str(PageSoup).split("|", 1)
if len(split_string) > 0:
substring = split_string[1]
return substring
capcha_ready = capcha_ready + 1
#print(PageSoup)
return PageSoup
def apply_token(browser, token):
print("Applying token to browser...")
browser.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(token))
print("Token applied")
Thanks for your help for this, I really appreciate it!
I need to make a request to an API that only responses with maximum of 200 results. If the total amount of data is more than 200, the API responses also with a parameter lastKey that I need to pass to a new request. When all the data has been returned the lastKey -param is not returned anymore.
My question is how to do it in a simple, clean way? This is how I make the first request and I can see if there is the lastKey -param or not:
url = 'https://example.com'
moreData = False
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
if 'lastKey' in result:
url = 'https://example.com&lastKey=' + result['lastKey']
moreData = True
How could I do this whole thing for example inside a while -loop?
Just get the first result out of the while loop, then call your api while you have "lastkey" in the result
url = 'https://example.com'
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
while 'lastKey' in result:
url = 'https://example.com&lastKey=' + result['lastKey']
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
I generated the following code where I pass multiple URLS via API and ask that there be an output written to different pandas dataframes. It (sort of) works, but the outcome is incorrect
1) It seems to enter the function and print "Success" way too many times. Why?
2) The output for all the dataframes is the same; not sure where the error is.
See the function:
def data_extract(url):
payload = {'limit':'200000'}
# Persists parameters across requests
s = requests.Session()
# To determine success of request, and error code
for url in url:
try:
response = s.get(url)
# If the response was successful, no Exception will be raised
response.raise_for_status()
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except Exception as err:
print(f'Other error occurred: {err}')
else:
# Ret
jsonData = s.get(url, params=payload).json()
print('Success!')
df_tmr = pd.DataFrame(jsonData['records'])
return df_tmr
See the call to the function:
urls = {
# Rainfall data
'tot_rain_mth': 'https://data.gov.sg/dataset/5942f8bd-4240-4f68-acd2-a5a276958237/resource/778814b8-1b96-404b-9ac9-68d6c00e637b/data',
'no_days_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-number-of-rain-days/resource/8b94f596-91fd-4545-bf9e-7a426493b674/data',
'max_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-maximum-daily-total/resource/df4d391e-6950-4fc6-80cd-c9b9ef6354fe/data',
# Temperature Data
'mean_sun_dur_mth': 'https://data.gov.sg/dataset/sunshine-duration-monthly-mean-daily-duration/resource/0230819f-1c83-4980-b738-56136d6dc300/data',
'wet_bulb_hr': 'https://data.gov.sg/dataset/wet-bulb-temperature-hourly/resource/0195dc7a-2f49-4107-ac7c-3112ca4a09a8/data',
'min_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-minimum/resource/ad0d8a97-9321-42e9-ac6f-46bf12845d44/data',
'min_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-absolute-extreme-minimum/resource/0c5b9752-2488-46cc-ae1c-42318d0f8865/data',
'mean_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-mean/resource/07654ce7-f97f-49c9-81c6-bd41beba4e96/data',
'max_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-maximum/resource/c7a7d2fd-9d32-4508-92ef-d1019e030a2f/data',
'max_air_temp_mth': 'https://data.gov.sg/dataset/air-temperature-absolute-extremes-maximum/resource/96e66346-68bb-4ca9-b001-58bbf39e36a7/data',
# Humidity Data
'min_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-absolute-extreme-minimum/resource/585c24a5-76cd-4c48-9341-9223de5adc1d/data',
'mean_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-mean/resource/4631174f-9858-463d-8a88-f3cb21588c67/data',
'mean_hum_yr': 'https://data.gov.sg/dataset/relative-humidity-annual-mean/resource/77b9059f-cc9a-4f4f-a495-9c268945191b/data'
}
df={}
for i in range(len(urls.keys())):
df[str(i)] = pd.DataFrame()
#print('Name of Dataframe:', df)
df[str(i)] = data_extract(urls.values())
print (df['0'])
print (df['1'])
--> Sorry about the bad formatting; cant quite get it right in SO
import requests
import pandas as pd
def data_extract(url):
print(url)
payload = {'limit':'200000'}
s = requests.Session()
try:
response = s.get(url)
response.raise_for_status()
jsonData = s.get(url, params=payload).json()
print('Success!')
except Exception as err:
print(f'Other error occurred: {err}')
df_tmr = pd.DataFrame(jsonData['records'])
return df_tmr
urls = {
# Rainfall data
'tot_rain_mth': 'https://data.gov.sg/dataset/5942f8bd-4240-4f68-acd2-a5a276958237/resource/778814b8-1b96-404b-9ac9-68d6c00e637b/data',
'no_days_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-number-of-rain-days/resource/8b94f596-91fd-4545-bf9e-7a426493b674/data',
'max_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-maximum-daily-total/resource/df4d391e-6950-4fc6-80cd-c9b9ef6354fe/data',
# Temperature Data
'mean_sun_dur_mth': 'https://data.gov.sg/dataset/sunshine-duration-monthly-mean-daily-duration/resource/0230819f-1c83-4980-b738-56136d6dc300/data',
'wet_bulb_hr': 'https://data.gov.sg/dataset/wet-bulb-temperature-hourly/resource/0195dc7a-2f49-4107-ac7c-3112ca4a09a8/data',
'min_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-minimum/resource/ad0d8a97-9321-42e9-ac6f-46bf12845d44/data',
'min_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-absolute-extreme-minimum/resource/0c5b9752-2488-46cc-ae1c-42318d0f8865/data',
'mean_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-mean/resource/07654ce7-f97f-49c9-81c6-bd41beba4e96/data',
'max_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-maximum/resource/c7a7d2fd-9d32-4508-92ef-d1019e030a2f/data',
'max_air_temp_mth': 'https://data.gov.sg/dataset/air-temperature-absolute-extremes-maximum/resource/96e66346-68bb-4ca9-b001-58bbf39e36a7/data',
# Humidity Data
'min_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-absolute-extreme-minimum/resource/585c24a5-76cd-4c48-9341-9223de5adc1d/data',
'mean_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-mean/resource/4631174f-9858-463d-8a88-f3cb21588c67/data',
'mean_hum_yr': 'https://data.gov.sg/dataset/relative-humidity-annual-mean/resource/77b9059f-cc9a-4f4f-a495-9c268945191b/data'
}
df={}
temp = list(urls.values())
for i in range(len(temp)):
df[str(i)] = data_extract(temp[i])
print(df['0'])
print(df['1'])
if len(df) == len(temp):
print('success')
I think this will help you. You where iterating over all items and returning only last item as you. Just need to remove for loop from data_extract method.
I am currently using the following script to load a list of URLs then check the source of each for a list of error strings. If no error string is found in the source, the URL is considered valid and written to a text file.
How can I modify this script to check for HTTP status instead? If a URL returns a 404 it would be ignored, if it returns 200 the URL would be written to the text file. Any help would be much appreciated.
import urllib2
import sys
error_strings = ['invalid product number', 'specification not available. please contact customer services.']
def check_link(url):
if not url:
return False
f = urllib2.urlopen(url)
html = f.read()
result = False
if html:
result = True
html = html.lower()
for s in error_strings:
if s in html:
result = False
break
return result
if __name__ == '__main__':
if len(sys.argv) == 1:
print 'Usage: %s <file_containing_urls>' % sys.argv[0]
else:
output = open('valid_links.txt', 'w+')
for url in open(sys.argv[1]):
if(check_link(url.strip())):
output.write('%s\n' % url.strip());
output.flush()
output.close()
You can alter your call to urlopen slightly:
>>> try:
... f = urllib2.urlopen(url)
... except urllib2.HTTPError, e:
... print e.code
...
404
Utilizing the e.code, you can check if it 404s on you. If you don't hit the except block, you can utilize f as you currently do.
urlib2.urlopen gives back a file-like object with some other methods, one of which: getcode() is what you're looking for, just add a line:
if f.getcode() != 200:
return False
In the relevant place
Try this. You can use this
def check_link(url):
if not url:
return False
code = None
try:
f = urllib2.urlopen(url)
code = f.getCode()
except urllib2.HTTPError, e:
code = e.code
result = True
if code != 200:
result = False
return result
Alternatively, if you just need to maintain a list of invalid code strings and check against those, it will be something like below.
def check_link(url):
if not url:
return False
code = None
try:
f = urllib2.urlopen(url)
code = f.getCode()
except urllib2.HTTPError, e:
code = e.code
result = True
if code in invalid_code_strings:
result = False
return result