The result of requests module is different between idle and code - python

Why doesn't the wayback machine return an answer with this code?
What I tried: (1) python idle returned a normal answer.
(2) The status_code is 200 and the function returns None.
def wayback_search(url):
res = requests.get("https://web.archive.org/cdx/search/cdx?url=%s&showDupeCount=true&output=json" % url,
headers = {'User-agent': 'Mozilla/5.0'})
### search in requests_module
urllist = res.url.split('&')
request_url = urllist[0][:-1] + '&' + urllist[1] + '&' + urllist[2]
print('timestamps_url:', request_url)
res = requests.get(request_url)
if res.raise_for_status():
cdx = res.json()
print(res.url)
print('cdx', cdx)
res = requests.get("http://archive.org/wayback/available?url=%s" % url,
headers = {'User-agent': 'Mozilla/5.0'})
if res.raise_for_status():
cdx = res.json()
print(res.url)
print('cdx', cdx)
Perhaps the wayback isn't working at all.

I do not see where the function wayback_search is called. Also, there is no return statement in the function. In python, when no return statement is there in function, it returns None.. Try to return what you want
Also the code inside the if res.raise_for_status() should ideally never run because the res.raise_for_status() raises a exception.

Related

Handling final page in Python paginated API request

I'm requesting Microsoft's Graph API, where I'm using the following function to request multiple pages. I'm trying to request all pages, merge the json files and finally write them to a pandas dataframe.
v = "v1.0"
r = "/users?$filter=userType eq 'Member'&$select=displayName,givenName,jobTitle,mail,department&$top=200"
def query(v, r):
all_records = []
url = uri.format(v=v, r=r)
while True:
if not url:
break
result = requests.get(url, headers=headers)
if result.status_code == 200:
json_data = json.loads(result.text)
all_records = all_records + json_data["value"]
url = json_data["#odata.nextLink"]
return all_records
The while-loop goes through all the pages, but when I run the function I'm getting a error:
KeyError: '#odata.nextLink'
I assume this is because the loop reaches the final page, and thus the '#odata.nextLink' cannot be found. But how can I handle this?
You are doing
url = json_data["#odata.nextLink"]
which suggest json_data is dict, so you should be able to use .get method which returns default value when key not found (None by default), please try doing following and write if it does work as excepted:
url = json_data.get("#odata.nextLink")
if url is None:
print("nextLink not found")
else:
print("nextLink found")

Call API for each element in list

I have a list with over 1000 IDs and I want to call an API with different endpoints for every element of the list.
Example:
customerlist = [803818, 803808, 803803,803738,803730]
I tried the following:
import json
import requests
import pandas as pd
API_BASEURL = "https://exampleurl.com/"
API_TOKEN = "abc"
HEADERS = {'content-type' : 'application/json',
'Authorization': API_TOKEN }
def get_data(endpoint):
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
res= pd.DataFrame([res])
return res
get_data(endpointexample)
This works, but it only returns the values for the first element of the list (803818). I want the function to return the values for every ID from customerlist for the endpoint I defined in the function argument.
I found this - possibly related - question, but I couldn't figure my problem out.
There is probably an easy solution for this which I am not seeing, as I am just starting with Python. Thanks.
The moment a function hits a return statement, it immediately finishes. Since your return statement is in the loop, the other iterations never actually get called.
To fix, you can create a list outside the loop, append to it every loop iteration, and then return the DataFrame created with that list:
def get_data(endpoint):
responses = []
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
responses.append(res)
return pd.DataFrame(responses)
A much cleaner solution would be to use list comprehension:
def get_data(endpoint, i):
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
return res
responses = pd.DataFrame([get_data(endpoint, i) for i in customerlist])

Crawler script runs without error, but there's no output excel as I expected

I tried to crawl some housing information from a Chinese housing website. The code has no error when I run. However there's no output file when the running process completes.
import requests
from bs4 import BeautifulSoup
import sys
import os
import time
import pandas as pd
import numpy as np
from parsel import Selector
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'
}
def catchHouseList(url):
resp = requests.get(url, headers=headers, stream=True)
if resp.status_code == 200:
reg = re.compile('<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"')
urls = re.findall(reg, resp.text)
return urls
return []
def catchHouseDetail(url):
resp = requests.get(url, headers=headers)
print(url)
if resp.status_code == 200:
info = {}
soup = BeautifulSoup(resp.text, 'html.parser')
info['Title'] = soup.select('.main')[0].text
info['Total_Price'] = soup.select('.total')[0].text
info['Unit_Price'] = soup.select('.unit')[0].text
info['Price_per_square'] = soup.select('.unitPriceValue')[0].text
# p = soup.select('.tax')
# info['Reference_price'] = soup.select('.tax')[0].text
info['Built_time'] = soup.select('.subInfo')[2].text
info['Place_Name'] = soup.select('.info')[0].text
info['Area'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text
info['Lianjia_number'] = str(url)[34:].rsplit('.html')[0]
info['flooring_plan'] = str(soup.select('.content')[2].select('.label')[0].next_sibling)
info['floor'] = soup.select('.content')[2].select('.label')[1].next_sibling
info['Area_Size'] = soup.select('.content')[2].select('.label')[2].next_sibling
info['Flooring_structure'] = soup.select('.content')[2].select('.label')[3].next_sibling
info['Inner_Area'] = soup.select('.content')[2].select('.label')[4].next_sibling
info['Building_Category'] = soup.select('.content')[2].select('.label')[5].next_sibling
info['House_Direction'] = soup.select('.content')[2].select('.label')[6].next_sibling
info['Building_Structure'] = soup.select('.content')[2].select('.label')[7].next_sibling
info['Decoration'] = soup.select('.content')[2].select('.label')[8].next_sibling
info['Stair_Number'] = soup.select('.content')[2].select('.label')[9].next_sibling
info['Heating'] = soup.select('.content')[2].select('.label')[10].next_sibling
info['Elevator'] = soup.select('.content')[2].select('.label')[11].next_sibling
# info['Aseest_Year'] = str(soup.select('.content')[2].select('.label')[12].next_sibling)
return info
pass
def appendToXlsx(info):
fileName = './second_hand_houses.xlsx'
dfNew = pd.DataFrame([info])
if (os.path.exists(fileName)):
sheet = pd.read_excel(fileName)
dfOld = pd.DataFrame(sheet)
df = pd.concat([dfOld, dfNew])
df.to_excel(fileName)
else:
dfNew.to_excel(fileName)
def catch():
pages = ['https://zs.lianjia.com/ershoufang/guzhenzhen/pg{}/'.format(x) for x in range(1, 21)]
for page in pages:
print(page)
houseListURLs = catchHouseList(page)
for houseDetailUrl in houseListURLs:
try:
info = catchHouseDetail(houseDetailUrl)
appendToXlsx(info)
except:
pass
time.sleep(2)
pass
if __name__ == '__main__':
catch()
I expected to have an excel output, but there's nothing in the end. Only telling me that the Process finished with exit code 0.
Here's one of your problem areas, with a little rewrite to help you see it. You were returning an empty list when that status code was anything other than 200, without any warning or explanation. The rest of your script requires a list to continue running. When you return an empty list, it exits cleanly.
Now, when you run your code, this function is going to return None when the server response isn't 200, and then a TypeError is going to be raised in your catch() function, which will require further error handling.
def catchHouseList(url):
try:
resp = requests.get(url, headers=headers, stream=True)
if resp.status_code == 200:
reg = re.compile(
'<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"')
urls = re.findall(reg, resp.text)
return urls
else:
print('catchHouseList response code:', resp.status_code)
except Exception as e:
print('catchHouseList:', e)

How to trace or to check history of redirected URLs with python only urllib library

When you go to https://httpbin.org/redirect/6 after 6 redirection, you will lead to https://httpbin.org/get. I want to check what URLs are in between - only using python urllib.request.
import urllib.request
def openurl(url):
headers = {}
req = urllib.request.Request(url, headers=headers)
httpResponse = urllib.request.urlopen(req)
code = httpResponse.getcode()
httpHeader = httpResponse.info()
httpBody = httpResponse.read().decode()
return httpHeader, httpBody, code
url = 'https://httpbin.org/redirect/6'
h, b, c = openurl(url)
print(h)
print(b)
print('http Response Code:', c)
Is there any way to tweak the behavior of urlopen inorder to produce a list of URLs in between?
P.S. I cannot vote your answer as my reputation is below 15 otherwise unless I obtain 4 more points.
Agree with georgexsh,
But you can also modify the HTTPRedirectHandler as below which is shorter:
class MyHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
pass
it is a simple task by building your own HTTPRedirectHandler:
import urllib.request
class MyHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
print("newurl", headers["location"])
return super().http_error_302(req, fp, code, msg, headers)
opener = urllib.request.build_opener(MyHTTPRedirectHandler)
urllib.request.install_opener(opener)
response = urllib.request.urlopen('https://httpbin.org/redirect/6')
response.read()

More simple way to login into multiple site/pages

I am not quite happy the way i coded this. Is there a more simple and convenient way to code this in one function and return the output of multiple pages.
def login():
url = "http://192.168.2.45/pricelogin.php"
r = requests.get(url, auth=('pstats', 'pStats'))
page = r.text
return page
def loginhighpricingerror():
pricingerrorurl = "http://192.168.2.45/airline_error.pl"
peu = requests.get(pricingerrorurl, auth=('pstats', 'pstats'))
peupage = peu.text
return peupage
def loginsuccessfullbookings():
sucurl = "http://192.168.2.45/airlinessucbookings.php"
suc = requests.get(sucbookingurl, auth=('pstats', 'pstats'))
sucpage = suc.text
return sucpage
Use session instead of sessionless module functions:
s = requests.Session()
s.auth=('pstats', 'pStats')
def login():
url = "http://192.168.2.45/pricelogin.php"
r = s.get(url)
page = r.text
return page
def loginhighpricingerror():
pricingerrorurl = "http://192.168.2.45/airline_error.pl"
peu = s.get(pricingerrorurl)
peupage = peu.text
return peupage
def loginsuccessfullbookings():
sucurl = "http://192.168.2.45/airlinessucbookings.php"
suc = s.get(sucbookingurl)
sucpage = suc.text
return sucpage
Of course this should be refactored, but hopefully you can see what I mean.
I would generalize the login function, passing the url as parameter:
def login(url):
try:
r = requests.get(url, auth=('pstats', 'pStats'))
except requests.exceptions.RequestException as e:
print e
return '' # but maybe you want to do something else
page = r.text
return page
And then you can run it for each url accumulating the pages in an array for example:
urls = ["http://192.168.2.45/pricelogin.php", "http://192.168.2.45/airline_error.pl", "http://192.168.2.45/airlinessucbookings.php"]
pages = [] # resulting array
for url in urls:
pages.append(login(url))
Note: I added a check on an exception for requests.get since this might fail when there is a connection problem.

Categories

Resources