Google search returns None 302 on AppEngine - python

I am querying Google Search Engine and it works fine locally by returning the expected results. When the same code is deployed on AppEngine, it returns None 302.
The following program returns the links returned in Google Search results.
# The first two imports will be slightly different when deployed on appengine
from pyquery import PyQuery as pq
import requests
import random
try:
from urllib.parse import quote as url_quote
except ImportError:
from urllib import quote as url_quote
USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',
'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',)
SEARCH_URL = 'https://www.google.com/search?q=site:foobar.com%20{0}'
def get_result(url):
return requests.get(url, headers={'User-Agent': random.choice(USER_AGENTS)}).text
def get_links(query):
result = get_result(SEARCH_URL.format(url_quote(query)))
html = pq(result)
return [a.attrib['href'] for a in html('.l')] or \
[a.attrib['href'] for a in html('.r')('a')]
print get_links('foo bar')
Code deployed on AppEngine:
import sys
sys.path[0:0] = ['distlibs']
import lxml
import webapp2
import json
from requests import api
from pyquery.pyquery import PyQuery as pq
import random
try:
from urllib.parse import quote as url_quote
except ImportError:
from urllib import quote as url_quote
USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',
'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',)
SEARCH_URL = 'https://www.google.com/search?q=site:foobar.com%20{0}'
def get_result(url):
return api.get(url, headers={'User-Agent': random.choice(USER_AGENTS)}).text
def get_links(query):
result = get_result(SEARCH_URL.format(url_quote(query)))
html = pq(result)
return [a.attrib['href'] for a in html('.l')] or \
[a.attrib['href'] for a in html('.r')('a')]
form="""
<form action="/process">
<input name="q">
<input type="submit">
</form>
"""
class MainHandler(webapp2.RequestHandler):
def get(self):
self.response.out.write("<h3>Write something.</h3><br>")
self.response.out.write(form)
class ProcessHandler(webapp2.RequestHandler):
def get(self):
query = self.request.get("q")
self.response.out.write("Your query : " + query)
results = get_links(query)
self.response.out.write(results[0])
app = webapp2.WSGIApplication([('/', MainHandler),
('/process', ProcessHandler)],
debug=True)
I have tried querying with both the http and https protocols. The following is the AppEngine log for a request.
Starting new HTTP connection (1): www.google.com
D 2013-12-21 13:13:37.217
"GET /search?q=site:foobar.com%20foo%20bar HTTP/1.1" 302 None
I 2013-12-21 13:13:37.218
Starting new HTTP connection (1): ipv4.google.com
D 2013-12-21 13:13:37.508
"GET /sorry/IndexRedirect?continue=http://www.google.com/search%3Fq%3Dsite:foobar.com%20foo%20bar HTTP/1.1" 403 None
E 2013-12-21 20:51:32.090
list index out of range

I'm puzzled as to why you're trying to spoof the User-Agent header, but it if makes you happy, go for it. Just note that if requests.get is using urlfetch under the covers, App Engine appends a string to the User-Agent header your app supplies, identifying your app. (See https://developers.google.com/appengine/docs/python/urlfetch/#Python_Request_headers).
Try passing follow_redirects = False to urlfetch. That's how you make requests to other App Engine Apps. For completely non-obvious reasons, it might help you in this case.

Related

Scraping href value, but only for items that are in stock

I'm trying to scrape the href values for the items on the following page, however only if the items show as in stock: https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan
With the following code, I've managed to successfully scrape the hrefs, however the out_of_stock flag does not appear to be working and still returns items that are out of stock in the print list. My code:
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
test = []
for product in soup.find_all('div', class_="productName"):
out_of_stock=False
for span in product.parent.find_all('span', ):
if "Out of stock" in span.text:
out_of_stock = True
break
if not out_of_stock:
test.append(product.a['href'])
print(test)
Please could I have suggestions as to how to make the out_of_stock flag work correctly, in order to only print items that are in stock. Thank you!
Here is one way to differentiate between out of stock/available products:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
cards = soup.select('div[class="productCard"]')
for c in cards:
product = c.select_one('div[class="productName" ] a').text.strip()
product_url = c.select_one('div[class="productName" ] a').get('href')
availability = 'Product Available' if c.select_one('div[class="productOutOfStock"]').get('style') == 'display:none;' else 'Out of Stock'
if availability == 'Product Available':
print(product, product_url, availability)
Result in terminal:
Macallan 12 Year Old Sherry Oak https://www.waitrosecellar.com/macallan-12-year-old-sherry-oak-717201 Product Available
Of course you can get other data points about products as well. See BeautifulSoup documentation here: https://beautiful-soup-4.readthedocs.io/en/latest/
Also, Requests-Html seems to be unmaintained, last release being almost 4 years ago? Released: Feb 17, 2019

Crawling, can't connect to the site

I have a little issue, because when I want to crawle a site i get error like: "HTTP Error 404 not found" I tried some ways to fix it, but it didn't work. I can't connect with the site to get the data.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import urllib.request
my_url="https://tabletennis.setkacup.com/en/schedule?date=2021-08-29&hall=4&period=1"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
headers={'User-Agent':user_agent,}
request = urllib.request.Request(my_url,None,headers)
uClient = uReq(request)
It seems like SSL Error if so look at here.
Or you can try requests library.
pip install requests
import requests
my_url="https://tabletennis.setkacup.com/en/schedule?date=2021-08-29&hall=4&period=1"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
headers={'User-Agent':user_agent,}
response = requests.request(method="GET", url=my_url, headers=headers)
print(response.content)

Python request yields status code 500 even though the website is available

I'm trying to use Python to check whether or not a list of websites is online. However, on several sites, requests yields the wrong status code. For example, the status code I get for https://signaturehound.com/ is 500 even though the website is online and in the Chrome developer tools the response code 200 is shown.
My code looks as follows:
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def url_ok(url):
r = requests.head(url,timeout=5,allow_redirects=True,headers=headers)
status_code = r.status_code
return status_code
print(url_ok("https://signaturehound.com/"))
As suggested by #CaptainDaVinci in the comments, the solution is to replace head by get in the code:
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def url_ok(url):
r = requests.get(url,timeout=5,allow_redirects=True,headers=headers)
status_code = r.status_code
return status_code
print(url_ok("https://signaturehound.com/"))

HTTP Error 406: Not Acceptable Python urllib2

I get the following error with the code below.
HTTP Error 406: Not Acceptable Python urllib2
This is my first step before I use beautifulsoup to parse the page.
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
url = "http://www.choicemoney.us/retail.php"
response = opener.open(url)
All help greatly appreciated.
The resource identified by the request is only capable of generating
response entities which have content characteristics not acceptable
according to the accept headers sent in the request. [RFC2616]
Based on the code and what the RFC describes I assume that you need to set both the key and the value of the User-Agent header correctly.
These are correct examples:
Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A
Just replace the following.
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A')]
I believe #ipinak's answer is correct.
urllib2 actually provides a default User-Agent that works here, so if you delete opener.addheaders = [('User-agent', 'Mozilla/5.0')] the response should have status code 200.
I recommend the popular requests library for such jobs as its API is much easier to use.
url = "http://www.choicemoney.us/retail.php"
resp = requests.get(url)
print resp.status_code # 200
print resp.content # can be used in your beautifulsoup.

Error logging into a HTTP Server

I'm trying to login into a http server to fetch some tables. The code I'm using is this:
MechBrowser = mechanize.Browser()
LoginUrl = 'http://www.jlrvehiclefeedback.com'
LoginData = "username=my_username&password=my_password&do=login"
LoginHeader = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
LoginRequest = urllib2.Request(LoginUrl, LoginData, LoginHeader)
LoginResponse = MechBrowser.open(LoginRequest)
however I get this error:
mechanize._response.httperror_seek_wrapper: HTTP Error 403: request disallowed by robots.txt
As you could see, I've defined a UserAgent but still can't get through the bot policy.

Categories

Resources