How to compare variables if not http 200 status - python

I have currently written a webscraping where I compare two values to see if there has been any increased value from previous request compare to new request.
import json
import re
import time
from dataclasses import dataclass
from typing import Optional, List
import requests
from bs4 import BeautifulSoup
#dataclass
class Product:
name: Optional[str]
price: Optional[str]
image: Optional[str]
sizes: List[str]
#staticmethod
def get_sizes(doc: BeautifulSoup) -> List[str]:
pat = re.compile(
r'^<script>var JetshopData='
r'(\{.*\})'
r';</script>$',
)
for script in doc.find_all('script'):
match = pat.match(str(script))
if match is not None:
break
else:
return []
data = json.loads(match[1])
return [
variation
for get_value in data['ProductInfo']['Attributes']['Variations']
if get_value.get('IsBuyable')
for variation in get_value['Variation']
]
#classmethod
def from_page(cls, url: str) -> Optional['Product']:
with requests.get(url) as response:
response.raise_for_status()
doc = BeautifulSoup(response.text, 'html.parser')
name = doc.select_one('h1.product-page-header')
price = doc.select_one('span.price')
image = doc.select_one('meta[property="og:image"]')
return cls(
name=name and name.text.strip(),
price=price and price.text.strip(),
image=image and image['content'],
sizes=cls.get_sizes(doc),
)
def main():
product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
previous_request = product.sizes
while True:
product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
if set(product.sizes) - set(previous_request):
print("new changes on the webpage")
previous_request = product.sizes
else:
print("No changes made")
time.sleep(500)
if __name__ == '__main__':
main()
The problem I am facing is that there is a scenario where the product can be taken down. For example if I now have found sizes['US 9,5/EUR 43', 'US 10,5/EUR 44,5'] and the webpage gets taken down by the admin where it returns 404. After few hours they re-add back the webpage and add again the values ['US 9,5/EUR 43', 'US 10,5/EUR 44,5']- That would not print the value we already had it before on our previous valid request.
I wonder what would be the best way to print out the values if a webpage returns from 404 back to 200 (even if they add the same value?)

The use of response.raise_for_status() is incorrect in this case. That will simply raise an exception if the website returns a 404, 500 or similar, exiting your program. change out response.raise_for_status() with:
if response.status_code is not 200:
return cls(None,None,None,None)
EDIT as i misinterpreted the question:
An empty product will now be returned if an error occurred. The only check required now is if the sizes has changed.
def main():
url = "https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103"
previous_product = Product.from_page(url)
while True:
product = Product.from_page(url)
if not product.sizes == previous_product.sizes:
print("new changes on the webpage")
else:
print("No changes made")
previous_product = product
time.sleep(500)
previous_product has been moved outside. In this exact case, it does not matter, but it improves readability.
The use of set(...) - set(...) has been removed as it does not catch when something has been removed from the website, only when something is added. If something is first removed and then re-added, it would be have been caught by your program either.

Related

Handling final page in Python paginated API request

I'm requesting Microsoft's Graph API, where I'm using the following function to request multiple pages. I'm trying to request all pages, merge the json files and finally write them to a pandas dataframe.
v = "v1.0"
r = "/users?$filter=userType eq 'Member'&$select=displayName,givenName,jobTitle,mail,department&$top=200"
def query(v, r):
all_records = []
url = uri.format(v=v, r=r)
while True:
if not url:
break
result = requests.get(url, headers=headers)
if result.status_code == 200:
json_data = json.loads(result.text)
all_records = all_records + json_data["value"]
url = json_data["#odata.nextLink"]
return all_records
The while-loop goes through all the pages, but when I run the function I'm getting a error:
KeyError: '#odata.nextLink'
I assume this is because the loop reaches the final page, and thus the '#odata.nextLink' cannot be found. But how can I handle this?
You are doing
url = json_data["#odata.nextLink"]
which suggest json_data is dict, so you should be able to use .get method which returns default value when key not found (None by default), please try doing following and write if it does work as excepted:
url = json_data.get("#odata.nextLink")
if url is None:
print("nextLink not found")
else:
print("nextLink found")

Index out of range when sending requests in a loop

I encounter an index out of range error when I try to get the number of contributors of a GitHub project in a loop. After some iterations (which are working perfectly) it just throws that exception. I have no clue why ...
for x in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number) # prints the correct number until the exception
Here's the exception.
----> 4 contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
IndexError: list index out of range
It seems likely that you're getting a 429 - Too many requests since you're firing requests of one after the other.
You might want to modify your code as such:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
time.sleep(3) # Wait a bit before firing of another request
Better yet would be:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
if r.status_code in [200]: # Check if the request was successful
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
else:
print("Failed fetching page, status code: " + str(r.status_code))
time.sleep(3) # Wait a bit before firing of another request
Now this works perfectly for me while using the API. Probably the cleanest way of doing it.
import requests
import json
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'
response = requests.get(url)
commits = json.loads(response.text)
commits_total = len(commits)
page_number = 1
while(len(commits) == 100):
page_number += 1
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'+'&page='+str(page_number)
response = requests.get(url)
commits = json.loads(response.text)
commits_total += len(commits)
GitHub is blocking your repeated requests. Do not scrape sites in quick succession, many website operators actively block too many requests. As a result, the content that is returned no longer matches your XPath query.
You should be using the REST API that GitHub provides to retrieve project stats like the number of contributors, and you should implement some kind of rate limiting. There is no need to retrieve the same number 100 times, contributor counts do not change that rapidly.
API responses include information on how many requests you can make in a time window, and you can use conditional requests to only incur rate limit costs when the data actually has changed:
import requests
import time
from urllib.parse import parse_qsl, urlparse
owner, repo = 'tipsy', 'profile-summary-for-github'
github_username = '....'
# token = '....' # optional Github basic auth token
stats = 'https://api.github.com/repos/{}/{}/contributors'
with requests.session() as sess:
# GitHub requests you use your username or appname in the header
sess.headers['User-Agent'] += ' - {}'.format(github_username)
# Consider logging in! You'll get more quota
# sess.auth = (github_username, token)
# start with the first, move to the last when available, include anonymous
last_page = stats.format(owner, repo) + '?per_page=100&page=1&anon=true'
while True:
r = sess.get(last_page)
if r.status_code == requests.codes.not_found:
print("No such repo")
break
if r.status_code == requests.codes.no_content:
print("No contributors, repository is empty")
break
if r.status_code == requests.codes.accepted:
print("Stats not yet ready, retrying")
elif r.status_code == requests.codes.not_modified:
print("Stats not changed")
elif r.ok:
# success! Check for a last page, get that instead of current
# to get accurate count
link_last = r.links.get('last', {}).get('url')
if link_last and r.url != link_last:
last_page = link_last
else:
# this is the last page, report on count
params = dict(parse_qsl(urlparse(r.url).query))
page_num = int(params.get('page', '1'))
per_page = int(params.get('per_page', '100'))
contributor_count = len(r.json()) + (per_page * (page_num - 1))
print("Contributor count:", contributor_count)
# only get us a fresh response next time
sess.headers['If-None-Match'] = r.headers['ETag']
# pace ourselves following the rate limit
window_remaining = int(r.headers['X-RateLimit-Reset']) - time.time()
rate_remaining = int(r.headers['X-RateLimit-Remaining'])
# sleep long enough to honour the rate limit or at least 100 milliseconds
time.sleep(max(window_remaining / rate_remaining, 0.1))
The above uses a requests session object to handle repeated headers and ensure that you get to reuse connections where possible.
A good library such as github3.py (incidentally written by a requests core contributor) will take care of most of those details for you.
If you do want to persist on scraping the site directly, you do take a risk that the site operators block you altogether. Try to take some responsibility by not hammering the site continually.
That means that at the very least, you should honour the Retry-After header that GitHub gives you on 429:
if not r.ok:
print("Received a response other that 200 OK:", r.status_code, r.reason)
retry_after = r.headers.get('Retry-After')
if retry_after is not None:
print("Response included a Retry-After:", retry_after)
time.sleep(int(retry_after))
else:
# parse OK response

Python How to retrieve a stock's last current stock price from the dictionary and put it into a variable?

I am trying to obtain a stock's current price, and then put it into a variable to run if / else statements on. I have used the Google API to retrieve current stock prices, but I am unable to figure out how to put it into a variable. Thanks!
import json
import sys
try:
from urllib.request import Request, urlopen
except ImportError: #python 2
from urllib2 import Request, urlopen
googleFinanceKeyToFullName = {
u'id' : u'ID',
u't' : u'StockSymbol',
u'e' : u'Index',
u'l' : u'LastTradePrice',
u'l_cur' : u'LastTradeWithCurrency',
u'ltt' : u'LastTradeTime',
u'lt_dts' : u'LastTradeDateTime',
u'lt' : u'LastTradeDateTimeLong',
u'div' : u'Dividend',
u'yld' : u'Yield'
}
def buildUrl(symbols):
symbol_list = ','.join([symbol for symbol in symbols])
#a deprecated but still active & correct api
return 'http://finance.google.com/finance/info?client=ig&q=' \
+ symbol_list
def request(symbols):
url = buildUrl(symbols)
req = Request(url)
resp = urlopen(req)
#remove special symbols such as the pound symbol
content = resp.read().decode('ascii', 'ignore').strip()
content = content[3:]
return content
def replaceKeys(quotes):
global googleFinanceKeyToFullName
quotesWithReadableKey = []
for q in quotes:
qReadableKey = {}
for k in googleFinanceKeyToFullName:
if k in q:
qReadableKey[googleFinanceKeyToFullName[k]] = q[k]
quotesWithReadableKey.append(qReadableKey)
return quotesWithReadableKey
def getQuotes(symbols):
if type(symbols) == type('str'):
symbols = [symbols]
content = json.loads(request(symbols))
return replaceKeys(content);
if __name__ == '__main__':
try:
symbols = sys.argv[1]
except:
symbols = "GOOG,AAPL,MSFT,AMZN,SBUX"
symbols = symbols.split(',')
try:
print(json.dumps(getQuotes(symbols), indent=2))
except:
print("Fail")
You can get the last current stock price from the dictionary and put it into a variable, say price,
by changing the last part of the code to
try:
quotes = getQuotes(symbols)
price = quotes[-1]['LastTradePrice'] # -1 means last in a list
print(price)
except Exception as e:
print(e)
but it is very unreliable because if the order of prices is changed, you will get a price for a different stock.
What you should do is to learn how to define a data structure that's suitable ro solve your problem.

Results not showing after search term entered from keyboard

I am learning to develop an addon for Kodi and need to implement a search functionality. I found some resources online to get user input from keyboard and then calling an API with the search term to fetch results. The API is being requested fine but the results are not being shown through ListItems. Below is my code
_url = sys.argv[0]
_handle = int(sys.argv[1])
def get_url(**kwargs):
return '{0}?{1}'.format(_url, urlencode(kwargs))
def display_main_menu():
list_item = xbmcgui.ListItem(label="Search")
url = get_url(action='search')
xbmcplugin.addDirectoryItem(_handle, url, list_item)
def perform_search(search_term):
link = "api_url_here" + search_term
r = requests.get(link)
resp = json.loads(r.text)
for result in resp:
list_item = xbmcgui.ListItem(label=result["name"])
list_item.setArt({'thumb': result["img"], 'icon' : result["img"], 'fanart' : result["img"]})
url = '' #blank url for testing
is_folder = True
xbmcplugin.addDirectoryItem(_handle, url, list_item, is_folder)
xbmcplugin.endOfDirectory(_handle)
def search():
keyb = xbmc.Keyboard('',"Search for Videos", False)
keyb.setDefault('')
keyb.doModal()
if (keyb.isConfirmed() and len(keyb.getText()) > 0):
perform_search(keyb.getText())
def router(paramstring):
params = dict(parse_qsl(paramstring))
if params:
if params['action'] == 'search':
search()
else:
raise ValueError('Invalid paramstring: {0}!'.format(paramstring))
else:
display_main_menu()
if __name__ == '__main__':
router(sys.argv[2][1:])
When I select Search and then type my word to search the keyboard is dismissed but nothing happens. The same menu is being displayed and new ListItems from the perform_search function are not being displayed. Also, there is no error. Please help.
Add some logging, the interesting part will be if you actually hit the for. So add some xbmc.log('hit') or even log your values.
If you want a full blown logging check this example https://github.com/xbmc/generator-kodi-addon/blob/master/generators/app/templates/resources/lib/kodilogging.py

Recursive function gives no output

I'm scraping all the URL of my domain with recursive function.
But it outputs nothing, without any error.
#usr/bin/python
from bs4 import BeautifulSoup
import requests
import tldextract
def scrape(url):
for links in url:
main_domain = tldextract.extract(links)
r = requests.get(links)
data = r.text
soup = BeautifulSoup(data)
for href in soup.find_all('a'):
href = href.get('href')
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == main_domain.domain :
problem.append(href)
elif not href == '#' and link_domain.tld == '':
new = 'http://www.'+ main_domain.domain + '.' + main_domain.tld + '/' + href
problem.append(new)
return len(problem)
return scrape(problem)
problem = ["http://xyzdomain.com"]
print(scrape(problem))
When I create a new list, it works, but I don't want to make a list every time for every loop.
You need to structure your code so that it meets the pattern for recursion as your current code doesn't - you also should not call variables the same name as libraries, e.g. href = href.get() because this will usually stop the library working as it becomes the variable, your code as it currently is will only ever return the len() as this return is unconditionally reached before: return scrap(problem).:
def Recursive(Factorable_problem)
if Factorable_problem is Simplest_Case:
return AnswerToSimplestCase
else:
return Rule_For_Generating_From_Simpler_Case(Recursive(Simpler_Case))
for example:
def Factorial(n):
""" Recursively Generate Factorials """
if n < 2:
return 1
else:
return n * Factorial(n-1)
Hello I've made a none recursive version of this that appears to get all the links on the same domain.
The code below I've tested using the problem included in the code. When I'd solved the problems with the recursive version the next problem was hitting the recursion depth limit so I rewrote it so it ran in an iterative fashion, the code and result below:
from bs4 import BeautifulSoup
import requests
import tldextract
def print_domain_info(d):
print "Main Domain:{0} \nSub Domain:{1} \nSuffix:{2}".format(d.domain,d.subdomain,d.suffix)
SEARCHED_URLS = []
problem = [ "http://Noelkd.neocities.org/", "http://youpi.neocities.org/"]
while problem:
# Get a link from the stack of links
link = problem.pop()
# Check we haven't been to this address before
if link in SEARCHED_URLS:
continue
# We don't want to come back here again after this point
SEARCHED_URLS.append(link)
# Try and get the website
try:
req = requests.get(link)
except:
# If its not working i don't care for it
print "borked website found: {0}".format(link)
continue
# Now we get to this point worth printing something
print "Trying to parse:{0}".format(link)
print "Status Code:{0} Thats: {1}".format(req.status_code, "A-OK" if req.status_code == 200 else "SOMTHINGS UP" )
# Get the domain info
dInfo = tldextract.extract(link)
print_domain_info(dInfo)
# I like utf-8
data = req.text.encode("utf-8")
print "Lenght Of Data Retrived:{0}".format(len(data)) # More info
soup = BeautifulSoup(data) # This was here before so i left it.
print "Found {0} link{1}".format(len(soup.find_all('a')),"s" if len(soup.find_all('a')) > 1 else "")
FOUND_THIS_ITERATION = [] # Getting the same links over and over was boring
found_links = [x for x in soup.find_all('a') if x.get('href') not in SEARCHED_URLS] # Find me all the links i don't got
for href in found_links:
href = href.get('href') # You wrote this seems to work well
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == dInfo.domain: # JUST FINDING STUFF ON SAME DOMAIN RIGHT?!
if href not in FOUND_THIS_ITERATION: # I'ma check you out next time
print "Check out this link: {0}".format(href)
print_domain_info(link_domain)
FOUND_THIS_ITERATION.append(href)
problem.append(href)
else: # I got you already
print "DUPE LINK!"
else:
print "Not on same domain moving on"
# Count down
print "We have {0} more sites to search".format(len(problem))
if problem:
continue
else:
print "Its been fun"
print "Lets see the URLS we've visited:"
for url in SEARCHED_URLS:
print url
Which prints, after a lot of other logging loads of neocities websites!
What's happening is the script is popping a value of the list of websites yet to visit, it then gets all the links on the page which are on the same domain. If those links are to pages we haven't visited we add the link to the list of links to be visited. After we do that we pop the next page and do the same thing again until there are no pages left to visit.
Think this is what your looking for, get back to us in the comments if this doesn't work in the way that you want or if anyone can improve please leave a comment.

Categories

Resources