What has changed on the site?

What has changed on the site? - python

i want to know if there is a way i monitor a site for changes that i dont only get the message "something changed", that i get what has changed too.
My code right now is:
url = Request('https://stackoverflow.com',
headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
print("running")
time.sleep(10)
while True:
try:
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
response = urlopen(url).read()
newHash = hashlib.sha224(response).hexdigest()
if newHash == currentHash:
continue
else:
print("something changed")
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
continue
except Exception as e:
print("error")

Related

While Loop doesn't break when error is occurred

I want my while loop to break when there an error, but it doesn't break / close program when it does...
from bs4 import BeautifulSoup
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
check_listing_sell()

For some reason, soup.select("div.property-address") returns an empty webelements (not an error) even on 'no results' page. Thus, the condition if len(soup.select("div.property-address")) == 0 should be added. Moreover, placing r = requests.get(url) inside the try block is a decent suggestion.
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
if len(soup.select("div.property-address")) == 0:
break
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break

Move the call to requests.get() inside the try.
KeyboardInterrupt is not a subtype of Exception, so you need a separate except block for that.
#from bs4 import BeautifulSoup
import requests
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
print(url)
r = requests.get(url)
print(r.text[:30])
# soup = BeautifulSoup(r.text, "html.parser")
# for item in soup.select("div.property-address"):
# house_counter += 1
# address_prospect = item.get_text(strip=True)
# print(f"{address_prospect} {house_counter}")
counter += 12
except KeyboardInterrupt:
print("Manual interrupt")
break
except Exception as e:
print(f"Exception occurred for counter={counter}, stopping loop: {e}")
break
check_listing_sell()

How to check if file is already in the cache?

i am working on a Tool which can post Ingame News Updates from Games on your Twitter Account, in the last few Days I searched for an solution to find a way to look if an image is already in the cache so it wont be posted again if a News Feed Update of an Game is online, it should only post the new ones and skip the old ones (actually it posts every active News Feed again), the problem is can't do it. I tested it about 100 Times but it wont work. I really hope that one of you can help me with this issue because it would be fantastic if this Tool would work with an method like this. Thanks for every single help in advance.
Here is my code:
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)

You need to check each motd to see if it existed in the old dataset.
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
if i in newsData["motds"]:
# has already been posted
print("Already posted")
continue
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)

Python - save images to folder after requests

I have this function that fecthes a bunch of images:
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page=0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(player['name']['display'], img_url.format(player_id=player['altIds']['opta'])))
sleep(2)
page+=1
except:
break
How do I dinamically save each image on a 'path/to/image' folder with player['name'].png format?

Here you go :)
import requests
from time import sleep
import urllib.request
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page = 0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(
player['name']['display'],
img_url.format(player_id=player['altIds']['opta'])))
urllib.request.urlretrieve(
img_url.format(player_id=player['altIds']['opta']),
player['name']['display'] + ".png")
sleep(2)
page += 1
except:
break

How can I prevent duplicates with try and except blocks?

I have a list of urls, which contain JSON files.
The JSON files are all stored differently, therefore I need try and except blocks to cover the different storing methods.
The problem is that this method leads to some duplicates, because some links get requested two times or more in the different blocks.
My code:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Is it possible to write the try/except blocks in a way that a link gets ignored, if it was succesfully requested in a previous block?

The first 2 try/except blocks are explicitly duplicated and there's no functional benefit to repeat them.
Instead, think through 2 consecutive phases:
extracting a remote resource
parsing JSON string and storing the result
So when extracting phase is failed - no sense to move forward, if the 1st parsing phase is failed - try another kind of parsing:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
except:
continue
try:
try:
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
except:
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass

This should solve it for you
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass

Except open URL BeautifulSoup?

I have function that does request and gets page by URL:
def openUrl(similar_url):
print("Open URL: " + similar_url)
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
I call this function from another:
def get(url):
content = openUrl(url);
try:
soup = BeautifulSoup(content, "html.parser")
except:
pass
for url in urls:
get(url)
Problem is that if I get exception in openUrl then I get erro in soup = BeautifulSoup(content, "html.parser"), because I try to get empty content.
How can I except skip this iteration when error is?

return *WHAT YOU WANT* inside exception block
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
return None # or return ""
and you can call like this
def get(url):
try:
content = openUrl(url) # remove semicolon
except:
pass
else:
if contents is not None:
soup = BeautifulSoup(content, "html.parser")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

What has changed on the site? - python

Related

While Loop doesn't break when error is occurred

How to check if file is already in the cache?

Python - save images to folder after requests

How can I prevent duplicates with try and except blocks?

Except open URL BeautifulSoup?

Categories

Resources