While Loop doesn't break when error is occurred - python

I want my while loop to break when there an error, but it doesn't break / close program when it does...
from bs4 import BeautifulSoup
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
check_listing_sell()

For some reason, soup.select("div.property-address") returns an empty webelements (not an error) even on 'no results' page. Thus, the condition if len(soup.select("div.property-address")) == 0 should be added. Moreover, placing r = requests.get(url) inside the try block is a decent suggestion.
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
if len(soup.select("div.property-address")) == 0:
break
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break

Move the call to requests.get() inside the try.
KeyboardInterrupt is not a subtype of Exception, so you need a separate except block for that.
#from bs4 import BeautifulSoup
import requests
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
print(url)
r = requests.get(url)
print(r.text[:30])
# soup = BeautifulSoup(r.text, "html.parser")
# for item in soup.select("div.property-address"):
# house_counter += 1
# address_prospect = item.get_text(strip=True)
# print(f"{address_prospect} {house_counter}")
counter += 12
except KeyboardInterrupt:
print("Manual interrupt")
break
except Exception as e:
print(f"Exception occurred for counter={counter}, stopping loop: {e}")
break
check_listing_sell()

Related

JSONDecodeError: Extra data: line 1 column 8 (char 7)

I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)

What has changed on the site?

i want to know if there is a way i monitor a site for changes that i dont only get the message "something changed", that i get what has changed too.
My code right now is:
url = Request('https://stackoverflow.com',
headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
print("running")
time.sleep(10)
while True:
try:
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
response = urlopen(url).read()
newHash = hashlib.sha224(response).hexdigest()
if newHash == currentHash:
continue
else:
print("something changed")
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
continue
except Exception as e:
print("error")

How can I prevent duplicates with try and except blocks?

I have a list of urls, which contain JSON files.
The JSON files are all stored differently, therefore I need try and except blocks to cover the different storing methods.
The problem is that this method leads to some duplicates, because some links get requested two times or more in the different blocks.
My code:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Is it possible to write the try/except blocks in a way that a link gets ignored, if it was succesfully requested in a previous block?
The first 2 try/except blocks are explicitly duplicated and there's no functional benefit to repeat them.
Instead, think through 2 consecutive phases:
extracting a remote resource
parsing JSON string and storing the result
So when extracting phase is failed - no sense to move forward, if the 1st parsing phase is failed - try another kind of parsing:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
except:
continue
try:
try:
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
except:
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
This should solve it for you
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass

why i am getting UnboundLocalError

import requests
from bs4 import BeautifulSoup
from random import choice
def get_proxy():
url = "https://free-proxy-list.net/"
r = requests.get(url)
soup =BeautifulSoup(r.content , 'lxml')
return{'https': choice(list(map(lambda x:x[0]+':'+x[1],list(zip(map(lambda x:x.text, soup.findAll('td')[::8]),map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return r
r = proxy_request('get',"https://en.wikipedia.org/wiki/Main_Page")
print (r.text())
and i am getting this error
The problem is your return is in your while loop. When you break, you skip return, therefore r is None:
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return num # return it here, outside of the while loop
You'll also want to consider the case where num might not be found and get a case to break out to avoid infinite loops

Except open URL BeautifulSoup?

I have function that does request and gets page by URL:
def openUrl(similar_url):
print("Open URL: " + similar_url)
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
I call this function from another:
def get(url):
content = openUrl(url);
try:
soup = BeautifulSoup(content, "html.parser")
except:
pass
for url in urls:
get(url)
Problem is that if I get exception in openUrl then I get erro in soup = BeautifulSoup(content, "html.parser"), because I try to get empty content.
How can I except skip this iteration when error is?
return *WHAT YOU WANT* inside exception block
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
return None # or return ""
and you can call like this
def get(url):
try:
content = openUrl(url) # remove semicolon
except:
pass
else:
if contents is not None:
soup = BeautifulSoup(content, "html.parser")

Categories

Resources