Except open URL BeautifulSoup? - python

I have function that does request and gets page by URL:
def openUrl(similar_url):
print("Open URL: " + similar_url)
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
I call this function from another:
def get(url):
content = openUrl(url);
try:
soup = BeautifulSoup(content, "html.parser")
except:
pass
for url in urls:
get(url)
Problem is that if I get exception in openUrl then I get erro in soup = BeautifulSoup(content, "html.parser"), because I try to get empty content.
How can I except skip this iteration when error is?

return *WHAT YOU WANT* inside exception block
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
return None # or return ""
and you can call like this
def get(url):
try:
content = openUrl(url) # remove semicolon
except:
pass
else:
if contents is not None:
soup = BeautifulSoup(content, "html.parser")

Related

While Loop doesn't break when error is occurred

I want my while loop to break when there an error, but it doesn't break / close program when it does...
from bs4 import BeautifulSoup
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
check_listing_sell()
For some reason, soup.select("div.property-address") returns an empty webelements (not an error) even on 'no results' page. Thus, the condition if len(soup.select("div.property-address")) == 0 should be added. Moreover, placing r = requests.get(url) inside the try block is a decent suggestion.
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
if len(soup.select("div.property-address")) == 0:
break
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
Move the call to requests.get() inside the try.
KeyboardInterrupt is not a subtype of Exception, so you need a separate except block for that.
#from bs4 import BeautifulSoup
import requests
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
print(url)
r = requests.get(url)
print(r.text[:30])
# soup = BeautifulSoup(r.text, "html.parser")
# for item in soup.select("div.property-address"):
# house_counter += 1
# address_prospect = item.get_text(strip=True)
# print(f"{address_prospect} {house_counter}")
counter += 12
except KeyboardInterrupt:
print("Manual interrupt")
break
except Exception as e:
print(f"Exception occurred for counter={counter}, stopping loop: {e}")
break
check_listing_sell()

How to use Funcy "print_durations" in grequests function?

I've never used this type of module or function. I wanted to know how I can time and print the time taken to find the "response 200" link?
##################### START MODUL PARSE #######################
def my_except_handler(request, exception):
request.url
def check_for_errors(response, *args, **kwargs):
try:
response
except response.exceptions.RequestException as e:
pass
except response.exceptions.HTTPError as e:
pass
except response.ReadTimeout as e:
pass
except response.ConnectionError as e:
pass
except response.ConnectTimeout:
pass
def do_parse(response, *args, **kwargs):
url = response.url
if response.status_code == 200:
response.request.url
url_parse = response.request.url
response.text
try:
if "<font color=#25ff00>" in response.text:
print("STOP SCAN PARSE Time SCAN HERE NEED PRINT TIME")
except Exception as e:
pass
def get_urls_file(site):
urls = []
config_file = "Configurations/package.json"
config = config_file = json.loads(open(f'{config_file}').read())
par_s = config['parse']
for x in par_s:
urls.append(f'{site}{x}')
def get_data_file(urls):
actions_list = []
for url in urls:
action_item = grequests.get(url, headers=headers, timeout=6, stream=True, allow_redirects=False, hooks={'response': [do_parse, check_for_errors]})
actions_list.append(action_item)
grequests.map(actions_list,size=30, exception_handler=my_except_handler)
get_data_file(urls)
Also, i have this example
from funcy import print_durations
#print_durations()
def myfunc(n=0):
for i in range(n):
pass
myfunc(123)
myfunc(123456789)
but if i try to put " #print_durations() " like here:
#print_durations()
def do_parse(response, *args, **kwargs):
i get still the print for all file:
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
..... ..... .....
187.51 ms in do_parse(<Response [200]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
but i need to print the time just here:
print(f"STOP SCAN PARSE Time SCAN ("HERE NEED PRINT TIME")
Thanks for help me.

JSONDecodeError: Extra data: line 1 column 8 (char 7)

I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)

What has changed on the site?

i want to know if there is a way i monitor a site for changes that i dont only get the message "something changed", that i get what has changed too.
My code right now is:
url = Request('https://stackoverflow.com',
headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
print("running")
time.sleep(10)
while True:
try:
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
response = urlopen(url).read()
newHash = hashlib.sha224(response).hexdigest()
if newHash == currentHash:
continue
else:
print("something changed")
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
continue
except Exception as e:
print("error")

How can I prevent duplicates with try and except blocks?

I have a list of urls, which contain JSON files.
The JSON files are all stored differently, therefore I need try and except blocks to cover the different storing methods.
The problem is that this method leads to some duplicates, because some links get requested two times or more in the different blocks.
My code:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Is it possible to write the try/except blocks in a way that a link gets ignored, if it was succesfully requested in a previous block?
The first 2 try/except blocks are explicitly duplicated and there's no functional benefit to repeat them.
Instead, think through 2 consecutive phases:
extracting a remote resource
parsing JSON string and storing the result
So when extracting phase is failed - no sense to move forward, if the 1st parsing phase is failed - try another kind of parsing:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
except:
continue
try:
try:
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
except:
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
This should solve it for you
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass

Categories

Resources