I have a list of urls, which contain JSON files.
The JSON files are all stored differently, therefore I need try and except blocks to cover the different storing methods.
The problem is that this method leads to some duplicates, because some links get requested two times or more in the different blocks.
My code:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Is it possible to write the try/except blocks in a way that a link gets ignored, if it was succesfully requested in a previous block?
The first 2 try/except blocks are explicitly duplicated and there's no functional benefit to repeat them.
Instead, think through 2 consecutive phases:
extracting a remote resource
parsing JSON string and storing the result
So when extracting phase is failed - no sense to move forward, if the 1st parsing phase is failed - try another kind of parsing:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
except:
continue
try:
try:
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
except:
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
This should solve it for you
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Related
I want my while loop to break when there an error, but it doesn't break / close program when it does...
from bs4 import BeautifulSoup
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
check_listing_sell()
For some reason, soup.select("div.property-address") returns an empty webelements (not an error) even on 'no results' page. Thus, the condition if len(soup.select("div.property-address")) == 0 should be added. Moreover, placing r = requests.get(url) inside the try block is a decent suggestion.
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
if len(soup.select("div.property-address")) == 0:
break
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
Move the call to requests.get() inside the try.
KeyboardInterrupt is not a subtype of Exception, so you need a separate except block for that.
#from bs4 import BeautifulSoup
import requests
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
print(url)
r = requests.get(url)
print(r.text[:30])
# soup = BeautifulSoup(r.text, "html.parser")
# for item in soup.select("div.property-address"):
# house_counter += 1
# address_prospect = item.get_text(strip=True)
# print(f"{address_prospect} {house_counter}")
counter += 12
except KeyboardInterrupt:
print("Manual interrupt")
break
except Exception as e:
print(f"Exception occurred for counter={counter}, stopping loop: {e}")
break
check_listing_sell()
I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
i want to know if there is a way i monitor a site for changes that i dont only get the message "something changed", that i get what has changed too.
My code right now is:
url = Request('https://stackoverflow.com',
headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
print("running")
time.sleep(10)
while True:
try:
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
response = urlopen(url).read()
newHash = hashlib.sha224(response).hexdigest()
if newHash == currentHash:
continue
else:
print("something changed")
response = urlopen(url).read()
currentHash = hashlib.sha224(response).hexdigest()
time.sleep(30)
continue
except Exception as e:
print("error")
I have function that does request and gets page by URL:
def openUrl(similar_url):
print("Open URL: " + similar_url)
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
I call this function from another:
def get(url):
content = openUrl(url);
try:
soup = BeautifulSoup(content, "html.parser")
except:
pass
for url in urls:
get(url)
Problem is that if I get exception in openUrl then I get erro in soup = BeautifulSoup(content, "html.parser"), because I try to get empty content.
How can I except skip this iteration when error is?
return *WHAT YOU WANT* inside exception block
try:
req = urllib.request.Request(similar_url)
return urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("HTTP Response: " + str(e.code))
return None # or return ""
and you can call like this
def get(url):
try:
content = openUrl(url) # remove semicolon
except:
pass
else:
if contents is not None:
soup = BeautifulSoup(content, "html.parser")
I have a rate stream where I need to store and compare the last two lines. For instance if the new price is higher than the previous, queue event. It's my understanding that iter_lines()only displays the last line. My question is how could I store the last line, wait for a new line and compare those, then queue the event? I know this is simple, but I'm still having trouble, thanks for your help!
Here is my UPDATED(3) stream:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
oldLine = ''
for line in response.iter_lines(1):
if line < oldLine:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
tev = TickEvent(instrument, time, bid, ask)
self.events_queue.put(tev)
oldLine = line
The original function:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
The repaired function:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
last_msg = None # new line
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
if last_msg is None: # new line
last_msg = msg # new line
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
# can now compare last msg with current msg
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
last_msg = msg # new line (may want to indent 4 more spaces)
It may make sense to move the if last_msg is None check to the inside of if msg.has_key block if you want the last_msg to have certain information.