import requests
from bs4 import BeautifulSoup
from random import choice
def get_proxy():
url = "https://free-proxy-list.net/"
r = requests.get(url)
soup =BeautifulSoup(r.content , 'lxml')
return{'https': choice(list(map(lambda x:x[0]+':'+x[1],list(zip(map(lambda x:x.text, soup.findAll('td')[::8]),map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return r
r = proxy_request('get',"https://en.wikipedia.org/wiki/Main_Page")
print (r.text())
and i am getting this error
The problem is your return is in your while loop. When you break, you skip return, therefore r is None:
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return num # return it here, outside of the while loop
You'll also want to consider the case where num might not be found and get a case to break out to avoid infinite loops
Related
I want my while loop to break when there an error, but it doesn't break / close program when it does...
from bs4 import BeautifulSoup
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
check_listing_sell()
For some reason, soup.select("div.property-address") returns an empty webelements (not an error) even on 'no results' page. Thus, the condition if len(soup.select("div.property-address")) == 0 should be added. Moreover, placing r = requests.get(url) inside the try block is a decent suggestion.
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
if len(soup.select("div.property-address")) == 0:
break
for item in soup.select("div.property-address"):
house_counter += 1
address_prospect = item.get_text(strip=True)
print(f"{address_prospect} {house_counter}")
counter += 12
except Exception as e:
print(e)
break
Move the call to requests.get() inside the try.
KeyboardInterrupt is not a subtype of Exception, so you need a separate except block for that.
#from bs4 import BeautifulSoup
import requests
def check_listing_sell():
counter = 0
house_counter = 0
while True:
url = f"https://www.remax-quebec.com/fr/courtiers-immobiliers/james.he/index.rmx?offset={counter}#listing"
try:
print(url)
r = requests.get(url)
print(r.text[:30])
# soup = BeautifulSoup(r.text, "html.parser")
# for item in soup.select("div.property-address"):
# house_counter += 1
# address_prospect = item.get_text(strip=True)
# print(f"{address_prospect} {house_counter}")
counter += 12
except KeyboardInterrupt:
print("Manual interrupt")
break
except Exception as e:
print(f"Exception occurred for counter={counter}, stopping loop: {e}")
break
check_listing_sell()
I've never used this type of module or function. I wanted to know how I can time and print the time taken to find the "response 200" link?
##################### START MODUL PARSE #######################
def my_except_handler(request, exception):
request.url
def check_for_errors(response, *args, **kwargs):
try:
response
except response.exceptions.RequestException as e:
pass
except response.exceptions.HTTPError as e:
pass
except response.ReadTimeout as e:
pass
except response.ConnectionError as e:
pass
except response.ConnectTimeout:
pass
def do_parse(response, *args, **kwargs):
url = response.url
if response.status_code == 200:
response.request.url
url_parse = response.request.url
response.text
try:
if "<font color=#25ff00>" in response.text:
print("STOP SCAN PARSE Time SCAN HERE NEED PRINT TIME")
except Exception as e:
pass
def get_urls_file(site):
urls = []
config_file = "Configurations/package.json"
config = config_file = json.loads(open(f'{config_file}').read())
par_s = config['parse']
for x in par_s:
urls.append(f'{site}{x}')
def get_data_file(urls):
actions_list = []
for url in urls:
action_item = grequests.get(url, headers=headers, timeout=6, stream=True, allow_redirects=False, hooks={'response': [do_parse, check_for_errors]})
actions_list.append(action_item)
grequests.map(actions_list,size=30, exception_handler=my_except_handler)
get_data_file(urls)
Also, i have this example
from funcy import print_durations
#print_durations()
def myfunc(n=0):
for i in range(n):
pass
myfunc(123)
myfunc(123456789)
but if i try to put " #print_durations() " like here:
#print_durations()
def do_parse(response, *args, **kwargs):
i get still the print for all file:
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
..... ..... .....
187.51 ms in do_parse(<Response [200]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
but i need to print the time just here:
print(f"STOP SCAN PARSE Time SCAN ("HERE NEED PRINT TIME")
Thanks for help me.
I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
I wrote a piece of code to rotate proxies for a multithreaded crawler, but it doesn't look very good and I want to see what I can improve.
What I had in mind:
1) Make a number of requests (a random range) with a proxy, then change it
2) If blocked, change the proxy (remove it from proxies list) and retry.
3) If a HTTP error occurs, retry with same proxy
4) If a proxy error occurs, change the proxy (remove it from proxies list), and retry.
Usually, it works pretty decent, though I see some problems which may appear:
1) make_request function is calling itsel which may lead in some cases to an infinte loop
2) proxy errors are not handled properly
Here is my code:
import requests
import threading
import random
import time
import logging
import os
class Crawler():
def __init__(self):
self.user_agents = []
with open('user_agents.txt', 'r') as inpt:
for line in inpt:
if line.strip():
self.user_agents.append(line.strip())
self.proxies = []
with open('proxies.txt', 'r') as inpt:
for line in inpt:
if not line.strip():
continue
self.proxies.append({"http": ''.join(["http://",
line.strip()]),
"https": ''.join(["https://",
line.strip()])})
self.headers = {'User-agent': random.choice(self.user_agents)}
self.session = requests.Session()
self.counter = 0
self.current_proxy = None
self.lock = threading.Lock()
self.set_proxy()
def make_request(self, method, url, **kwargs):
"""Request a page and return its content
#method - string, POST or GET
#url - string
#return: string, HTML page source
or bytes for binary files
"""
# make only 10 to 20 requests using a proxy
with self.lock:
if self.counter > random.randrange(10, 20):
self.set_proxy()
else:
self.counter += 1
try:
if method == 'GET':
if kwargs.get('download', False):
req = self.session.get(url,
headers=self.headers,
stream=True, verify=False)
return req.raw
req = self.session.get(url,
headers=self.headers,
verify=False,
**kwargs)
else:
req = self.session.post(url,
headers=self.headers,
verify=False,
**kwargs)
if req.status_code == 407:
logging.exception('make_request[Proxy Authentication]')
os._exit(1)
if req.encoding not in ['utf8', 'utf-8', None]:
html = req.content.decode(req.encoding)
else:
html = req.text
if 'Access Denied' in html:
# website's error message. proxy blocked
with self.lock:
self.set_proxy(remove=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
return html
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
# access forbidden. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
elif e.response.status_code == 404:
logging.exception(' '.join([
'make_request[HTTPError]',
url, str(e)]))
return
elif e.response.status_code == 429:
# too many requests. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
logging.exception(' '.join([
'make_request[unknown HTTPError]',
url, str(e)]))
return None
except requests.exceptions.InvalidURL as e:
logging.exception(' '.join([
'make_request[InvalidURL]',
url, str(e)]))
return None
except requests.exceptions.Timeout:
time.sleep(1)
return self.make_request(method, url, **kwargs)
except requests.exceptions.ConnectionError as e:
# Connection refused
if '403 Forbidden' in str(e):
logging.exception('make_requests[403 forbidden]')
os._exit(1)
with self.lock:
self.set_proxy()
time.sleep(1)
return self.make_request(method, url, **kwargs)
except Exception as e:
logging.exception(' '.join([
'make_request[unknown Exception]',
url, str(e)]))
return None
def set_proxy(self, remove_proxy=False):
"""Get a random proxy from the list"""
if remove_proxy:
try:
self.proxies.remove(self.current_proxy)
except:
pass
while True:
if self.proxies:
proxy = random.choice(self.proxies)
if not self.is_alive(proxy):
continue
self.current_proxy = proxy
self.session = requests.Session()
self.session.proxies = self.current_proxy
self.headers = {'User-agent': random.choice(self.user_agents)}
self.counter = 0
break
else:
logging.exception('EMPTY PROXY LIST')
os._exit(1)
break
def is_alive(self, proxy):
"""Check if a proxy is alive or not
#proxy - dict
#return: True if alive, False otherwise
"""
try:
requests.get('http://www.google.com',
proxies=proxy, timeout=5)
return True
except:
return False
Thanks
I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.
I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.
The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.