Multiprocessing in python/beautifulsoup issues - python

Hi guys i'm fairly new in python. what i'm trying to do is to move my old code into multiprocessing however i'm facing some errors that i hope anyone could help me out. My code is used to check a few thousand links given in a text form to check for certain tags. Once found it will output it to me. Due to the reason i have a few thousand links to check, speed is an issue and hence the need for me to move to multi processing.
Update: i'm having return errors of HTTP 503 errors. Am i sending too much request or am i missin gout something?
Multiprocessing code:
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
no_stock = []
def main(lines):
done = False
tries = 1
while tries and not done:
try:
r = br.open(lines, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except Exception as e:
print '%s: %s' % (e.__class__.__name__, e)
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(lines))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(lines)
if __name__ == "__main__":
r = br.open('http://www.randomweb.com/') #avoid redirection
fileName = "url.txt"
pool = Pool(processes=2)
with open(fileName, "r+") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
Traceback:
Traceback (most recent call last):
File "test2.py", line 43, in <module>
lines = pool.map(main, f)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
UnboundLocalError: local variable 'soup' referenced before assignment
my txt file is something like this:-
http://www.randomweb.com/item.htm?uuid=44733096229
http://www.randomweb.com/item.htm?uuid=4473309622789
http://www.randomweb.com/item.htm?uuid=447330962291
....etc

from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
no_stock = []
def main(line):
done = False
tries = 3
while tries and not done:
try:
r = br.open(line, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except:
print('Random fail retrying')
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(i))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)
if __name__ == "__main__":
fileName = "url.txt"
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads.
with open(fileName, "rb") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
pool.map takes two parameters, the fist is a function(in your code, is main), the other is an iterable, each item of iterable will be a parameter of the function(in your code, is each line of the file)

Related

multiprocessing ThreadPool stops at the end

I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))
requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)

Python threadings: threads block

im checking my customers countries so as to i know which service i can offer bla bla......
so the the problems is the threads blocks for example it check 15-20 and block, i want a solution to keep it continuing
the code is:
import requests
import re
from sys import argv
from Queue import Queue
from threading import Thread
e = argv[1]
emails = open(e, 'r').readlines()
emails = map(lambda s: s.strip(), emails)
valid=[]
def base(email):
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
w=open(str(x.group(1))+".txt", 'a')
w.write(email+"\n")
valid.append(email)
except:
pass
def work():
email=q.get()
base(email)
q.task_done()
THREADS = 25
q=Queue()
for i in range(THREADS):
t=Thread(target=work())
t.daemon=True
t.start()
if (len(argv)>0):
for email in emails:
q.put(email)
q.join()`enter code here
thanks in advance
Your problem is that you call work() instead of passing the work function when creating your threads. Instead of putting changes in your code, consider moving the python's ThreadPool which does the heavy lifting for you. Here's an example that implements what you want.
map calls your worker for each email in the iterator and returns the worker's result as an iterator (python 3) or list (python 2). Your worker returns a valid email or None for each email its given, so you just have to filter out the Nones at the end.
import requests
import re
from sys import argv
import multiprocessing.pool
e = argv[1]
emails = [line.strip() for line in open(e)]
def base(email):
print("getting email {}".format(email))
try:
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
with open(str(x.group(1))+".txt", 'a') as w:
w.write(email+"\n")
return email
except:
pass
except requests.exceptions.RequestException as e:
print(e)
THREADS = 25
pool = multiprocessing.pool.ThreadPool(THREADS)
valid = [email for email in pool.map(base, emails, chunksize=1) if email]
print(valid)
pool.close()

Python Multi-threaded App does not terminate

This my code which basically just takes a list of 94,000+ URLs, and collects the http_status codes for them:
#!/usr/bin/python3
import threading
from queue import Queue
import urllib.request
import urllib.parse
from http.client import HTTPConnection
import socket
import http.client
#import httplib
url_input = open("urls_prod_sort.txt", "r").read()
urls = url_input[:url_input.rfind('\n')].split('\n')
#urls = urls[:100]
url_502 = []
url_logs = []
url_502_lock = threading.Lock()
print_lock = threading.Lock()
def sendRequest(url_u, http_method = 'GET', data = None):
use_proxy = "http://xxxxxxxx:8080"
proxies = {"http": use_proxy}
proxy = urllib.request.ProxyHandler(proxies)
handler = urllib.request.HTTPHandler()
url = "http://" + url_u
with print_lock:
print(url)
opener = urllib.request.build_opener(proxy,handler)
urllib.request.install_opener(opener)
request = urllib.request.Request(url,data)
request.add_header("User-agent","| MSIE |")
request.get_method = lambda: http_method
try:
response = urllib.request.urlopen(request)
response_code = response.code
except urllib.error.HTTPError as error:
response_code = error.code
except urllib.error.URLError as e2:
response_code = 701
except socket.timeout as e3:
response_code = 702
except socket.error as e4:
response_code = 703
except http.client.IncompleteRead as e:
response_code = 700
if response_code == 502:
with url_502_lock:
#url_502.append(url)
url_502_file = open("url_502_file.txt", "a")
url_502_file.write(url + "\n")
url_502_file.close()
with print_lock:
#url_logs.append(url + "," + str(response_code))
url_all_logs_file = open("url_all_logs.csv", "a")
url_all_logs_file.write(url + "," + str(response_code) + '\n')
url_all_logs_file.close()
#print (url + "," + str(response_code))
#print (response_code)
return response_code
def worker():
while True:
url = q.get()
if url == ":::::"
break
else:
sendRequest(url)
q.task_done()
#======================================
q = Queue()
for threads in range(1000):
t = threading.Thread(target = worker)
t.daemon = True
t.start()
for url in urls:
q.put(url)
q.put(":::::")
q.join()
However, the program never seems to terminate (even tho the URLs have all been iteratred through) which forces me to ctrl-c the program - and then I get the following error:
Traceback (most recent call last):
File "./url_sc_checker.py", line 120, in <module>
q.join()
File "/usr/lib/python3.2/queue.py", line 82, in join
self.all_tasks_done.wait()
File "/usr/lib/python3.2/threading.py", line 235, in wait
waiter.acquire()
KeyboardInterrupt
The reason that your program doesn't terminate is simple, your worker creates an infinite loop:
def worker():
while True:
...
You need to either throw an exception, break, or have a terminating condition in your while statement. Otherwise your program would remain trying to get the next job from the queue, without knowing that there will never be the next job.
A common way to do this is to put a sentinel value in your queue, when checking out a job from the queue, the worker checks if it is the sentinel value and breaks out the loop.
Another way is to have a global condition variable that you check in the while condition. When the job producer have pushed all items to the queue, the job producer joins the queue, and when all jobs are done, the job producer unblocks and terminates the threads our processes.
Another possible reason why your process doesn't terminate is if your sendRequest produces an unexpected exception, then the thread terminates and you'll be left with some jobs that are never marked as done.

Reload Webpage when timeout Mechanize

Hi guys my code is basically used to check a number of links i've gave to find certain tags in the webpage. Once found it will then give me back the links that i've gave. However, sometimes mechanize will get stuck forever trying to open/read the page unless i've set a timeout. Are they any ways to reload/retry the webpage upon time out?
import mechanize
from mechanize import Browser
from bs4 import BeautifulSoup
import urllib2
import time
import os
from tqdm import tqdm
import socket
br = Browser()
with open("url.txt", 'r+') as f:
lines = f.read().splitlines()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
no_stock = []
for i in tqdm(lines):
r = br.open(i, timeout=200)
r = r.read()
done = False
tries = 3
while tries and not done:
try:
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except:
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}'.format(i))
continue # skip this and continue with the next
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)
Updated error:
File "/usr/local/lib/python2.7/dist-packages/mechanize/_response.py", line 190, in read
self.__cache.write(self.wrapped.read())
File "/usr/lib/python2.7/socket.py", line 355, in read
data = self._sock.recv(rbufsize)
File "/usr/lib/python2.7/httplib.py", line 587, in read
return self._read_chunked(amt)
File "/usr/lib/python2.7/httplib.py", line 656, in _read_chunked
value.append(self._safe_read(chunk_left))
File "/usr/lib/python2.7/httplib.py", line 702, in _safe_read
chunk = self.fp.read(min(amt, MAXAMOUNT))
File "/usr/lib/python2.7/socket.py", line 384, in read
data = self._sock.recv(left)
socket.timeout: timed out
Any help is appreciated!
catch the socket.timeout exception and retry there:
try:
# first try
soup = BeautifulSoup(r,'html.parser')
except socket.timeout:
# try a second time
soup = BeautifulSoup(r,'html.parser')
You can even try many times, and if a line fails, continue with the next:
for i in tqdm(lines):
r = br.open(i, timeout=200)
r = r.read()
done = False
tries = 3
while tries and not done:
try:
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except: # just catch any error
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}'.format(i))
continue # skip this and continue with the next
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)

Multi-threaded Python Web Crawler Got Stuck

I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus
Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...

Categories

Resources