Python threadings: threads block - python

im checking my customers countries so as to i know which service i can offer bla bla......
so the the problems is the threads blocks for example it check 15-20 and block, i want a solution to keep it continuing
the code is:
import requests
import re
from sys import argv
from Queue import Queue
from threading import Thread
e = argv[1]
emails = open(e, 'r').readlines()
emails = map(lambda s: s.strip(), emails)
valid=[]
def base(email):
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
w=open(str(x.group(1))+".txt", 'a')
w.write(email+"\n")
valid.append(email)
except:
pass
def work():
email=q.get()
base(email)
q.task_done()
THREADS = 25
q=Queue()
for i in range(THREADS):
t=Thread(target=work())
t.daemon=True
t.start()
if (len(argv)>0):
for email in emails:
q.put(email)
q.join()`enter code here
thanks in advance

Your problem is that you call work() instead of passing the work function when creating your threads. Instead of putting changes in your code, consider moving the python's ThreadPool which does the heavy lifting for you. Here's an example that implements what you want.
map calls your worker for each email in the iterator and returns the worker's result as an iterator (python 3) or list (python 2). Your worker returns a valid email or None for each email its given, so you just have to filter out the Nones at the end.
import requests
import re
from sys import argv
import multiprocessing.pool
e = argv[1]
emails = [line.strip() for line in open(e)]
def base(email):
print("getting email {}".format(email))
try:
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
with open(str(x.group(1))+".txt", 'a') as w:
w.write(email+"\n")
return email
except:
pass
except requests.exceptions.RequestException as e:
print(e)
THREADS = 25
pool = multiprocessing.pool.ThreadPool(THREADS)
valid = [email for email in pool.map(base, emails, chunksize=1) if email]
print(valid)
pool.close()

Related

Python requests.get and threading with different servers

I am working on a simple web scraper and rn trying to implement some multithreading. While my code works as intended with some servers(reducing time of execution vastly), my primary goal is to make it work with few specific ones. So when I try it with the ones in sites list, I get performance like I am still using sequential code. Any guesses what can cause this?
import requests, time
from bs4 import BeautifulSoup
from threading import Thread
from random import choice
# Enable to get some logging info
#---------------------------------
# import logging
# import http.client
# http.client.HTTPConnection.debuglevel = 1
# logging.basicConfig()
# logging.getLogger().setLevel(logging.DEBUG)
# requests_log = logging.getLogger("requests.packages.urllib3")
# requests_log.setLevel(logging.DEBUG)
# requests_log.propagate = True
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
class Pikabu_Downloader(Thread):
def __init__(self, url, name, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = url
self.name = name
self.begin = time.time()
def run(self):
print("Beginning with thread number",self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
html_data = self._get_html()
print("After requests.get with thread number", self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
if html_data is None:
return
self.soup = BeautifulSoup(html_data, "html.parser")
print("After making soup with thread number", self.name, ",", round(time.time() - self.begin, 4), " seconds has passed")
def _get_html(self):
try:
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
print(f"Go {self.url}...")
res = requests.get(self.url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
except Exception as exc:
print(exc)
else:
return res.text
test = "https://readingbooks.site/read/?name=1984&"
def download():
pikabu_urls = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(1, 10)]
pikabu_urls = pikabu_urls + pikabu
pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(pikabu_urls)]
# Comment the string above and enable 2 underlying strings to get result from test server
# tests = [test + "page=" + str(x) for x in range(1, pages)]
# pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(tests)]
for pikabu_dl in pikabu_dls:
pikabu_dl.start()
for pikabu_dl in pikabu_dls:
pikabu_dl.join()
download()
And the result is something like
...
After requests.get with thread number 1 , 1.6904 seconds has passed
After making soup with thread number 1 , 1.7554 seconds has passed
After requests.get with thread number 2 , 2.9805 seconds has passed
After making soup with thread number 2 , 3.0455 seconds has passed
After requests.get with thread number 3 , 4.3225 seconds has passed
After making soup with thread number 3 , 4.3895 seconds has passed
...
What can cause such latency between thread executions? I was hoping to get each thread to finish almost simultaneously and to get more...asynchronous output, like with server from test. If I set a timeout of 5 sec inside requests.get, most of the requests wont even work.
After I investigated your case, I would point out some issues that you have encountered:
Do not print when it is on parallel tasks, it will cause the bottle-neck on the way of rendering to screen
The large of tasks are not always good for performance, it depends on how much your memory will process. Imagine that you have 1000 links, you have to create 1000 task objects? No, only place-holder for 5-20 by leveraging ThreadPool
Server also is a problem to deal with when taking request. Downloaded size, low bandwidth, network, distancing,.. caused response late will affect your physic machine. Your sites are weight, it seems consuming 1-3000ms each request so when you test it with small size (20 links), it makes you feel it runs sequentially
Your code is running parallel, since you do a little bit trick to put it on different threads, it is not quite right because we need a fully async library, such like asyncio and aiohttp. The aiohttp will take care numerous async requests on the Coroutine whereas asyncio will support syntax and operate on your main thread.
I did a small experiment on colab, please be noticed that I didn't use asyncio and aiohttp on colab because of stuck, but I have implemented on several projects before and it worked faster than below fastest method.
The second function is your implementation
import urllib.request
from threading import Thread
import time, requests
from random import choice
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
timeout = 5
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
URLS = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(25)]
URLS.extend(pikabu)
def convert_to_threads():
return [Thread(target=load_url, args=(page, timeout)) for page in URLS]
def running_threads():
threads = convert_to_threads()
start = time.time()
for i in threads:
i.start()
for i in threads:
i.join()
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def load_url(url, timeout):
res = requests.get(url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
return res.text
def running_sequence():
start = time.time()
for url in URLS:
load_url(url, timeout)
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def running_thread_pool():
start = time.time()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, timeout): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
# else:
# print('%r page is %d length' % (url, len(data)))
print(f'Finish with {len(URLS)} requests {time.time() - start}')
In short, I recommend you use ThreadPool (prefer in colab), or asyncio and aiohttp (not in colab) to gain speed

multiprocessing ThreadPool stops at the end

I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))
requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)

Multiprocessing in python/beautifulsoup issues

Hi guys i'm fairly new in python. what i'm trying to do is to move my old code into multiprocessing however i'm facing some errors that i hope anyone could help me out. My code is used to check a few thousand links given in a text form to check for certain tags. Once found it will output it to me. Due to the reason i have a few thousand links to check, speed is an issue and hence the need for me to move to multi processing.
Update: i'm having return errors of HTTP 503 errors. Am i sending too much request or am i missin gout something?
Multiprocessing code:
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
no_stock = []
def main(lines):
done = False
tries = 1
while tries and not done:
try:
r = br.open(lines, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except Exception as e:
print '%s: %s' % (e.__class__.__name__, e)
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(lines))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(lines)
if __name__ == "__main__":
r = br.open('http://www.randomweb.com/') #avoid redirection
fileName = "url.txt"
pool = Pool(processes=2)
with open(fileName, "r+") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
Traceback:
Traceback (most recent call last):
File "test2.py", line 43, in <module>
lines = pool.map(main, f)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
UnboundLocalError: local variable 'soup' referenced before assignment
my txt file is something like this:-
http://www.randomweb.com/item.htm?uuid=44733096229
http://www.randomweb.com/item.htm?uuid=4473309622789
http://www.randomweb.com/item.htm?uuid=447330962291
....etc
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
no_stock = []
def main(line):
done = False
tries = 3
while tries and not done:
try:
r = br.open(line, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except:
print('Random fail retrying')
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(i))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)
if __name__ == "__main__":
fileName = "url.txt"
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads.
with open(fileName, "rb") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
pool.map takes two parameters, the fist is a function(in your code, is main), the other is an iterable, each item of iterable will be a parameter of the function(in your code, is each line of the file)

Python, send a stop notification to a blocking loop within a thread

I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')

Python Threading Error when executing a MSSQL statement

I have a producer, consumer application where the producer reads a database and puts the results into a website. On success, The consumer then gets the id of the transaction and updates the db.
The program runs as required until it attempts to execute the update. It fails sometimes with this error 'HY000', 'The driver did not supply an error!'
The code can comfortably write to file without any issues.
What could i do to fix this? We need to update the db.
Thanks
Notes...using python 2.7 with pyodbc on mssql 2008.
code below
#!/usr/bin/env python
from urlparse import urlparse
from threading import Thread
import httplib, sys
import Queue
import urllib2
import urllib
from time import localtime, strftime
from ConfigParser import SafeConfigParser
from functions import Functions
from pyodbcclass import db_mssql
now = strftime("%y-%m-%d-%H%M%S", localtime())
k = db_mssql()
thread_list = []
thread_list2 = []
getFromDBQueue = Queue.Queue()
updateDBQueue = Queue.Queue()
number_of_consumer_threads = 3
def putURL():
querySql = "select distinct top 3 id,url from tblURL where processed=0 order by id asc"
re = k.query2(querySql)
if re:
for r in re:
id = r.id
params = urllib.urlencode({'user': user, 'password': password})
ourl = urlini + "?%s" % params
urlplusid = {'url':ourl.strip(),'id':id}
getFromDBQueue.put(urlplusid)
def getURL(thread_id):
while 1:
try:
URL_toget = getFromDBQueue.get(block=False)
url2 = URL_toget['url']
msgid2 = URL_toget['id']
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "++getFromDB++"
sys.exit()
status,url = getStatus(url2)
if status == 200:
updateDBQueue.put(msgid2)
print(status)
def updateDB(thread_id):
while 1:
try:
id2 = updateDBQueue.get(block=False)
if id2:
params = ['true',id2]
sqlupdate = "UPDATE tblURL SET processed=? WHERE id=?"
k.execute3(sqlupdate,params)
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "**update**"
sys.exit()
# fill the queue with work and block until we are done filling the queue
producer_thread = Thread(target=putURL)
producer_thread.start()
producer_thread.join()
# we can now start consumers
for i in range(number_of_consumer_threads):
getfromDB = Thread(target=getURL, args=(i,))
getfromDB.start()
thread_list.append(getfromDB)
for i in range(number_of_consumer_threads):
update = Thread(target=updateDB, args=(i,))
update.start()
thread_list2.append(update)
for thread in thread_list:
thread.join()
for thread2 in thread_list2:
thread2.join()

Categories

Resources