I have a producer, consumer application where the producer reads a database and puts the results into a website. On success, The consumer then gets the id of the transaction and updates the db.
The program runs as required until it attempts to execute the update. It fails sometimes with this error 'HY000', 'The driver did not supply an error!'
The code can comfortably write to file without any issues.
What could i do to fix this? We need to update the db.
Thanks
Notes...using python 2.7 with pyodbc on mssql 2008.
code below
#!/usr/bin/env python
from urlparse import urlparse
from threading import Thread
import httplib, sys
import Queue
import urllib2
import urllib
from time import localtime, strftime
from ConfigParser import SafeConfigParser
from functions import Functions
from pyodbcclass import db_mssql
now = strftime("%y-%m-%d-%H%M%S", localtime())
k = db_mssql()
thread_list = []
thread_list2 = []
getFromDBQueue = Queue.Queue()
updateDBQueue = Queue.Queue()
number_of_consumer_threads = 3
def putURL():
querySql = "select distinct top 3 id,url from tblURL where processed=0 order by id asc"
re = k.query2(querySql)
if re:
for r in re:
id = r.id
params = urllib.urlencode({'user': user, 'password': password})
ourl = urlini + "?%s" % params
urlplusid = {'url':ourl.strip(),'id':id}
getFromDBQueue.put(urlplusid)
def getURL(thread_id):
while 1:
try:
URL_toget = getFromDBQueue.get(block=False)
url2 = URL_toget['url']
msgid2 = URL_toget['id']
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "++getFromDB++"
sys.exit()
status,url = getStatus(url2)
if status == 200:
updateDBQueue.put(msgid2)
print(status)
def updateDB(thread_id):
while 1:
try:
id2 = updateDBQueue.get(block=False)
if id2:
params = ['true',id2]
sqlupdate = "UPDATE tblURL SET processed=? WHERE id=?"
k.execute3(sqlupdate,params)
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "**update**"
sys.exit()
# fill the queue with work and block until we are done filling the queue
producer_thread = Thread(target=putURL)
producer_thread.start()
producer_thread.join()
# we can now start consumers
for i in range(number_of_consumer_threads):
getfromDB = Thread(target=getURL, args=(i,))
getfromDB.start()
thread_list.append(getfromDB)
for i in range(number_of_consumer_threads):
update = Thread(target=updateDB, args=(i,))
update.start()
thread_list2.append(update)
for thread in thread_list:
thread.join()
for thread2 in thread_list2:
thread2.join()
Related
I need to execute code inside while loop every x seconds without stoping loop work
I have trying threading and lock combinations but it is still not working. I am working on python 3.7.4, pycharm 2019.2
#!/usr/bin/env python3
import configparser
import logging
import threading
import time
import ts3
__all__ = ["notify_bot"]
logging.basicConfig(filename='ts3bot.log',
level=logging.INFO,
format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s",
)
logging.getLogger().addHandler(logging.StreamHandler())
def notify_bot(ts3conn, config, lock):
logging.info("Start Notify Bot ...")
lock.acquire()
ts3conn.exec_("servernotifyregister", event="server")
lock.release()
while True:
event = ts3conn.wait_for_event()
try:
reasonid_ = event[0]["reasonid"]
except KeyError:
continue
if reasonid_ == "0":
logging.info("User joined Lobby:")
logging.info(event[0])
servergroups = event[0]['client_servergroups']
guestname = event[0]['client_nickname']
lock.acquire()
if not set(servergroups):
print(f"s1 {guestname}")
else:
print(f"s2{guestname}")
lock.release()
return None
def keep_alive(ts3conn, lock):
while True:
logging.info("Send keep alive!")
lock.acquire()
ts3conn.send_keepalive()
lock.release()
time.sleep(5)
if __name__ == "__main__":
logging.info("Start TS Bot ...")
config = configparser.ConfigParser()
config.sections()
config.read("settings_test.ini")
logging.info("Config loaded!")
HOST = config['server']['url']
PORT = config['server']['query_port']
USER = config['server']['query_user']
PASS = config['server']['query_pw']
SID = config['server']['sid']
NAME = config['bot']['name']
logging.info("Connecting to query interface ...")
URI = f"telnet://{USER}:{PASS}#{HOST}:{PORT}"
try:
with ts3.query.TS3ServerConnection(URI) as ts3conn:
ts3conn.exec_("use", sid=SID)
ts3conn.query("clientupdate", client_nickname="x123d")
logging.info("Connected!")
lock = threading.Lock()
notify_thread = threading.Thread(target=notify_bot, args=(ts3conn, config, lock), daemon=True,
name="notify")
keep_alive_thread = threading.Thread(target=keep_alive, args=(ts3conn, lock), daemon=True,
name="keep_alive")
notify_thread.start()
keep_alive_thread.start()
keep_alive_thread.join()
notify_thread.join()
except KeyboardInterrupt:
logging.INFO(60 * "=")
logging.info("TS Bot terminated by user!")
logging.INFO(60 * "=")
After run work for 1 person who join server and do nothing, dont send keep alive and dont work at all
you can use Bibio TIME
You can check it from official python website (https://docs.python.org/3/library/time.html)
Personally, for simple things, I find the _thread library easier. Here's a function that you can run in a thread, and an example of starting that thread:
import _thread
def mythread(arg1):
while True:
time.sleep(arg1)
do.whatever()
_thread.start_new_thread(mythread, (5,))
The important thing to note is the second argument I passed to the _thread.start_new_thread function. It must be a tuple, which is why there is a comma after the 5. Even if your function doesn't require any arguments, you have to pass a tuple.
I am using time module and threading,
I'v made some changes and it seems to work
#!/usr/bin/env python3
import configparser
import logging
import threading
import time
import ts3
logging.basicConfig(filename='ts3bot.log',
level=logging.INFO,
format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s",
)
logging.getLogger().addHandler(logging.StreamHandler())
def notify_bot(ts3conn):
logging.info("Start Notify Bot ...")
ts3conn.exec_("servernotifyregister", event="server")
while True:
event = ts3conn.wait_for_event()
try:
reasonid_ = event[0]["reasonid"]
except KeyError:
continue
if reasonid_ == "0":
logging.info("User joined Lobby:")
logging.info(event[0])
servergroups = event[0]['client_servergroups']
guestname = event[0]['client_nickname']
if not set(servergroups):
print(f"s1 {guestname}")
else:
print(f"s2{guestname}")
return None
def keep_alive(ts3conn, time):
while True:
logging.info("Send keep alive!")
ts3conn.send_keepalive()
time.sleep(20)
if __name__ == "__main__":
logging.info("Start TS Bot ...")
config = configparser.ConfigParser()
config.sections()
config.read("settings_test.ini")
logging.info("Config loaded!")
HOST = config['server']['url']
PORT = config['server']['query_port']
USER = config['server']['query_user']
PASS = config['server']['query_pw']
SID = config['server']['sid']
NAME = config['bot']['name']
logging.info("Connecting to query interface ...")
URI = f"telnet://{USER}:{PASS}#{HOST}:{PORT}"
try:
with ts3.query.TS3ServerConnection(URI) as ts3conn:
ts3conn.exec_("use", sid=SID)
ts3conn.query("clientupdate", client_nickname="x123d")
logging.info("Connected!")
notify_thread = threading.Thread(target=notify_bot, args=(ts3conn,), daemon=True,
name="notify")
keep_alive_thread = threading.Thread(target=keep_alive, args=(ts3conn, time), daemon=True,
name="keep_alive")
notify_thread.start()
keep_alive_thread.start()
keep_alive_thread.join()
notify_thread.join()
except KeyboardInterrupt:
logging.INFO(60 * "=")
logging.info("TS Bot terminated by user!")
logging.INFO(60 * "=")
It looks like ts3conn.send_keepalive() making error, when I delete it, code work fine, when I'v add it, code stop working after send ts3conn.send_keepalive() once
Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.
I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.
EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html). Could it be that this command can't get parallelized using threads?
from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib
start_time = time.time()
level = 1
client = MongoClient()
db = client.crawler
visited = {}
def doWork():
while True:
try:
myUrl = q_start.get()
except:
continue
try:
c=urllib2.urlopen(myUrl)
except:
q_start.task_done()
continue
parsed_url = urlparse(myUrl)
html=c.read()
try:
soup = BeautifulSoup(html)
except:
q_start.task_done()
continue
txt = soup.prettify()
links = soup('a')
m = hashlib.md5(myUrl)
db.urls.insert(
{
"url":myUrl,
"HTML":txt,
"level":level,
"domain":parsed_url.netloc,
"md5":m.hexdigest()
}
)
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(myUrl,link['href'])
if url.find("'")!=-1:
continue
url=url.split('#')[0]
if url[0:4] == 'http':
if url in visited:
continue
else:
visited[url]=True
q_new.put(url)
q_start.task_done()
q_start = Queue.Queue()
q_new = Queue.Queue()
for i in range(50):
t = Thread(target=doWork)
t.daemon = True
t.start()
q_start.put("http://google.com")
q_start.join()
for i in range(2,5):
print "Depth: "
print i
print time.time() - start_time
level += 1
print q_new.qsize()
q_aux = q_new
q_new = Queue.Queue()
while q_aux.empty() != True:
x = q_aux.get()
q_start.put(x)
q_start.join()
print "end"
print time.time() - start_time
Recently,I'm working on a gevent demo and I try to compare the efficiency between gevent and thread. Generally speakingļ¼the gevent code should be more efficient than the thread code. But when I use time command to profile the program, I get the unusual result(my command is time python FILENAME.py 50 1000,the last two parameters means pool number or thread number,so I change the two number in the table below). The result shows that the thread is more efficient than the gevent code,so I want to know why this happen and what's wrong with my program? Thanks.
gevent VS thread
My code is below(The main idea is use thread or gevent to send multi HTTP request):
******This is the thread version code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import requests
import threading
import time
import urllib2
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(pagenum):
try:
r = requests.get(urlset[pagenum])
except Exception, e:
print e
pass
def DigJobByPagenum(pagenum, requestnum):
init_num = pagenum
print '%d begin' % init_num
while pagenum < requestnum:
GetResponse(pagenum)
pagenum += threadnum
print '%d over' % init_num
def NormalThread(threadnum):
startime = time.time()
print "%s is running..." % threading.current_thread().name
threads = []
global finished, requestnum
for i in xrange(threadnum):
thread = threading.Thread(target=DigJobByPagenum, args=(i, requestnum))
threads.append(thread)
for t in threads:
t.daemon = True
t.start()
for t in threads:
t.join()
finished += 1
endtime = time.time()
print "%s is stop.The total time is %0.2f" % \
(threading.current_thread().name, (endtime - startime))
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
if __name__ == '__main__':
threadnum = int(sys.argv[1])
requestnum = int(sys.argv[2])
print 'threadnum : %s,requestnum %s ' % (threadnum, requestnum)
originStartTime = time.time()
urlset = []
setUrlSet()
NormalThread(threadnum)
******This is the gevent verison code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import gevent
from gevent import pool
import requests
import time
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(url):
startime = time.time()
r = requests.get(url)
print url
endtime = time.time()
spendtime = endtime - startime
NormalSpendTime.append(spendtime)
global finished
finished += 1
print finished
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
def RunAsyncJob():
jobpool = pool.Pool(concurrent)
for url in urlset:
jobpool.spawn(GetResponse, url)
jobpool.join()
endtime = time.time()
allSpendTime = endtime - originStartime
print 'Total spend time is %0.3f, total request num is %s within %s \
seconds' % (allSpendTime, finished, timeoutNum)
print 'Each request time is %0.3f' % (GetAvageTime(NormalSpendTime))
if __name__ == '__main__':
concurrent = int(sys.argv[1])
requestnum = int(sys.argv[2])
timeoutNum = 100
NormalSpendTime = []
urlset = []
urlActionList = []
setUrlSet()
originStartime = time.time()
RunAsyncJob()
Try
gevent.monkey.patch_all(httplib=True)
It seems that by default gevent does not patch httplib (have a look at http://www.gevent.org/gevent.monkey.html : httplib=False) so you are actually doing blocking requests and you lose all advantages of the asynchronous framework. Although I'm not sure whether requests uses httplib.
If that doesn't work, then have a look at this lib:
https://github.com/kennethreitz/grequests
Re: httplib=False
You are already using requests library to make web calls. It has gevent flavour called grequests:
https://github.com/kennethreitz/grequests
Overall I don't immediately see much reason to prefer one style of threading to the other, if your pool is so small. Of course real threads are relatively heavy (start with 8MB stack), but you have to take that into proportion to the size of your job.
My take, try both (done), verify you are doing both right (to do) and let numbers do the talking.
The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"