I'm trying to run this code:
def VideoHandler(id):
try:
cursor = conn.cursor()
print "Doing {0}".format(id)
data = urllib2.urlopen("http://myblogfms2.fxp.co.il/video" + str(id) + "/").read()
title = re.search("<span class=\"style5\"><strong>([\\s\\S]+?)</strong></span>", data).group(1)
picture = re.search("#4F9EFF;\"><img src=\"(.+?)\" width=\"120\" height=\"90\"", data).group(1)
link = re.search("flashvars=\"([\\s\\S]+?)\" width=\"612\"", data).group(1)
id = id
print "Done with {0}".format(id)
cursor.execute("insert into videos (`title`, `picture`, `link`, `vid_id`) values('{0}', '{1}', '{2}', {3})".format(title, picture, link, id))
print "Added {0} to the database".format(id)
except:
pass
x = 1
while True:
if x != 945719:
currentX = x
thread.start_new_thread(VideoHandler, (currentX))
else:
break
x += 1
and it says "can't start new thread"
The real reason for the error is most likely that you create way too many threads (more than 100k!!!) and hit an OS-level limit.
Your code can be improved in many ways besides this:
don't use the low level thread module, use the Thread class in the threading module.
join the threads at the end of your code
limit the number of threads you create to something reasonable: to process all elements, create a small number of threads and let each one process a subset of the whole data (this is what I propose below, but you could also adopt a producer-consumer pattern with worker threads getting their data from a queue.Queue instance)
and never, ever have a except: pass statement in your code. Or
if you do, don't come crying here if your code does not work and you
cannot figure out why. :-)
Here's a proposal:
from threading import Thread
import urllib2
import re
def VideoHandler(id_list):
for id in id_list:
try:
cursor = conn.cursor()
print "Doing {0}".format(id)
data = urllib2.urlopen("http://myblogfms2.fxp.co.il/video" + str(id) + "/").read()
title = re.search("<span class=\"style5\"><strong>([\\s\\S]+?)</strong></span>", data).group(1)
picture = re.search("#4F9EFF;\"><img src=\"(.+?)\" width=\"120\" height=\"90\"", data).group(1)
link = re.search("flashvars=\"([\\s\\S]+?)\" width=\"612\"", data).group(1)
id = id
print "Done with {0}".format(id)
cursor.execute("insert into videos (`title`, `picture`, `link`, `vid_id`) values('{0}', '{1}', '{2}', {3})".format(title, picture, link, id))
print "Added {0} to the database".format(id)
except:
import traceback
traceback.print_exc()
conn = get_some_dbapi_connection()
threads = []
nb_threads = 8
max_id = 945718
for i in range(nb_threads):
id_range = range(i*max_id//nb_threads, (i+1)*max_id//nb_threads + 1)
thread = Thread(target=VideoHandler, args=(id_range,))
threads.append(thread)
thread.start()
for thread in threads:
thread.join() # wait for completion
os has a limit of the amount of threads. So you can't create too many threads over the limit.
ThreadPool should be a good choice for you the do this high concurrency work.
Related
I have a function that executes a SELECT sql query (using postgresql).
Now, I want to INSERT to some table in my DB the execution time of this query, however, I want to do it in parallel, so that even if my INSERT query is still running I will be able to continue my program and call other functions.
I tries to use multiprocessing.Process, however, my function is waiting for the process to finish and I'm actually losing the effect of the parallelism I wanted.
My code in a nut shell:
def select_func():
with connection.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'"
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start()
process.join()
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Now the problem is that this operation is not really async, since select_func is waiting until insert_function is finished. I want that the execution of these functions won't be depended and that the select function could end even though insert_function is still running so that I will be able to continue and call other function in my script.
Thanks!
Quite a lot of issues with your code snippet but lets try to at least give a structure to implement.
def select_func():
with connection.cursor() as cursor: #I dont think the same global variable connectino should be used for read/write simultaneously
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #quotation issues
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start() #you start the process here BUT
process.join() #you force python to wait for it here....
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Consider an alternative:
def select_func():
read_con = sql.connect() #sqlite syntax but use your connection
with read_con.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #where does Alice come from?
start = time.time()
cursor.execute(query)
end = time.time()
return cursor.fetchall(),(query,(end-start)) #Our tuple has query at position 0 and time at position 1
def insert_function(insert_queue): #The insert you want to parallleize
connection = sql.connect("db") #initialize your 'writer'. Note: May be good to initialize the connection on each insert. Not sure if optimal.
while True: #We keep pulling from the pipe
data = insert_queue.get() # we pull from our pipe
if data == 'STOP': #Example of a kill instruction to stop our process
break #breaks the while loop and the function can 'exit'
with connection.cursor() as cursor:
query_data = data #I assume you would want to pass your query through the pipe
query= query_data[0] #see how we stored the tuple
time = query_data[1] #as above
insert_query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")" #Somehow query and time goes into the insert_query
cursor.execute(insert_query)
connection.commit()
if __name__ == '__main__': #Typical python main thread
query_pipe = Queue() #we initialize a Queue here to feed into your inserting function
process = Process(target = insert_func,args = (query_pipe,)
process.start()
stuff = []
for i in range(5):
data,insert_query = select_function() #select function lets say it gets the data you want to insert.
stuff.append(data)
query_pipe.put(insert_query)
#
#Do other stuff and even put more stuff into the pipe.
#
query_pipe.put('STOP') #we wanna kill our process so we send the stop command
process.join()
I got a table from MSSQL with 5M rows and when I fetch all the rows of this table, this take me 2~3 minutes. I want (if possible) to optimize that.
That's my code :
cursor.execute("SELECT * FROM MyTable")
rows = cursor.fetchall() # that takes 2~3 minutes
# some code for setup the output that take only few seconds
I already tried, to used :
while True:
rows = cursor.fetchmany(500000)
if not rows:
break
# Do some stuff
And Also with fetchone.
But again i'm between 2-3 mins :/ How to optimize that ? Maybe using thread but I don't know how.
thanks for your help.
I think you can limit the number of lines returned by your query even if you have to make several calls to your database.
About the Threads, you have several solutions:
A single connection but a different cursor for each Thread
One connection for each Thread and one cursor from that connection
In any case you need a ThreadedConnectionPool. Here is a small example of one of the ways to do it
import psycopg2
from psycopg2 import pool
from threading import Thread
from time import sleep
threaded_connection_pool = None
thread_table = list()
def get_new_connection():
global threaded_postgreSQL_pool
connection = None
while not isinstance(connection, psycopg2.extensions.connection):
try:
connection = threaded_postgreSQL_pool.getconn()
except pool.PoolError:
sleep(10) # Wait a free connection
return connection, connection.cursor()
def thread_target():
connection, cursor = get_new_connection()
with connection, cursor:
# Do some stuff
pass
threaded_connection_pool = psycopg2.pool.ThreadedConnectionPool(
# YOUR PARAM
)
for counter_thread in range(10):
thread = Thread(
target=thread_target,
name=f"Thread n°{counter_thread}"
)
thread_table.append(thread)
thread.start()
#
# Do many more stuff
#
for thread in thread_table:
thread.join()
# End
I prefer to use the first solution "A single connection but a different cursor for each Thread"
For that : I have to do something like that ?
result = []
cursor = connection.cursor()
def fetch_cursor(cursor):
global result
rows = cursor.fetchall()
if rows:
result += beautify_output(rows)
######### THIS CODE BELOW IS INSIDE A FUNCTION ######
thread_table = []
limit = 1000000
offset = 0
sql = "SELECT * FROM myTABLE"
while True:
try:
cursor.execute(f"{sql} LIMIT {limit} OFFSET {offset}")
except Exception as e:
break
offset += limit
thread = Thread(target=fetch_cursor, args=(cursor,))
thread_table.append(thread)
thread.start()
for thread in thread_table:
thread.join()
print(result)
So something like that should work ? (I will try that tommorow)
I have a link that I want to test for robustness, for lack of a better word. What I have code that pings the URL multiple times, sequentially:
# Testing for robustness
for i in range(100000):
city = 'New York'
city = '%20'.join(city.split(' '))
res = requests.get(f'http://example.com/twofishes?query={city}')
data = res.json()
geo = data['interpretations'][0]['feature']['geometry']['center']
print('pinging xtime: %s ' % str(i))
print(geo['lat'], geo['lng'])
I want to take this code, but ping the link say, 10 or 12 times at once. I don't mind the sequential pinging, but that's not as efficient as pinging multiple times at once. I feel like this is a quick modification, where the for loop comes out and a PULL function goes in?
Here is an example program which should work for this task. Given that I do not want to be blacklisted, I have not actually tested the code to see if it works. Regardless, it should at least be in the ballpark of what your looking for. If you want actually have all of the threads execute at the same time I would look into adding events. Hope this helps.
Code
import threading
import requests
import requests.exceptions as exceptions
def stress_test(s):
for i in range(100000):
try:
city = 'New York'
city = '%20'.join(city.split(' '))
res = s.get(f'http://example.com/twofishes?query={city}')
data = res.json()
geo = data['interpretations'][0]['feature']['geometry']['center']
print('pinging xtime: %s ' % str(i))
print(geo['lat'], geo['lng'])
except (exceptions.ConnectionError, exceptions.HTTPError, exceptions.Timeout):
pass
if __name__ == '__main__':
for i in range(1, 12):
s = requests.session()
t = threading.Thread(target=stress_test, args=(s,))
t.start()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
I have made a script which constructs a checkout URL for shopify websites. This is done by appending each unique product 'variant' ID in the checkout URL and then opening the said URL in a webbrowser. To find the variant ID, i need to parse the website's sitemap to obtain the ID, which I am currenly doing in seperate threads for each product i am parsing, however with each thread added the time it takes increases by quite a lot (nearly one second).
Why is this the case? Shouldn't it take around the same time since each thread basically does the same exact thing?
For reference, one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Here is my code:
import time
import requests
from bs4 import BeautifulSoup
import webbrowser
import threading
sitemap2 = 'https://deadstock.ca/sitemap_products_1.xml'
atc_url = 'https://deadstock.ca/cart/'
# CHANGE SITEMAP TO THE CORRECT ONE (THE SITE YOU ARE SCRAPING)
variant_list = []
def add_to_cart(keywords, size):
init = time.time()
# Initialize session
product_url = ''
parse_session = requests.Session()
response = parse_session.get(sitemap2)
soup = BeautifulSoup(response.content, 'lxml')
variant_id = 0
# Find Item
for urls in soup.find_all('url'):
for images in urls.find_all('image:image'):
if all(i in images.find('image:title').text.lower() for i in keywords):
now = time.time()
product_name = images.find('image:title').text
print('FOUND: ' + product_name + ' - ' + str(format(now-init, '.3g')) + 's')
product_url = urls.find("loc").text
if product_url != '':
response1 = parse_session.get(product_url+".xml")
soup = BeautifulSoup(response1.content,'lxml')
for variants in soup.find_all('variant'):
if size in variants.find('title').text.lower():
variant_id = variants.find('id', type='integer').text
atc_link = str(variant_id)+':1'
print(atc_link)
variant_list.append(atc_link)
try:
print("PARSED PRODUCT: " + product_name)
except UnboundLocalError:
print("Retrying")
add_to_cart(keywords, size)
def open_checkout():
url = 'https://deadstock.ca/cart/'
for var in variant_list:
url = url + var + ','
webbrowser.open_new_tab(url)
# When initializing a new thread, only change the keywords in the args, and make sure you start and join the thread.
# Change sitemap in scraper.py to your websites' sitemap
# If the script finds multiple items, the first item will be opened so please try to be very specific yet accurate.
def main():
print("Starting Script")
init = time.time()
try:
t1 = threading.Thread(target=add_to_cart, args=(['alltimers','relations','t-shirt','white'],'s',))
t2 = threading.Thread(target=add_to_cart, args=(['alltimers', 'relations', 'maroon'],'s',))
t3 = threading.Thread(target=add_to_cart, args=(['brain', 'dead','melter'], 's',))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
print(variant_list)
open_checkout()
except:
print("Product not found / not yet live. Retrying..")
main()
print("Time taken: " + str(time.time()-init))
if __name__ == '__main__':
main()
Question: ... one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Regarding your example code, you are counting​ the sum of all threads.
As #asettouf pointed out, there is a overhead, mean you have to pay for it.
But I assume, doing this 3 tasks threaded will be faster as doing it one after the other.
I'm finding the boto dynamoDB documentation lacking almost completely of examples.
In Python, I simply want to output the contents of a table with a limit of a number of records, say 500 of the latest ones, from a certain date.
Here is what I have...
import boto.dynamodb
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb.connect_to_region(
'us-east-1',
aws_access_key_id='somekey',
aws_secret_access_key='somesecretkey')
#----------------------------------------------------#
def info():
print('#########################_TABLE_NAMES_#########################')
#get and print list of tables
tablenames = connection.list_tables()
for table in tablenames:
print('DynamoDB table: %s' % table)
#print(connection.describe_table(table))
print('###############################################################' + '\n')
def main():
print('###########################_RESULTS_###########################')
scan = myTable.scan(scan_filter=None, attributes_to_get=['SomeField'])
results = []
for x in scan:
results.append(x['SomeField'])
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
info()
main()
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
The table I have hasn't got any custom indexes so I'd be looking for something pretty basic as an example.
I'm sorry I don't have a better attempt, but I've researched and not found a lot to go on.
I've modified your script to print out the first 500 scan results for each table. Don't forget to correct the field name (I put someField):
import boto.dynamodb2
from boto.dynamodb2.table import Table
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb2.connect_to_region(
'us-east-1')
#----------------------------------------------------#
def getTableNames():
'''get list of tables'''
tablenames = connection.list_tables()["TableNames"]
return tablenames
def main(tablenames=[]):
print('###########################_RESULTS_###########################')
for table in tablenames:
print "Table Name: " + table
myTable = Table(table)
scan = myTable.scan()
results = []
for item in scan:
if len(results) >= 500:
break
results.append(item.get('someField'))
for result in results:
print result
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
tablenames = getTableNames()
main(tablenames)
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
Please note that DynamoDB doesn't provide scan results in any order. If you want them ordered by the latest changes, you can use a solution based on DynamoDB Streams https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html or add a secondary index: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html