Python - run part of code multipel times at once - python

I have this code:
configurationsFile = "file.csv"
configurations = []
def loadConfigurations():
with open(configurationsFile) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
line_count = 0
for row in csv_reader:
url = row[0]
line_count += 1
configurations.append({"url": url})
print(f'{line_count} urls loaded.')
loadConfigurations()
failedConfigs = []
session_requests = requests.session()
for config in configurations:
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
if len(failedConfigs) > 0:
print("These errored out:")
for theConfig in failedConfigs:
print("ERROR: {}".format(theConfig['url']))
It reads urls from a csv file, and then runs a code for each of the urls that's listed in the csv file.
The only "problem" is if the csv file contains a lot of urls, then it takes a long time to run thru them all. So I'm looking for a way to run more then one url at a time.
I'm not that good with python so I don't even know if it's possible.
But the question is, is there some way to tell the code to run, say 5 urls at once instead of just 1?

You can use the threading.Thread class. Here is an example:
from threading import Thread
def read(file, start, end):
with open(file, 'r') as r:
for i, v in enumerate(r):
if start <= i < end:
print(v)
file = "file.txt"
t1 = Thread(target=read, args=(file, 0, 100))
t2 = Thread(target=read, args=(file, 100, 200))
t3 = Thread(target=read, args=(file, 200, 300))
t4 = Thread(target=read, args=(file, 300, 400))
t5 = Thread(target=read, args=(file, 400, 500))
t1.start()
t2.start()
t3.start()
t3.start()
t5.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
Or use a loop:
from threading import Thread
def read(file, start, end):
with open(file, 'r') as r:
for i, v in enumerate(r):
if start <= i < end:
print(v)
file = "file.txt"
threads = []
for i in range(5):
threads.append(Thread(target=read, args=(file, i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
Basically, the read() function defined above reads in a file from line start to line end. Split the reading tasks into 5 segments so that 5 threads can simultaneously read the file.
UPDATE UPON REQUEST
For your code, the
for config in configurations:
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
Can be converted to a function which allows you to specify from which index to which index of the configurations you want to process:
def process(start, end):
for i in range(start, end):
config = configurations[i]
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
Which you can then add
threads = []
for i in range(5):
threads.append(Thread(target=process, args=(i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
So you might end up with something like:
configurationsFile = "file.csv"
configurations = []
def loadConfigurations():
with open(configurationsFile) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
line_count = 0
for row in csv_reader:
url = row[0]
line_count += 1
configurations.append({"url": url})
print(f'{line_count} urls loaded.')
loadConfigurations()
failedConfigs = []
session_requests = requests.session()
def process(start, end):
for i in range(start, end):
config = configurations[i]
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
threads = []
for i in range(5):
threads.append(Thread(target=process, args=(i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
if len(failedConfigs) > 0:
print("These errored out:")
for theConfig in failedConfigs:
print("ERROR: {}".format(theConfig['url']))

Related

Python pass arguments from a .csv file to threads

the csv:
email,password
test0#test.com,SuperSecretpassword123!
test1#test.com,SuperSecretpassword123!
test2#test.com,SuperSecretpassword123!
test3#test.com,SuperSecretpassword123!
the example printing function
def start_printer(row):
email = row["email"]
password = row["password"]
print(f"{email}:{password}")
the threading starting example
number_of_threads = 10
for _ in range(number_of_threads):
t = Thread(target=start_printer, args=(row,))
time.sleep(0.1)
t.start()
threads.append(t)
for t in threads:
t.join()
how do I pass the values from the csv to the threading example?
I guess you could go about it this way:
from threading import Thread
from csv import DictReader
def returnPartionedList(inputlist: list, x: int = 100) -> list: # returns inputlist split into x parts, default is 100
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
def start_printer(row) -> None:
email: str = row["email"]
password: str = row["password"]
print(f"{email}:{password}")
def main() -> None:
path: str = r"tasks.csv"
list_tasks: list = []
with open(path) as csv_file:
csv_reader: DictReader = DictReader(csv_file, delimiter=',')
for row in csv_reader:
list_tasks.append(row)
list_tasks_partitions: list = returnPartionedList(list_tasks, 10) # Run 10 threads per partition
for partition in list_tasks_partitions:
threads: list = [Thread(target=start_printer, args=(row,)) for row in partition]
for t in threads:
t.start()
t.join()
if __name__ == "__main__":
main()
Result:
test0#test.com:SuperSecretpassword123!
test1#test.com:SuperSecretpassword123!
test2#test.com:SuperSecretpassword123!
test3#test.com:SuperSecretpassword123!

Queue.put inside a worker thread failing

Inside a worker thread I am generating a data frame . Trying to put this into the queue passed to the worker thread is failing. In fact trying to put any values into the queue is failing.
The part of the code that is failing inside the worker thread task1() is given below:
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
Complete code
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl1 = TSNE(perplexity=100,early_exaggeration=1, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl1.fit_transform(dense_mx))
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
print(df)
print(str(mdltiming))
print(paramval)
def task2(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl2 = TSNE(perplexity=100,early_exaggeration=10, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl2.fit_transform(dense_mx2))
qmdlvalues.put(pd.DataFrame([[1,2,3,4],[3,4,5,6]]))
qmdlparams.put(paramval)
mdltiming = time.time() - start
qtiming.put(mdltiming)
if __name__ == "__main__":
dense_mx2 = dense_mx
dense_mx3 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qtme = queue.Queue()
mdlvalues = pd.DataFrame()
t1 = threading.Thread(target=task1,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 1 timing:$_plex100_exag1.csv"), name='t1')
t2 = threading.Thread(target=task2,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 10 timing:$_plex100_exag10.cv"), name='t2')
# starting threads
t1.start()
t2.start()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
# wait until all threads finish
t1.join()
t2.join()
Below is the actual output I am getting from the code in the main
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
ID of process running main program: 6456
Main thread name: MainThread
Queue closed. Exiting thread.
I want to able to put the data frame into a queue inside the worker thread and access the same data frame in the main thread.
There are parameter mis-matches in my earlier code those have been corrected an a full working code presented below.
I stored the output of t-SNE directly into the queue and retrieved the same in the main thread. The next progression would be convert this to thread pool and sub-classing.
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl1 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl1.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
print(str(mdltiming)+"time")
qmdltime.put(mdltiming)
def task2(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl2 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl2.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task3(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl3 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl3.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task4(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl4 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl4.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
if __name__ == "__main__":
# print ID of current process
print("ID of process running main program: {}".format(os.getpid()))
# print name of main thread
print("Main thread name: {}".format(threading.main_thread().name))
dense_mx2 = dense_mx
dense_mx3 = dense_mx
dense_mx4 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qmdltme = queue.Queue()
qmdlhdrfname = queue.Queue()
perplex = 200
# creating threads
exag=10
t1 = threading.Thread(target=task1,args=(dense_mx,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 10 timing:$_plex200_exag10.csv"), name='t1')
exag=30
t2 = threading.Thread(target=task2,args=(dense_mx2,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 30 timing:$_plex200_exag30.cv"), name='t2')
exag=50
t3 = threading.Thread(target=task3,args=(dense_mx3,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 50 timing:$_plex200_exag50.csv"), name='t3')
exag=100
t4 = threading.Thread(target=task4,args=(dense_mx4,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 100 timing:$_plex200_exag100.cv"), name='t4')
# starting threads
t1.start()
t2.start()
t3.start()
t4.start()
# wait until all threads finish
t1.join()
t2.join()
t3.join()
t4.join()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item1 = qmdlvl.get(timeout=.5)
item2 = qmdlch.get(timeout=.5)
item3 = qmdltme.get(timeout=.5)
header,fname = qmdlhdrfname.get(timeout=.5).split('$')
except:
continue
write_tsne_op(item1,fname,header)

python for loop in parallel

I am trying to read data from an input file, and for each line perform a task in a while loop. Problem is that when I create the first process - its loop is executing and not returning control to the above for loop. Bottom line there is no parallelism. What am I doing wrong?
Here is the relevant code:
from multiprocessing import Process
def work_line(list1Line,jobId):
while True:
print list1Line
tenant = list1Line[0]
module = list1Line[1]
endTime = int(time.time())
startTime = endTime - startTimeDelta
generate(jobId, startTime, endTime, tenantServiceAddress, tenant, module)
print ("tenant {} will sleep for {} seconds").format(tenant,sleepBetweenLoops)
time.sleep(sleepBetweenLoops)
def openFiles():
file = open(CLOUD_INPUT_FILE, 'r')
lines = file.readlines()
file.close()
linesLen = len(lines)
processes = []
for linesIndex in range(0, linesLen):
jobId = GenerateRandomID()
line = lines[linesIndex]
list1Line = line.split()
p = Process(target=work_line(list1Line,jobId))
p.start()
processes.append(p)
print processes
for p in processes:
p.join()
if __name__ == '__main__':
CLOUD_INPUT_FILE = r'C:\CF\input_file.txt'
tenantServiceAddress = 'address.address'
startTimeDelta = 300
sleepBetweenLoops = 1800
print multiprocessing.cpu_count()
openFiles()
You are actually calling the function. Change to
p = Process(target=work_line, args=(list1Line,jobId))

Do locks block the access to an entire array/list/dict in python?

I have 4 threads which call a common function with different arguments each.
But all 4 of these threads make changes to the same list.
They may or may not be accessing the same index/key in the list.
Do I need to implements lock? and how do I proceed?
Here is my code:
import itertools
from threading import Thread
winr = {}
two = {}
winr=[{},{},{},{},{}]
def openfile():
for i in range(0,5):
filename='C:\Users\Rishabh\Desktop\\'+str(i+1)+'.txt'
with open(filename) as f:
lines = f.readlines()
for x in lines:
xt=x.split(" ")
winr[i][str(xt[0])]=[str(xt[1]),str(xt[2])]
def getmatch(start,thread_id,last_match):
match_id=start*4+thread_id
while match_id<last_match:
while True:
try:
import urllib2,json
response = urllib2.urlopen('https://api.steampowered.com/IDOTA2Match_570/GetMatchDetails/V001/?match_id='+str(match_id)+'&key=C4B5D001E352AB612DCECABBFAB949B1')
except urllib2.HTTPError, err:
continue
break
r=[]
d=[]
team={}
html = response.read()
dic = json.loads(html)
if 'result' in dic and 'radiant_win' in dic['result'] and dic['result']['human_players']==10:
print match_id
for a in dic['result']['players']:
if a['player_slot']>127:
d.append(str(a['hero_id']))
else:
r.append(str(a['hero_id']))
if dic['result']['radiant_win']=='true':
team[0]=r
team[1]=d
else:
team[0]=d
team[1]=r
updatearrays(team)
else:
print "error",match_id
match_id=match_id+4
# do stuff
def updatearrays(team):
team[0].sort()
team[1].sort()
winner=[]
loser=[]
for i in range(1,6):
w=[]
r=[]
winner=list(itertools.combinations(team[0],i))
loser=list(itertools.combinations(team[1],i))
for d in winner:
w.append('.'.join(str(a) for a in d))
for d in loser:
r.append('.'.join(str(a) for a in d))
for k in w:
if k in winr[i-1]:
winr[i-1][k][0]=int(winr[i-1][k][0])+1
winr[i-1][k][1]=int(winr[i-1][k][1])+1
else:
winr[i-1][k]=[1,1]
for k in r:
if k in winr[i-1]:
winr[i-1][k][0]=int(winr[i-1][k][0])+1
else:
winr[i-1][k]=[1,0]
winner=[]
loser=[]
def updatefile():
# print winr
for i in range(0,5):
filename='C:\Users\Rishabh\Desktop\\'+str(i+1)+'.txt'
with open(filename,"w") as f:
for j in winr[i]:
tstring = str(j)+" "+str(winr[i][j][0])+" "+str(winr[i][j][1])+" \n"
f.write(tstring)
openfile()
t1 = Thread(target=getmatch, args=(0,1,1000))
t2 = Thread(target=getmatch, args=(0,2,1000))
t3 = Thread(target=getmatch, args=(0,3,1000))
t4 = Thread(target=getmatch, args=(0,4,1000))
t1.start()
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
updatefile()
The common function I was referring to is updatearray()

Only 1 Thread started in for loop

So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.

Categories

Resources