If I run the script step by step works perfectly, but when I'm using threading misses 50-60%. I'm using Python + mechanize module
#setting up the browser
mySite = 'http://example.com/managament.php?'
postData = {'UserID' : '', 'Action':'Delete'}
job_tab1_user1 = [1,2,3]
job_tab2_user1 = [4,5,6]
job_tab1_user2 = [7,8,9]
job_tab2_user2 = [10,12,13]
.... till user1000
#i want to point out that the lists are 100% different
def user1_jobs:
for i in job_tab1_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
def user2_jobs:
for i in job_tab1_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
... and so on till user 1000
And I call them in the end like this:
t_user1 = threading.Thread(target=user1_jobs, args=[])
t_user1.start()
t_user2 = threading.Thread(target=user2_jobs, args=[])
t_user2.start()
I have a similar script that sends like 200 request per second and all of them are processed. I also tried using time.sleep(2), but again is missing a lot.
Another question besides what is wrong with my script is if its way to compact this code, because I'm using 1000 users and the script reaches thousands of lines. Thank you in advance.
from threading import *
submits = [[1,2,3], [3,4,5], [6,7,8]]
class worker(Thread):
def __init__(self, site, postdata, data):
Thread.__init__(self)
self.data = data
self.site = site
self.postdata = postdata
self.start()
def run(self):
for i in self.data:
browser.open("http://example.com/jobs.php?actions="+str(i))
browser.open(self.site, self.postdata)
for obj in submits:
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, submits)
Since the OP asked for it, here's a condensed/compressed version of the code.
or:
for index in range(0,1000):
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, [i for i in range(1,4)])
If the data you want to send actually is a sequence of 3 integers (1,2,3) that inclines in a perfect order.
Here is a full script that you can easily modify by changing the initial variables.
It creates a list dynamically and uses a generator to create the functions for each thread.
Currently it creates 1000 users, each with 2 tabs and 3 jobs.
# define your variables here
NUM_USERS = 1000
NUM_JOBS_PER_USER = 3
NUM_TABS_PER_USER = 2
URL_PART = "http://example.com/jobs.php?actions="
# populate our list of jobs
# the structure is like this: jobs[user][tab][job]
jobs = [[[0 for y in range(NUM_JOBS_PER_USER)] \
for x in range(NUM_TABS_PER_USER)] \
for x in range(NUM_USERS)]
p = 1
for i in range(NUM_USERS):
for j in range(NUM_TABS_PER_USER):
for k in range(NUM_JOBS_PER_USER):
jobs[i][j][k] = p
p += 1
# create a generator that builds our thread functions
def generateFunctions(jobs):
for user in jobs:
for tab in user:
for job in tab:
def f():
browser.open(URL_PART + str(job))
browser.open(mySite, Post_data)
yield f
# create and start threads, add them to a list
# if we need to preserve handlers for later use
threads = []
for f in generateFunctions(jobs):
thr = threading.Thread(target = f, args=[])
thr.start()
threads.append(thr)
Related
I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)
I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)
I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!
I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished
for testing reasons I start only 1 process. One given argument is an array that shall be changed from that process.
class Engine():
Ready = Value('i', False)
def movelisttoctypemovelist(self, movelist):
ctML = []
for zug in movelist:
ctZug = ctypeZug()
ctZug.VonReihe = zug.VonReihe
ctZug.VonLinie = zug.VonLinie
ctZug.NachReihe = zug.NachReihe
ctZug.NachLinie = zug.NachLinie
ctZug.Bewertung = zug.Bewertung
ctML.append(ctZug)
return ctML
def findbestmove(self, board, settings, enginesettings):
print ("Computer using", multiprocessing.cpu_count(),"Cores.")
movelist = Array(ctypeZug, [], lock = True)
movelist = self.movelisttoctypemovelist(board.movelist)
bd = board.boardtodictionary()
process = []
for i in range(1):
p = Process(target=self.calculatenullmoves, args=(bd, movelist, i, self.Ready))
process.append(p)
p.start()
for p in process:
p.join()
self.printctypemovelist(movelist, settings)
print ("Ready:", self.Ready.value)
def calculatenullmoves(self, boarddictionary, ml, processindex, ready):
currenttime = time()
print ("Process", processindex, "begins to work...")
board = Board()
board.dictionarytoboard(boarddictionary)
...
ml[processindex].Bewertung = 2.4
ready.value = True
print ("Process", processindex, "finished work in", time()-currenttime, "sec")
def printctypemovelist(self, ml):
for zug in ml:
print (zug.VonReihe, zug.VonLinie, zug.NachReihe, zug.NachLinie, zug.Bewertung)
I try to write 2.4 directly in the list, but no changing is shown when calling "printctypemovelist".
I set "Ready" to True and it works.
I used information from http://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.sharedctypes
I hope someone can find my mistake, if it is too difficult to read, please let me know.
The problem is that you're trying to share a plain Python list:
ctML = []
Use a proxy object instead:
from multiprocessing import Manager
ctML = Manager().list()
See Python doc on Sharing state between processes for more detail.
I have some python code to read a file and push data to a list. Then put this list to queue, use threading to process the list, say 20 items a time. After processing, I save the result into a new file. What was put in the new file was actually different order than the original file. For example, I have in input,
1 a
2 b
3 c
4 a
5 d
But the output looks like:
2 aa
1 ba
4 aa
5 da
3 ca
Is there any way to preserve the original order?
Here is my code:
import threading,Queue,time,sys
class eSS(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
self.lock = threading.Lock()
def ess(self,email,code,suggested,comment,reason,dlx_score):
#do something
def run(self):
while True:
info = self.queue.get()
infolist = info.split('\t')
email = infolist[1]
code = infolist[2]
suggested = infolist[3]
comment = infolist[4]
reason = infolist[5]
dlx_score = (0 if infolist[6] == 'NULL' else int(infolist[6]))
g.write(info + '\t' + self.ess(email,code,suggested,comment,reason,dlx_score) +'\r\n')
self.queue.task_done()
if __name__ == "__main__":
queue = Queue.Queue()
filename = sys.argv[1]
#Define number of threads
threads = 20
f = open(filename,'r')
g = open(filename+'.eSS','w')
lines = f.read().splitlines()
f.close()
start = time.time()
for i in range(threads):
t = eSS(queue)
t.setDaemon(True)
t.start()
for line in lines:
queue.put(line)
queue.join()
print time.time()-start
g.close()
Three thoughts come to mind. Common to all is to include an index with the packet that is queued for processing.
One thought then is to use the controller/workers/output framework in which the output thread de-queues the worker-processed data, assembles, and outputs it.
The second thought is to employ a memory-mapped file for output, and use the index to calculate the offset to write into the file (assumes fixed-length writes probably).
The third is to use the index to put processed data in a new list, and when the list is completed write the items out at the end rather than on the fly.