Basically, I pull a series of links from my database, and want to scrape them for specific links I'm looking for. I then re-feed those links into my link queue that my multiple QWebViews reference, and they continue to pull those down for processing/storage.
My issue is that as this runs for... say 200 or 500 links, it starts to use up more and more RAM.
I have exhaustively looked into this, using heapy, memory_profiler, and objgraph to figure out what's causing the memory leak... The python heap's objects stay about the the same in terms of amount AND size over time. This made me think the C++ objects weren't getting removed. Sure enough, using memory_profiler, the RAM only goes up when the self.load(self.url) lines of code are called. I've tried to fix this, but to no avail.
Code:
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebView, QWebSettings
from PyQt4.QtGui import QApplication
from lxml.etree import HTMLParser
# My functions
from util import dump_list2queue, parse_doc
class ThreadFlag:
def __init__(self, threads, jid, db):
self.threads = threads
self.job_id = jid
self.db_direct = db
self.xml_parser = HTMLParser()
class WebView(QWebView):
def __init__(self, thread_flag, id_no):
super(QWebView, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.settings().globalSettings().setAttribute(QWebSettings.AutoLoadImages, False)
# This is actually a dict with a few additional details about the url we want to pull
self.url = None
# doing one instance of this to avoid memory leaks
self.qurl = QUrl()
# id of the webview instance
self.id = id_no
# Status webview instance, green mean it isn't working and yellow means it is.
self.status = 'GREEN'
# Reference to a single universal object all the webview instances can see.
self.thread_flag = thread_flag
def handleLoadFinished(self):
try:
self.processCurrentPage()
except Exception as e:
print e
self.status = 'GREEN'
if not self.fetchNext():
# We're finished!
self.loadFinished.disconnect()
self.stop()
else:
# We're not finished! Do next url.
self.qurl.setUrl(self.url['url'])
self.load(self.qurl)
def processCurrentPage(self):
self.frame = str(self.page().mainFrame().toHtml().toUtf8())
# This is the case for the initial web pages I want to gather links from.
if 'name' in self.url:
# Parse html string for links I'm looking for.
new_links = parse_doc(self.thread_flag.xml_parser, self.url, self.frame)
if len(new_links) == 0: return 0
fkid = self.url['pkid']
new_links = map(lambda x: (fkid, x['title'],x['url'], self.thread_flag.job_id), new_links)
# Post links to database, db de-dupes and then repull ones that made it.
self.thread_flag.db_direct.post_links(new_links)
added_links = self.thread_flag.db_direct.get_links(self.thread_flag.job_id,fkid)
# Add the pulled links to central queue all the qwebviews pull from
dump_list2queue(added_links, self._urls)
del added_links
else:
# Process one of the links I pulled from the initial set of data that was originally in the queue.
print "Processing target link!"
# Get next url from the universal queue!
def fetchNext(self):
if self._urls and self._urls.empty():
self.status = 'GREEN'
return False
else:
self.status = 'YELLOW'
self.url = self._urls.get()
return True
def start(self, urls):
# This is where the reference to the universal queue gets made.
self._urls = urls
if self.fetchNext():
self.qurl.setUrl(self.url['url'])
self.load(self.qurl)
# uq = central url queue shared between webview instances
# ta = array of webview objects
# tf - thread flag (basically just a custom universal object that all the webviews can access).
# This main "program" is started by another script elsewhere.
def main_program(uq, ta, tf):
app = QApplication([])
webviews = ta
threadflag = tf
tf.app = app
print "Beginning the multiple async web calls..."
# Create n "threads" (really just webviews) that each will make asynchronous calls.
for n in range(0,threadflag.threads):
webviews.append(WebView(threadflag, n+1))
webviews[n].start(uq)
app.exec_()
Here's what my memory tools say (they're all about constant through the whole program)
RAM: resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
2491(MB)
Objgraph most common types:
methoddescriptor 9959
function 8342
weakref 6440
tuple 6418
dict 4982
wrapper_descriptor 4380
getset_descriptor 2314
list 1890
method_descriptor 1445
builtin_function_or_method 1298
Heapy:
Partition of a set of 9879 objects. Total size = 1510000 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 2646 27 445216 29 445216 29 str
1 563 6 262088 17 707304 47 dict (no owner)
2 2267 23 199496 13 906800 60 __builtin__.weakref
3 2381 24 179128 12 1085928 72 tuple
4 212 2 107744 7 1193672 79 dict of guppy.etc.Glue.Interface
5 50 1 52400 3 1246072 83 dict of guppy.etc.Glue.Share
6 121 1 40200 3 1286272 85 list
7 116 1 32480 2 1318752 87 dict of guppy.etc.Glue.Owner
8 240 2 30720 2 1349472 89 types.CodeType
9 42 0 24816 2 1374288 91 dict of class
Your program is indeed experiencing growth due to C++ code, but it is not an actual leak in terms of the creation of objects that are no longer referenced. What is happening, at least in part, is that your QWebView holds a QWebPage which holds a QWebHistory(). Each time you call self.load the history is getting a bit longer.
Note that QWebHistory has a clear() function.
Documentation is available: http://pyqt.sourceforge.net/Docs/PyQt4/qwebview.html#history
Related
App startup memory
Partition of a set of 249162 objects. Total size = 28889880 bytes.
Index Count % Size % Cumulative % Referrers by Kind (class / dict of class)
0 77463 31 5917583 20 5917583 20 types.CodeType
1 30042 12 3774404 13 9691987 34 function
2 51799 21 3070789 11 12762776 44 tuple
3 15106 6 2061017 7 14823793 51 dict of type
4 5040 2 1928939 7 16752732 58 function, tuple
5 6627 3 1459448 5 18212180 63 type
6 5227 2 1346136 5 19558316 68 dict of module
7 16466 7 1026538 4 20584854 71 dict (no owner)
8 734 0 685897 2 21270751 74 dict of module, tuple
9 420 0 626760 2 21897511 76 function, module
App memory after 100 subsequent calls (interacting with SQLAlchemy)
Partition of a set of 628910 objects. Total size = 107982928 bytes.
Index Count % Size % Cumulative % Referrers by Kind (class / dict of class)
0 23373 4 27673632 26 27673632 26 sqlalchemy.sql.schema.Column
1 141175 22 20904408 19 48578040 45 dict of sqlalchemy.sql.schema.Column
2 78401 12 5984371 6 54562411 51 types.CodeType
3 34133 5 4239726 4 58802137 54 function
4 64371 10 3661978 3 62464115 58 tuple
5 20034 3 2971710 3 65435825 61 dict of sqlalchemy.sql.schema.Table
6 13356 2 2297232 2 67733057 63 sqlalchemy.sql.base.ColumnCollection
7 15924 3 2133374 2 69866431 65 dict of type
8 5095 1 1946855 2 71813286 67 function, tuple
9 8714 1 1793696 2 73606982 68 type
Helper function that detects memory usage by rcs
def heap_results():
from guppy import hpy
hp = hpy()
h = hp.heap()
return Response(response=str(h.bytype),
status=200,
mimetype='application/json')
The implementation of SQLAlchemy is fairly straightforward. Using the db.Model, we are creating a class for ORM, and breaking the tables into subfunctions of the ORM class.
We are gc.collect() just before returning the final response to the user. We are also using db.session.flush(), db.session.expunge_all(), and db.session.close().
We have tried to remove the db.session.* commands, as well as the gc.collect(). Nothing changes.
Here is a timeseries graph of our app's memory usage, the application being restarted is where you see the memory cap reset back to a stable state:
Code to simulate an HAProxy.
def reconnect():
hostnames = [Settings.SECRETS.get('PATRONI_HOST_C', ''), Settings.SECRETS.get('PATRONI_HOST_E', '')]
try:
master_node = HAProxy(hostnames=hostnames)
except (ValueError, TypeError, BaseException) as e:
# send an alert here though, use the informant!
raise e
else:
if master_node in ['None', None]:
raise ValueError("Failed to determined which server is acting as the master node")
my_app.config['SQLALCHEMY_DATABASE_URI'] = "postgresql://{}:{}#{}/{}".format(Settings.SECRETS['PATRONI_USER'],
Settings.SECRETS['PATRONI_PASSWORD'],
master_node,
Settings.SECRETS['PATRONI_DB'])
my_app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {
'pool_recycle': 1800
}
new_db = SQLAlchemy(my_app)
new_db_orm = DBORM(new_db)
return new_db_orm
What the DBORM (modified to hide full functionality) looks like:
class DBORM(object):
def __init__(self, database):
self.database = database
self.owner_model = self.owner_model()
def create_owner_model(self):
db = self.database
class OwnerModel(db.Model):
__tablename__ = "owners"
owner_id = db.Column(UUID(as_uuid=True), unique=True,
nullable=False, primary_key=True)
client_owner = db.Column(db.String(255), unique=False, nullable=False)
admin_owner = db.Column(db.String(255), unique=False, nullable=False)
#staticmethod
def owner_validation(owner_model, owner=None):
if owner is not None:
owner_model = OwnerModel.get_owner_by_id(owner_id=owner_id,
return_as_model=True)
if owner_model is not None:
client_owner = owner_model.client_owner
admin_owner = owner_model.admin_owner
if client_owner is None and admin_owner is None:
return False
elif client_owner.lower() == owner.lower():
return True
elif admin_owner.lower() == owner.lower():
return True
else:
return False
else:
return None
else:
return None
Example of Using OwnerModel from API
#api.route('/owners/{owner_id}')
def my_function(owner_id):
try:
dborm = reconnect()
except (AttributeError, KeyError, ValueError, BaseException) as e:
logger.error(f'Unable to get an owner model.')
logger.info(f'Garbage collector, collected: {gc.collect()}')
return Response(response=Exception.database_reconnect_failure(),
status=503,
mimetype='application/json')
else:
response = dborm.get_owner_by_id(owner_id=owner_id)
logger.info(f'Garbage collector, collected: {gc.collect()}')
return Response(response=json.dumps(response),
status=200,
mimetype='application/json')
SQLAlchemy MetaData holds references to Table objects and the Declarative base class also has an internal registry for lookups, for example for use as context in relationship() lazily evaluated arguments. When you repeatedly create new versions of model classes, which also creates the required metadata like Table, you likely consume more and more memory, if the references are kept around. The fact that Column objects dominate your memory usage supports this in my view.
You should aim to create your models and their metadata just once during your application's life cycle. You only need to be able to change the connection parameters dynamically. SQLAlchemy versions up to 1.3 provide the creator argument of Engine for exactly this, and version 1.4 introduced DialectEvents.do_connect() event hook for even finer control.
Using creator:
import psycopg2
db = SQLAlchemy()
dborm = DBORM(db)
def initialize(app):
"""
Setup `db` configuration and initialize the application. Call this once and
once only, before your application starts using the database.
The `creator` creates a new connection every time the connection pool
requires one, due to all connections being in use, or old ones having been
recycled, etc.
"""
# Placeholder that lets the Engine know which dialect it will be speaking
app.config['SQLALCHEMY_DATABASE_URI'] = "postgresql+psycopg2://"
app.config['SQLALCHEMY_ENGINE_OPTIONS'] = {
'pool_recycle': 1800,
'creator': lambda: psycopg2.connect(
dbname=Settings.SECRETS['PATRONI_DB'],
user=Settings.SECRETS['PATRONI_USER'],
password=Settings.SECRETS['PATRONI_PASSWORD'],
host=HAProxy([
Settings.SECRETS.get('PATRONI_HOST_C', ''),
Settings.SECRETS.get('PATRONI_HOST_E', ''),
]))
}
db.init_app(app)
class OwnerModel(db.Model):
__tablename__ = "owners"
...
Note that you need to change DBORM to use the global db and model classes, and that your controllers do not call reconnect()—that does not exist—anymore, but just use db, dborm, and the classes directly as well.
I'm trying to do some scraping, but I get blocked every 4 requests. I have tried to change proxies but the error is the same. What should I do to change it properly?
Here is some code where I try it. First I get proxies from a free web. Then I go do the request with the new proxy but it doesn't work because I get blocked.
from fake_useragent import UserAgent
import requests
def get_player(id,proxy):
ua=UserAgent()
headers = {'User-Agent':ua.random}
url='https://www.transfermarkt.es/jadon-sancho/profil/spieler/'+str(id)
try:
print(proxy)
r=requests.get(u,headers=headers,proxies=proxy)
execpt:
....
code to manage the data
....
Getting proxies
def get_proxies():
ua=UserAgent()
headers = {'User-Agent':ua.random}
url='https://free-proxy-list.net/'
r=requests.get(url,headers=headers)
page = BeautifulSoup(r.text, 'html.parser')
proxies=[]
for proxy in page.find_all('tr'):
i=ip=port=0
for data in proxy.find_all('td'):
if i==0:
ip=data.get_text()
if i==1:
port=data.get_text()
i+=1
if ip!=0 and port!=0:
proxies+=[{'http':'http://'+ip+':'+port}]
return proxies
Calling functions
proxies=get_proxies()
for i in range(1,100):
player=get_player(i,proxies[i//4])
....
code to manage the data
....
I know that proxies scrape is well because when i print then I see something like:
{'http': 'http://88.12.48.61:42365'}
I would like to don't get blocked.
I recently had this same issue, but using proxy servers online as recommended in other answers is always risky (from privacy standpoint), slow, or unreliable.
Instead, you can use the requests-ip-rotator python library to proxy traffic through AWS API Gateway, which gives you a new IP each time:
pip install requests-ip-rotator
This can be used as follows (for your site specifically):
import requests
from requests_ip_rotator import ApiGateway, EXTRA_REGIONS
gateway = ApiGateway("https://www.transfermarkt.es")
gateway.start()
session = requests.Session()
session.mount("https://www.transfermarkt.es", gateway)
response = session.get("https://www.transfermarkt.es/jadon-sancho/profil/spieler/your_id")
print(response.status_code)
# Only run this line if you are no longer going to run the script, as it takes longer to boot up again next time.
gateway.shutdown()
Combined with multithreading/multiprocessing, you'll be able to scrape the site in no time.
The AWS free tier provides you with 1 million requests per region, so this option will be free for all reasonable scraping.
import requests
from itertools import cycle
list_proxy = ['socks5://Username:Password#IP1:20000',
'socks5://Username:Password#IP2:20000',
'socks5://Username:Password#IP3:20000',
'socks5://Username:Password#IP4:20000',
]
proxy_cycle = cycle(list_proxy)
# Prime the pump
proxy = next(proxy_cycle)
for i in range(1, 10):
proxy = next(proxy_cycle)
print(proxy)
proxies = {
"http": proxy,
"https":proxy
}
r = requests.get(url='https://ident.me/', proxies=proxies)
print(r.text)
The problem with using free proxies from sites like this is
websites know about these and may block just because you're using one of them
you don't know that other people haven't gotten them blacklisted by doing bad things with them
the site is likely using some form of other identifier to track you across proxies based on other characteristics (device fingerprinting, proxy-piercing, etc)
Unfortunately, there's not a lot you can do other than be more sophisticated (distribute across multiple devices, use VPN/TOR, etc) and risk your IP being blocked for attempting DDOS-like traffic or, preferably, see if the site has an API for access
Presumably you have your own pool of proxies - what is the best way to rotate them?
First, blindly picking random proxy we risk of repeating connection from the same proxy multiple times in a row. To add, most connection pattern based blocking is using proxy subnet (3rd number) rather than host - it's best to prevent repeats at subnet level.
It's also a good idea to track proxy performance as not all proxies are equal - we want to use our better performing proxies more often and let dead proxies cooldown.
All of this can be done with weighted randomization which is implemented by Python's random.choices() function:
import random
from time import time
from typing import List, Literal
class Proxy:
"""container for a proxy"""
def __init__(self, ip, type_="datacenter") -> None:
self.ip: str = ip
self.type: Literal["datacenter", "residential"] = type_
_, _, self.subnet, self.host = ip.split(":")[0].split('.')
self.status: Literal["alive", "unchecked", "dead"] = "unchecked"
self.last_used: int = None
def __repr__(self) -> str:
return self.ip
def __str__(self) -> str:
return self.ip
class Rotator:
"""weighted random proxy rotator"""
def __init__(self, proxies: List[Proxy]):
self.proxies = proxies
self._last_subnet = None
def weigh_proxy(self, proxy: Proxy):
weight = 1_000
if proxy.subnet == self._last_subnet:
weight -= 500
if proxy.status == "dead":
weight -= 500
if proxy.status == "unchecked":
weight += 250
if proxy.type == "residential":
weight += 250
if proxy.last_used:
_seconds_since_last_use = time() - proxy.last_used
weight += _seconds_since_last_use
return weight
def get(self):
proxy_weights = [self.weigh_proxy(p) for p in self.proxies]
proxy = random.choices(
self.proxies,
weights=proxy_weights,
k=1,
)[0]
proxy.last_used = time()
self.last_subnet = proxy.subnet
return proxy
If we mock run this Rotator we can see how weighted randoms distribute our connections:
from collections import Counter
if __name__ == "__main__":
proxies = [
# these will be used more often
Proxy("xx.xx.121.1", "residential"),
Proxy("xx.xx.121.2", "residential"),
Proxy("xx.xx.121.3", "residential"),
# these will be used less often
Proxy("xx.xx.122.1"),
Proxy("xx.xx.122.2"),
Proxy("xx.xx.123.1"),
Proxy("xx.xx.123.2"),
]
rotator = Rotator(proxies)
# let's mock some runs:
_used = Counter()
_failed = Counter()
def mock_scrape():
proxy = rotator.get()
_used[proxy.ip] += 1
if proxy.host == "1": # simulate proxies with .1 being significantly worse
_fail_rate = 60
else:
_fail_rate = 20
if random.randint(0, 100) < _fail_rate: # simulate some failure
_failed[proxy.ip] += 1
proxy.status = "dead"
mock_scrape()
else:
proxy.status = "alive"
return
for i in range(10_000):
mock_scrape()
for proxy, count in _used.most_common():
print(f"{proxy} was used {count:>5} times")
print(f" failed {_failed[proxy]:>5} times")
# will print:
# xx.xx.121.2 was used 2629 times
# failed 522 times
# xx.xx.121.3 was used 2603 times
# failed 508 times
# xx.xx.123.2 was used 2321 times
# failed 471 times
# xx.xx.122.2 was used 2302 times
# failed 433 times
# xx.xx.121.1 was used 1941 times
# failed 1187 times
# xx.xx.122.1 was used 1629 times
# failed 937 times
# xx.xx.123.1 was used 1572 times
# failed 939 times
By using weighted randoms we can create a connection pattern that appears random but smart. We can apply generic patterns like not proxies from the same IP family in a row as well as custom per-target logic like priotizing North American IPs for NA targets etc.
For more on this see my blog How to Rotate Proxies in Web Scraping
I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished
Is it possible to use kernprof.py, line_profiler.py, or something similar to profile a QGIS plugin? I can't run the plugin outside of QGIS because the plugin requires state from QGIS & will make calls to the QGIS API.
It seems like I might be able to modify the plugin's initializer to call kernprof, to call back to the plugin and pass the state all the way through, but I can't wrap my head around it.
Does anyone have experience with running a Python profiler from inside another tool?
I used a simpler way to profile my plugin using cProfile. In the constructor of main class of plugin (that is returned in classFactory), I used this code:
self.pr = cProfile.Profile()
self.pr.enable()
and in unload method of the class or any where some one needs print the profile stats:
self.pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(self.pr, stream=s).sort_stats(sortby)
ps.print_stats()
remember to use following code for imports:
import cProfile, pstats, io
from pstats import SortKey
It's possible to use line_profiler while running your script inside QGIS.
You need to import it inside the main file of your plugin along your other imports, then add profile = line_profiler.LineProfiler() before your main class, add the decorator #profile just before your main function to profile and finally add profile.print_stats(stream=stream) just before the return of the function.
I suppose there is other ways to do it, but it's the way I found that works good enough for me.
Below is an example for a Processing plugin:
import os
import line_profiler
profile = line_profiler.LineProfiler()
class processingScriptExample(QgsProcessingAlgorithm):
INPUT_directory = 'INPUT_directory'
def initAlgorithm(self, config):
self.addParameter(QgsProcessingParameterNumber(self.INPUT_directory,
self.tr('Output directory'),
QgsProcessingParameterFile.Folder))
#profile
def processAlgorithm(self, parameters, context, feedback):
directory = self.parameterAsInt(parameters, self.INPUT_directory, context)
ls = []
for ii in range(1000000):
ls.append(ii)
ls = [ii for ii in range(1000000)]
path_profiling = os.path.join(directory, "line_profiling.txt")
with open(path_profiling, 'w') as stream:
profile.print_stats(stream=stream)
return {'Profiling file': path_profiling}
The resulting file:
Timer unit: 1e-07 s
Total time: 1.31260 s
File: C:\OSGeo4W\profiles\default/python/plugins\test\algo_test.py
Function: processAlgorithm at line 70
Line # Hits Time Per Hit % Time Line Contents
==============================================================
70 #profile
71 def processAlgorithm(self, parameters, context, feedback):
72 1 248.0 248.0 0.0 directory = self.parameterAsInt(parameters, self.INPUT_directory, context)
73
74 1 8.0 8.0 0.0 ls = []
75 1000001 5054594.0 5.1 38.5 for ii in range(1000000):
76 1000000 6633146.0 6.6 50.5 ls.append(ii)
77
78 1 1418416.0 1418416.0 10.8 ls = [ii for ii in range(1000000)]
79
80 1 561.0 561.0 0.0 path_profiling = os.path.join(directory, "line_profiling.txt")
81 1 19001.0 19001.0 0.1 with open(path_profiling, 'w') as stream:
82 profile.print_stats(stream=stream)
83
84 return {"Profiling file":path_profiling}
If I run the script step by step works perfectly, but when I'm using threading misses 50-60%. I'm using Python + mechanize module
#setting up the browser
mySite = 'http://example.com/managament.php?'
postData = {'UserID' : '', 'Action':'Delete'}
job_tab1_user1 = [1,2,3]
job_tab2_user1 = [4,5,6]
job_tab1_user2 = [7,8,9]
job_tab2_user2 = [10,12,13]
.... till user1000
#i want to point out that the lists are 100% different
def user1_jobs:
for i in job_tab1_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
def user2_jobs:
for i in job_tab1_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
... and so on till user 1000
And I call them in the end like this:
t_user1 = threading.Thread(target=user1_jobs, args=[])
t_user1.start()
t_user2 = threading.Thread(target=user2_jobs, args=[])
t_user2.start()
I have a similar script that sends like 200 request per second and all of them are processed. I also tried using time.sleep(2), but again is missing a lot.
Another question besides what is wrong with my script is if its way to compact this code, because I'm using 1000 users and the script reaches thousands of lines. Thank you in advance.
from threading import *
submits = [[1,2,3], [3,4,5], [6,7,8]]
class worker(Thread):
def __init__(self, site, postdata, data):
Thread.__init__(self)
self.data = data
self.site = site
self.postdata = postdata
self.start()
def run(self):
for i in self.data:
browser.open("http://example.com/jobs.php?actions="+str(i))
browser.open(self.site, self.postdata)
for obj in submits:
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, submits)
Since the OP asked for it, here's a condensed/compressed version of the code.
or:
for index in range(0,1000):
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, [i for i in range(1,4)])
If the data you want to send actually is a sequence of 3 integers (1,2,3) that inclines in a perfect order.
Here is a full script that you can easily modify by changing the initial variables.
It creates a list dynamically and uses a generator to create the functions for each thread.
Currently it creates 1000 users, each with 2 tabs and 3 jobs.
# define your variables here
NUM_USERS = 1000
NUM_JOBS_PER_USER = 3
NUM_TABS_PER_USER = 2
URL_PART = "http://example.com/jobs.php?actions="
# populate our list of jobs
# the structure is like this: jobs[user][tab][job]
jobs = [[[0 for y in range(NUM_JOBS_PER_USER)] \
for x in range(NUM_TABS_PER_USER)] \
for x in range(NUM_USERS)]
p = 1
for i in range(NUM_USERS):
for j in range(NUM_TABS_PER_USER):
for k in range(NUM_JOBS_PER_USER):
jobs[i][j][k] = p
p += 1
# create a generator that builds our thread functions
def generateFunctions(jobs):
for user in jobs:
for tab in user:
for job in tab:
def f():
browser.open(URL_PART + str(job))
browser.open(mySite, Post_data)
yield f
# create and start threads, add them to a list
# if we need to preserve handlers for later use
threads = []
for f in generateFunctions(jobs):
thr = threading.Thread(target = f, args=[])
thr.start()
threads.append(thr)