I want to be able to receive a list of content when posting data from an input. The input is in a text file which will be opened in python. To speed the process up I'd like to increase the number of threads that can be sent at once. How would I be able to do that, here's a rough idea of what I'm talking about:
import requests
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
for i in range(len(usercodes)):
thread_one = requests.post(url='https://test.com/input', params=usercodes[i])
thread_two = requests.post(url='https://test.com/input', params=usercodes[i+1])
thread_three = requests.post(url='https://test.com/input', params=usercodes[i+2])
I want all the threads to run at the same time as in here the program will carry out the requests one after the next.
import requests
from multiprocessing import Pool
def make_request(usercode):
requests.post(url='https://test.com/input', params=usercode)
if __name__ == '__main__':
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
with Pool(multiprocessing.cpu_count()) as p:
print(p.map(make_request, usercodes))
p.close()
With concurrent.futures.ThreadPoolExecutor:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import requests
userdata = open("data.txt", "r")
usercodes = (x.strip() for x in userdata) # keep as generator
with ThreadPoolExecutor() as pool:
pool.map(partial(requests.post, 'https://test.com/input'), usercodes)
userdata.close() # closing the input file
Async is definitely your friend here.
from gevent import joinall, spawn, monkey
gevent.monkey.patch_all()
import requests
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
send_url = 'https://test.com/input'
threads = []
def send(usercode):
requests.post(url=send_url, params=usercode)
for code in usercodes:
threads.append(spawn(send, code))
joinall(threads)
Related
I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)
I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)
I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
I have a script that parses xml files using the ElementTree Path Evaluator. It works fine as it is, but it takes a long for it to finish. So I tried to make a multithreaded implementation:
import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue
STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'
def loadData(dir_name, pattern):
nohyphen_files = []
dir_names = []
dir_paths = []
for root, dirnames, filenames in os.walk(dir_name):
dir_names.append(dirnames)
dir_paths.append(root)
for filename in fnmatch.filter(filenames, pattern):
nohyphen_files.append(os.path.join(root, filename))
return nohyphen_files, dir_names, dir_paths
def freq(element_list, descending = True):
agglomerated = defaultdict(int)
for e in element_list:
agglomerated[e] += 1
return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)
def lexDiv(amount_words):
return 1.0*len(set(amount_words))/len(amount_words)
def anotherFreq(list_types, list_words):
fd = FreqDist(list_types)
print 'top 10 most frequent types:'
for t, freq in fd.items()[:10]:
print t, freq
print '\ntop 10 most frequent words:'
agglomerated = defaultdict(int)
for w in list_words:
if not w.lower() in STOPWORDS:
agglomerated[w] += 1
sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
print sorted_dict[:10]
def extractor(f):
print "check file: {}".format(f)
try:
# doc = lxml.etree.ElementTree(lxml.etree.XML(f))
doc = lxml.etree.ElementTree(file=f)
except lxml.etree.XMLSyntaxError, e:
print e
return
doc_evaluator = lxml.etree.XPathEvaluator(doc)
entities = doc_evaluator('//entity/*/externalRef/#reference')
places_dbpedia = doc_evaluator('//entity[contains(#type, "Schema:Place")]/*/externalRef/#reference')
non_people_dbpedia = set(doc_evaluator('//entity[not(contains(#type, "Schema:Person"))]'))
people = doc_evaluator('//entity[contains(#type, "Schema:Person")]/*/externalRef/#reference')
words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
unique_words = set(words)
other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
amount_of_sentences = doc_evaluator('text/wf/#sent')[-1]
types = doc_evaluator('//term/#morphofeat')
longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/#sent',\
namespaces={"re": "http://exslt.org/regular-expressions"}))[0]
top_people = freq([e.split('/')[-1] for e in people])[:10]
top_entities = freq([e.split('/')[-1] for e in entities])[:10]
top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]
def worker():
while 1:
job_number = q.get()
extractor(job_number)
q.task_done() #this thread is complete, move on
if __name__ =='__main__':
startTime = datetime.now()
files, dirs, path = loadData(DIR_NAME, PATTERN)
startTime = datetime.now()
q = Queue.Queue()# job queue
for f in files:
q.put(f)
for i in range(20): #make 20 workerthreads ready
worker_thread = threading.Thread(target=worker)
worker_thread.daemon = True
worker_thread.start()
q.join()
print datetime.now() - startTime
This does something, but when timing it, it isn't faster than the normal version. I think it has something to do with opening and reading files making the threader not multithreaded. If I use a function that instead of parsing the xml file just sleeps for a couple of second and prints something, it does work and it is a lot faster. What do I have to account for to have a multithreaded XML parser?
Threading in Python doesn't work as it does in other languages. It relies on the Global Interpreter Lock that makes sure only one thread is active at one time (running bytecode to be exact).
What you want to do is use the multiprocess library, instead.
You can read more about the GIL and Threading here:
https://docs.python.org/2/glossary.html#term-global-interpreter-lock
https://docs.python.org/2/library/threading.html
Is there a way I can multithread the function to take 5 URL's from the list at a single time ? Please see my code below . its python 2.7
import requests, csv, time, json, threading
from lxml import html
from csv import DictWriter
All_links = ['http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.343097&longitude=-71.123046&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.398588&longitude=-71.24505&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.394319&longitude=-71.218049&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.365396&longitude=-71.23165&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.356719&longitude=-71.250479&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.385096&longitude=-71.208399&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.334146&longitude=-71.183298&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.374296&longitude=-71.182371&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA']
target = open('completedlinks.txt','ab')
def get_data(each):
each = each.strip('\n')
r = requests.get(each)
source = json.loads(r.content)
the_file = open("output.csv", "ab")
writer = DictWriter(the_file, source[1].keys())
writer.writeheader()
writer.writerows(source)
the_file.close()
target.write(each+'\n')
print each+"\n--------------------------"
for each in All_links:
try:
get_data(each)
except:
pass
Check out the multiprocessing package. It implements thread pools, which would accomplish this.
Update:
Adding something like this should work
from multiprocessing import Pool
def chunks(l, n):
""" Yield successive n-sized chunks from l. """
for i in xrange(0, len(l), n):
yield l[i:i+n]
def threadit(threads, links):
for part in chunks(links, threads):
pool = Pool(threads)
for link in part:
pool.apply_async(getdata, args=(link,))
pool.close()
pool.join()
threadit(5, All_links)
This source is just an exemple:
inputf = open('input', 'r')
outputf = open('output', 'a')
for x in inputf:
x = x.strip('\n')
result = urllib2.urlopen('http://test.com/'+x).getcode()
outputf.write(x+' - '+result+'\n')
I want to add threading to this to check a few URLs at the same time.
The user should everytime decide how many threads he want to use.
The order on the output is not important.
What is the best and most beautiful way for that?
I like multiprocessing.pool.ThreadPool (or multiprocessing.pool.Pool)
like:
from multiprocessing.pool import ThreadPool
n_threads = 5
pool = ThreadPool(processes=n_threads)
threads = [pool.apply_async(some_function, args=(arg1,)) for arg1 in args]
pool.close()
pool.join()
results = [result.get() for result in threads]