MultiThreading/Optimization Python Requests? - python

I am trying to optimize this code, as of right now it runs 340 Requests in 10 mins. I have trying to get 1800 requests in 30 mins. Since I can run a request every second, according to amazon api. Can I use multithreading with this code to increase the number of runs??
However, I was reading in the full data to the main function, should I split it now, how can I figure out how many each thread should take?
def newhmac():
return hmac.new(AWS_SECRET_ACCESS_KEY, digestmod=sha256)
def getSignedUrl(params):
hmac = newhmac()
action = 'GET'
server = "webservices.amazon.com"
path = "/onca/xml"
params['Version'] = '2013-08-01'
params['AWSAccessKeyId'] = AWS_ACCESS_KEY_ID
params['Service'] = 'AWSECommerceService'
params['Timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
key_values = [(urllib.quote(k), urllib.quote(v)) for k,v in params.items()]
key_values.sort()
paramstring = '&'.join(['%s=%s' % (k, v) for k, v in key_values])
urlstring = "http://" + server + path + "?" + \
('&'.join(['%s=%s' % (k, v) for k, v in key_values]))
hmac.update(action + "\n" + server + "\n" + path + "\n" + paramstring)
urlstring = urlstring + "&Signature="+\
urllib.quote(base64.encodestring(hmac.digest()).strip())
return urlstring
def readData():
data = []
with open("ASIN.csv") as f:
reader = csv.reader(f)
for row in reader:
data.append(row[0])
return data
def writeData(data):
with open("data.csv", "a") as f:
writer = csv.writer(f)
writer.writerows(data)
def main():
data = readData()
filtData = []
i = 0
count = 0
while(i < len(data) -10 ):
if (count %4 == 0):
time.sleep(1)
asins = ','.join([data[x] for x in range(i,i+10)])
params = {'ResponseGroup':'OfferFull,Offers',
'AssociateTag':'4chin-20',
'Operation':'ItemLookup',
'IdType':'ASIN',
'ItemId':asins}
url = getSignedUrl(params)
resp = requests.get(url)
responseSoup=BeautifulSoup(resp.text)
quantity = ['' if product.amount is None else product.amount.text for product in responseSoup.findAll("offersummary")]
price = ['' if product.lowestnewprice is None else product.lowestnewprice.formattedprice.text for product in responseSoup.findAll("offersummary")]
prime = ['' if product.iseligibleforprime is None else product.iseligibleforprime.text for product in responseSoup("offer")]
for zz in zip(asins.split(","), price,quantity,prime):
print zz
filtData.append(zz)
print i, len(filtData)
i+=10
count +=1
writeData(filtData)
threading.Timer(1.0, main).start()

If you are using python 3.2 you can use concurrent.futures library to make it easy to launch tasks in multiple threads. e.g. here I am simulating running 10 url parsing job in parallel, each one of which takes 1 sec, if run synchronously it would have taken 10 seconds but with thread pool of 10 should take about 1 seconds
import time
from concurrent.futures import ThreadPoolExecutor
def parse_url(url):
time.sleep(1)
print(url)
return "done."
st = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(10):
future = executor.submit(parse_url, "http://google.com/%s"%i)
print("total time: %s"%(time.time() - st))
Output:
http://google.com/0
http://google.com/1
http://google.com/2
http://google.com/3
http://google.com/4
http://google.com/5
http://google.com/6
http://google.com/7
http://google.com/8
http://google.com/9
total time: 1.0066466331481934

Related

How to return a dictionary from a process in Python?

I want to make an inverted index using multiprocessing to speed up its work. My idea is to split the files into groups, and each process will build its own inverted index, and then I want to merge all these indexes into one inverted index. But I don't know how to return them to the main process that will merge them.
import multiprocessing as mp
from pathlib import Path
import re
import time
class InvertedIndex:
def __init__(self):
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt'))
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num
processes = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
currLi = pathList[startIndex:endIndex]
p = mp.Process(target=self.oneProcessTask, args=(currLi,))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
#staticmethod
def oneProcessTask(listOfDoc):
#print(f'Start: {list[0]}, end: {list[-1]}') # temp
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
def getListOfDoc(self, keyWord):
return self.index[keyWord]
if __name__ == '__main__':
ii = InvertedIndex()
start_time = time.time()
ii.createIndex()
print("--- %s seconds ---" % (time.time() - start_time))
I used multiprocessing.manager to write everything in one dictionary, but that solution was too slow. So I went back to the idea of creating own inverted index for each process and then merging them. But I don't know how to return all indexes to one process.
Take a look at concurrent.futures (native library) with either ThreadPoolExecutor or ProcessPoolExecutor. FYI: I wrote on that in here and did not test but, this is more or less the jist of what I use all the time.
from concurrent.futures import ThreadPoolExecutor, as_completed
def foo(stuff: int) -> dict:
return {}
things_to_analyze = [1,2,3]
threads = []
results = []
with ThreadPoolExecutor() as executor:
for things in things_to_analyze:
threads.append(executor.submit(foo, thing))
for job in as_completed(threads):
results.append(job.results())
I found a solution. I used pool.starmap to return a list of indexes.
My code:
class InvertedIndex:
def __init__(self):
self.smallIndexes = None
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt')) # Рекурсивно проходимо по всіх текстових файлах і робимо з них список
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num # Розраховуємо скільки файлів має обробити один процес
processes_args = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
processes_args.append((path, startIndex, endIndex))
pool = mp.Pool(threads_num)
self.smallIndexes = pool.starmap(self.oneProcessTask, processes_args)
self.mergeIndex()
#staticmethod
def oneProcessTask(path, startIndex, endIndex):
pathList = list(Path(path).glob('**/*.txt'))
listOfDoc = pathList[startIndex:endIndex]
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
return tempDict
Execution time decreased from 200 seconds (when I used shared memory and menger.dict ) to 0.8 seconds (when I used pool.starmap).

Why isn't my Simpy resource keeping a queue?

I've been working for around a week to learn SimPy for a discrete simulation I have to run. I've done my best, but I'm just not experienced enough to figure it out quickly. I am dying. Please help.
The system in question goes like this:
order arrives -> resource_1 (there are 2) performs take_order -> order broken into items -> resource_2 (there are 10) performs process_item
My code runs and performs the simulation, but I'm having a lot of trouble getting the queues on the resources to function. As in, queues do not build up on either resource when I run it, and I cannot find the reason why. I try resource.get_queue and get empty lists. There should absolutely be queues, as the orders arrive faster than they can be processed.
I think it has something to do with the logic for requesting resources, but I can't figure it out. Here's how I've structured the code:
import simpy
import random
import numpy as np
total_items = []
total_a = []
total_b = []
total_c = []
order_Q = []
item_Q = []
skipped_visits = []
order_time_dict = {}
order_time_dict2 = {}
total_order_time_dict = {}
var = []
class System:
def __init__(self,env,num_resource_1,num_resource_2):
self.env = env
self.resource_1 = simpy.Resource(env,num_resource_1)
self.resource_2 = simpy.Resource(env,num_resource_2)
def take_order(self, order):
self.time_to_order = random.triangular(30/60,60/60,120/60)
arrive = self.env.now
yield self.env.timeout(self.time_to_order)
def process_item(self,item):
total_process_time = 0
current = env.now
order_num = item[1][0]
for i in range(1,item[1][1]):
if 'a' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6) #bagging time only
#here edit order time w x
if 'b' in item[0]:
total_process_time += random.triangular(.05,.3333,.75)
if 'c' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6)
#the following is handling time: getting to station, waiting on car to arrive at window after finished, handing to cust
total_process_time += random.triangular(.05, 10/60, 15/60)
item_finish_time = current + total_process_time
if order_num in order_time_dict2.keys():
start = order_time_dict2[order_num][0]
if order_time_dict2[order_num][1] < item_finish_time:
order_time_dict2[order_num] = (start, item_finish_time)
else:
order_time_dict2[order_num] = (current, item_finish_time)
yield self.env.timeout(total_process_time)
class Order:
def __init__(self, order_dict,order_num):
self.order_dict = order_dict
self.order_num = order_num
self.order_stripped = {}
for x,y in list(self.order_dict.items()):
if x != 'total':
if y != 0:
self.order_stripped[x] = (order_num,y) #this gives dictionary format {item: (order number, number items) } but only including items in order
self.order_list = list(self.order_stripped.items())
def generate_order(num_orders):
print('running generate_order')
a_demand = .1914 ** 3
a_stdev = 43.684104
b_demand = .1153
b_stdev = 28.507782
c_demand = .0664
c_stdev = 15.5562624349
num_a = abs(round(np.random.normal(a_demand)))
num_b = abs(round(np.random.normal(b_demand)))
num_c = abs(round(np.random.normal(c_demand)))
total = num_orders
total_a.append(num_a)
total_b.append(num_b)
total_c.append(num_c)
total_num_items = num_a + num_b + num_c
total_items.append(total_num_items)
order_dict = {'num_a':num_a, 'num_b':num_b,'num_c':num_c, 'total': total}
return order_dict
def order_process(order_instance,system):
enter_system_at = system.env.now
print("order " + str(order_instance.order_num) + " arrives at " + str(enter_system_at))
if len(system.resource_1.get_queue) > 1:
print("WORKING HERE ******************")
if len(system.resource_1.get_queue) <= 25:
with system.resource_1.request() as req:
order_Q.append(order_instance)
yield req
yield env.process(system.take_order(order_instance))
order_Q.pop()
enter_workstation_at = system.env.now
print("order num " + str(order_instance.order_num) + " enters workstation at " + str(enter_workstation_at))
for item in order_instance.order_list:
item_Q.append(item)
with system.resource_2.request() as req:
yield req
yield env.process(system.process_item(item))
if len(system.resource_2.get_queue) >1:
var.append(1)
item_Q.pop()
leave_workstation_at = system.env.now
print("Order num " + str(order_instance.order_num) + " leaves at " + str(leave_workstation_at))
order_time_dict[order_instance.order_num] = leave_workstation_at-enter_workstation_at
total_order_time_dict[order_instance.order_num]=leave_workstation_at-enter_system_at
else:
skipped_visits.append(1)
def setup(env):
system = System(env,2,15)
order_num = 0
while True:
next_order = random.expovariate(3.5) #where 20 is order arrival mean (lambda)
yield env.timeout(next_order)
order_num+=1
env.process(order_process(Order(generate_order(order_num),order_num),system))
env = simpy.Environment()
env.process(setup(env))
env.run(until=15*60)
print("1: \n", order_time_dict)
I think you are looking at the wrong queue.
the api for getting queued requests for resources is just attribute queue so try using
len(system.resource_1.queue)
get_queue and put_queue is from the base class and used to derive new resource classes.
but wait they are not what any reasonable person would assume, and I find this confusing too, but the doc says
Requesting a resources is modeled as “putting a process’ token into the resources” which means when you call request() the process is put into the put_queue, not the get_queue. And with resource, release always succeeds immediately so its queue (which is the get_queue) is always empty
I think queue is just a alias for the put_queue, but queue is much less confussing

Python multi connection downloader resuming after pausing makes download run endlessly

I have written a Python script that downloads a single file using 32 connections if available.
I have written a multiconnection downloader that works fine without pausing, but won't stop downloading after resuming, the progress would go beyond 100%...
Like this:
Download mode: Multi-thread (press Space to pause/resume, press Escape to stop)
[████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████] 120% completed, paused: False
Download mode: Multi-thread (press Space to pause/resume, press Escape to stop)
1798.08 MiB downloaded, 1489.83 MiB total, -308.25 MiB remaining, download speed: 22.73 MiB/s
Minimum speed: 0.00 MiB/s, average speed: 4.54 MiB/s, maximum speed: 75.00 MiB/s
Task started on 2021-08-09 16:57:03, 00:06:35 elapsed, ETA: -1:59:47
After progress exceeds 100%, there will be error messages like this:
Exception in thread Thread-78:
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\threading.py", line 973, in _bootstrap_inner
self.run()
File "C:\Program Files\Python39\lib\threading.py", line 910, in run
self._target(*self._args, **self._kwargs)
File "D:\MyScript\downloader.py", line 70, in multidown
mm[position: position+len(chunk)] = chunk
IndexError: mmap slice assignment is wrong size
(The above doesn't include all of the error message)
I have encountered all sorts of errors after resuming, but most importantly, the server will often send extra bytes from previous request, whose connection is dead and needless to say this breaks the whole code.
How should I implement pause and resume correctly?
I am thinking about multiprocessing, I assume the sessions and connections are all PID and port number related, and so far I haven't encountered a new run of the script that received extra bytes from previous runs of the script, so I guess using another process with a new PID and new port number plus requests.session() plus {'connection': 'close'} for each download should guarantee that no extra bytes from previous connections will be received, I just don't know how to share variables between processes...
The code:
downloader.py
import json
import keyboard
import os
import re
import requests
import sys
import time
import validators
from collections import deque
from datetime import datetime, timedelta
from math import inf
from mmap import mmap
from pathlib import Path
from ping3 import ping
from reprint import output
from threading import Thread
def timestring(sec):
sec = int(sec)
m, s = divmod(sec, 60)
h, m = divmod(m, 60)
return f'{h:02d}:{m:02d}:{s:02d}'
class Downloader:
def __init__(self):
self.recent = deque([0] * 12, maxlen=12)
self.recentspeeds = deque([0] * 200, maxlen=200)
self.paused = False
self.progress = dict()
class Multidown:
def __init__(self, obj, id):
self.count = 0
self.position = 0
self.completed = False
self.id = id
self.parent = obj
def multidown(self, url, start, end):
interrupted = False
s = requests.session()
s.headers.update({'connection': 'close', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'})
r = s.get(
url, headers={'range': 'bytes={0}-{1}'.format(start, end)}, stream=True)
length = int(r.headers['content-length'])
while end - length + (self.id != self.parent.progress['connections'] - 1) != start or r.status_code != 206:
r.close()
s.close()
del r
del s
time.sleep(0.02)
s = requests.session()
r = s.get(
url, headers={'range': 'bytes={0}-{1}'.format(start, end)}, stream=True)
length = int(r.headers['content-length'])
self.position = start
for chunk in r.iter_content(1048576):
if self.parent.paused:
self.parent.mm.flush()
r.connection.close()
r.close()
s.close()
del r
del s
interrupted = True
break
if chunk:
self.parent.mm[self.position: self.position+len(chunk)] = chunk
self.count += len(chunk)
self.position += len(chunk)
self.parent.progress[self.id]['count'] = self.count
self.parent.progress[self.id]['position'] = self.position
if not interrupted:
r.close()
s.close()
if self.count == self.parent.progress[self.id]['length']:
self.completed = True
self.parent.progress[self.id]['completed'] = True
self.parent.mm.flush()
class Singledown:
def __init__(self):
self.count = 0
def singledown(self, url, path):
with requests.get(url, stream=True) as r:
with path.open('wb') as file:
for chunk in r.iter_content(1048576):
if chunk:
self.count += len(chunk)
file.write(chunk)
def download(self, url, filepath, num_connections=32, overwrite=False):
singlethread = False
threads = []
bcontinue = False
filepath = filepath.replace('\\', '/')
if (not re.match('^[a-zA-Z]:/(((?![<>:"/|?*]).)+((?<![ .])/)?)*$', filepath) or
not Path(filepath[:3]).exists()):
print('Invalid windows file path has been inputted, process will now stop.')
return
if not validators.url(url):
print('Invalid url been inputted, process will now stop.')
return
if url.lower().startswith('ftp://'):
print(
"`requests` module doesn't suport File Transfer Protocol, process will now stop")
return
path = Path(filepath)
if not path.exists():
bcontinue = True
else:
if path.is_file():
if overwrite:
bcontinue = True
else:
while True:
answer = input(
f'`{filepath}` already exists, do you want to overwrite it? \n(Yes, No):').lower()
if answer in ['y', 'yes', 'n', 'no']:
if answer.startswith('y'):
os.remove(filepath)
bcontinue = True
break
else:
print('Invalid input detected, retaking input.')
if not bcontinue:
print(
f'Overwritting {filepath} has been aborted, process will now stop.')
return
bcontinue = False
server = url.split('/')[2]
ok = ping(server, timeout=2)
if ok == False:
print(
'The server of the inputted url is non-existent, process will now stop.')
return
if ok:
bcontinue = True
if not ok:
print('Connection has timed out, will reattempt to ping server 5 times.')
for i in range(5):
print(
f'Reattempting to ping server, retrying {i + 1} out of 5')
ok = ping(server, timeout=2)
if ok:
print(
f'Connection successful on retry {i + 1}, process will now continue.')
bcontinue = True
break
else:
print(f'Retry {i + 1} out of 5 timed out' + (i != 4)
* ', reattempting in 1 second.' + (i == 4) * '.')
time.sleep(1)
if not bcontinue:
print('Failed to connect server, connection timed out, process will now stop')
return
bcontinue = False
head = requests.head(url)
if head.status_code == 200:
bcontinue = True
else:
for i in range(5):
print(f'Server responce is invalid, retrying {i + 1} out of 5')
head = requests.head(url)
if head.status_code == 200:
print(
f'Connection successful on retry {i + 1}, process will now continue.')
bcontinue = True
break
else:
print(f'Retry {i + 1} out of 5 failed to access data' +
(i != 4) * ', reattempting in 1 second.' + (i == 4) * '.')
time.sleep(1)
if not bcontinue:
print("Can't establish a connection with access to data, can't download target file, process will now stop.")
return
folder = '/'.join(filepath.split('/')[:-1])
Path(folder).mkdir(parents=True, exist_ok=True)
headers = head.headers
total = headers.get('content-length')
if not total:
print(
f'Cannot find the total length of the content of {url}, the file will be downloaded using a single thread.')
started = datetime.now()
print('Task started on %s.' %
started.strftime('%Y-%m-%d %H:%M:%S'))
sd = self.Singledown()
th = Thread(target=sd.singledown, args=(url, path))
threads.append(sd)
th.start()
total = inf
singlethread = True
else:
total = int(total)
if not headers.get('accept-ranges'):
print(
'Server does not support the `range` parameter, the file will be downloaded using a single thread.')
started = datetime.now()
print('Task started on %s.' %
started.strftime('%Y-%m-%d %H:%M:%S'))
sd = self.Singledown()
th = Thread(target=sd.singledown, args=(url, path))
threads.append(sd)
th.start()
singlethread = True
else:
segment = total / num_connections
started = datetime.now()
lastpressed = started
path.touch()
file = path.open('wb')
file.seek(total - 1)
file.write(b'\0')
file.close()
file = path.open(mode='r+b')
self.mm = mmap(file.fileno(), 0)
print('Task started on %s.' %
started.strftime('%Y-%m-%d %H:%M:%S'))
self.progress['total'] = total
self.progress['connections'] = num_connections
for i in range(num_connections):
md = self.Multidown(self, i)
start = int(segment * i)
end = int(segment * (i + 1)) - (i != num_connections - 1)
length = end - start + (i != num_connections - 1)
th = Thread(target=md.multidown, args=(
url, start, end))
threads.append(md)
self.progress[i] = dict()
self.progress[i]['start'] = start
self.progress[i]['position'] = start
self.progress[i]['end'] = end
self.progress[i]['count'] = 0
self.progress[i]['length'] = length
self.progress[i]['completed'] = False
th.start()
Path(filepath + '.progress.json').write_text(json.dumps(self.progress, indent=4))
downloaded = 0
totalMiB = total / 1048576
speeds = []
interval = 0.04
with output(initial_len=5, interval=0) as dynamic_print:
while True:
Path(filepath + '.progress.json').write_text(json.dumps(self.progress, indent=4))
status = sum([i.completed for i in threads])
downloaded = sum(i.count for i in threads)
self.recent.append(downloaded)
done = int(100 * downloaded / total)
doneMiB = downloaded / 1048576
gt0 = len([i for i in self.recent if i])
if not gt0:
speed = 0
else:
recent = list(self.recent)[12 - gt0:]
if len(recent) == 1:
speed = recent[0] / 1048576 / interval
else:
diff = [b - a for a, b in zip(recent, recent[1:])]
speed = sum(diff) / len(diff) / 1048576 / interval
speeds.append(speed)
self.recentspeeds.append(speed)
nzspeeds = [i for i in speeds if i]
if nzspeeds:
minspeed = min(nzspeeds)
else:
minspeed = 0
maxspeed = max(speeds)
now = datetime.now()
elapsed = (now - started).total_seconds()
meanspeed = downloaded / elapsed / 1048576
remaining = totalMiB - doneMiB
dynamic_print[0] = '[{0}{1}] {2}'.format(
'\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed' + (not singlethread) * ', paused: {0}'.format(self.paused)
dynamic_print[1] = 'Download mode: ' + singlethread * \
'Single-thread' + (not singlethread) * 'Multi-thread (press Space to pause/resume, press Escape to stop)'
dynamic_print[2] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
doneMiB, totalMiB, remaining, speed)
if speed and total != inf:
eta = timestring(remaining / speed)
else:
eta = '99:59:59'
dynamic_print[3] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(
minspeed, meanspeed, maxspeed)
dynamic_print[4] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
started.strftime('%Y-%m-%d %H:%M:%S'), timestring(elapsed), eta)
if keyboard.is_pressed('space'):
if not singlethread:
pressed = datetime.now()
if (pressed - lastpressed).total_seconds() > 0.5:
lastpressed = pressed
if self.paused:
for i, md in enumerate(threads):
if not md.completed:
th = Thread(target=md.multidown, args=(
url, self.progress[i]['position'], self.progress[i]['end']))
th.start()
self.paused = not self.paused
if keyboard.is_pressed('esc'):
if not singlethread:
ended = datetime.now()
self.paused = True
break
if status == len(threads):
if not singlethread:
self.mm.close()
ended = datetime.now()
break
time.sleep(interval)
time_spent = (ended - started).total_seconds()
meanspeed = total / time_spent / 1048576
status = sum([i.completed for i in threads])
if status == len(threads):
print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
else:
print('Task interrupted on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
if __name__ == '__main__':
d = Downloader()
d.download(*sys.argv[1:])
For testing purposes this is a dumbed-down version of the script, with all checks removed while retaining the same functionality (sorry it really takes all these lines to show the download information):
import json
import os
import requests
import sys
import time
from collections import deque
from datetime import datetime, timedelta
from math import inf
from mmap import mmap
from pathlib import Path
from reprint import output
from threading import Thread
def timestring(sec):
sec = int(sec)
m, s = divmod(sec, 60)
h, m = divmod(m, 60)
return f'{h:02d}:{m:02d}:{s:02d}'
class Downloader:
def __init__(self):
self.recent = deque([0] * 12, maxlen=12)
self.recentspeeds = deque([0] * 200, maxlen=200)
self.paused = False
self.progress = dict()
self.UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
class Multidown:
def __init__(self, obj, id):
self.count = 0
self.position = 0
self.completed = False
self.id = id
self.parent = obj
self.UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
def multidown(self, url, start, end):
interrupted = False
s = requests.session()
s.headers.update({'connection': 'close', 'user-agent': self.UA})
r = s.get(
url, headers={'range': 'bytes={0}-{1}'.format(start, end)}, stream=True)
length = int(r.headers['content-length'])
while end - length + (self.id != self.parent.progress['connections'] - 1) != start or r.status_code != 206:
r.close()
s.close()
del r
del s
time.sleep(0.02)
s = requests.session()
r = s.get(
url, headers={'range': 'bytes={0}-{1}'.format(start, end)}, stream=True)
length = int(r.headers['content-length'])
self.position = start
for chunk in r.iter_content(1048576):
if self.parent.paused:
self.parent.mm.flush()
r.connection.close()
r.close()
s.close()
del r
del s
interrupted = True
break
if chunk:
self.parent.mm[self.position: self.position+len(chunk)] = chunk
self.count += len(chunk)
self.position += len(chunk)
self.parent.progress[self.id]['count'] = self.count
self.parent.progress[self.id]['position'] = self.position
if not interrupted:
r.close()
s.close()
if self.count == self.parent.progress[self.id]['length']:
self.completed = True
self.parent.progress[self.id]['completed'] = True
self.parent.mm.flush()
def download(self, url, filepath, num_connections=32, overwrite=False):
singlethread = False
threads = []
bcontinue = False
filepath = filepath.replace('\\', '/')
if Path(filepath).exists():
os.remove(filepath)
folder = '/'.join(filepath.split('/')[:-1])
Path(folder).mkdir(parents=True, exist_ok=True)
head = requests.head(url, headers={'user-agent': self.UA})
path = Path(filepath)
headers = head.headers
total = headers.get('content-length')
if total:
total = int(total)
if headers.get('accept-ranges'):
segment = total / num_connections
started = datetime.now()
lastpressed = started
path.touch()
file = path.open('wb')
file.seek(total - 1)
file.write(b'\0')
file.close()
file = path.open(mode='r+b')
self.mm = mmap(file.fileno(), 0)
print('Task started on %s.' %
started.strftime('%Y-%m-%d %H:%M:%S'))
self.progress['total'] = total
self.progress['connections'] = num_connections
for i in range(num_connections):
md = self.Multidown(self, i)
start = int(segment * i)
end = int(segment * (i + 1)) - (i != num_connections - 1)
length = end - start + (i != num_connections - 1)
th = Thread(target=md.multidown, args=(
url, start, end))
threads.append(md)
self.progress[i] = dict()
self.progress[i]['start'] = start
self.progress[i]['position'] = start
self.progress[i]['end'] = end
self.progress[i]['count'] = 0
self.progress[i]['length'] = length
self.progress[i]['completed'] = False
th.start()
Path(filepath + '.progress.json').write_text(json.dumps(self.progress, indent=4))
downloaded = 0
totalMiB = total / 1048576
speeds = []
interval = 0.04
with output(initial_len=5, interval=0) as dynamic_print:
while True:
Path(filepath + '.progress.json').write_text(json.dumps(self.progress, indent=4))
status = sum([i.completed for i in threads])
downloaded = sum(i.count for i in threads)
self.recent.append(downloaded)
done = int(100 * downloaded / total)
doneMiB = downloaded / 1048576
gt0 = len([i for i in self.recent if i])
if not gt0:
speed = 0
else:
recent = list(self.recent)[12 - gt0:]
if len(recent) == 1:
speed = recent[0] / 1048576 / interval
else:
diff = [b - a for a, b in zip(recent, recent[1:])]
speed = sum(diff) / len(diff) / 1048576 / interval
speeds.append(speed)
self.recentspeeds.append(speed)
nzspeeds = [i for i in speeds if i]
if nzspeeds:
minspeed = min(nzspeeds)
else:
minspeed = 0
maxspeed = max(speeds)
now = datetime.now()
elapsed = (now - started).total_seconds()
meanspeed = downloaded / elapsed / 1048576
remaining = totalMiB - doneMiB
dynamic_print[0] = '[{0}{1}] {2}'.format(
'\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed' + (not singlethread) * ', paused: {0}'.format(self.paused)
dynamic_print[1] = 'Download mode: ' + singlethread * \
'Single-thread' + (not singlethread) * 'Multi-thread (press Space to pause/resume, press Escape to stop)'
dynamic_print[2] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
doneMiB, totalMiB, remaining, speed)
if speed and total != inf:
eta = timestring(remaining / speed)
else:
eta = '99:59:59'
dynamic_print[3] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(
minspeed, meanspeed, maxspeed)
dynamic_print[4] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
started.strftime('%Y-%m-%d %H:%M:%S'), timestring(elapsed), eta)
if PAUSE:
if not singlethread:
pressed = datetime.now()
if (pressed - lastpressed).total_seconds() > 0.5:
lastpressed = pressed
if self.paused:
for i, md in enumerate(threads):
if not md.completed:
th = Thread(target=md.multidown, args=(
url, self.progress[i]['position'], self.progress[i]['end']))
th.start()
self.paused = not self.paused
if status == len(threads):
if not singlethread:
self.mm.close()
ended = datetime.now()
break
time.sleep(interval)
time_spent = (ended - started).total_seconds()
meanspeed = total / time_spent / 1048576
status = sum([i.completed for i in threads])
if status == len(threads):
print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
else:
print('Task interrupted on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
if __name__ == '__main__':
import hashlib
global PAUSE
PAUSE = False
chash = '5674E59283D95EFE8C88770515A9BBC80CBB77CB67602389FD91DEF26D26AED2'
d = Downloader()
if sys.argv[1] == '0':
d.download('http://ipv4.download.thinkbroadband.com/1GB.zip', 'C:/test/1GB.zip')
elif sys.argv[1] == '1':
th1 = Thread(target=d.download, args=('http://ipv4.download.thinkbroadband.com/1GB.zip', 'C:/test/1GB.zip'))
th1.start()
def test():
while th1.is_alive():
global PAUSE
PAUSE = not PAUSE
time.sleep(10)
th2 = Thread(target=test)
th2.start()
while th1.is_alive():
pass
sha256_hash = hashlib.sha256()
with open('C:/test/1GB.zip',"rb") as f:
for byte_block in iter(lambda: f.read(1048576),b""):
sha256_hash.update(byte_block)
print(sha256_hash.hexdigest().lower() == chash.lower())
The url isn't accessible without a VPN in my locale, and test 0 always results True, that is, if the connection hasn't gone dead during the download, and test 1 sometimes results True, sometimes results False, sometimes it doesn't finish(progress bar goes beyond 100%)...
How can my code be salvaged?
This might not be your only problem but you have a race condition that could show up if you pause and resume quickly (where the definition of quickly varies greatly depending on your circumstances). Consider that you've got 32 threads each requesting a MB chunk, let's call them threads 0-31. They are sitting their downloading and you pause. The threads do not know that you paused until they get a chunk of data as they are sitting in blocking io. Not sure what speed your connection is or how many cores your machine has (threads will sometimes act in parallel when they don't need the GIL,) but this process could take a lot longer than you expect. Then you unpause and your code creates new threads 32-63 but some or all of threads 0-31 are still waiting for the next chunk. You set threads 32-63 in motion and then you turn off your pause flag. Those threads that didn't end from 0-31 then wake up and see that things aren't paused. Now you have multiple threads accessing the same state variables
self.parent.mm[self.position: self.position + len(chunk)] = chunk
self.count += len(chunk)
self.position += len(chunk)
self.parent.progress[self.id]['count'] = self.count
self.parent.progress[self.id]['position'] = self.position
so if thread 0 is downloading the same chunk as thread 31 they both keep updating all the same state and they add to position and count even though they are downloading overlapping parts of the file. You even reuse the objects that the threads live inside of so that state can get really really messed up.
for i, md in enumerate(threads):
if not md.completed:
th = Thread(target=md.multidown, args=(url, self.progress[i]['position'], self.progress[i]['end']))
th.start()
There might be some other problems in your code and it is a lot to sort through so I suggest taking the time to do some refactoring to eliminate duplicate code and organise things into more functions. I don't believe in crazy tiny functions, but you could use a few sub functions like download_multi(download_state) and download_single maybe. I am relatively confident however that your current problem will be solved if you ensure the threads you have running actually end after you pause. To do so you need to actually hold references to your threads
somewhere:
actual_threads = []
When you create your threads (the first time and after you unpause, or preferably this would be in a function and you'd do it there and return the list):
th = Thread(target=md.multidown, args=(
url, start, end))
threads.append(md)
actual_threads.append(th)
Then when you unpause:
self.paused = not self.paused
for th in actual_threads:
th.join()
This way you have the threads working, they quit when you pause and you rebuild them. So join should return as soon as they break out of the blocking io call to iter_content. This way those threads are always dead before you make the new ones.
What I would do myself however would be to create sockets from each thread to the main process. When pause is detected the threads shut down the request and save any data that's already waiting in the OS buffer then go into a blocking receive on the socket (there might be a way to use select with a socket and requests to allow you to even break out of the blocking io involved in r.iter_content immediately but I leave that for your research). When the program is unpaused the main process would send some value to indicate the program should restart (you'd want at least two signals the threads would recognise, one for quitting gracefully and one to resume. The codes can be single characters.) When the value is sent to each thread that thread will unblock and can then restart the download using requests and its previous state like nothing happened.

Get a result of a log file parsing speed every 10 seconds in python

I have a python code to parse a 1 TB log file, but the problem is my result is shown after the parsing process is finished. So for that I need to wait for 12 hours, and after 12 hours only then the result is shown. I want to know how can I parse a log file and know the result of the parsing speed every 10 seconds.
This is my code:
import re
import timeit
log_file = '/Users/kiya/Desktop/mysql/ipscan/ip.txt'
output_file ='/Users/kiya/Desktop/mysql/ipscan/k2u.csv'
name_to_check = 'MBX_AUTHENTICATION_FAILED'
class Log_minning:
def __init__(self):
self.counter = 0
def get_userdata(self):
user_att = []
list_usr = []
counterr = 0
with open(log_file, encoding='utf-8') as infile:
for line in infile:
if name_to_check in line:
username = re.search(r'(?<=userName=)(.*)(?=,)', line)
username = username.group()
date = re.search(r"([12]\d{3}(0[1-9]|1[0-2])+"
"(0[1-9]|[12]\d|3[01]))", line)
date = date.group()
time = re.search(r"(\d{9}\+\d{4})", line)
time = time.group()
ip = re.search(
r'(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)'
'{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])',
line
)
ip = ip.group()
user_att.append(username)
user_att.append(date)
user_att.append(time)
user_att.append(ip)
list_usr.append(user_att)
counterr = counterr + 1
self.counter = counterr
return list_usr
if __name__ == "__main__":
lm = Log_minning()
the_time = timeit.Timer(lm.get_userdata).repeat(1, 1000)
sing_time = min(the_time)/1000
speed = 600 / sing_time * lm.counter
# for line in lm.get_userdata():
# print(line)
print(
"Processing " + str(lm.counter) + " in " + str(the_time) +
"\nThe speed aproximately " + str(speed) + " data in 10 sec"
)
This is the fully scanned seconds
Processing 117 in [6.646515152002394]

Processing big files using requests library and DictReader- Python

So I'm trying to process huge data files (> 1.6GB) using the requests library to deal with chunks of data.
import urllib2, json, csv
import requests
import multiprocessing
def getTaxiTrips(date):
"""
Gets the taxi trips occurred in NY from a starting date.
:param date: (Y-m-d).
:return: list of tuples (long, lat, drop off date).
"""
today = str(datetime.date(datetime.now())).split('-')
today_y = today[0]
today_m = today[1]
start = date.split('-')
start_y = start[0]
start_m = start[1]
print start_m+"-"+start_y +" / "+today_m+"-"+today_y
data = []
y = int(start_y)
m = int(start_m)
while int(start_y) <= int(today_y):
# Month transformation
if m > 12:
m %= 12
y += 1
mt = str(m) if m > 9 else '0' + str(m)
# Green cabs
if readCSV("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv") is not None:
data.append("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv")
if m == int(today_m):
break
m += 1
pool = multiprocessing.Pool(mps-1)
result = pool.map(consumeTaxiData, data)
pool.close()
pool.join()
return list(itertools.chain(*result))
def consumeTaxiData(url):
"""
Given a url, reads its content and process its data.
:param url: the url to be readen.
:return: a list of tuples in the form (long, lat, hour).
"""
print "Processing", url
points = []
r = requests.get(url, stream=True)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
reader = csv.DictReader(chunk.splitlines(), delimiter=',')
for line in reader:
print line
latitude = line.get('dropoff_latitude', None)
if latitude is None:
latitude = line.get('Dropoff_latitude', None)
longitude = line.get('dropoff_longitude', None)
if longitude is None:
longitude = line.get('Dropoff_longitude', None)
time = line.get('tpep_dropoff_datetime', None)
if time is None:
time = line.get('Lpep_dropoff_datetime', None)
if time is not None and latitude is not None and longitude is not None and \
datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.strptime(date, '%Y-%m-%d'):
time = roundTime(datetime.strptime(time, '%Y-%m-%d %H:%M:%S'), roundTo=60 * 60).hour
points.append((longitude, latitude, time))
return points
This is one file example:
https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv
I'm not sure how to use this idea properly because the first line of the file is a header that specifies the key of some content that I need to capture. This may change among files, so I though about reading it using csv.DictRead. But I don't know if this works among chunks, because the header would be captured just in the first chunk, right? Is there a way to preserver the keys and be able to use csv.DictReader?

Categories

Resources