How to assign luigi parameter using list in task wrapper - python

I am using luigi to extract different user actions and save each as a csv simultaneously.
The idea to look at my source data, find unique actions and create csv's using the names of each of those actions.
class data_filter(luigi.Task):
task = luigi.Parameter()
def run(self):
data_filter = full_file[full_file['properties_url'].str.contains(task)]
data_filter.to_csv('/Users/Documents/Data/'+str(task)+'.csv')
def requires(self):
return []
def output(self):
return luigi.LocalTarget('/Users/Documents/Data/'+str(task)+'.csv')
#chaining tasks with wrapper
class wrapper(luigi.WrapperTask):
def requires(self):
file = pd.read_csv('/Users/Desktop/attr.csv')
actions = file.utm_source.unique()
task_list = []
for current_task in actions:
task_list.append(data_filter(task=current_task))
return task_list
def run(self):
print ('Wrapper has ended')
pd.DataFrame().to_csv('/Users/Documents/Data/wrangle.csv')
def output(self):
return luigi.LocalTarget('/Users/Documents/Data/dwrangle.csv')
if __name__ == '__main__':
luigi.run(wrapper())
The wrapper should tie everything up by, looking at all unique actions, assigning them to task_list and running task_list...while assigning the current task I am iterating through to task = luigi.Paramter in my data_filter class.
However this returns the error message:
return luigi.LocalTarget('/Users/emmanuels/Documents/GitHub/Springboard-DSC/Springboard-DSC/Capstone 1 - Attribution Model/Data/'+str(task)+'.csv')
NameError: name 'task' is not defined
and
===== Luigi Execution Summary =====
Scheduled 1 tasks of which:
* 1 failed scheduling:
- 1 wrapper()
Did not run any tasks
This progress looks :( because there were tasks whose scheduling failed
I just want to figure out what I am doing wrong

Related

How to determine the status after a celery task has been completed inside code?

Sample logic
logic.py
#shared_task
def run_create_or_update_google_creative():
return create_or_update_google_creative()
def create_or_update_google_creative() :
# do some logic
def run_db_sinc():
result = run_create_or_update_google_creative.delay()
job = CeleryJobResult(job_id=result.task_id, status=result.status)
job.save()
return 201, job.id
There is such a structure to the celery task call logic. First I call run_db_sinc, a new celery task is generated and I immediately get the task_id value which I save in the database and send as a response to the frontend. As long as the status is PENDING the frontend will go through the endpoint to the database and find the task_id status.
My question is how do I know that the task has completed and the status has changed to SUCCESS? At what point and how to do it ? I know that it is possible to use similar function
from celery.result import AsyncResult
def get_task_status(task_id):
task = AsyncResult(task_id)
if task.status = 'SUCCESS': # or task ended already
job = CeleryJobResult.objects.get(job_id=task_id)
job.status=task.status
job.save()
return task.status
But I can't understand at what point in time and where in my code to call it.
I think I've come up with an option that works. using threading inside run_db_sinc
#shared_task
def run_create_or_update_google_creative():
return create_or_update_google_creative()
def create_or_update_google_creative() :
# do some logic
def get_task_status(task_id: str) -> str:
task = AsyncResult(task_id)
return task.status
def check_task_status(task_id: str) -> None:
status = get_task_status(task_id)
while status not in ('custom status', ):
time.sleep(1)
status = get_task_status(task_id)
job = CeleryJobResult.objects.get(job_id=task_id)
job.status = status
job.save()
logger.info(f"Task {task_id} completed with status: {status}")
return None
def run_db_sinc():
result = run_create_or_update_google_creative.delay()
job = CeleryJobResult(job_id=result.task_id, status=result.status)
job.save()
t = threading.Thread(target=check_task_status, args=(result.task_id,))
t.start()
return 201, job.id

Django: asynchronous service development through task list sharing

I am creating a web application with Django 3.1.0 and Python 3.8. This application requires the execution of a very long back-office operation and of which I am interested in knowing the percentage of progress. To do this, the server exposes two services:
www.contoso.com/process_start: returns a string to indicate the id assigned to the task;
www.contoso.com/process_status/taskid: returns a number from 0 to 1 to indicate the progress (as a taskid I use the id returned by the process_start call);
To do this I have created the following Python class
TASKS = dict() # list of tasks in progress
class Task():
def __init__(self):
id = get_uid() # generation of a random string
TASKS[id] = self # added to the list of tasks in progress
self.id = id
self.percentage = 0.0
self.thread = Thread(name=self.id, target=self.execute)
self.thread.start()
#staticmethod
def get_status(id: str) -> float:
task = TASKS.get(id, None)
return (1.0 if task == None else task.percentage)
def execute(self) -> None:
try:
...
finally:
TASKS.pop(self.id, None) # removal from the list of tasks in progress
The first service creates the task like this:
def post(self, request: Request) -> Response:
task = TaskReslice()
return Response(status=status.HTTP_200_OK, data=task.id)
the second service returns the status of the task in this way:
def get(self, request: Request, *args, **kwargs) -> Response:
task_id: str = kwargs["id"]
return Response(status=status.HTTP_200_OK, data=Task.get_status(task_id))
The problem is that the second service can't find any running tasks. What's the problem?

How to check what coroutine is completed after asyncio.wait

Consider the following code:
import random
import asyncio
class RandomLife(object):
def __init__(self, name: str):
self.name = name
self.coro = asyncio.sleep(random.randrange(0, 5))
def __await__(self):
return self.coro.__await__()
async def main():
objects = [RandomLife("one"), RandomLife("two"), RandomLife("three")]
finished, unfinished = await asyncio.wait(objects, return_when=asyncio.FIRST_COMPLETED)
print(finished)
await asyncio.wait(unfinished)
if __name__ == "__main__":
asyncio.run(main())
After then first asyncio.wait I want to know what instance of RandomLife has completed. But the finished variable is a set of Task s, rather than a RandomLife instance. How do I convert this task to a RandomLife? Is it possible?
As the documentation warns:
Note wait() schedules coroutines as Tasks automatically and later returns those implicitly created Task objects in (done, pending) sets. Therefore the following code won’t work as expected:
async def foo():
return 42
coro = foo()
done, pending = await asyncio.wait({coro})
if coro in done:
# This branch will never be run!
Here is how the above snippet can be fixed:
async def foo():
return 42
task = asyncio.create_task(foo())
done, pending = await asyncio.wait({task})
if task in done:
# Everything will work as expected now.
We can employ the same trick. First, we need to wrap all the coroutines to tasks, and then set up mapping from a task created to its RandomLife instance:
import random
import asyncio
class RandomLife(object):
def __init__(self, name: str):
self.name = name
self.coro = asyncio.sleep(random.randrange(0, 5))
def __await__(self):
return self.coro.__await__()
async def main():
objects = [RandomLife("one"), RandomLife("two"), RandomLife("three")]
# Wrap all the coros to tasks, as the documentation suggests.
tasks = [asyncio.create_task(o.coro) for o in objects]
# Set up mapping from tasks created to RandomLife instances.
task2life = dict(zip(tasks, objects))
finished, unfinished = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
# Get first task finished.
finised_task = list(finished)[0]
# Map it back to the RandomLife instance.
finished_life = task2life[finised_task]
print(finished_life.name)
await asyncio.wait(unfinished)
if __name__ == "__main__":
asyncio.run(main())

How to track progress of job worker threads when threads are initiated from a Job Processor?

I have a scenario where I get a list of jobs to be processed e.g. a list of web pages to be crawled from internet). Each job is independent and also the jobs can be processed in any order. Individual jobs may fail or succeed and may have to be handled accordingly (e.g. temporary data for a failed crawl task may have to be deleted and recrawled in next round)
I am trying to implement it using thread based processing in python. To mimic the actual task lets say I have a huge list of integer arrays and the individual job is to compute the Sum or Product of each array. What I am trying to do is to use a JobsProcessor class object to instantiate threads of JobWorker class objects which perform the actual processing by creating objects for other classes (Sum and Product here). The code for the same is mentioned below. A snippet is shown
from queue import Queue, Empty
from threading import Thread
import time
class Product:
def __init__(self,data):
self.data = data
def doOperation(self):
try:
product =self.data[0]
for d in self.data[1:]:
if d>100000:
raise Exception( "Forcefully throwing exception")
product*=d
time.sleep(1)
return product
except:
return "product computation failed"
class Sum:
def __init__(self,data):
self.data = data
def doOperation(self):
try:
sum =0
for d in self.data:
sum+=d
time.sleep(1)
return sum
except:
return "sum computation failed"
class JobWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
jobitem = self.queue.get_nowait()
if jobitem is None:
break
jobdata, optype = jobitem
if optype =='sum':
opobj = Sum(jobdata)
jobresult = opobj.doOperation()
elif optype =='product':
opobj = Product(jobdata)
jobresult = opobj.doOperation()
else:
print ("Invalid op type")
jobresult = 'Failed'
print(" job result", jobresult)
self.queue.task_done()
except Empty:
break
except:
print ("Some exception occured")
#How to pass it to up to the main jobs processor#
class JobsProcessor(object):
def __init__(self, joblist):
self.joblist = joblist
self.job_queue = Queue()
def process_resources(self):
try:
for job in self.joblist:
self.job_queue.put(job)
for i in range(2):
jobthread = JobWorker(self.job_queue)
jobthread.start()
'''
Write code here to monitor current status for all running jobs
'''
self.job_queue.join()
'''I want to write code here to track progress status for all jobs
Some jobs may have failed, not completed and based on that I may
want to take further action such as retry or flag them'''
print("Finished Jobs")
except:
pass
orgjobList = [ ([1,5,9,4],'sum'),
([5,4,5,8],'product'),
([100,45,678,999],'product'),
([3743,34,44324,543],'sum'),
([100001, 100002, 9876, 83989], 'product')]
mainprocessor = JobsProcessor(orgjobList)
mainprocessor.process_resources()
I want to add 2 functionalities to this process.
Consolidation : when all the job threads complete I want to know the status of all the JobWorker objects (e.g if they are completed successfully/ complete with failure). Failure/Exception may occur in the JobWorker object or may be even the Sum or Product object. The failure/success status should be propagate back to JobsProcessor, where I want to perform other actions such as reprocess/delete/send_elsewhere etc based on the returned status
Monitoring - also I want to have a Monitor functionality which can continuously check on the status of current running/completed jobs and perform the requisite actions such as delete immediately rather than waiting till the end for Consolidation
Please advise how I can add the above functionalities, and if only one of them would suffice for cases such as crawling pages. Any other suggestions are also welcome.
You can add both the functionalities in your code in any of the two ways -
Using Global Variables (simplest approach)
Using a getProgress and getStatus methods in your class (elegant approach)
You can create 2 threads, One thread does the actual work and updates the progress variable.
For the second approach, you can set two vars in __init__ class, like the following.
def __init__(self):
self.progress = 0
self.success = True
self.isDone = False
self.error = "No Error Occurred"
Then you can include the logic in your code like the following -
def actualWork(self):
self.isDone = 0
try:
for i in range(1000):
self.progress = i
time.sleep(0.01)
self.isDone = True
except Exception as e:
self.success = False
self.error = str(e)
def getProgress(self):
return self.progress
def getError(self):
return self.error

Is it possible to inherit multiprocessing.Process to communicate with the main process

I'm trying to inherit a sub class from multiprocessing.Process, which will have a queue for each instant, so that the queue can be use to catch the return value of the target.
But the problem is the multiprocessing.Process.start() uses subprocess.Popen (https://github.com/python/cpython/blob/master/Lib/multiprocessing/process.py) to create a process and run the target inside it. Is there a way to overload this without defining/overloading the entire Process module.
This is what I'm trying to do:
class Mprocessor(multiprocessing.Process):
def __init__(self, **kwargs):
multiprocessing.Process.__init__(self, **kwargs)
self._ret = Queue.Queue()
def run(self):
self._ret.put(multiprocessing.Process.run(self))
def getReturn(self):
if self._ret.empty:
return None
return self._ret.get()
Here I try to create a multiprocessig.Queue inside the class.
I override the 'run' method so when it is executed the return value/s of the target is put inside the queue.
I have a 'getReturn' method which is called in the main function using the Mprocess class. This method should only be called when 'Mprocess.isalive()' method(which is defined for multiprocessing.Process) returns false.
But this mechanism is not working because when I call 'Mprocess.start()' it creates a subprocess which runs the target in its own environment.
I want to know if there's a way to use the queue in the start method to get the return value, and avoid the target to have a queue argument to communicate.
I wanted to generalize this module.
I don't want my methods to be defined to have a queue to get return value.
I want to have a module so that it can be applicable to any function, because I am planning to have a manager method, which takes a dict["process_name/ID" : methods/targets], a dict["process name/ID" : [argument_list]] and create a process for each of this targets and return a dict["process_name/ID" : (return tuple, ).
Any ideas will be welcomed.
EDIT
Manager function:
def Processor_call(func = None, func_args = None):
if sorted(func.keys()) != sorted(func_args()):
print "Names in func dict and args dict doesn't match"
return None
process_list = multiprocessing.Queue()
for i in func.keys():
p = Mprocessor(name = i, target = func[i], args = tuple(func_args[i]))
process_list.put(p)
p.start()
return_dict = {}
while not process_list.empty():
process_wait = process_list.get()
if not process_wait.is_alive():
process_wait.join()
if process_wait.exitcode == 0:
return_dict[process_wait.name] = process_wait.getReturn()
else:
print "Error in process %s, status not availabe" %process_wait.name
else:
join_process.put(process_wait)
return return_dict
EDIT: The target function should look like this.
def sum(a , b):
return a + b
I don't want to pass a queue into the function, and return with queue.
I want to make a common module so that, any existing methods can use multiprocessing without any change to its definition, So the interface with other modules are maintained.
I don't want a function to be designed only to be run as a process, I want to have the common interface so that other modules can also use this function as a normal method, without bothering to read from the queue to get the return value.
Comment: ... so that I'll get the return value from the process started from start method
This will work for me, for instance:
class Mprocessor
class Mprocessor(multiprocessing.Process):
def __init__(self, queue, **kwargs):
multiprocessing.Process.__init__(self, **kwargs)
self._ret = queue
def run(self):
return_value = self._target( *self._args )
self._ret.put((self.name, return_value))
time.sleep(0.25)
exit(0)
Start processes and wait for return values
def Processor_call(func=None, func_args=None):
print('func=%s, func_args=%s' % (func, func_args))
ret_q = multiprocessing.Manager().Queue()
process_list = []
for i in func.keys():
p = Mprocessor(name=i, target=func[i], args=(func_args[i],), queue=ret_q)
p.start()
process_list.append(p)
time.sleep(0.1)
print('Block __main__ until all process terminated')
for p in process_list:
p.join()
print('Aggregate alle return values')
return_dict = {}
while not ret_q.empty():
p_name, value = ret_q.get()
return_dict[p_name] = value
return return_dict
__main__
if __name__ == '__main__':
rd = Processor_call({'f1':f1, 'f2':f1}, {'f1':1, 'f2':2})
print('rd=%s' % rd)
Output:
func={'f1': , 'f2': }, func_args={'f1': 1, 'f2': 2}
pid:4501 start 2
pid:4501 running
pid:4500 start 1
pid:4500 running
Block __main__ until all process terminated
pid:4501 running
pid:4500 running
pid:4501 running
pid:4500 running
pid:4501 Terminate
pid:4500 Terminate
Aggregate alle return values
rd={'f1': 1, 'f2': 2}
Tested with Python:3.4.2 and 2.7.9
Question: Is it possible to inherit multiprocessing.Process to communicate with the main process
Yes, it's possible. But not useing a class object, as your process use it's own copy of the class object .
You have to use a global Queue object and pass it to your process .

Categories

Resources