Broadcast a user defined class in Spark

Broadcast a user defined class in Spark - python

I am trying to broadcast a user defined variable in a PySpark application but I always have the following error:
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/home/.../sparkbroad.py", line 29, in <lambda>
output = input_.map(lambda item: b.value.map(item))
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/broadcast.py", line 106, in value
self._value = self.load(self._path)
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/broadcast.py", line 97, in load
return pickle.load(f)
AttributeError: 'module' object has no attribute 'FooMap'
The code, in the module sparkbrad.py is the following:
import random
import pyspark as spark
class FooMap(object):
def __init__(self):
keys = list(range(10))
values = [2 * key for key in keys]
self._map = dict(zip(keys, values))
def map(self, value):
if value not in self._map:
return -1
return self._map[value]
class FooMapJob(object):
def __init__(self, inputs):
self._inputs = inputs
self._foomap = FooMap()
def run(self):
sc = spark.SparkContext('local', 'FooMap')
input_ = sc.parallelize(self._inputs, 4)
b = sc.broadcast(self._foomap)
output = input_.map(lambda item: b.value.map(item))
b.unpersist()
result = list(output.toLocalIterator())
sc.stop()
return result
def main():
inputs = [random.randint(0, 10) for _ in range(10)]
job = FooMapJob(inputs)
print(job.run())
if __name__ == '__main__':
main()
and I am running it via the:
:~$ spark-submit --master local[4] --py-files sparkbroad.py sparkbroad.py
where I have added the --py-files argument but it looks it doesn't change that much. Unfortunately, I could not find any example online dealing with broadcasting of complex classes (just lists or dictionaries). Any hint is appreciated. Thanks in advance.
UPDATE: placing the FooMap class in a separate module, everything seems working fine, even without the --py-files directive.

Placing the FooMap class in a separate module, everything works fine.

This is an addition to the previous answer.
You should import FooMap from another file not just define it in current file
maybe like this:
in foo_map.py:
class FooMap(object):
def __init__(self):
keys = list(range(10))
values = [2 * key for key in keys]
self._map = dict(zip(keys, values))
def map(self, value):
if value not in self._map:
return -1
return self._map[value]
then in sparkbrad.py
from foo_map import FooMap
class FooMapJob(object):
def __init__(self, inputs):
self._inputs = inputs
self._foomap = FooMap()
def run(self):
sc = spark.SparkContext('local', 'FooMap')
input_ = sc.parallelize(self._inputs, 4)
b = sc.broadcast(self._foomap)
output = input_.map(lambda item: b.value.map(item))
b.unpersist()
result = list(output.toLocalIterator())
sc.stop()
return result
def main():
inputs = [random.randint(0, 10) for _ in range(10)]
job = FooMapJob(inputs)
print(job.run())
if __name__ == '__main__':
main()

Related

raise AttributeError(name) AttributeError: LCC_GetChannelHandle

I am very new in python cffi. I have to access my temprature module by using its Index or with its channel name. I am trying with both as you can see in my QmixTC class. I am getting attribute error. In other class, there is no errors. Can someone help me understand where is the problem. I am putting my code as well as error trace. Thanks.
main code with name qmix.py (importing it in to sample code):
class QmixTC (object):
"""
"""
def __init__(self, index=0, handle=None,name=''):
self.dll_dir = DLL_DIR
self.dll_file = os.path.join(self.dll_dir,
'labbCAN_Controller_API.dll')
self._ffi = FFI()
self._ffi.cdef(CONTROLLER_HEADER)
self._dll = self._ffi.dlopen(self.dll_file)
self._handle = self._ffi.new('dev_hdl *', 0)
if handle is None:
self.index = index
self._handle = self._ffi.new('dev_hdl *', 0)
self._call('LCC_GetChannelHandle', self.index, self._handle)
else:
self.index = None
self._handle = handle
self._ch_name="QmixTC_1_DO0_INA"
self._channel = self._ch_name + str(index)
self._call('LCC_LookupChanByName',
bytes(self._channel,'utf8'),
self._handle)
self.name = name
def _call(self, func_name, *args):
func = getattr(self._dll, func_name)
r = func(*args)
r = CHK(r, func_name, *args)
return r
def Setpoint_write (self, setpoint):
"""
Write setpoint value to controller device.
Parameters
[in] ChanHdl Valid handle of open controller channel
[in] fSetPointValue The setpoint value to write
Returns
Error code - ERR_NOERR indicates success
"""
self._call('LCC_WriteSetPoint', self._handle[0], setpoint)
def enable_controllLoop (self, enable):
"""
Enables / disables a control loop.
If the control loop is enabled, then the output value is calculated periodically.
Parameters
ChanHdl Valid handle of a controller channel
Enable 1 = enable, 0 = disable
Returns
Error code - ERR_NOERR indicates success
"""
self._call('LCC_EnableControlLoop', self._handle[0], enable)
def read_temp_value (self, actualvalue):
"""
Read actual value from device.
Parameters
[in] ChanHdl Valid handle of open controller channel
[out] pfActualValue Returns the actual controller value
Returns
Error code - ERR_NOERR indicates success
"""
self._call('LCC_ReadActualValue', self._handle[0], actualvalue)
if __name__ == '__main__':
import os.path as op
dll_dir = op.normpath('C:\\Users\\Ravikumar\\AppData\\Local\\QmixSDK')
config_dir = op.normpath('C:\\Users\\Public\\Documents\\QmixElements\\Projects\\QmixTC_Pump\\Configurations\\QmixTC_pump')
bus = QmixBus(config_dir=config_dir)
bus.open()
bus.start()
controller_0 = QmixTC(index=0)
controller_0.enable_controllLoop(1)
sample program:
from __future__ import division, print_function
from win32api import GetSystemMetrics
import numpy as np
import os
import qmix
import pandas as pd
#%% CHANNEL INITIALIZATION
if __name__ == '__main__':
dll_dir = ('C:\\Users\\Ravikumar\\AppData\\Local\\QmixSDK')
config_dir = ('C:\\Users\\Public\\Documents\\QmixElements\\Projects\\QmixTC_test1\\Configurations\\QmixTC_test1')
qmix_bus = qmix.QmixBus(config_dir=config_dir,dll_dir=dll_dir)
qmix_bus.open()
qmix_bus.start()
controller_0 = qmix.QmixTC(index=0)
controller_0.Setpoint_write(50)
error:
Traceback (most recent call last):
File "<ipython-input-5-40d4a3db9493>", line 17, in <module>
controller_0 = qmix.QmixTC(index=0)
File "qmix.py", line 921, in __init__
self._call('LCC_GetChannelHandle', self.index, self._handle)
File "qmix.py", line 937, in _call
func = getattr(self._dll, func_name)
File "C:\Users\Ravikumar\Anaconda2\lib\site-packages\cffi\api.py", line 875, in __getattr__
make_accessor(name)
File "C:\Users\Ravikumar\Anaconda2\lib\site-packages\cffi\api.py", line 870, in make_accessor
raise AttributeError(name)
AttributeError: LCC_GetChannelHandle

python multiprocessing with multiple arguments

I'm trying to multiprocess a function that does multiple actions for a large file but I'm getting the knownle pickling error eventhough Im using partial.
The function looks something like this:
def process(r,intermediate_file,record_dict,record_id):
res=0
record_str = str(record_dict[record_id]).upper()
start = record_str[0:100]
end= record_str[len(record_seq)-100:len(record_seq)]
print sample, record_id
if r=="1":
if something:
res = something...
intermediate_file.write("...")
if something:
res = something
intermediate_file.write("...")
if r == "2":
if something:
res = something...
intermediate_file.write("...")
if something:
res = something
intermediate_file.write("...")
return res
The way im calling it is the following in another function:
def call_func():
intermediate_file = open("inter.txt","w")
record_dict = get_record_dict() ### get infos about each record as a dict based on the record_id
results_dict = {}
pool = Pool(10)
for a in ["a","b","c",...]:
if not results_dict.has_key(a):
results_dict[a] = {}
for b in ["1","2","3",...]:
if not results_dict[a].has_key(b):
results_dict[a][b] = {}
results_dict[a][b]['res'] = []
infile = open(a+b+".txt","r")
...parse the file and return values in a list called "record_ids"...
### now call the function based on for each record_id in record_ids
if b=="1":
func = partial(process,"1",intermediate_file,record_dict)
res=pool.map(func, record_ids)
## append the results for each pair (a,b) for EACH RECORD in the results_dict
results_dict[a][b]['res'].append(res)
if b=="2":
func = partial(process,"2",intermediate_file,record_dict)
res = pool.map(func, record_ids)
## append the results for each pair (a,b) for EACH RECORD in the results_dict
results_dict[a][b]['res'].append(res)
... do something with results_dict...
The idea is that for each record inside the record_ids, I want to save the results for each pair (a,b).
I'm not sure what is giving me this error:
File "/code/Python/Python-2.7.9/Lib/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/code/Python/Python-2.7.9/Lib/multiprocessing/pool.py", line 558, in get
raise self._value
cPickle.PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function faile
d

func is not defined at the top level of the code so it can't be pickled.
You can use pathos.multiprocesssing which is not a standard module but it will work.
Or, use something diferent to Pool.map maybe a Queue of workers ?
https://docs.python.org/2/library/queue.html
In the end there is an example you can use, it's for threading but is very similar to the multiprocessing where there is also Queues...
https://docs.python.org/2/library/multiprocessing.html#pipes-and-queues

Returning multiple lists from pool.map processes?

Win 7, x64, Python 2.7.12
In the following code I am setting off some pool processes to do a trivial multiplication via the multiprocessing.Pool.map() method. The output data is collected in List_1.
NOTE: this is a stripped down simplification of my actual code. There are multiple lists involved in the real application, all huge.
import multiprocessing
import numpy as np
def createLists(branches):
firstList = branches[:] * node
return firstList
def init_process(lNodes):
global node
node = lNodes
print 'Starting', multiprocessing.current_process().name
if __name__ == '__main__':
mgr = multiprocessing.Manager()
nodes = mgr.list()
pool_size = multiprocessing.cpu_count()
branches = [i for i in range(1, 21)]
lNodes = 10
splitBranches = np.array_split(branches, int(len(branches)/pool_size))
pool = multiprocessing.Pool(processes=pool_size, initializer=init_process, initargs=[lNodes])
myList_1 = pool.map(createLists, splitBranches)
pool.close()
pool.join()
I now add an extra calculation to createLists() & try to pass back both lists.
import multiprocessing
import numpy as np
def createLists(branches):
firstList = branches[:] * node
secondList = branches[:] * node * 2
return firstList, secondList
def init_process(lNodes):
global node
node = lNodes
print 'Starting', multiprocessing.current_process().name
if __name__ == '__main__':
mgr = multiprocessing.Manager()
nodes = mgr.list()
pool_size = multiprocessing.cpu_count()
branches = [i for i in range(1, 21)]
lNodes = 10
splitBranches = np.array_split(branches, int(len(branches)/pool_size))
pool = multiprocessing.Pool(processes=pool_size, initializer=init_process, initargs=[lNodes])
myList_1, myList_2 = pool.map(createLists, splitBranches)
pool.close()
pool.join()
This raises the follow error & traceback..
Traceback (most recent call last):
File "<ipython-input-6-ff188034c708>", line 1, in <module>
runfile('C:/Users/nr16508/Local Documents/Inter Trab Angle/Parallel/scratchpad.py', wdir='C:/Users/nr16508/Local Documents/Inter Trab Angle/Parallel')
File "C:\Users\nr16508\AppData\Local\Continuum\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\Users\nr16508\AppData\Local\Continuum\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/nr16508/Local Documents/Inter Trab Angle/Parallel/scratchpad.py", line 36, in <module>
myList_1, myList_2 = pool.map(createLists, splitBranches)
ValueError: too many values to unpack
When I tried to put both list into one to pass back ie...
return [firstList, secondList]
......
myList = pool.map(createLists, splitBranches)
...the output becomes too jumbled for further processing.
Is there an method of collecting more than one list from pooled processes?

This question has nothing to do with multiprocessing or threadpooling. It is simply about how to unzip lists, which can be done with the standard zip(*...) idiom.
myList_1, myList_2 = zip(*pool.map(createLists, splitBranches))

TypeError: Popen not iterable

Am re-writing a program from previous question, and I am having trouble with it. Please see code:
#!/usr/bin/python
import subprocess,time, timeit
from multiprocessing import Process, Queue
import re, os, pprint, math
from collections import defaultdict
Dict = {}
identifier = ""
hexbits = []
count = defaultdict(int)
def __ReadRX__(RX_info):
lines = iter(RX_info.stdout.readline, "")
try:
start = time.clock()
for line in lines:
if re.match(r"^\d+.*$",line):
splitline = line.split()
del splitline[1:4]
identifier = splitline[1]
count[identifier] += 1
end = time.clock()
timing = round((end - start) * 10000, 100)
dlc = splitline[2]
hexbits = splitline[3:]
Dict[identifier] = [dlc, hexbits, count[identifier],int(timing)]
start = end
except keyboardinterrupt:
pass
procRX = subprocess.Popen('receivetest -f=/dev/pcan32'.split(), stdout=subprocess.PIPE)
if __name__ == '__main__':
munchCan = Process(target=__ReadRX__, args=(procRX))
munchCan.start()
munchCan.join()
print Dict
When trying to run the code I get the following error:
File "./cancheck2.py", line 36, in <module>
munchCan = Process(target=__ReadRx__, args=(procRX))
File "/usr/lib/python2.7/multiprocessing/process.py", line 104, in __init__
self._args = tuple(args)
TypeError: 'Popen' objec is not iterable
This code worked before I seperated the subprocess and set up __ReadRX__ as a seperate process.
Coudl anyone explain what is happening as I don't quite understand?

(procRX) doesn't create a tuple, you have to use (procRX,).

buildbot doesn't accept my MailNotifier's IEMailLookup object

A number of people in my organization have different email names from perforce names, so I need to create an IEmailLookup derivation that overrides getAddress to do my evil bidding:
(From my master.cfg)
class MyIEmailLookup:
from buildbot import interfaces
__implements__ = interfaces.IEmailLookup
def getAddresses(user):
address_dict = {"user1", "user_one#our_domain.com"}
try:
address = address_dict[user]
except KeyError:
address = user + "#our_domain.com"
return address
maillookup = MyIEmailLookup()
from buildbot.status import mail
c['status'].append(mail.MailNotifier(....
....
lookup=maillookup
))
I've tried any number of permutations, but I either get:
Traceback (most recent call last):
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/runner.py", line 1071, in doCheckConfig
ConfigLoader(configFileName=configFileName)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/checkconfig.py", line 46, in __init__
self.loadConfig(configFile, check_synchronously_only=True)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/master.py", line 727, in loadConfig
exec f in localDict
File "/Users/playbuilder/buildbot/master.cfg", line 207, in <module>
lookup=maillookup
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/status/mail.py", line 293, in __init__
assert interfaces.IEmailLookup.providedBy(lookup)
AssertionError
...or any other number of issues, dependant upon how I try to implement the IEmailLookup interface.
I'm using buildbot 0.8.3p1 and python 2.6.1.
I see precious few examples of how to do this, and every one of them fails in my context. What am I missing here?

I just solved this problem myself.
First you need to add (somewhere at the top of the file)
from zope.interface import implements
and then change
__implements__ = interfaces.IEmailLookup
to
if implements:
implements( interfaces.IEmailLookup )
else:
__implements__ = interfaces.IEmailLookup

If you want to fetch email from perforce user, you can use this class:
# .-----------------------.
# | Perforce Email Lookup |
# `-----------------------'
from twisted.internet import defer, utils
from buildbot.util import ComparableMixin
from buildbot.interfaces import IEmailLookup
from zope.interface import implements
import os
import re
class PerforceEmailLookup(ComparableMixin):
implements(IEmailLookup)
compare_attrs = ["p4port", "p4user", "p4passwd", "p4bin"]
env_vars = ["P4CLIENT", "P4PORT", "P4PASSWD", "P4USER",
"P4CHARSET"]
def __init__(self,
p4port = None,
p4user = None,
p4passwd = None,
p4bin = 'p4'):
self.p4port = p4port
self.p4user = p4user
self.p4passwd = p4passwd
self.p4bin = p4bin
self.email_re = re.compile(r"Email:\s+(?P<email>\S+#\S+)\s*$")
def _get_process_output(self, args):
env = dict([(e, os.environ.get(e)) for e in self.env_vars if os.environ.get(e)])
d = utils.getProcessOutput(self.p4bin, args, env)
return d
#defer.deferredGenerator
def getAddress(self, name):
if '#' in name:
yield name
return
args = []
if self.p4port:
args.extend(['-p', self.p4port])
if self.p4user:
args.extend(['-u', self.p4user])
if self.p4passwd:
args.extend(['-P', self.p4passwd])
args.extend(['user', '-o', name])
wfd = defer.waitForDeferred(self._get_process_output(args))
yield wfd
result = wfd.getResult()
for line in result.split('\n'):
line = line.strip()
if not line: continue
m = self.email_re.match(line)
if m:
yield m.group('email')
return
yield name
usage would look like:
c['status'].append(
MailNotifier(
sendToInterestedUsers = True,
mode = 'failing',
lookup = PerforceEmailLookup(
p4port = "perforce:1666",
p4user = "buildbot",
p4passwd = "buildbot")))

Try this:
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implements
class lookup_example_email(ComparableMixin):
implements(IEmailLookup)
def getAddress(self,user):
return "%s#example.com"%(user)
...
mn = MailNotifier(..., lookup=lookup_example_email(), extraRecipients=m)

Here's the piece of code I use which works with buildbot 2.3.1 in python3.6.
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implementer
#implementer(IEmailLookup)
class EmailMap(ComparableMixin):
def getAddress(self, name):
return f'{name}#xxxxx'

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Broadcast a user defined class in Spark - python

Placing the FooMap class in a separate module, everything works fine.

Related

raise AttributeError(name) AttributeError: LCC_GetChannelHandle

python multiprocessing with multiple arguments

Returning multiple lists from pool.map processes?

TypeError: Popen not iterable

buildbot doesn't accept my MailNotifier's IEMailLookup object

Categories

Resources