Left join in apache beam - python

Which is the better way to left join following Pcollection in apache beam?
pcoll1 = [('key1', [[('a', 1)],[('b', 2)], [('c', 3)], [('d', 4)],[('e', 5)], [('f', 6)]]), ('key2',[[('a', 12)],[('b', 21)], [('c', 13)]]), ('key3',[[('a', 21)],[('b', 23)], [('c', 31)]])]
pcoll2 = [('key1', [[('x', 10)]]), ('key2', [[('x', 20)]])]
Expected outpus is
[('a', 1), ('x', 10)]
[('b', 2), ('x', 10)]
[('c', 3), ('x', 10)]
[('d', 4), ('x', 10)]
[('e', 5), ('x', 10)]
[('f', 6), ('x', 10)]
[('a', 12), ('x', 20)]
[('b', 21), ('x', 20)]
[('c', 13), ('x', 20)]
[('a', 21)]
[('b', 23)]
[('c', 31)]
I have implemented a left joiner using CoGroupByKey() and Pardo(). Is there any other method to implement left joiner in beam Python SDK?
left_joined = (
{'left': pcoll1, 'right': pcoll2}
| 'LeftJoiner: Combine' >> beam.CoGroupByKey()
| 'LeftJoiner: ExtractValues' >> beam.Values()
| 'LeftJoiner: JoinValues' >> beam.ParDo(LeftJoinerFn())
)
class LeftJoinerFn(beam.DoFn):
def __init__(self):
super(LeftJoinerFn, self).__init__()
def process(self, row, **kwargs):
left = row['left']
right = row['right']
if left and right:
for each in left:
yield each + right[0]
elif left:
for each in left:
yield each

You can use the follwing code for using side inputs for the right side of the join, assuming the right side is always going to have one element mapped to each key which means that it is always much smaller in size than the left pcollection.
Also, if your pcollection is created by reading from an external source instead of an in-memory array, you will need to pass right_list=beam.pvalue.asList(pcoll2) instead of right_list=pcoll2 to the ParDo . Check here for more info
class LeftJoinerFn(beam.DoFn):
def __init__(self):
super(LeftJoinerFn, self).__init__()
def process(self, row, **kwargs):
right_dict = dict(kwargs['right_list'])
left_key = row[0]
if left_key in right_dict:
for each in row[1]:
yield each + right_dict[left_key]
else:
for each in row[1]:
yield each
class Display(beam.DoFn):
def process(self, element):
LOG.info(str(element))
yield element
p = beam.Pipeline(options=pipeline_options)
pcoll1 = [('key1', [[('a', 1)],[('b', 2)], [('c', 3)], [('d', 4)],[('e', 5)], [('f', 6)]]), \
('key2',[[('a', 12)],[('b', 21)], [('c', 13)]]), \
('key3',[[('a', 21)],[('b', 23)], [('c', 31)]])\
]
pcoll2 = [('key1', [[('x', 10)]]), ('key2', [[('x', 20)]])]
left_joined = (
pcoll1
| 'LeftJoiner: JoinValues' >> beam.ParDo(LeftJoinerFn(), right_list=pcoll2)
| 'Display' >> beam.ParDo(Display())
)
p.run()

If the second collection is always smaller, an alternative approach would be to use side inputs. This would require making the right collection a side-input that is broadcast to all the workers, then writing a ParDo that processes elements from the left collection and reads in the right collection.

left.txt
149633CM,Marco,10
212539MU,Rebekah,10
231555ZZ,Itoe,10
right.txt
149633CM,Australia
212539MU,India
Code for left Join:
from apache_beam.io.gcp.internal.clients import bigquery
import apache_beam as beam
def retTuple(element):
thisTuple=element.split(',')
return (thisTuple[0],thisTuple[1:])
def jstr(j):
import datetime
jlist=[]
for k in ((j[1]['left_data'])):
if len((j[1]['right_data']))==0:
id,name,rank=([j[0]]+k)
json_str={ "id":id,"name":name,"rank":rank}
jlist.append(json_str)
else:
for l in ((j[1]['right_data'])):
# print(([j[0]]+k+l))
id,name,rank,country=([j[0]]+k+l)
json_str={ "id":id,"name":name,"rank":rank,"country":country }
jlist.append(json_str)
return jlist
table_spec = 'project:dataset.table_name'
table_schema = 'id:STRING,name:STRING,rank:INTEGER,country:STRING'
gcs='gs://dataflow4bigquery/temp'
p1 = beam.Pipeline()
# Apply a ParDo to the PCollection "words" to compute lengths for each word.
left_rows = (
p1
| "Reading File 1" >> beam.io.ReadFromText('left.txt')
| 'Pair each employee with key' >> beam.Map(retTuple) # {149633CM : [Marco,10]}
)
right_rows = (
p1
| "Reading File 2" >> beam.io.ReadFromText('right.txt')
| 'Pair each country with key' >> beam.Map(retTuple) # {149633CM : [9876843261,New York]}
)
results = ({'left_data': left_rows, 'right_data': right_rows}
| beam.CoGroupByKey()
| beam.FlatMap(jstr)
| beam.io.WriteToBigQuery(
custom_gcs_temp_location=gcs,
table=table_spec,
schema=table_schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
)
)
p1.run().wait_until_finish()

Related

Implementing Dijkstra's algorithm in Python but a Key Error is received when using a different graph

I'm a university student and we were tasked to implement Dijkstra's algorithm on the given graph below.
Graph to implement Dijkstra's algorithm on
We were given a code to use and/or modify and help answer the question given.
import heapq
import math
def dijkstra(G, S):
pq = []
entry_finder = {}
costs = {}
pred = {S: None}
REMOVED = 'removed'
def add_entry(label, priority):
if label in entry_finder:
remove_entry(label)
entry = [priority, label]
entry_finder[label] = entry
heapq.heappush(pq, entry)
def remove_entry(label):
entry = entry_finder.pop(label)
entry[-1] = REMOVED
def pop_entry():
while pq:
priority, label = heapq.heappop(pq)
if label != REMOVED:
del entry_finder[label]
return priority, label
return None, None
for v in G:
if v == S:
add_entry(S, 0)
else:
add_entry(v, math.inf)
while pq:
d_u, u = pop_entry()
if u is not None and u != REMOVED:
costs[u] = d_u
for e in G[u]:
v, w = e
entry_v = entry_finder[v]
d_v = entry_v[0]
if d_v > d_u + w:
add_entry(v, d_u + w)
pred[v] = u
return costs, pred
This code was shown to work for a separate graph that was used in an example from our lectures. The graph was converted into code as such.
G = {
'0': [('1', 2), ('2', 6), ('3', 7)],
'1': [('3', 3), ('4', 6)],
'2': [('4', 1)],
'3': [('4', 5)],
'4': []
}
costs, pred = dijkstra(G, '0')
print(costs, pred)
So I know for a fact that the given code works. The problem arose when I tried to implement the graph into code and it gave me a KeyError: 'D'. My implementation of the graph is as follows.
G = {
'A': [('B', 56), ('C', 96), ('D', 78)],
'B': [('D', 18), ('F', 208), ('E', 110)],
'C': [('D', 20), ('F', 90)],
'D': [('F', 112)],
'E': [('F', 16), ('G', 46), ('I', 108)],
'F': [('G', 20), ('H', 62)],
'G': [('H', 40)],
'H': [('I', 29), ('J', 56)],
'I': [('J', 21)],
'J': []
}
costs, pred = dijkstra(G, 'A')
print(costs, pred)
The error also comes with:
line 41, in dijkstra
entry_v = entry_finder[v]. I'd like to know if the error came from my wrong implementation of the graph or if the given sample code itself had errors.

python multiprocessing and queue access issues

I've got the following code:
import multiprocessing
import queue
import time
to_delete = queue.Queue()
def parallel(args):
return para_print(*args)
def para_print(sngl, dbl, iter):
to_delete.put(f"{sngl}, {dbl}, {iter}")
if __name__ == '__main__':
multiprocessing.freeze_support()
expression_list = [('a', 'aa', 1), ('b', 'bb', 2), ('c', 'cc', 3), ('d', 'dd', 4)]
pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
result = pool.map(parallel, expression_list)
print(to_delete.qsize())
while not to_delete.empty():
print(to_delete.get())
The result is '0' printed as the queue size, with nothing getting put correctly into the queue - or pulled from it. What on earth am I doing wrong here?
You should be using the queue from multiprocessing. The standard one doesn't work correctly between processes. Here's the revised code..
import multiprocessing
to_delete = multiprocessing.Queue()
def parallel(args):
return para_print(*args)
def para_print(sngl, dbl, iter):
to_delete.put(f"{sngl}, {dbl}, {iter}")
if __name__ == '__main__':
multiprocessing.freeze_support()
expression_list = [('a', 'aa', 1), ('b', 'bb', 2), ('c', 'cc', 3), ('d', 'dd', 4)]
pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
result = pool.map(parallel, expression_list)
print(to_delete.qsize())
while not to_delete.empty():
print(to_delete.get())

How to dump and save function arguments without embedding all of the container's content?

I am trying to save all function arguments as it is ran, to a container. Container is common for all funcs ran in the script. How to ensure all container's content is NOT saved every time I save function arguments?
Below decorator saves function arguments:
import inspect
from datetime import datetime
import time
def func_logger(method):
def wrapper(*args, **kw):
method_args = inspect.signature(method).bind(*args, **kw).arguments
runtime = str( datetime.now() )
name = method.__name__
module = method.__module__
signature = runtime + ': ' + '.'.join([module, name])
ts = time.time()
result = method(*args, **kw)
te = time.time()
kw['log'][signature] = {}
kw['log'][signature]['time'] = round(te - ts, 2)
kw['log'][signature]['args'] = method_args
return result
return wrapper
And an example function:
#func_logger
def test(a, b=4, c='blah-blah', *args, **kwargs):
return 4**4**8
When I am running the following snippet:
log = {}
output = test(1,4,2,4,1,par=1, log=log)
output = test(1,4,2,4,1,par=1, log=log)
log
I receive this output:
{'2019-05-17 13:48:25.214094: __main__.test': {'time': 0.0,
'args': OrderedDict([('a', 1),
('b', 4),
('c', 2),
('args', (4, 1)),
('kwargs', {'par': 1, 'log': {...}})])},
'2019-05-17 13:48:25.215092: __main__.test': {'time': 0.0,
'args': OrderedDict([('a', 1),
('b', 4),
('c', 2),
('args', (4, 1)),
('kwargs', {'par': 1, 'log': {...}})])}}
I already tried a workaround - a function that removes 'log' entry from the dictionary. However, every next item in this log stores of the log's current content. So when I try this:
list( log.items() )[-1][-1]['args']
The output is this:
OrderedDict([('a', 1),
('b', 4),
('c', 2),
('args', (4, 1)),
('kwargs',
{'par': 1,
'log': {'2019-05-17 13:45:45.748722: __main__.test': {'time': 0.0,
'args': OrderedDict([('a', 1),
('b', 4),
('c', 2),
('args', (4, 1)),
('kwargs', {'par': 1, 'log': {...}})])},
'2019-05-17 13:45:45.749221: __main__.test': {'time': 0.0,
'args': OrderedDict([('a', 1),
('b', 4),
('c', 2),
('args', (4, 1)),
('kwargs', {'par': 1, 'log': {...}})])},
'2019-05-17 13:45:45.750218: __main__.test': {'time': 0.0,
'args': OrderedDict(...)}}})])
So essentially, such a workaround won't work because with time, the memory would get clogged quickly.
Is there any way decorator would not save log entry every time I save function arguments? What I would rather like to avoid is to create a new 'log = {}' container every time I want to dump arguments from a new function.
You could simply store the log parameter if present and remove it from **kw:
def func_logger(method):
def wrapper(*args, **kw):
try:
log = kw['log']
del kw['log']
except KeyError:
log = None
method_args = inspect.signature(method).bind(*args, **kw).arguments
runtime = str( datetime.now() )
name = method.__name__
module = method.__module__
signature = runtime + ': ' + '.'.join([module, name])
ts = time.time()
result = method(*args, **kw)
te = time.time()
if log is not None:
log[signature] = {}
log[signature]['time'] = round(te - ts, 2)
log[signature]['args'] = method_args
return result
return wrapper
use global log in func_logger
log = {}
def func_logger(method):
def wrapper(*args, **kw):
# pass
log[signature] = {...}
return result
return wrapper
then, use output = test(1,4,2,4,1,par=1)

write data in to a csv according to headers names, which indicate occurrences of items

I need to enter data in to csv using headers and put a value if the flag is available in the event else zero it. Required output is:
I am currently getting:
This is my current code, I would like to know how to generate my desired output:
inputs for code is counter1-4 shown below :
OrderedDict([('flags=40', 3971), ('flags=10004', 6244), ('flags=10100', 236), ('flags=90002', 2), ('flags=80', 2009), ('flags=10080', 5421), ('flags=4', 2886), ('flags=100', 227), ('flags=80002', 58), ('flags=10040', 8990), ('flags=0', 5)])
OrderedDict([('flags=40', 16), ('flags=10004', 6244), ('flags=10100', 236), ('flags=90002', 2), ('flags=10080', 5421), ('flags=4', 16), ('flags=80002', 11), ('flags=10040', 8990), ('flags=0', 4), ('Total', 20940)])
OrderedDict([('flags=4', 1332), ('flags=40', 1839), ('flags=80002', 3), ('flags=100', 197), ('flags=80', 935), ('Total', 4306)])
OrderedDict([('Total', 0)])
OrderedDict([('flags=40', 2116), ('flags=80', 1074), ('flags=4', 1538), ('flags=100', 30), ('flags=80002', 44), ('flags=0', 1), ('Total', 4803)])
dat = 1
with open(outputcsv,'wb') as outcsv:
writer = csv.writer(outcsv,delimiter=',')
appname = inputfile[:-3]
writer.writerow(appname.split(','))
for x in threads:
writer.writerows([x.split(',')])
#w.writeheader([x.split(',')])
if dat == 1:
w = csv.DictWriter(outcsv,counter1.keys())
w.writeheader()
w.writerow(counter1)
elif dat == 2:
w = csv.DictWriter(outcsv,counter2.keys())
w.writeheader()
w.writerow(counter2)
elif dat == 3:
w = csv.DictWriter(outcsv,counter3.keys())
w.writeheader()
w.writerow(counter3)
elif dat == 4:
w = csv.DictWriter(outcsv,counter4.keys())
w.writeheader()
w.writerow(counter4)
dat = dat +1
writer.writerows('\n')
code for how threads are being read:
exampleFile = open('top_tasks.csv')
exampleReader = csv.reader(exampleFile)
exampleData = list(exampleReader)
thread1 = exampleData[11][0]
thread2 = exampleData[12][0]
thread3 = exampleData[13][0]
thread4 = exampleData[14][0]
threads = [thread1,thread2,thread3,thread4]
I think this code meets your requirements:
from collections import OrderedDict
import csv
# build an OrderedDict of all keys
all_keys = OrderedDict()
# first column gets name of data set
all_keys[data_set_name] = data_set_name
# collect all of the known keys, and insert the thread name
for counter, thread in zip(counters, threads):
all_keys.update(counter)
counter[data_set_name] = thread
with open(outputcsv, 'wb') as outcsv:
# using all known keys, create a csv writer
w = csv.DictWriter(outcsv, fieldnames=all_keys.keys())
# output the header and data rows
w.writeheader()
w.writerows(counters)
Data Used:
outputcsv = 'output.csv'
counters = [
OrderedDict(
[('flags=40', 3971), ('flags=10004', 6244), ('flags=10100', 236),
('flags=90002', 2), ('flags=80', 2009), ('flags=10080', 5421),
('flags=4', 2886), ('flags=100', 227), ('flags=80002', 58),
('flags=10040', 8990), ('flags=0', 5)]),
OrderedDict(
[('flags=40', 16), ('flags=10004', 6244), ('flags=10100', 236),
('flags=90002', 2), ('flags=10080', 5421), ('flags=4', 16),
('flags=80002', 11), ('flags=10040', 8990), ('flags=0', 4),
('Total', 20940)]),
OrderedDict([('flags=4', 1332), ('flags=40', 1839), ('flags=80002', 3),
('flags=100', 197), ('flags=80', 935), ('Total', 4306)]),
OrderedDict([('Total', 0)]),
OrderedDict([('flags=40', 2116), ('flags=80', 1074), ('flags=4', 1538),
('flags=100', 30), ('flags=80002', 44), ('flags=0', 1),
('Total', 4803)]),
]
# code assumes thread names are in a list, make some sample names
threads = ['thread%d' % (i+1) for i in range(len(counters))]
# first column header if the name of the data set
data_set_name = 'CandyCrush 1'

How to convert python list of tuples into tree?

I have a list of tuples like
list_of_tuples = [(number, name, id, parent_id),
(number, name, id, parent_id),
]
I am trying to sort it into an ordered structure like:
{
parent: [(id, name), (id, name)],
parent: {parent: [(id, name)]
{
So, any node could have a parent and/or children
I tried with:
tree = defaultdict(lambda: [None, ()])
ancestors = set([item[3] for item in list_of_tuples])
for items in list_of_tuples:
children_root = {}
descendants = []
number, name, id, parent = items
if parent is None:
tree[id] = [(id, name)]
elif parent:
if parent not in tree.keys():
node = tree.get(parent)
node.append((id, name))
children = (id, name)
tree[parent].append(children)
But I'm losing deep hierarchy when a node has both a parent and children
How do I make the ordering work correctly?
I propose to represent the tree nodes as tuples ((id, name), dict_of_children).
list_of_tuples = [(1, 'name1', 1, None),
(2, 'name2', 2, 1),
(3, 'name3', 3, 1),
(4, 'name4', 4, 2),
(5, 'name5', 5, 2),
(6, 'name5', 6, None),
(7, 'name5', 7, 6),
]
def build_tree(list_of_tuples):
"""
>>> import pprint
>>> pprint.pprint(build_tree(list_of_tuples))
{1: ((1, 'name1'),
{2: ((2, 'name2'), {4: ((4, 'name4'), {}), 5: ((5, 'name5'), {})}),
3: ((3, 'name3'), {})}),
6: ((6, 'name5'), {7: ((7, 'name5'), {})})}
"""
all_nodes = {n[2]:((n[2], n[1]), {}) for n in list_of_tuples}
root = {}
for item in list_of_tuples:
number, name, id, parent = item
if parent is not None:
all_nodes[parent][1][id] = all_nodes[id]
else:
root[id] = all_nodes[id]
return root

Categories

Resources