How to delete an dictionay after processing it in python - python

I am merging 4 files which has billions of records this script is just an example of what i want
In this script im trying to adding records into four dictionaries and atlast im merging those 4 dictionaries to one dictionar
I am trying to delete 2nd dictionary after it is getting processed(after merging is done into one final dictionary) but it is throwing some error
class project(object):
def one(self):
self.hash_1 = {}
self.hash_1["ramu"] = ["10","20"]
self.hash_1["se"] = ["1","2"]
def two(self):
self.hash_2 = {}
self.hash_2["ramu"] = ["0","2"]
self.hash_2["se"] = ["111","2w"]
def three(self):
self.hash_3 = {}
self.hash_3["ramu"] = ["44","22"]
self.hash_3["se"] = ["111121","25"]
def four(self):
self.hash_4 = {}
self.hash_4["ramu"] = ["4433","222"]
self.hash_4["se"] = ["16621","2532"]
def process(self):
self.final_hash = {}
for k in self.hash_1:
self.final_hash[k] = self.hash_1[k]
print k
if k in self.hash_2:
print self.hash_2[k]
else:
print "no"
del self.hash_2
if k in self.hash_3:
print self.hash_3[k]
else:
print "no"
del self.hash_3
if k in self.hash_4:
print self.hash_4[k]
else:
print "no"
del self.hash_4
print self.final_hash
e_obj = project()
e_obj.one()
e_obj.two()
e_obj.three()
e_obj.four()
e_obj.process()
Error:
e_obj.process()
File "hash_remove_test.py", line 31, in process
if k in self.hash_2:
AttributeError: 'project' object has no attribute 'hash_2'
I want to delete every dictionary after processing it ot else it is throwing memoryError (since data is big)
How to solve this problem?
Note: The whole idea is to delete every dictionary after merging it

Your for loop runs del self.hash_2 during its first iteration (after examining self.hash_2), so it will be gone when the second iteration starts.

If I can rephrase your question, what you want is a dictionary final_hash with "ramu" and "se" as keys and two corresponding value arrays that have all the values of ramu and se from hash_1, hash_2, hash_3 and hash_4, right? Here's how I'd do it:
def process(self):
final_hash = dict()
for key in self.hash_1:
if key not in final_hash:
final_hash[key]= []
final_hash[key].append(self.hash_1[key])
for key in self.hash_2:
final_hash[key].append(self.hash_2[key])
for key in self.hash_3:
final_hash[key].append(self.hash_3[key])
for key in self.hash_4:
final_hash[key].append(self.hash_4[key])
del(self.hash_1)
del(self.hash_2)
del(self.hash_3)
del(self.hash_4)
print final_hash["ramu"], final_hash["se"]
[['10', '20'], ['0', '2'], ['44', '22'], ['4433', '222']]
[['1', '2'], ['111', '2w'], ['111121', '25'], ['16621', '2532']]
I have just hard coded the process() function. If you have a lot other dictionaries that have to be merged together, you might want to consider automating that part.

Related

for loop not accessing all of the elements in a list python

The class below is driving me crazy. It is stuck on the for loop. I'm unsure why it will not access the last element in self.job_ids. This should be working. Any ideas why this for loop does not work inside the class but it works perfectly find outside the class?
import subprocess
class job_runner():
def __init__(self, user_id='staffner'):
self.job_ids = ['12054807', '12054808', '12054809', '12054810', '12054811', '12054812', '12054813', '10', '100']
self.user_id = user_id
def update_running_jobs(self):
'''
() ->
Remove job ids from self.job_ids which completed
'''
# find currently running jobs
curr_running = self.find_running_jobs()
#iterate over self.job_ids and see if they are in the currently running jobs
working_ids = self.job_ids
print 'start working ids:'
print working_ids
for j_id in working_ids:
# for some reason I can not access the last id in the list
print j_id
if j_id not in curr_running:
self.job_ids.remove(j_id)
print 'job_ids'
print self.job_ids
def find_running_jobs(self):
'''
() -> list running ids
Find what job ids are still running on the high performance cluster
'''
proc = subprocess.Popen(['squeue -u %s --Format=arrayjobid'%(self.user_id)], stdout=subprocess.PIPE, shell=True)
out, err = proc.communicate()
if err == None:
out_list = out.replace('ARRAY_JOB_ID', '').replace(' ', '').split('\n')
# filter out any empty strings
out_list = filter(None, out_list)
return out_list
else:
return False
curr_jobs = job_runner()
curr_jobs.update_running_jobs()
Here's the output (as you can see 100 is never accessed):
start working ids:
['12054807', '12054808', '12054809', '12054810', '12054811', '12054812', '12054813', '10', '100']
12054807
12054808
12054809
12054810
12054811
12054812
12054813
10
job_ids
['12054807', '12054808', '12054809', '12054810', '12054811', '12054812', '12054813', '100']
You should change:
working_ids = self.job_ids
to:
working_ids = self.job_ids[:] # create a copy of job_ids and use it
Explanation: you're modifying the list while iterating it which causes unexpected results, changing the code you'll iterate a copy of the list.

Python List of Dictionaries Only See Last Element

Struggling to figure out why this doesn't work. It should. But when I create a list of dictionaries and then look through that list, I only ever see the final entry from the list:
alerts = []
alertDict = {}
af=open("C:\snort.txt")
for line in af:
m = re.match(r'([0-9/]+)-([0-9:.]+)\s+.*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})\s+->\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})', line)
if m:
attacktime = m.group(2)
srcip = m.group(3)
srcprt = m.group(4)
dstip = m.group(5)
dstprt = m.group(6)
alertDict['Time'] = attacktime
alertDict['Source IP'] = srcip
alertDict['Destination IP'] = dstip
alerts.append(alertDict)
for alert in alerts:
if alert["Time"] == "13:13:42.443062":
print "Found Time"
You create exactly one dict at the beginning of the script, and then append that one dict to the list multiple times.
Try creating multiple individual dicts, by moving the initialization to the inside of the loop.
alerts = []
af=open("C:\snort.txt")
for line in af:
alertDict = {}
#rest of loop goes here

Python multiprocessing: Reading a file and updating a dictionary

Lets assume that I have a text file with only 2 rows as follows:
File.txt:
100022441 #DavidBartonWB Guarding Constitution
100022441 RT #frankgaffney 2nd Amendment Guy.
First column is user id and second column is user tweet. I'd like to read the above text file and update the following dictionary:
d={'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}.
Here is my code:
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[1]
if uid in d.keys():
for gn in d[uid].keys():
if gn in tweet:
return uid, gn, 1
else:
return uid, gn, 0
p = Pool(4)
with open('~/File.txt') as source_file:
for uid, gn, r in p.map(f, source_file):
d[uid][gn] += r
So basically I need to read each line of the file and determine whether the user is in my dictionary, and if it is, whether the tweet contain user's keys in the dictionary (e.g. '#frankgaffney' and '#DavidBartonWB'). So based on the two lines I wrote above, the code should result:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 1 }}
But it gives:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 0 }}
For some reason the code always loses one of the keys for all users. Any idea what is wrong in my code?
Your file is tab delimited, and you are always checking the third column for the mention; it works correctly for the first mention because you are passing in the entire file to the function, not each line. So effectively you are doing this:
>>> s = '100022441\t#DavidBartonWB Guarding Constitution\n100022441\tRT#frankgaffney 2nd Amendment Guy.'
>>> s.split('\t')
['100022441', '#DavidBartonWB Guarding Constitution\n100022441', 'RT#frankgaffney 2nd Amendment Guy.']
I recommend two approaches:
Map your function to each line in the file.
Use regular expressions for a more robust search.
Try this version:
import re
d = {'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}
e = r'(#\w+)'
def parser(line):
key, tweet = line.split('\t')
data = d.get(key)
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data.keys():
d[key][mention] += 1
with open('~/File.txt') as f:
for line in f:
parser(line)
print(d)
Once you've confirmed its working correctly, then you can multi-process it:
import itertools, re
from multiprocessing import Process, Manager
def parse(queue, d, m):
while True:
line = queue.get()
if line is None:
return # we are done with this thread
key, tweet = line.split('\t')
data = d.get(key)
e = r'(#\w+)'
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data:
if mention not in m:
m[mention] = 1
else:
m[mention] += 1
if __name__ == '__main__':
workers = 2
manager = Manager()
d = manager.dict()
d2 = manager.dict()
d = {'100022441': ['#frankgaffney', '#DavidBartonWB']}
queue = manager.Queue(workers)
worker_pool = []
for i in range(workers):
p = Process(target=parse, args=(queue, d, d2))
p.start()
worker_pool.append(p)
# Fill the queue with data for the workers
with open(r'tweets2.txt') as f:
iters = itertools.chain(f, (None,)*workers)
for line in iters:
queue.put(line)
for p in worker_pool:
p.join()
for i,data in d.iteritems():
print('For ID: {}'.format(i))
for key in data:
print(' {} - {}'.format(key, d2[key]))
second column is data[1], not data[2]
the fact that data[2] works means that you are splitting into words, not columns
if you want to search for the user key as a separate word (as opposed to substring), you need tweet=data[1:]
if you want to search for a substring you need to split into exactly two pieces: uid,tweet=line.split(None,1)

Creating nested Json structure with multiple key values in Python from Json

My code is as follows:
import json
def reformat(importscompanies):
#print importscompanies
container={}
child=[]
item_dict={}
for name, imports in importscompanies.iteritems():
item_dict['name'] = imports
item_dict['size'] = '500'
child.append(dict(item_dict))
container['name'] = name
container['children'] = child
if __name__ == '__main__':
raw_data = json.load(open('data/bricsinvestorsfirst.json'))
run(raw_data)
def run(raw_data):
raw_data2 = raw_data[0]
the_output = reformat(raw_data2)
My issue is, the code isn't going through the whole file. It's only outputting one entry. Why is this? Am I rewriting something and do I need another dict that appends with every loop?
Also, it seems as though the for loop is going through the iteritems for each dict key. Is there a way to make it pass only once?
The issue is indeed
raw_data2 = raw_data[0]
I ended up creating an iterator to access the dict values.
Thanks.
Lastly, I'm hoping my final Json file looks this way, using the data I provided above:
{'name': u'name', 'children': [{'name': u'500 Startups', 'size': '500'}, {'name': u'AffinityChina', 'size': '500'}]}
Try this. Though your sample input and output data don't really give many clues as to where the "name" fields should come from. I've assumed you wanted the name of the original item in your list.
original_json = json.load(open('data/bricsinvestorsfirst.json'),'r')
response_json = {}
response_json["name"] = "analytics"
# where your children list will go
children = []
size = 500 # or whatever else you want
# For each item in your original list
for item in original_json:
children.append({"name" : item["name"],
"size" : size})
response_json["children"] = children
print json.dumps(response_json,indent=2)
"It's only outputting one entry" because you only select the first dictionary in the JSON file when you say raw_data2 = raw_data[0]
Try something like this as a starting point (I haven't tested/ran it):
import json
def run():
with open('data/bricsinvestorsfirst.json') as input_file:
raw_data = json.load(input_file)
children = []
for item in raw_data:
children.append({
'name': item['name'],
'size': '500'
})
container = {}
container['name'] = 'name'
container['children'] = children
return json.dumps(container)
if __name__ == '__main__':
print run()

Why do I keep getting this error in map-reduce while using mincemeat?

I just want to calculate word count from some 7500 files with some condition on which words to count. The program goes like this.
import glob
import mincemeat
text_files = glob.glob('../fldr/2/*')
def file_contents(file_name):
f = open(file_name)
try:
return f.read()
finally:
f.close()
source = dict((file_name, file_contents(file_name))
for file_name in text_files)
def mapfn(key, value):
for line in value.splitlines():
list2 = [ ]
for temp in line.split("::::"):
list2.append(temp)
if (list2[0] == '5'):
for review in list2[1].split():
yield [review.lower(),1]
def reducefn(key, value):
return key, len(value)
s = mincemeat.Server()
s.datasource = source
s.mapfn = mapfn
s.reducefn = reducefn
results = s.run_server(password="wola")
print results
The error I get while running this program is
error: uncaptured python exception, closing channel <__main__.Client connected at 0x250f990>
(<type 'exceptions.IndexError'>:list index out of range
[C:\Python27\lib\asyncore.py|read|83]
[C:\Python27\lib\asyncore.py|handle_read_event|444]
[C:\Python27\lib\asynchat.py|handle_read|140]
[mincemeat.py|found_terminator|96]
[mincemeat.py|process_command|194]
[mincemeat.py|call_mapfn|170]
[projminc2.py|mapfn|21])
Take a look at what's in list2 e.g. by doing
print(list2)
or with a debugger. If you do this you'll see that list2 only has one element so list2[1] isn't valid.
(You don't really want to split on "::::" - that's a typo in your script).

Categories

Resources