Python multiprocessing: Reading a file and updating a dictionary - python

Lets assume that I have a text file with only 2 rows as follows:
File.txt:
100022441 #DavidBartonWB Guarding Constitution
100022441 RT #frankgaffney 2nd Amendment Guy.
First column is user id and second column is user tweet. I'd like to read the above text file and update the following dictionary:
d={'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}.
Here is my code:
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[1]
if uid in d.keys():
for gn in d[uid].keys():
if gn in tweet:
return uid, gn, 1
else:
return uid, gn, 0
p = Pool(4)
with open('~/File.txt') as source_file:
for uid, gn, r in p.map(f, source_file):
d[uid][gn] += r
So basically I need to read each line of the file and determine whether the user is in my dictionary, and if it is, whether the tweet contain user's keys in the dictionary (e.g. '#frankgaffney' and '#DavidBartonWB'). So based on the two lines I wrote above, the code should result:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 1 }}
But it gives:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 0 }}
For some reason the code always loses one of the keys for all users. Any idea what is wrong in my code?

Your file is tab delimited, and you are always checking the third column for the mention; it works correctly for the first mention because you are passing in the entire file to the function, not each line. So effectively you are doing this:
>>> s = '100022441\t#DavidBartonWB Guarding Constitution\n100022441\tRT#frankgaffney 2nd Amendment Guy.'
>>> s.split('\t')
['100022441', '#DavidBartonWB Guarding Constitution\n100022441', 'RT#frankgaffney 2nd Amendment Guy.']
I recommend two approaches:
Map your function to each line in the file.
Use regular expressions for a more robust search.
Try this version:
import re
d = {'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}
e = r'(#\w+)'
def parser(line):
key, tweet = line.split('\t')
data = d.get(key)
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data.keys():
d[key][mention] += 1
with open('~/File.txt') as f:
for line in f:
parser(line)
print(d)
Once you've confirmed its working correctly, then you can multi-process it:
import itertools, re
from multiprocessing import Process, Manager
def parse(queue, d, m):
while True:
line = queue.get()
if line is None:
return # we are done with this thread
key, tweet = line.split('\t')
data = d.get(key)
e = r'(#\w+)'
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data:
if mention not in m:
m[mention] = 1
else:
m[mention] += 1
if __name__ == '__main__':
workers = 2
manager = Manager()
d = manager.dict()
d2 = manager.dict()
d = {'100022441': ['#frankgaffney', '#DavidBartonWB']}
queue = manager.Queue(workers)
worker_pool = []
for i in range(workers):
p = Process(target=parse, args=(queue, d, d2))
p.start()
worker_pool.append(p)
# Fill the queue with data for the workers
with open(r'tweets2.txt') as f:
iters = itertools.chain(f, (None,)*workers)
for line in iters:
queue.put(line)
for p in worker_pool:
p.join()
for i,data in d.iteritems():
print('For ID: {}'.format(i))
for key in data:
print(' {} - {}'.format(key, d2[key]))

second column is data[1], not data[2]
the fact that data[2] works means that you are splitting into words, not columns
if you want to search for the user key as a separate word (as opposed to substring), you need tweet=data[1:]
if you want to search for a substring you need to split into exactly two pieces: uid,tweet=line.split(None,1)

Related

Modify a loop based on key and value in dictionary

Im new to python
I wrote the code below, to search in a dictionary, do something, clear old items in dictionary and update dictionary with new key and values and break while there is noting to add to dictionary (it is empty), how can I modify my code to do this process?
#since_id - Returns results with an ID greater than
#(that is, more recent than) the specified ID. There are limits to the
#number of Tweets which can be accessed through the API.
# If the limit of Tweets has occured since the since_id,
# the since_id will be forced to the oldest ID available.
# max_id - Returns results with an ID less than (that is, older than)
#or equal to the specified ID.
Dict2 = dict({'#TweeetLorraine':1392217841680764931})
d2 = {}
rep=[]
from tqdm import tqdm
for key, value in tqdm(Dict2.items()):
for i in tweepy.Cursor(api.search,
q='to:{} -filter:retweets"'.format(key),lang="en"
,since_id=value,tweet_mode='extended',
wait_on_rate_limit=True,
wait_on_rate_limit_notify=True).items(50):
if (i.in_reply_to_status_id == value):
rep.append(i)
from pandas.io.json import json_normalize
dfflat = pd.DataFrame()
for tweet in rep:
df_for_tweet = json_normalize(tweet._json)
dfflat=dfflat.append(df_for_tweet,ignore_index=True,sort=True)
d2.update(zip(dfflat["user.screen_name"].tolist(), dfflat["id"].tolist()))
d2 ```
You can use a while loop for that :
#since_id - Returns results with an ID greater than
#(that is, more recent than) the specified ID. There are limits to the
#number of Tweets which can be accessed through the API.
# If the limit of Tweets has occured since the since_id,
# the since_id will be forced to the oldest ID available.
# max_id - Returns results with an ID less than (that is, older than)
#or equal to the specified ID.
Dict2 = dict({'#TweeetLorraine':1392217841680764931})
d2 = {}
rep=[]
from tqdm import tqdm
for key, value in tqdm(Dict2.items()):
for i in tweepy.Cursor(api.search,
q='to:{} -filter:retweets"'.format(key),lang="en"
,since_id=value,tweet_mode='extended',
wait_on_rate_limit=True,
wait_on_rate_limit_notify=True).items(50):
if (i.in_reply_to_status_id == value):
rep.append(i)
from pandas.io.json import json_normalize
dfflat = pd.DataFrame()
for tweet in rep:
df_for_tweet = json_normalize(tweet._json)
dfflat=dfflat.append(df_for_tweet,ignore_index=True,sort=True)
d2.update(zip(dfflat["user.screen_name"].tolist(), dfflat["id"].tolist()))
d2
For your use case, here is roughly the code that does what you describe, there is better ways to do that using map, I let you search for it if you want to know more.
Also, I'm not sure whether you want to completely clear the dict or only clear the current "i", but I think you can modify the following snippet to your true needs
mydict = initial_dict
# while there is something in the dictionary
while mydict:
value_searched = None
for key, value in mydict.items():
for i in tweepy.Cursor(api.search,
q='to:{} -filter:retweets"'.format(key),lang="en"
,since_id=value,tweet_mode='extended',
wait_on_rate_limit=True,
wait_on_rate_limit_notify=True).items(50):
if (i.in_reply_to_status_id == value):
replies3.append(i)
value_searched = i
break
break
# create new dict from value retrieved
mydict = {"#" +value_searched.user.screen_name : value_searched.id_str}
Edit2 :
Using recursivity
def tweepy_stub(key, value):
if key == "TweeetLorraine" and value == 1392217841680764931:
return [
("AlexBC997", 1392385334155956226),
("ChapinDolores", 1392432099945238529),
]
elif key == "AlexBC997" and value == 1392385334155956226:
return [("test", 139238533415595852)]
elif ("ChapinDolores", 1392432099945238529):
return []
def recursive(list_values, nb_recursion):
mydict = {}
if list_values == None or nb_recursion == 0:
return mydict
else:
for name_user, tweet_id in list_values:
mydict[(name_user, tweet_id)] = recursive(
retrieve_direct_reply_stub(name_user, tweet_id), nb_recursion - 1
)
return mydict
class stub_tweepy_answer:
def __init__(self, status_id) -> None:
self.in_reply_to_status_id = status_id
def retrieve_direct_reply_stub(name_user, tweepy_id):
rep = []
d2 = []
return tweepy_stub(name_user, tweepy_id)
def retrieve_direct_reply(name_user, tweet_id):
rep = []
d2 = []
for i in tweepy_stub(name_user, tweet_id):
val = i
if (i.in_reply_to_status_id == tweet_id):
rep.append(i)
from pandas.io.json import json_normalize
dfflat = pd.DataFrame()
for tweet in rep:
df_for_tweet = json_normalize(tweet._json)
dfflat=dfflat.append(df_for_tweet,ignore_index=True,sort=True)
d2.append(zip(dfflat["user.screen_name"].tolist(), dfflat["id"].tolist()))
return d2
#print(retrieve_direct_reply_stub("TweeetLorraine", 1392217841680764931))
elem = [("TweeetLorraine", 1392217841680764931)]
print(recursive(elem, 3))

Python 3.6 script surprisingly slow on Windows 10 but not on Ubuntu 17.10

I recently had to write a challenge for a company that was to merge 3 CSV files into one based on the first attribute of each (the attributes were repeating in all files).
I wrote the code and sent it to them, but they said it took 2 minutes to run. That was funny because it ran for 10 seconds on my machine. My machine had the same processor, 16GB of RAM, and had an SSD as well. Very similar environments.
I tried optimising it and resubmitted it. This time they said they ran it on an Ubuntu machine and got 11 seconds, while the code ran for 100 seconds on the Windows 10 still.
Another peculiar thing was that when I tried profiling it with the Profile module, it went on forever, had to terminate after 450 seconds. I moved to cProfiler and it recorded it for 7 seconds.
EDIT: The exact formulation of the problem is
Write a console program to merge the files provided in a timely and
efficient manner. File paths should be supplied as arguments so that
the program can be evaluated on different data sets. The merged file
should be saved as CSV; use the id column as the unique key for
merging; the program should do any necessary data cleaning and error
checking.
Feel free to use any language you’re comfortable with – only
restriction is no external libraries as this defeats the purpose of
the test. If the language provides CSV parsing libraries (like
Python), please avoid using them as well as this is a part of the
test.
Without further ado here's the code:
#!/usr/bin/python3
import sys
from multiprocessing import Pool
HEADERS = ['id']
def csv_tuple_quotes_valid(a_tuple):
"""
checks if a quotes in each attribute of a entry (i.e. a tuple) agree with the csv format
returns True or False
"""
for attribute in a_tuple:
in_quotes = False
attr_len = len(attribute)
skip_next = False
for i in range(0, attr_len):
if not skip_next and attribute[i] == '\"':
if i < attr_len - 1 and attribute[i + 1] == '\"':
skip_next = True
continue
elif i == 0 or i == attr_len - 1:
in_quotes = not in_quotes
else:
return False
else:
skip_next = False
if in_quotes:
return False
return True
def check_and_parse_potential_tuple(to_parse):
"""
receives a string and returns an array of the attributes of the csv line
if the string was not a valid csv line, then returns False
"""
a_tuple = []
attribute_start_index = 0
to_parse_len = len(to_parse)
in_quotes = False
i = 0
#iterate through the string (line from the csv)
while i < to_parse_len:
current_char = to_parse[i]
#this works the following way: if we meet a quote ("), it must be in one
#of five cases: "" | ", | ," | "\0 | (start_of_string)"
#in case we are inside a quoted attribute (i.e. "123"), then commas are ignored
#the following code also extracts the tuples' attributes
if current_char == '\"':
if i == 0 or (to_parse[i - 1] == ',' and not in_quotes): # (start_of_string)" and ," case
#not including the quote in the next attr
attribute_start_index = i + 1
#starting a quoted attr
in_quotes = True
elif i + 1 < to_parse_len:
if to_parse[i + 1] == '\"': # "" case
i += 1 #skip the next " because it is part of a ""
elif to_parse[i + 1] == ',' and in_quotes: # ", case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#not including the quote and comma in the next attr
attribute_start_index = i + 2
in_quotes = False #the quoted attr has ended
#skip the next comma - we know what it is for
i += 1
else:
#since we cannot have a random " in the middle of an attr
return False
elif i == to_parse_len - 1: # "\0 case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#reached end of line, so no more attr's to extract
attribute_start_index = to_parse_len
in_quotes = False
else:
return False
elif current_char == ',':
if not in_quotes:
a_tuple.append(to_parse[attribute_start_index:i].strip())
attribute_start_index = i + 1
i += 1
#in case the last attr was left empty or unquoted
if attribute_start_index < to_parse_len or (not in_quotes and to_parse[-1] == ','):
a_tuple.append(to_parse[attribute_start_index:])
#line ended while parsing; i.e. a quote was openned but not closed
if in_quotes:
return False
return a_tuple
def parse_tuple(to_parse, no_of_headers):
"""
parses a string and returns an array with no_of_headers number of headers
raises an error if the string was not a valid CSV line
"""
#get rid of the newline at the end of every line
to_parse = to_parse.strip()
# return to_parse.split(',') #if we assume the data is in a valid format
#the following checking of the format of the data increases the execution
#time by a factor of 2; if the data is know to be valid, uncomment 3 lines above here
#if there are more commas than fields, then we must take into consideration
#how the quotes parse and then extract the attributes
if to_parse.count(',') + 1 > no_of_headers:
result = check_and_parse_potential_tuple(to_parse)
if result:
a_tuple = result
else:
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
else:
a_tuple = to_parse.split(',')
if not csv_tuple_quotes_valid(a_tuple):
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
#if the format is correct but more data fields were provided
#the following works faster than an if statement that checks the length of a_tuple
try:
a_tuple[no_of_headers - 1]
except IndexError:
raise TypeError('Error while parsing CSV line %s. Unknown reason' % to_parse)
#this replaces the use my own hashtables to store the duplicated values for the attributes
for i in range(1, no_of_headers):
a_tuple[i] = sys.intern(a_tuple[i])
return a_tuple
def read_file(path, file_number):
"""
reads the csv file and returns (dict, int)
the dict is the mapping of id's to attributes
the integer is the number of attributes (headers) for the csv file
"""
global HEADERS
try:
file = open(path, 'r');
except FileNotFoundError as e:
print("error in %s:\n%s\nexiting...")
exit(1)
main_table = {}
headers = file.readline().strip().split(',')
no_of_headers = len(headers)
HEADERS.extend(headers[1:]) #keep the headers from the file
lines = file.readlines()
file.close()
args = []
for line in lines:
args.append((line, no_of_headers))
#pool is a pool of worker processes parsing the lines in parallel
with Pool() as workers:
try:
all_tuples = workers.starmap(parse_tuple, args, 1000)
except TypeError as e:
print('Error in file %s:\n%s\nexiting thread...' % (path, e.args))
exit(1)
for a_tuple in all_tuples:
#add quotes to key if needed
key = a_tuple[0] if a_tuple[0][0] == '\"' else ('\"%s\"' % a_tuple[0])
main_table[key] = a_tuple[1:]
return (main_table, no_of_headers)
def merge_files():
"""
produces a file called merged.csv
"""
global HEADERS
no_of_files = len(sys.argv) - 1
processed_files = [None] * no_of_files
for i in range(0, no_of_files):
processed_files[i] = read_file(sys.argv[i + 1], i)
out_file = open('merged.csv', 'w+')
merged_str = ','.join(HEADERS)
all_keys = {}
#this is to ensure that we include all keys in the final file.
#even those that are missing from some files and present in others
for processed_file in processed_files:
all_keys.update(processed_file[0])
for key in all_keys:
merged_str += '\n%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str)
out_file.close()
if __name__ == '__main__':
# merge_files()
import cProfile
cProfile.run('merge_files()')
# import time
# start = time.time()
# print(time.time() - start);
Here is the profiler report I got on my Windows.
EDIT: The rest of the csv data provided is here. Pastebin was taking too long to process the files, so...
It might not be the best code and I know that, but my question is what slows down Windows so much that doesn't slow down an Ubuntu? The merge_files() function takes the longest, with 94 seconds just for itself, not including the calls to other functions. And there doesn't seem to be anything too obvious to me for why it is so slow.
Thanks
EDIT: Note: We both used the same dataset to run the code with.
It turns out that Windows and Linux handle very long strings differently. When I moved the out_file.write(merged_str) inside the outer for loop (for key in all_keys:) and stopped appending to merged_str, it ran for 11 seconds as expected. I don't have enough knowledge on either of the OS's memory management systems to be able to give a prediction on why it is so different.
But I would say that the way that the second one (the Windows one) is the more fail-safe method because it is unreasonable to keep a 30 MB string in memory. It just turns out that Linux sees that and doesn't always try to keep the string in cache, or to rebuild it every time.
Funny enough, initially I did run it a few times on my Linux machine with these same writing strategies, and the one with the large string seemed to go faster, so I stuck with it. I guess you never know.
Here's the modified code
for key in all_keys:
merged_str = '%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str + '\n')
out_file.close()
When I run your solution on Ubuntu 16.04 with the three given files, it seems to take ~8 seconds to complete. The only modification I made was to uncomment the timing code at the bottom and use it.
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
8.039648056030273
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
7.78482985496521
I rewrote my first attempt without using csv from the standard library and am now getting times of ~4.3 seconds.
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.332579612731934
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.305467367172241
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.27345871925354
This is my solution code (lettuce_merge.py):
from collections import defaultdict
def split_row(csv_row):
return [col.strip('"') for col in csv_row.rstrip().split(',')]
def merge_csv_files(files):
file_headers = []
merged_headers = []
for i, file in enumerate(files):
current_header = split_row(next(file))
unique_key, *current_header = current_header
if i == 0:
merged_headers.append(unique_key)
merged_headers.extend(current_header)
file_headers.append(current_header)
result = defaultdict(lambda: [''] * (len(merged_headers) - 1))
for file_header, file in zip(file_headers, files):
for line in file:
key, *values = split_row(line)
for col_name, col_value in zip(file_header, values):
result[key][merged_headers.index(col_name) - 1] = col_value
file.close()
quotes = '"{}"'.format
with open('lettuce_merged.csv', 'w') as f:
f.write(','.join(quotes(a) for a in merged_headers) + '\n')
for key, values in result.items():
f.write(','.join(quotes(b) for b in [key] + values) + '\n')
if __name__ == '__main__':
from argparse import ArgumentParser, FileType
from time import time
parser = ArgumentParser()
parser.add_argument('files', nargs='*', type=FileType('r'))
args = parser.parse_args()
start_time = time()
merge_csv_files(args.files)
print(time() - start_time)
I'm sure this code could be optimized even further but sometimes just seeing another way to solve a problem can help spark new ideas.

Shared memory dictionary creation too slow using multiprocessing.Manager()

I have a code in which I need to read an excel file and store the information into dictionaries.
I have to use multiprocessing.Manager() to create the dictionaries in order to be able to retrieve calculation output from a function that I run using multiprocess.Process.
The problem is that, when multiprocessing.Manager() and manager.dict() is used to create a dictionary it takes ~400 times longer than using only dict() (and dict() is not a shared memory structure).
Here is a sample code to verify the diference:
import xlrd
import multiprocessing
import time
def DictManager(inp1, inp2):
manager = multiprocessing.Manager()
Dict = manager.dict()
Dict['input1'] = inp1
Dict['input2'] = inp2
Dict['Output1'] = None
Dict['Output2'] = None
return Dict
def DictNoManager(inp1, inp2):
Dict = dict()
Dict['input1'] = inp1
Dict['input2'] = inp2
Dict['Output1'] = None
Dict['Output2'] = None
return Dict
def ReadFileManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
def ReadFileNoManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictNoManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
if __name__ == '__main__':
excelfile = 'MyFile.xlsx'
start = time.time()
ReadFileNoManager(excelfile)
end = time.time()
print 'Run time NoManager:', end - start, 's'
start = time.time()
ReadFileManager(excelfile)
end = time.time()
print 'Run time Manager:', end - start, 's'
Is there a way to improve the performance of multiprocessing.Manager()?
If the answer is No, is there any other shared memory structure that I can use to replace what I am doing and improve performance?
I would appreciate your help!
EDIT:
My main function uses the following code:
def MyFunction(Dictionary, otherdata):
#Perform calculation and save results in the dictionary
Dict['Output1'] = Value1
Dict['Output2'] = Value2
ListOfProcesses = []
for Dict in DictList:
p = multiprocessing.Process(target=MyFunction, args=(Dict, otherdata)
p.start()
ListOfProcesses.append(p)
for p in ListOfProcesses:
p.join()
If I do not use the manager, I will not be able to retrieve the Outputs.
As I mentioned in the comments, I recommend using the main process to read in the excel file. Then using multiprocessing for the function calls. Just add your function to apply_function and make sure it returns whatever you want. results will contain a list of your results.
Update: I changed map to starmap to include your extra argument
def ReadFileNoManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictNoManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
return DictList
def apply_function(your_dict, otherdata):
pass
if __name__ == '__main__':
excelfile = 'MyFile.xlsx'
dict_list = ReadFileNoManager(excelfile)
pool = multiprocessing.Pool(multiprocessing.cpu_count())
results = pool.starmap(apply_function, zip(dict_list, repeat(otherdata)))

Python List of Dictionaries Only See Last Element

Struggling to figure out why this doesn't work. It should. But when I create a list of dictionaries and then look through that list, I only ever see the final entry from the list:
alerts = []
alertDict = {}
af=open("C:\snort.txt")
for line in af:
m = re.match(r'([0-9/]+)-([0-9:.]+)\s+.*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})\s+->\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})', line)
if m:
attacktime = m.group(2)
srcip = m.group(3)
srcprt = m.group(4)
dstip = m.group(5)
dstprt = m.group(6)
alertDict['Time'] = attacktime
alertDict['Source IP'] = srcip
alertDict['Destination IP'] = dstip
alerts.append(alertDict)
for alert in alerts:
if alert["Time"] == "13:13:42.443062":
print "Found Time"
You create exactly one dict at the beginning of the script, and then append that one dict to the list multiple times.
Try creating multiple individual dicts, by moving the initialization to the inside of the loop.
alerts = []
af=open("C:\snort.txt")
for line in af:
alertDict = {}
#rest of loop goes here

How to preserve file write order when using threading in python

I have some python code to read a file and push data to a list. Then put this list to queue, use threading to process the list, say 20 items a time. After processing, I save the result into a new file. What was put in the new file was actually different order than the original file. For example, I have in input,
1 a
2 b
3 c
4 a
5 d
But the output looks like:
2 aa
1 ba
4 aa
5 da
3 ca
Is there any way to preserve the original order?
Here is my code:
import threading,Queue,time,sys
class eSS(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
self.lock = threading.Lock()
def ess(self,email,code,suggested,comment,reason,dlx_score):
#do something
def run(self):
while True:
info = self.queue.get()
infolist = info.split('\t')
email = infolist[1]
code = infolist[2]
suggested = infolist[3]
comment = infolist[4]
reason = infolist[5]
dlx_score = (0 if infolist[6] == 'NULL' else int(infolist[6]))
g.write(info + '\t' + self.ess(email,code,suggested,comment,reason,dlx_score) +'\r\n')
self.queue.task_done()
if __name__ == "__main__":
queue = Queue.Queue()
filename = sys.argv[1]
#Define number of threads
threads = 20
f = open(filename,'r')
g = open(filename+'.eSS','w')
lines = f.read().splitlines()
f.close()
start = time.time()
for i in range(threads):
t = eSS(queue)
t.setDaemon(True)
t.start()
for line in lines:
queue.put(line)
queue.join()
print time.time()-start
g.close()
Three thoughts come to mind. Common to all is to include an index with the packet that is queued for processing.
One thought then is to use the controller/workers/output framework in which the output thread de-queues the worker-processed data, assembles, and outputs it.
The second thought is to employ a memory-mapped file for output, and use the index to calculate the offset to write into the file (assumes fixed-length writes probably).
The third is to use the index to put processed data in a new list, and when the list is completed write the items out at the end rather than on the fly.

Categories

Resources